Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
111#include <linux/highmem.h>
112#include <linux/user_namespace.h>
113#include <linux/static_key.h>
114#include <linux/memcontrol.h>
115#include <linux/prefetch.h>
116#include <linux/compat.h>
117
118#include <linux/uaccess.h>
119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
123#include <net/net_namespace.h>
124#include <net/request_sock.h>
125#include <net/sock.h>
126#include <linux/net_tstamp.h>
127#include <net/xfrm.h>
128#include <linux/ipsec.h>
129#include <net/cls_cgroup.h>
130#include <net/netprio_cgroup.h>
131#include <linux/sock_diag.h>
132
133#include <linux/filter.h>
134#include <net/sock_reuseport.h>
135#include <net/bpf_sk_storage.h>
136
137#include <trace/events/sock.h>
138
139#include <net/tcp.h>
140#include <net/busy_poll.h>
141
142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list);
144
145static void sock_inuse_add(struct net *net, int val);
146
147/**
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
152 *
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
156 */
157bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
159{
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
162}
163EXPORT_SYMBOL(sk_ns_capable);
164
165/**
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
169 *
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
173 */
174bool sk_capable(const struct sock *sk, int cap)
175{
176 return sk_ns_capable(sk, &init_user_ns, cap);
177}
178EXPORT_SYMBOL(sk_capable);
179
180/**
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
184 *
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
188 */
189bool sk_net_capable(const struct sock *sk, int cap)
190{
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192}
193EXPORT_SYMBOL(sk_net_capable);
194
195/*
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family and separate keys for internal and
198 * userspace sockets.
199 */
200static struct lock_class_key af_family_keys[AF_MAX];
201static struct lock_class_key af_family_kern_keys[AF_MAX];
202static struct lock_class_key af_family_slock_keys[AF_MAX];
203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204
205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210
211#define _sock_locks(x) \
212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
221 x "27" , x "28" , x "AF_CAN" , \
222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
227 x "AF_MAX"
228
229static const char *const af_family_key_strings[AF_MAX+1] = {
230 _sock_locks("sk_lock-")
231};
232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 _sock_locks("slock-")
234};
235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 _sock_locks("clock-")
237};
238
239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 _sock_locks("k-sk_lock-")
241};
242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 _sock_locks("k-slock-")
244};
245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 _sock_locks("k-clock-")
247};
248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 _sock_locks("rlock-")
250};
251static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 _sock_locks("wlock-")
253};
254static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 _sock_locks("elock-")
256};
257
258/*
259 * sk_callback_lock and sk queues locking rules are per-address-family,
260 * so split the lock classes by using a per-AF key:
261 */
262static struct lock_class_key af_callback_keys[AF_MAX];
263static struct lock_class_key af_rlock_keys[AF_MAX];
264static struct lock_class_key af_wlock_keys[AF_MAX];
265static struct lock_class_key af_elock_keys[AF_MAX];
266static struct lock_class_key af_kern_callback_keys[AF_MAX];
267
268/* Run time adjustable parameters. */
269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270EXPORT_SYMBOL(sysctl_wmem_max);
271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272EXPORT_SYMBOL(sysctl_rmem_max);
273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275
276/* Maximal space eaten by iovec or ancillary data plus some space */
277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278EXPORT_SYMBOL(sysctl_optmem_max);
279
280int sysctl_tstamp_allow_data __read_mostly = 1;
281
282DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283EXPORT_SYMBOL_GPL(memalloc_socks_key);
284
285/**
286 * sk_set_memalloc - sets %SOCK_MEMALLOC
287 * @sk: socket to set it on
288 *
289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290 * It's the responsibility of the admin to adjust min_free_kbytes
291 * to meet the requirements
292 */
293void sk_set_memalloc(struct sock *sk)
294{
295 sock_set_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation |= __GFP_MEMALLOC;
297 static_branch_inc(&memalloc_socks_key);
298}
299EXPORT_SYMBOL_GPL(sk_set_memalloc);
300
301void sk_clear_memalloc(struct sock *sk)
302{
303 sock_reset_flag(sk, SOCK_MEMALLOC);
304 sk->sk_allocation &= ~__GFP_MEMALLOC;
305 static_branch_dec(&memalloc_socks_key);
306
307 /*
308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 * it has rmem allocations due to the last swapfile being deactivated
311 * but there is a risk that the socket is unusable due to exceeding
312 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 */
314 sk_mem_reclaim(sk);
315}
316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317
318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319{
320 int ret;
321 unsigned int noreclaim_flag;
322
323 /* these should have been dropped before queueing */
324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325
326 noreclaim_flag = memalloc_noreclaim_save();
327 ret = sk->sk_backlog_rcv(sk, skb);
328 memalloc_noreclaim_restore(noreclaim_flag);
329
330 return ret;
331}
332EXPORT_SYMBOL(__sk_backlog_rcv);
333
334static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335{
336 struct __kernel_sock_timeval tv;
337
338 if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 tv.tv_sec = 0;
340 tv.tv_usec = 0;
341 } else {
342 tv.tv_sec = timeo / HZ;
343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 }
345
346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 *(struct old_timeval32 *)optval = tv32;
349 return sizeof(tv32);
350 }
351
352 if (old_timeval) {
353 struct __kernel_old_timeval old_tv;
354 old_tv.tv_sec = tv.tv_sec;
355 old_tv.tv_usec = tv.tv_usec;
356 *(struct __kernel_old_timeval *)optval = old_tv;
357 return sizeof(old_tv);
358 }
359
360 *(struct __kernel_sock_timeval *)optval = tv;
361 return sizeof(tv);
362}
363
364static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 bool old_timeval)
366{
367 struct __kernel_sock_timeval tv;
368
369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 struct old_timeval32 tv32;
371
372 if (optlen < sizeof(tv32))
373 return -EINVAL;
374
375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 return -EFAULT;
377 tv.tv_sec = tv32.tv_sec;
378 tv.tv_usec = tv32.tv_usec;
379 } else if (old_timeval) {
380 struct __kernel_old_timeval old_tv;
381
382 if (optlen < sizeof(old_tv))
383 return -EINVAL;
384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 return -EFAULT;
386 tv.tv_sec = old_tv.tv_sec;
387 tv.tv_usec = old_tv.tv_usec;
388 } else {
389 if (optlen < sizeof(tv))
390 return -EINVAL;
391 if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 return -EFAULT;
393 }
394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 return -EDOM;
396
397 if (tv.tv_sec < 0) {
398 static int warned __read_mostly;
399
400 *timeo_p = 0;
401 if (warned < 10 && net_ratelimit()) {
402 warned++;
403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 __func__, current->comm, task_pid_nr(current));
405 }
406 return 0;
407 }
408 *timeo_p = MAX_SCHEDULE_TIMEOUT;
409 if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 return 0;
411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 return 0;
414}
415
416static bool sock_needs_netstamp(const struct sock *sk)
417{
418 switch (sk->sk_family) {
419 case AF_UNSPEC:
420 case AF_UNIX:
421 return false;
422 default:
423 return true;
424 }
425}
426
427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428{
429 if (sk->sk_flags & flags) {
430 sk->sk_flags &= ~flags;
431 if (sock_needs_netstamp(sk) &&
432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 net_disable_timestamp();
434 }
435}
436
437
438int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439{
440 unsigned long flags;
441 struct sk_buff_head *list = &sk->sk_receive_queue;
442
443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 atomic_inc(&sk->sk_drops);
445 trace_sock_rcvqueue_full(sk, skb);
446 return -ENOMEM;
447 }
448
449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 atomic_inc(&sk->sk_drops);
451 return -ENOBUFS;
452 }
453
454 skb->dev = NULL;
455 skb_set_owner_r(skb, sk);
456
457 /* we escape from rcu protected region, make sure we dont leak
458 * a norefcounted dst
459 */
460 skb_dst_force(skb);
461
462 spin_lock_irqsave(&list->lock, flags);
463 sock_skb_set_dropcount(sk, skb);
464 __skb_queue_tail(list, skb);
465 spin_unlock_irqrestore(&list->lock, flags);
466
467 if (!sock_flag(sk, SOCK_DEAD))
468 sk->sk_data_ready(sk);
469 return 0;
470}
471EXPORT_SYMBOL(__sock_queue_rcv_skb);
472
473int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474{
475 int err;
476
477 err = sk_filter(sk, skb);
478 if (err)
479 return err;
480
481 return __sock_queue_rcv_skb(sk, skb);
482}
483EXPORT_SYMBOL(sock_queue_rcv_skb);
484
485int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 const int nested, unsigned int trim_cap, bool refcounted)
487{
488 int rc = NET_RX_SUCCESS;
489
490 if (sk_filter_trim_cap(sk, skb, trim_cap))
491 goto discard_and_relse;
492
493 skb->dev = NULL;
494
495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 atomic_inc(&sk->sk_drops);
497 goto discard_and_relse;
498 }
499 if (nested)
500 bh_lock_sock_nested(sk);
501 else
502 bh_lock_sock(sk);
503 if (!sock_owned_by_user(sk)) {
504 /*
505 * trylock + unlock semantics:
506 */
507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508
509 rc = sk_backlog_rcv(sk, skb);
510
511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 bh_unlock_sock(sk);
514 atomic_inc(&sk->sk_drops);
515 goto discard_and_relse;
516 }
517
518 bh_unlock_sock(sk);
519out:
520 if (refcounted)
521 sock_put(sk);
522 return rc;
523discard_and_relse:
524 kfree_skb(skb);
525 goto out;
526}
527EXPORT_SYMBOL(__sk_receive_skb);
528
529INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
530 u32));
531INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
532 u32));
533struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
534{
535 struct dst_entry *dst = __sk_dst_get(sk);
536
537 if (dst && dst->obsolete &&
538 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
539 dst, cookie) == NULL) {
540 sk_tx_queue_clear(sk);
541 sk->sk_dst_pending_confirm = 0;
542 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
543 dst_release(dst);
544 return NULL;
545 }
546
547 return dst;
548}
549EXPORT_SYMBOL(__sk_dst_check);
550
551struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
552{
553 struct dst_entry *dst = sk_dst_get(sk);
554
555 if (dst && dst->obsolete &&
556 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
557 dst, cookie) == NULL) {
558 sk_dst_reset(sk);
559 dst_release(dst);
560 return NULL;
561 }
562
563 return dst;
564}
565EXPORT_SYMBOL(sk_dst_check);
566
567static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
568{
569 int ret = -ENOPROTOOPT;
570#ifdef CONFIG_NETDEVICES
571 struct net *net = sock_net(sk);
572
573 /* Sorry... */
574 ret = -EPERM;
575 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
576 goto out;
577
578 ret = -EINVAL;
579 if (ifindex < 0)
580 goto out;
581
582 sk->sk_bound_dev_if = ifindex;
583 if (sk->sk_prot->rehash)
584 sk->sk_prot->rehash(sk);
585 sk_dst_reset(sk);
586
587 ret = 0;
588
589out:
590#endif
591
592 return ret;
593}
594
595int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
596{
597 int ret;
598
599 if (lock_sk)
600 lock_sock(sk);
601 ret = sock_bindtoindex_locked(sk, ifindex);
602 if (lock_sk)
603 release_sock(sk);
604
605 return ret;
606}
607EXPORT_SYMBOL(sock_bindtoindex);
608
609static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
610{
611 int ret = -ENOPROTOOPT;
612#ifdef CONFIG_NETDEVICES
613 struct net *net = sock_net(sk);
614 char devname[IFNAMSIZ];
615 int index;
616
617 ret = -EINVAL;
618 if (optlen < 0)
619 goto out;
620
621 /* Bind this socket to a particular device like "eth0",
622 * as specified in the passed interface name. If the
623 * name is "" or the option length is zero the socket
624 * is not bound.
625 */
626 if (optlen > IFNAMSIZ - 1)
627 optlen = IFNAMSIZ - 1;
628 memset(devname, 0, sizeof(devname));
629
630 ret = -EFAULT;
631 if (copy_from_sockptr(devname, optval, optlen))
632 goto out;
633
634 index = 0;
635 if (devname[0] != '\0') {
636 struct net_device *dev;
637
638 rcu_read_lock();
639 dev = dev_get_by_name_rcu(net, devname);
640 if (dev)
641 index = dev->ifindex;
642 rcu_read_unlock();
643 ret = -ENODEV;
644 if (!dev)
645 goto out;
646 }
647
648 return sock_bindtoindex(sk, index, true);
649out:
650#endif
651
652 return ret;
653}
654
655static int sock_getbindtodevice(struct sock *sk, char __user *optval,
656 int __user *optlen, int len)
657{
658 int ret = -ENOPROTOOPT;
659#ifdef CONFIG_NETDEVICES
660 struct net *net = sock_net(sk);
661 char devname[IFNAMSIZ];
662
663 if (sk->sk_bound_dev_if == 0) {
664 len = 0;
665 goto zero;
666 }
667
668 ret = -EINVAL;
669 if (len < IFNAMSIZ)
670 goto out;
671
672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
673 if (ret)
674 goto out;
675
676 len = strlen(devname) + 1;
677
678 ret = -EFAULT;
679 if (copy_to_user(optval, devname, len))
680 goto out;
681
682zero:
683 ret = -EFAULT;
684 if (put_user(len, optlen))
685 goto out;
686
687 ret = 0;
688
689out:
690#endif
691
692 return ret;
693}
694
695bool sk_mc_loop(struct sock *sk)
696{
697 if (dev_recursion_level())
698 return false;
699 if (!sk)
700 return true;
701 switch (sk->sk_family) {
702 case AF_INET:
703 return inet_sk(sk)->mc_loop;
704#if IS_ENABLED(CONFIG_IPV6)
705 case AF_INET6:
706 return inet6_sk(sk)->mc_loop;
707#endif
708 }
709 WARN_ON_ONCE(1);
710 return true;
711}
712EXPORT_SYMBOL(sk_mc_loop);
713
714void sock_set_reuseaddr(struct sock *sk)
715{
716 lock_sock(sk);
717 sk->sk_reuse = SK_CAN_REUSE;
718 release_sock(sk);
719}
720EXPORT_SYMBOL(sock_set_reuseaddr);
721
722void sock_set_reuseport(struct sock *sk)
723{
724 lock_sock(sk);
725 sk->sk_reuseport = true;
726 release_sock(sk);
727}
728EXPORT_SYMBOL(sock_set_reuseport);
729
730void sock_no_linger(struct sock *sk)
731{
732 lock_sock(sk);
733 sk->sk_lingertime = 0;
734 sock_set_flag(sk, SOCK_LINGER);
735 release_sock(sk);
736}
737EXPORT_SYMBOL(sock_no_linger);
738
739void sock_set_priority(struct sock *sk, u32 priority)
740{
741 lock_sock(sk);
742 sk->sk_priority = priority;
743 release_sock(sk);
744}
745EXPORT_SYMBOL(sock_set_priority);
746
747void sock_set_sndtimeo(struct sock *sk, s64 secs)
748{
749 lock_sock(sk);
750 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
751 sk->sk_sndtimeo = secs * HZ;
752 else
753 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
754 release_sock(sk);
755}
756EXPORT_SYMBOL(sock_set_sndtimeo);
757
758static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
759{
760 if (val) {
761 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
762 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
763 sock_set_flag(sk, SOCK_RCVTSTAMP);
764 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
765 } else {
766 sock_reset_flag(sk, SOCK_RCVTSTAMP);
767 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
768 }
769}
770
771void sock_enable_timestamps(struct sock *sk)
772{
773 lock_sock(sk);
774 __sock_set_timestamps(sk, true, false, true);
775 release_sock(sk);
776}
777EXPORT_SYMBOL(sock_enable_timestamps);
778
779void sock_set_keepalive(struct sock *sk)
780{
781 lock_sock(sk);
782 if (sk->sk_prot->keepalive)
783 sk->sk_prot->keepalive(sk, true);
784 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
785 release_sock(sk);
786}
787EXPORT_SYMBOL(sock_set_keepalive);
788
789static void __sock_set_rcvbuf(struct sock *sk, int val)
790{
791 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
792 * as a negative value.
793 */
794 val = min_t(int, val, INT_MAX / 2);
795 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
796
797 /* We double it on the way in to account for "struct sk_buff" etc.
798 * overhead. Applications assume that the SO_RCVBUF setting they make
799 * will allow that much actual data to be received on that socket.
800 *
801 * Applications are unaware that "struct sk_buff" and other overheads
802 * allocate from the receive buffer during socket buffer allocation.
803 *
804 * And after considering the possible alternatives, returning the value
805 * we actually used in getsockopt is the most desirable behavior.
806 */
807 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
808}
809
810void sock_set_rcvbuf(struct sock *sk, int val)
811{
812 lock_sock(sk);
813 __sock_set_rcvbuf(sk, val);
814 release_sock(sk);
815}
816EXPORT_SYMBOL(sock_set_rcvbuf);
817
818static void __sock_set_mark(struct sock *sk, u32 val)
819{
820 if (val != sk->sk_mark) {
821 sk->sk_mark = val;
822 sk_dst_reset(sk);
823 }
824}
825
826void sock_set_mark(struct sock *sk, u32 val)
827{
828 lock_sock(sk);
829 __sock_set_mark(sk, val);
830 release_sock(sk);
831}
832EXPORT_SYMBOL(sock_set_mark);
833
834/*
835 * This is meant for all protocols to use and covers goings on
836 * at the socket level. Everything here is generic.
837 */
838
839int sock_setsockopt(struct socket *sock, int level, int optname,
840 sockptr_t optval, unsigned int optlen)
841{
842 struct sock_txtime sk_txtime;
843 struct sock *sk = sock->sk;
844 int val;
845 int valbool;
846 struct linger ling;
847 int ret = 0;
848
849 /*
850 * Options without arguments
851 */
852
853 if (optname == SO_BINDTODEVICE)
854 return sock_setbindtodevice(sk, optval, optlen);
855
856 if (optlen < sizeof(int))
857 return -EINVAL;
858
859 if (copy_from_sockptr(&val, optval, sizeof(val)))
860 return -EFAULT;
861
862 valbool = val ? 1 : 0;
863
864 lock_sock(sk);
865
866 switch (optname) {
867 case SO_DEBUG:
868 if (val && !capable(CAP_NET_ADMIN))
869 ret = -EACCES;
870 else
871 sock_valbool_flag(sk, SOCK_DBG, valbool);
872 break;
873 case SO_REUSEADDR:
874 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
875 break;
876 case SO_REUSEPORT:
877 sk->sk_reuseport = valbool;
878 break;
879 case SO_TYPE:
880 case SO_PROTOCOL:
881 case SO_DOMAIN:
882 case SO_ERROR:
883 ret = -ENOPROTOOPT;
884 break;
885 case SO_DONTROUTE:
886 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
887 sk_dst_reset(sk);
888 break;
889 case SO_BROADCAST:
890 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
891 break;
892 case SO_SNDBUF:
893 /* Don't error on this BSD doesn't and if you think
894 * about it this is right. Otherwise apps have to
895 * play 'guess the biggest size' games. RCVBUF/SNDBUF
896 * are treated in BSD as hints
897 */
898 val = min_t(u32, val, sysctl_wmem_max);
899set_sndbuf:
900 /* Ensure val * 2 fits into an int, to prevent max_t()
901 * from treating it as a negative value.
902 */
903 val = min_t(int, val, INT_MAX / 2);
904 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
905 WRITE_ONCE(sk->sk_sndbuf,
906 max_t(int, val * 2, SOCK_MIN_SNDBUF));
907 /* Wake up sending tasks if we upped the value. */
908 sk->sk_write_space(sk);
909 break;
910
911 case SO_SNDBUFFORCE:
912 if (!capable(CAP_NET_ADMIN)) {
913 ret = -EPERM;
914 break;
915 }
916
917 /* No negative values (to prevent underflow, as val will be
918 * multiplied by 2).
919 */
920 if (val < 0)
921 val = 0;
922 goto set_sndbuf;
923
924 case SO_RCVBUF:
925 /* Don't error on this BSD doesn't and if you think
926 * about it this is right. Otherwise apps have to
927 * play 'guess the biggest size' games. RCVBUF/SNDBUF
928 * are treated in BSD as hints
929 */
930 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
931 break;
932
933 case SO_RCVBUFFORCE:
934 if (!capable(CAP_NET_ADMIN)) {
935 ret = -EPERM;
936 break;
937 }
938
939 /* No negative values (to prevent underflow, as val will be
940 * multiplied by 2).
941 */
942 __sock_set_rcvbuf(sk, max(val, 0));
943 break;
944
945 case SO_KEEPALIVE:
946 if (sk->sk_prot->keepalive)
947 sk->sk_prot->keepalive(sk, valbool);
948 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
949 break;
950
951 case SO_OOBINLINE:
952 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
953 break;
954
955 case SO_NO_CHECK:
956 sk->sk_no_check_tx = valbool;
957 break;
958
959 case SO_PRIORITY:
960 if ((val >= 0 && val <= 6) ||
961 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
962 sk->sk_priority = val;
963 else
964 ret = -EPERM;
965 break;
966
967 case SO_LINGER:
968 if (optlen < sizeof(ling)) {
969 ret = -EINVAL; /* 1003.1g */
970 break;
971 }
972 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
973 ret = -EFAULT;
974 break;
975 }
976 if (!ling.l_onoff)
977 sock_reset_flag(sk, SOCK_LINGER);
978 else {
979#if (BITS_PER_LONG == 32)
980 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
981 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
982 else
983#endif
984 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
985 sock_set_flag(sk, SOCK_LINGER);
986 }
987 break;
988
989 case SO_BSDCOMPAT:
990 break;
991
992 case SO_PASSCRED:
993 if (valbool)
994 set_bit(SOCK_PASSCRED, &sock->flags);
995 else
996 clear_bit(SOCK_PASSCRED, &sock->flags);
997 break;
998
999 case SO_TIMESTAMP_OLD:
1000 __sock_set_timestamps(sk, valbool, false, false);
1001 break;
1002 case SO_TIMESTAMP_NEW:
1003 __sock_set_timestamps(sk, valbool, true, false);
1004 break;
1005 case SO_TIMESTAMPNS_OLD:
1006 __sock_set_timestamps(sk, valbool, false, true);
1007 break;
1008 case SO_TIMESTAMPNS_NEW:
1009 __sock_set_timestamps(sk, valbool, true, true);
1010 break;
1011 case SO_TIMESTAMPING_NEW:
1012 case SO_TIMESTAMPING_OLD:
1013 if (val & ~SOF_TIMESTAMPING_MASK) {
1014 ret = -EINVAL;
1015 break;
1016 }
1017
1018 if (val & SOF_TIMESTAMPING_OPT_ID &&
1019 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1020 if (sk->sk_protocol == IPPROTO_TCP &&
1021 sk->sk_type == SOCK_STREAM) {
1022 if ((1 << sk->sk_state) &
1023 (TCPF_CLOSE | TCPF_LISTEN)) {
1024 ret = -EINVAL;
1025 break;
1026 }
1027 sk->sk_tskey = tcp_sk(sk)->snd_una;
1028 } else {
1029 sk->sk_tskey = 0;
1030 }
1031 }
1032
1033 if (val & SOF_TIMESTAMPING_OPT_STATS &&
1034 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1035 ret = -EINVAL;
1036 break;
1037 }
1038
1039 sk->sk_tsflags = val;
1040 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1041
1042 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1043 sock_enable_timestamp(sk,
1044 SOCK_TIMESTAMPING_RX_SOFTWARE);
1045 else
1046 sock_disable_timestamp(sk,
1047 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1048 break;
1049
1050 case SO_RCVLOWAT:
1051 if (val < 0)
1052 val = INT_MAX;
1053 if (sock->ops->set_rcvlowat)
1054 ret = sock->ops->set_rcvlowat(sk, val);
1055 else
1056 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1057 break;
1058
1059 case SO_RCVTIMEO_OLD:
1060 case SO_RCVTIMEO_NEW:
1061 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1062 optlen, optname == SO_RCVTIMEO_OLD);
1063 break;
1064
1065 case SO_SNDTIMEO_OLD:
1066 case SO_SNDTIMEO_NEW:
1067 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1068 optlen, optname == SO_SNDTIMEO_OLD);
1069 break;
1070
1071 case SO_ATTACH_FILTER: {
1072 struct sock_fprog fprog;
1073
1074 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1075 if (!ret)
1076 ret = sk_attach_filter(&fprog, sk);
1077 break;
1078 }
1079 case SO_ATTACH_BPF:
1080 ret = -EINVAL;
1081 if (optlen == sizeof(u32)) {
1082 u32 ufd;
1083
1084 ret = -EFAULT;
1085 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1086 break;
1087
1088 ret = sk_attach_bpf(ufd, sk);
1089 }
1090 break;
1091
1092 case SO_ATTACH_REUSEPORT_CBPF: {
1093 struct sock_fprog fprog;
1094
1095 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1096 if (!ret)
1097 ret = sk_reuseport_attach_filter(&fprog, sk);
1098 break;
1099 }
1100 case SO_ATTACH_REUSEPORT_EBPF:
1101 ret = -EINVAL;
1102 if (optlen == sizeof(u32)) {
1103 u32 ufd;
1104
1105 ret = -EFAULT;
1106 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1107 break;
1108
1109 ret = sk_reuseport_attach_bpf(ufd, sk);
1110 }
1111 break;
1112
1113 case SO_DETACH_REUSEPORT_BPF:
1114 ret = reuseport_detach_prog(sk);
1115 break;
1116
1117 case SO_DETACH_FILTER:
1118 ret = sk_detach_filter(sk);
1119 break;
1120
1121 case SO_LOCK_FILTER:
1122 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1123 ret = -EPERM;
1124 else
1125 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1126 break;
1127
1128 case SO_PASSSEC:
1129 if (valbool)
1130 set_bit(SOCK_PASSSEC, &sock->flags);
1131 else
1132 clear_bit(SOCK_PASSSEC, &sock->flags);
1133 break;
1134 case SO_MARK:
1135 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1136 ret = -EPERM;
1137 break;
1138 }
1139
1140 __sock_set_mark(sk, val);
1141 break;
1142
1143 case SO_RXQ_OVFL:
1144 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1145 break;
1146
1147 case SO_WIFI_STATUS:
1148 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1149 break;
1150
1151 case SO_PEEK_OFF:
1152 if (sock->ops->set_peek_off)
1153 ret = sock->ops->set_peek_off(sk, val);
1154 else
1155 ret = -EOPNOTSUPP;
1156 break;
1157
1158 case SO_NOFCS:
1159 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1160 break;
1161
1162 case SO_SELECT_ERR_QUEUE:
1163 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1164 break;
1165
1166#ifdef CONFIG_NET_RX_BUSY_POLL
1167 case SO_BUSY_POLL:
1168 /* allow unprivileged users to decrease the value */
1169 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1170 ret = -EPERM;
1171 else {
1172 if (val < 0)
1173 ret = -EINVAL;
1174 else
1175 sk->sk_ll_usec = val;
1176 }
1177 break;
1178 case SO_PREFER_BUSY_POLL:
1179 if (valbool && !capable(CAP_NET_ADMIN))
1180 ret = -EPERM;
1181 else
1182 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1183 break;
1184 case SO_BUSY_POLL_BUDGET:
1185 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1186 ret = -EPERM;
1187 } else {
1188 if (val < 0 || val > U16_MAX)
1189 ret = -EINVAL;
1190 else
1191 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1192 }
1193 break;
1194#endif
1195
1196 case SO_MAX_PACING_RATE:
1197 {
1198 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1199
1200 if (sizeof(ulval) != sizeof(val) &&
1201 optlen >= sizeof(ulval) &&
1202 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1203 ret = -EFAULT;
1204 break;
1205 }
1206 if (ulval != ~0UL)
1207 cmpxchg(&sk->sk_pacing_status,
1208 SK_PACING_NONE,
1209 SK_PACING_NEEDED);
1210 sk->sk_max_pacing_rate = ulval;
1211 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1212 break;
1213 }
1214 case SO_INCOMING_CPU:
1215 WRITE_ONCE(sk->sk_incoming_cpu, val);
1216 break;
1217
1218 case SO_CNX_ADVICE:
1219 if (val == 1)
1220 dst_negative_advice(sk);
1221 break;
1222
1223 case SO_ZEROCOPY:
1224 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1225 if (!((sk->sk_type == SOCK_STREAM &&
1226 sk->sk_protocol == IPPROTO_TCP) ||
1227 (sk->sk_type == SOCK_DGRAM &&
1228 sk->sk_protocol == IPPROTO_UDP)))
1229 ret = -ENOTSUPP;
1230 } else if (sk->sk_family != PF_RDS) {
1231 ret = -ENOTSUPP;
1232 }
1233 if (!ret) {
1234 if (val < 0 || val > 1)
1235 ret = -EINVAL;
1236 else
1237 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1238 }
1239 break;
1240
1241 case SO_TXTIME:
1242 if (optlen != sizeof(struct sock_txtime)) {
1243 ret = -EINVAL;
1244 break;
1245 } else if (copy_from_sockptr(&sk_txtime, optval,
1246 sizeof(struct sock_txtime))) {
1247 ret = -EFAULT;
1248 break;
1249 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1250 ret = -EINVAL;
1251 break;
1252 }
1253 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1254 * scheduler has enough safe guards.
1255 */
1256 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1257 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1258 ret = -EPERM;
1259 break;
1260 }
1261 sock_valbool_flag(sk, SOCK_TXTIME, true);
1262 sk->sk_clockid = sk_txtime.clockid;
1263 sk->sk_txtime_deadline_mode =
1264 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1265 sk->sk_txtime_report_errors =
1266 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1267 break;
1268
1269 case SO_BINDTOIFINDEX:
1270 ret = sock_bindtoindex_locked(sk, val);
1271 break;
1272
1273 default:
1274 ret = -ENOPROTOOPT;
1275 break;
1276 }
1277 release_sock(sk);
1278 return ret;
1279}
1280EXPORT_SYMBOL(sock_setsockopt);
1281
1282
1283static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1284 struct ucred *ucred)
1285{
1286 ucred->pid = pid_vnr(pid);
1287 ucred->uid = ucred->gid = -1;
1288 if (cred) {
1289 struct user_namespace *current_ns = current_user_ns();
1290
1291 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1292 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1293 }
1294}
1295
1296static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1297{
1298 struct user_namespace *user_ns = current_user_ns();
1299 int i;
1300
1301 for (i = 0; i < src->ngroups; i++)
1302 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1303 return -EFAULT;
1304
1305 return 0;
1306}
1307
1308int sock_getsockopt(struct socket *sock, int level, int optname,
1309 char __user *optval, int __user *optlen)
1310{
1311 struct sock *sk = sock->sk;
1312
1313 union {
1314 int val;
1315 u64 val64;
1316 unsigned long ulval;
1317 struct linger ling;
1318 struct old_timeval32 tm32;
1319 struct __kernel_old_timeval tm;
1320 struct __kernel_sock_timeval stm;
1321 struct sock_txtime txtime;
1322 } v;
1323
1324 int lv = sizeof(int);
1325 int len;
1326
1327 if (get_user(len, optlen))
1328 return -EFAULT;
1329 if (len < 0)
1330 return -EINVAL;
1331
1332 memset(&v, 0, sizeof(v));
1333
1334 switch (optname) {
1335 case SO_DEBUG:
1336 v.val = sock_flag(sk, SOCK_DBG);
1337 break;
1338
1339 case SO_DONTROUTE:
1340 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1341 break;
1342
1343 case SO_BROADCAST:
1344 v.val = sock_flag(sk, SOCK_BROADCAST);
1345 break;
1346
1347 case SO_SNDBUF:
1348 v.val = sk->sk_sndbuf;
1349 break;
1350
1351 case SO_RCVBUF:
1352 v.val = sk->sk_rcvbuf;
1353 break;
1354
1355 case SO_REUSEADDR:
1356 v.val = sk->sk_reuse;
1357 break;
1358
1359 case SO_REUSEPORT:
1360 v.val = sk->sk_reuseport;
1361 break;
1362
1363 case SO_KEEPALIVE:
1364 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1365 break;
1366
1367 case SO_TYPE:
1368 v.val = sk->sk_type;
1369 break;
1370
1371 case SO_PROTOCOL:
1372 v.val = sk->sk_protocol;
1373 break;
1374
1375 case SO_DOMAIN:
1376 v.val = sk->sk_family;
1377 break;
1378
1379 case SO_ERROR:
1380 v.val = -sock_error(sk);
1381 if (v.val == 0)
1382 v.val = xchg(&sk->sk_err_soft, 0);
1383 break;
1384
1385 case SO_OOBINLINE:
1386 v.val = sock_flag(sk, SOCK_URGINLINE);
1387 break;
1388
1389 case SO_NO_CHECK:
1390 v.val = sk->sk_no_check_tx;
1391 break;
1392
1393 case SO_PRIORITY:
1394 v.val = sk->sk_priority;
1395 break;
1396
1397 case SO_LINGER:
1398 lv = sizeof(v.ling);
1399 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1400 v.ling.l_linger = sk->sk_lingertime / HZ;
1401 break;
1402
1403 case SO_BSDCOMPAT:
1404 break;
1405
1406 case SO_TIMESTAMP_OLD:
1407 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1408 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1409 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1410 break;
1411
1412 case SO_TIMESTAMPNS_OLD:
1413 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1414 break;
1415
1416 case SO_TIMESTAMP_NEW:
1417 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1418 break;
1419
1420 case SO_TIMESTAMPNS_NEW:
1421 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1422 break;
1423
1424 case SO_TIMESTAMPING_OLD:
1425 v.val = sk->sk_tsflags;
1426 break;
1427
1428 case SO_RCVTIMEO_OLD:
1429 case SO_RCVTIMEO_NEW:
1430 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1431 break;
1432
1433 case SO_SNDTIMEO_OLD:
1434 case SO_SNDTIMEO_NEW:
1435 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1436 break;
1437
1438 case SO_RCVLOWAT:
1439 v.val = sk->sk_rcvlowat;
1440 break;
1441
1442 case SO_SNDLOWAT:
1443 v.val = 1;
1444 break;
1445
1446 case SO_PASSCRED:
1447 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1448 break;
1449
1450 case SO_PEERCRED:
1451 {
1452 struct ucred peercred;
1453 if (len > sizeof(peercred))
1454 len = sizeof(peercred);
1455 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1456 if (copy_to_user(optval, &peercred, len))
1457 return -EFAULT;
1458 goto lenout;
1459 }
1460
1461 case SO_PEERGROUPS:
1462 {
1463 int ret, n;
1464
1465 if (!sk->sk_peer_cred)
1466 return -ENODATA;
1467
1468 n = sk->sk_peer_cred->group_info->ngroups;
1469 if (len < n * sizeof(gid_t)) {
1470 len = n * sizeof(gid_t);
1471 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1472 }
1473 len = n * sizeof(gid_t);
1474
1475 ret = groups_to_user((gid_t __user *)optval,
1476 sk->sk_peer_cred->group_info);
1477 if (ret)
1478 return ret;
1479 goto lenout;
1480 }
1481
1482 case SO_PEERNAME:
1483 {
1484 char address[128];
1485
1486 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1487 if (lv < 0)
1488 return -ENOTCONN;
1489 if (lv < len)
1490 return -EINVAL;
1491 if (copy_to_user(optval, address, len))
1492 return -EFAULT;
1493 goto lenout;
1494 }
1495
1496 /* Dubious BSD thing... Probably nobody even uses it, but
1497 * the UNIX standard wants it for whatever reason... -DaveM
1498 */
1499 case SO_ACCEPTCONN:
1500 v.val = sk->sk_state == TCP_LISTEN;
1501 break;
1502
1503 case SO_PASSSEC:
1504 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1505 break;
1506
1507 case SO_PEERSEC:
1508 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1509
1510 case SO_MARK:
1511 v.val = sk->sk_mark;
1512 break;
1513
1514 case SO_RXQ_OVFL:
1515 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1516 break;
1517
1518 case SO_WIFI_STATUS:
1519 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1520 break;
1521
1522 case SO_PEEK_OFF:
1523 if (!sock->ops->set_peek_off)
1524 return -EOPNOTSUPP;
1525
1526 v.val = sk->sk_peek_off;
1527 break;
1528 case SO_NOFCS:
1529 v.val = sock_flag(sk, SOCK_NOFCS);
1530 break;
1531
1532 case SO_BINDTODEVICE:
1533 return sock_getbindtodevice(sk, optval, optlen, len);
1534
1535 case SO_GET_FILTER:
1536 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1537 if (len < 0)
1538 return len;
1539
1540 goto lenout;
1541
1542 case SO_LOCK_FILTER:
1543 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1544 break;
1545
1546 case SO_BPF_EXTENSIONS:
1547 v.val = bpf_tell_extensions();
1548 break;
1549
1550 case SO_SELECT_ERR_QUEUE:
1551 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1552 break;
1553
1554#ifdef CONFIG_NET_RX_BUSY_POLL
1555 case SO_BUSY_POLL:
1556 v.val = sk->sk_ll_usec;
1557 break;
1558 case SO_PREFER_BUSY_POLL:
1559 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1560 break;
1561#endif
1562
1563 case SO_MAX_PACING_RATE:
1564 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1565 lv = sizeof(v.ulval);
1566 v.ulval = sk->sk_max_pacing_rate;
1567 } else {
1568 /* 32bit version */
1569 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1570 }
1571 break;
1572
1573 case SO_INCOMING_CPU:
1574 v.val = READ_ONCE(sk->sk_incoming_cpu);
1575 break;
1576
1577 case SO_MEMINFO:
1578 {
1579 u32 meminfo[SK_MEMINFO_VARS];
1580
1581 sk_get_meminfo(sk, meminfo);
1582
1583 len = min_t(unsigned int, len, sizeof(meminfo));
1584 if (copy_to_user(optval, &meminfo, len))
1585 return -EFAULT;
1586
1587 goto lenout;
1588 }
1589
1590#ifdef CONFIG_NET_RX_BUSY_POLL
1591 case SO_INCOMING_NAPI_ID:
1592 v.val = READ_ONCE(sk->sk_napi_id);
1593
1594 /* aggregate non-NAPI IDs down to 0 */
1595 if (v.val < MIN_NAPI_ID)
1596 v.val = 0;
1597
1598 break;
1599#endif
1600
1601 case SO_COOKIE:
1602 lv = sizeof(u64);
1603 if (len < lv)
1604 return -EINVAL;
1605 v.val64 = sock_gen_cookie(sk);
1606 break;
1607
1608 case SO_ZEROCOPY:
1609 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1610 break;
1611
1612 case SO_TXTIME:
1613 lv = sizeof(v.txtime);
1614 v.txtime.clockid = sk->sk_clockid;
1615 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1616 SOF_TXTIME_DEADLINE_MODE : 0;
1617 v.txtime.flags |= sk->sk_txtime_report_errors ?
1618 SOF_TXTIME_REPORT_ERRORS : 0;
1619 break;
1620
1621 case SO_BINDTOIFINDEX:
1622 v.val = sk->sk_bound_dev_if;
1623 break;
1624
1625 default:
1626 /* We implement the SO_SNDLOWAT etc to not be settable
1627 * (1003.1g 7).
1628 */
1629 return -ENOPROTOOPT;
1630 }
1631
1632 if (len > lv)
1633 len = lv;
1634 if (copy_to_user(optval, &v, len))
1635 return -EFAULT;
1636lenout:
1637 if (put_user(len, optlen))
1638 return -EFAULT;
1639 return 0;
1640}
1641
1642/*
1643 * Initialize an sk_lock.
1644 *
1645 * (We also register the sk_lock with the lock validator.)
1646 */
1647static inline void sock_lock_init(struct sock *sk)
1648{
1649 if (sk->sk_kern_sock)
1650 sock_lock_init_class_and_name(
1651 sk,
1652 af_family_kern_slock_key_strings[sk->sk_family],
1653 af_family_kern_slock_keys + sk->sk_family,
1654 af_family_kern_key_strings[sk->sk_family],
1655 af_family_kern_keys + sk->sk_family);
1656 else
1657 sock_lock_init_class_and_name(
1658 sk,
1659 af_family_slock_key_strings[sk->sk_family],
1660 af_family_slock_keys + sk->sk_family,
1661 af_family_key_strings[sk->sk_family],
1662 af_family_keys + sk->sk_family);
1663}
1664
1665/*
1666 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1667 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1668 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1669 */
1670static void sock_copy(struct sock *nsk, const struct sock *osk)
1671{
1672 const struct proto *prot = READ_ONCE(osk->sk_prot);
1673#ifdef CONFIG_SECURITY_NETWORK
1674 void *sptr = nsk->sk_security;
1675#endif
1676
1677 /* If we move sk_tx_queue_mapping out of the private section,
1678 * we must check if sk_tx_queue_clear() is called after
1679 * sock_copy() in sk_clone_lock().
1680 */
1681 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1682 offsetof(struct sock, sk_dontcopy_begin) ||
1683 offsetof(struct sock, sk_tx_queue_mapping) >=
1684 offsetof(struct sock, sk_dontcopy_end));
1685
1686 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1687
1688 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1689 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1690
1691#ifdef CONFIG_SECURITY_NETWORK
1692 nsk->sk_security = sptr;
1693 security_sk_clone(osk, nsk);
1694#endif
1695}
1696
1697static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1698 int family)
1699{
1700 struct sock *sk;
1701 struct kmem_cache *slab;
1702
1703 slab = prot->slab;
1704 if (slab != NULL) {
1705 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1706 if (!sk)
1707 return sk;
1708 if (want_init_on_alloc(priority))
1709 sk_prot_clear_nulls(sk, prot->obj_size);
1710 } else
1711 sk = kmalloc(prot->obj_size, priority);
1712
1713 if (sk != NULL) {
1714 if (security_sk_alloc(sk, family, priority))
1715 goto out_free;
1716
1717 if (!try_module_get(prot->owner))
1718 goto out_free_sec;
1719 }
1720
1721 return sk;
1722
1723out_free_sec:
1724 security_sk_free(sk);
1725out_free:
1726 if (slab != NULL)
1727 kmem_cache_free(slab, sk);
1728 else
1729 kfree(sk);
1730 return NULL;
1731}
1732
1733static void sk_prot_free(struct proto *prot, struct sock *sk)
1734{
1735 struct kmem_cache *slab;
1736 struct module *owner;
1737
1738 owner = prot->owner;
1739 slab = prot->slab;
1740
1741 cgroup_sk_free(&sk->sk_cgrp_data);
1742 mem_cgroup_sk_free(sk);
1743 security_sk_free(sk);
1744 if (slab != NULL)
1745 kmem_cache_free(slab, sk);
1746 else
1747 kfree(sk);
1748 module_put(owner);
1749}
1750
1751/**
1752 * sk_alloc - All socket objects are allocated here
1753 * @net: the applicable net namespace
1754 * @family: protocol family
1755 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1756 * @prot: struct proto associated with this new sock instance
1757 * @kern: is this to be a kernel socket?
1758 */
1759struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1760 struct proto *prot, int kern)
1761{
1762 struct sock *sk;
1763
1764 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1765 if (sk) {
1766 sk->sk_family = family;
1767 /*
1768 * See comment in struct sock definition to understand
1769 * why we need sk_prot_creator -acme
1770 */
1771 sk->sk_prot = sk->sk_prot_creator = prot;
1772 sk->sk_kern_sock = kern;
1773 sock_lock_init(sk);
1774 sk->sk_net_refcnt = kern ? 0 : 1;
1775 if (likely(sk->sk_net_refcnt)) {
1776 get_net(net);
1777 sock_inuse_add(net, 1);
1778 }
1779
1780 sock_net_set(sk, net);
1781 refcount_set(&sk->sk_wmem_alloc, 1);
1782
1783 mem_cgroup_sk_alloc(sk);
1784 cgroup_sk_alloc(&sk->sk_cgrp_data);
1785 sock_update_classid(&sk->sk_cgrp_data);
1786 sock_update_netprioidx(&sk->sk_cgrp_data);
1787 sk_tx_queue_clear(sk);
1788 }
1789
1790 return sk;
1791}
1792EXPORT_SYMBOL(sk_alloc);
1793
1794/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1795 * grace period. This is the case for UDP sockets and TCP listeners.
1796 */
1797static void __sk_destruct(struct rcu_head *head)
1798{
1799 struct sock *sk = container_of(head, struct sock, sk_rcu);
1800 struct sk_filter *filter;
1801
1802 if (sk->sk_destruct)
1803 sk->sk_destruct(sk);
1804
1805 filter = rcu_dereference_check(sk->sk_filter,
1806 refcount_read(&sk->sk_wmem_alloc) == 0);
1807 if (filter) {
1808 sk_filter_uncharge(sk, filter);
1809 RCU_INIT_POINTER(sk->sk_filter, NULL);
1810 }
1811
1812 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1813
1814#ifdef CONFIG_BPF_SYSCALL
1815 bpf_sk_storage_free(sk);
1816#endif
1817
1818 if (atomic_read(&sk->sk_omem_alloc))
1819 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1820 __func__, atomic_read(&sk->sk_omem_alloc));
1821
1822 if (sk->sk_frag.page) {
1823 put_page(sk->sk_frag.page);
1824 sk->sk_frag.page = NULL;
1825 }
1826
1827 if (sk->sk_peer_cred)
1828 put_cred(sk->sk_peer_cred);
1829 put_pid(sk->sk_peer_pid);
1830 if (likely(sk->sk_net_refcnt))
1831 put_net(sock_net(sk));
1832 sk_prot_free(sk->sk_prot_creator, sk);
1833}
1834
1835void sk_destruct(struct sock *sk)
1836{
1837 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1838
1839 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1840 reuseport_detach_sock(sk);
1841 use_call_rcu = true;
1842 }
1843
1844 if (use_call_rcu)
1845 call_rcu(&sk->sk_rcu, __sk_destruct);
1846 else
1847 __sk_destruct(&sk->sk_rcu);
1848}
1849
1850static void __sk_free(struct sock *sk)
1851{
1852 if (likely(sk->sk_net_refcnt))
1853 sock_inuse_add(sock_net(sk), -1);
1854
1855 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1856 sock_diag_broadcast_destroy(sk);
1857 else
1858 sk_destruct(sk);
1859}
1860
1861void sk_free(struct sock *sk)
1862{
1863 /*
1864 * We subtract one from sk_wmem_alloc and can know if
1865 * some packets are still in some tx queue.
1866 * If not null, sock_wfree() will call __sk_free(sk) later
1867 */
1868 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1869 __sk_free(sk);
1870}
1871EXPORT_SYMBOL(sk_free);
1872
1873static void sk_init_common(struct sock *sk)
1874{
1875 skb_queue_head_init(&sk->sk_receive_queue);
1876 skb_queue_head_init(&sk->sk_write_queue);
1877 skb_queue_head_init(&sk->sk_error_queue);
1878
1879 rwlock_init(&sk->sk_callback_lock);
1880 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1881 af_rlock_keys + sk->sk_family,
1882 af_family_rlock_key_strings[sk->sk_family]);
1883 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1884 af_wlock_keys + sk->sk_family,
1885 af_family_wlock_key_strings[sk->sk_family]);
1886 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1887 af_elock_keys + sk->sk_family,
1888 af_family_elock_key_strings[sk->sk_family]);
1889 lockdep_set_class_and_name(&sk->sk_callback_lock,
1890 af_callback_keys + sk->sk_family,
1891 af_family_clock_key_strings[sk->sk_family]);
1892}
1893
1894/**
1895 * sk_clone_lock - clone a socket, and lock its clone
1896 * @sk: the socket to clone
1897 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1898 *
1899 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1900 */
1901struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1902{
1903 struct proto *prot = READ_ONCE(sk->sk_prot);
1904 struct sk_filter *filter;
1905 bool is_charged = true;
1906 struct sock *newsk;
1907
1908 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1909 if (!newsk)
1910 goto out;
1911
1912 sock_copy(newsk, sk);
1913
1914 newsk->sk_prot_creator = prot;
1915
1916 /* SANITY */
1917 if (likely(newsk->sk_net_refcnt))
1918 get_net(sock_net(newsk));
1919 sk_node_init(&newsk->sk_node);
1920 sock_lock_init(newsk);
1921 bh_lock_sock(newsk);
1922 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1923 newsk->sk_backlog.len = 0;
1924
1925 atomic_set(&newsk->sk_rmem_alloc, 0);
1926
1927 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1928 refcount_set(&newsk->sk_wmem_alloc, 1);
1929
1930 atomic_set(&newsk->sk_omem_alloc, 0);
1931 sk_init_common(newsk);
1932
1933 newsk->sk_dst_cache = NULL;
1934 newsk->sk_dst_pending_confirm = 0;
1935 newsk->sk_wmem_queued = 0;
1936 newsk->sk_forward_alloc = 0;
1937 atomic_set(&newsk->sk_drops, 0);
1938 newsk->sk_send_head = NULL;
1939 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1940 atomic_set(&newsk->sk_zckey, 0);
1941
1942 sock_reset_flag(newsk, SOCK_DONE);
1943
1944 /* sk->sk_memcg will be populated at accept() time */
1945 newsk->sk_memcg = NULL;
1946
1947 cgroup_sk_clone(&newsk->sk_cgrp_data);
1948
1949 rcu_read_lock();
1950 filter = rcu_dereference(sk->sk_filter);
1951 if (filter != NULL)
1952 /* though it's an empty new sock, the charging may fail
1953 * if sysctl_optmem_max was changed between creation of
1954 * original socket and cloning
1955 */
1956 is_charged = sk_filter_charge(newsk, filter);
1957 RCU_INIT_POINTER(newsk->sk_filter, filter);
1958 rcu_read_unlock();
1959
1960 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1961 /* We need to make sure that we don't uncharge the new
1962 * socket if we couldn't charge it in the first place
1963 * as otherwise we uncharge the parent's filter.
1964 */
1965 if (!is_charged)
1966 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1967 sk_free_unlock_clone(newsk);
1968 newsk = NULL;
1969 goto out;
1970 }
1971 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1972
1973 if (bpf_sk_storage_clone(sk, newsk)) {
1974 sk_free_unlock_clone(newsk);
1975 newsk = NULL;
1976 goto out;
1977 }
1978
1979 /* Clear sk_user_data if parent had the pointer tagged
1980 * as not suitable for copying when cloning.
1981 */
1982 if (sk_user_data_is_nocopy(newsk))
1983 newsk->sk_user_data = NULL;
1984
1985 newsk->sk_err = 0;
1986 newsk->sk_err_soft = 0;
1987 newsk->sk_priority = 0;
1988 newsk->sk_incoming_cpu = raw_smp_processor_id();
1989 if (likely(newsk->sk_net_refcnt))
1990 sock_inuse_add(sock_net(newsk), 1);
1991
1992 /* Before updating sk_refcnt, we must commit prior changes to memory
1993 * (Documentation/RCU/rculist_nulls.rst for details)
1994 */
1995 smp_wmb();
1996 refcount_set(&newsk->sk_refcnt, 2);
1997
1998 /* Increment the counter in the same struct proto as the master
1999 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2000 * is the same as sk->sk_prot->socks, as this field was copied
2001 * with memcpy).
2002 *
2003 * This _changes_ the previous behaviour, where
2004 * tcp_create_openreq_child always was incrementing the
2005 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2006 * to be taken into account in all callers. -acme
2007 */
2008 sk_refcnt_debug_inc(newsk);
2009 sk_set_socket(newsk, NULL);
2010 sk_tx_queue_clear(newsk);
2011 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2012
2013 if (newsk->sk_prot->sockets_allocated)
2014 sk_sockets_allocated_inc(newsk);
2015
2016 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2017 net_enable_timestamp();
2018out:
2019 return newsk;
2020}
2021EXPORT_SYMBOL_GPL(sk_clone_lock);
2022
2023void sk_free_unlock_clone(struct sock *sk)
2024{
2025 /* It is still raw copy of parent, so invalidate
2026 * destructor and make plain sk_free() */
2027 sk->sk_destruct = NULL;
2028 bh_unlock_sock(sk);
2029 sk_free(sk);
2030}
2031EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2032
2033void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2034{
2035 u32 max_segs = 1;
2036
2037 sk_dst_set(sk, dst);
2038 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2039 if (sk->sk_route_caps & NETIF_F_GSO)
2040 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2041 sk->sk_route_caps &= ~sk->sk_route_nocaps;
2042 if (sk_can_gso(sk)) {
2043 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2044 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2045 } else {
2046 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2047 sk->sk_gso_max_size = dst->dev->gso_max_size;
2048 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2049 }
2050 }
2051 sk->sk_gso_max_segs = max_segs;
2052}
2053EXPORT_SYMBOL_GPL(sk_setup_caps);
2054
2055/*
2056 * Simple resource managers for sockets.
2057 */
2058
2059
2060/*
2061 * Write buffer destructor automatically called from kfree_skb.
2062 */
2063void sock_wfree(struct sk_buff *skb)
2064{
2065 struct sock *sk = skb->sk;
2066 unsigned int len = skb->truesize;
2067
2068 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2069 /*
2070 * Keep a reference on sk_wmem_alloc, this will be released
2071 * after sk_write_space() call
2072 */
2073 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2074 sk->sk_write_space(sk);
2075 len = 1;
2076 }
2077 /*
2078 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2079 * could not do because of in-flight packets
2080 */
2081 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2082 __sk_free(sk);
2083}
2084EXPORT_SYMBOL(sock_wfree);
2085
2086/* This variant of sock_wfree() is used by TCP,
2087 * since it sets SOCK_USE_WRITE_QUEUE.
2088 */
2089void __sock_wfree(struct sk_buff *skb)
2090{
2091 struct sock *sk = skb->sk;
2092
2093 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2094 __sk_free(sk);
2095}
2096
2097void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2098{
2099 skb_orphan(skb);
2100 skb->sk = sk;
2101#ifdef CONFIG_INET
2102 if (unlikely(!sk_fullsock(sk))) {
2103 skb->destructor = sock_edemux;
2104 sock_hold(sk);
2105 return;
2106 }
2107#endif
2108 skb->destructor = sock_wfree;
2109 skb_set_hash_from_sk(skb, sk);
2110 /*
2111 * We used to take a refcount on sk, but following operation
2112 * is enough to guarantee sk_free() wont free this sock until
2113 * all in-flight packets are completed
2114 */
2115 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2116}
2117EXPORT_SYMBOL(skb_set_owner_w);
2118
2119static bool can_skb_orphan_partial(const struct sk_buff *skb)
2120{
2121#ifdef CONFIG_TLS_DEVICE
2122 /* Drivers depend on in-order delivery for crypto offload,
2123 * partial orphan breaks out-of-order-OK logic.
2124 */
2125 if (skb->decrypted)
2126 return false;
2127#endif
2128 return (skb->destructor == sock_wfree ||
2129 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2130}
2131
2132/* This helper is used by netem, as it can hold packets in its
2133 * delay queue. We want to allow the owner socket to send more
2134 * packets, as if they were already TX completed by a typical driver.
2135 * But we also want to keep skb->sk set because some packet schedulers
2136 * rely on it (sch_fq for example).
2137 */
2138void skb_orphan_partial(struct sk_buff *skb)
2139{
2140 if (skb_is_tcp_pure_ack(skb))
2141 return;
2142
2143 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2144 return;
2145
2146 skb_orphan(skb);
2147}
2148EXPORT_SYMBOL(skb_orphan_partial);
2149
2150/*
2151 * Read buffer destructor automatically called from kfree_skb.
2152 */
2153void sock_rfree(struct sk_buff *skb)
2154{
2155 struct sock *sk = skb->sk;
2156 unsigned int len = skb->truesize;
2157
2158 atomic_sub(len, &sk->sk_rmem_alloc);
2159 sk_mem_uncharge(sk, len);
2160}
2161EXPORT_SYMBOL(sock_rfree);
2162
2163/*
2164 * Buffer destructor for skbs that are not used directly in read or write
2165 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2166 */
2167void sock_efree(struct sk_buff *skb)
2168{
2169 sock_put(skb->sk);
2170}
2171EXPORT_SYMBOL(sock_efree);
2172
2173/* Buffer destructor for prefetch/receive path where reference count may
2174 * not be held, e.g. for listen sockets.
2175 */
2176#ifdef CONFIG_INET
2177void sock_pfree(struct sk_buff *skb)
2178{
2179 if (sk_is_refcounted(skb->sk))
2180 sock_gen_put(skb->sk);
2181}
2182EXPORT_SYMBOL(sock_pfree);
2183#endif /* CONFIG_INET */
2184
2185kuid_t sock_i_uid(struct sock *sk)
2186{
2187 kuid_t uid;
2188
2189 read_lock_bh(&sk->sk_callback_lock);
2190 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2191 read_unlock_bh(&sk->sk_callback_lock);
2192 return uid;
2193}
2194EXPORT_SYMBOL(sock_i_uid);
2195
2196unsigned long sock_i_ino(struct sock *sk)
2197{
2198 unsigned long ino;
2199
2200 read_lock_bh(&sk->sk_callback_lock);
2201 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2202 read_unlock_bh(&sk->sk_callback_lock);
2203 return ino;
2204}
2205EXPORT_SYMBOL(sock_i_ino);
2206
2207/*
2208 * Allocate a skb from the socket's send buffer.
2209 */
2210struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2211 gfp_t priority)
2212{
2213 if (force ||
2214 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2215 struct sk_buff *skb = alloc_skb(size, priority);
2216
2217 if (skb) {
2218 skb_set_owner_w(skb, sk);
2219 return skb;
2220 }
2221 }
2222 return NULL;
2223}
2224EXPORT_SYMBOL(sock_wmalloc);
2225
2226static void sock_ofree(struct sk_buff *skb)
2227{
2228 struct sock *sk = skb->sk;
2229
2230 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2231}
2232
2233struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2234 gfp_t priority)
2235{
2236 struct sk_buff *skb;
2237
2238 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2239 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2240 sysctl_optmem_max)
2241 return NULL;
2242
2243 skb = alloc_skb(size, priority);
2244 if (!skb)
2245 return NULL;
2246
2247 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2248 skb->sk = sk;
2249 skb->destructor = sock_ofree;
2250 return skb;
2251}
2252
2253/*
2254 * Allocate a memory block from the socket's option memory buffer.
2255 */
2256void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2257{
2258 if ((unsigned int)size <= sysctl_optmem_max &&
2259 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2260 void *mem;
2261 /* First do the add, to avoid the race if kmalloc
2262 * might sleep.
2263 */
2264 atomic_add(size, &sk->sk_omem_alloc);
2265 mem = kmalloc(size, priority);
2266 if (mem)
2267 return mem;
2268 atomic_sub(size, &sk->sk_omem_alloc);
2269 }
2270 return NULL;
2271}
2272EXPORT_SYMBOL(sock_kmalloc);
2273
2274/* Free an option memory block. Note, we actually want the inline
2275 * here as this allows gcc to detect the nullify and fold away the
2276 * condition entirely.
2277 */
2278static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2279 const bool nullify)
2280{
2281 if (WARN_ON_ONCE(!mem))
2282 return;
2283 if (nullify)
2284 kfree_sensitive(mem);
2285 else
2286 kfree(mem);
2287 atomic_sub(size, &sk->sk_omem_alloc);
2288}
2289
2290void sock_kfree_s(struct sock *sk, void *mem, int size)
2291{
2292 __sock_kfree_s(sk, mem, size, false);
2293}
2294EXPORT_SYMBOL(sock_kfree_s);
2295
2296void sock_kzfree_s(struct sock *sk, void *mem, int size)
2297{
2298 __sock_kfree_s(sk, mem, size, true);
2299}
2300EXPORT_SYMBOL(sock_kzfree_s);
2301
2302/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2303 I think, these locks should be removed for datagram sockets.
2304 */
2305static long sock_wait_for_wmem(struct sock *sk, long timeo)
2306{
2307 DEFINE_WAIT(wait);
2308
2309 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2310 for (;;) {
2311 if (!timeo)
2312 break;
2313 if (signal_pending(current))
2314 break;
2315 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2316 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2317 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2318 break;
2319 if (sk->sk_shutdown & SEND_SHUTDOWN)
2320 break;
2321 if (sk->sk_err)
2322 break;
2323 timeo = schedule_timeout(timeo);
2324 }
2325 finish_wait(sk_sleep(sk), &wait);
2326 return timeo;
2327}
2328
2329
2330/*
2331 * Generic send/receive buffer handlers
2332 */
2333
2334struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2335 unsigned long data_len, int noblock,
2336 int *errcode, int max_page_order)
2337{
2338 struct sk_buff *skb;
2339 long timeo;
2340 int err;
2341
2342 timeo = sock_sndtimeo(sk, noblock);
2343 for (;;) {
2344 err = sock_error(sk);
2345 if (err != 0)
2346 goto failure;
2347
2348 err = -EPIPE;
2349 if (sk->sk_shutdown & SEND_SHUTDOWN)
2350 goto failure;
2351
2352 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2353 break;
2354
2355 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2356 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2357 err = -EAGAIN;
2358 if (!timeo)
2359 goto failure;
2360 if (signal_pending(current))
2361 goto interrupted;
2362 timeo = sock_wait_for_wmem(sk, timeo);
2363 }
2364 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2365 errcode, sk->sk_allocation);
2366 if (skb)
2367 skb_set_owner_w(skb, sk);
2368 return skb;
2369
2370interrupted:
2371 err = sock_intr_errno(timeo);
2372failure:
2373 *errcode = err;
2374 return NULL;
2375}
2376EXPORT_SYMBOL(sock_alloc_send_pskb);
2377
2378struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2379 int noblock, int *errcode)
2380{
2381 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2382}
2383EXPORT_SYMBOL(sock_alloc_send_skb);
2384
2385int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2386 struct sockcm_cookie *sockc)
2387{
2388 u32 tsflags;
2389
2390 switch (cmsg->cmsg_type) {
2391 case SO_MARK:
2392 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2393 return -EPERM;
2394 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2395 return -EINVAL;
2396 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2397 break;
2398 case SO_TIMESTAMPING_OLD:
2399 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2400 return -EINVAL;
2401
2402 tsflags = *(u32 *)CMSG_DATA(cmsg);
2403 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2404 return -EINVAL;
2405
2406 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2407 sockc->tsflags |= tsflags;
2408 break;
2409 case SCM_TXTIME:
2410 if (!sock_flag(sk, SOCK_TXTIME))
2411 return -EINVAL;
2412 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2413 return -EINVAL;
2414 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2415 break;
2416 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2417 case SCM_RIGHTS:
2418 case SCM_CREDENTIALS:
2419 break;
2420 default:
2421 return -EINVAL;
2422 }
2423 return 0;
2424}
2425EXPORT_SYMBOL(__sock_cmsg_send);
2426
2427int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2428 struct sockcm_cookie *sockc)
2429{
2430 struct cmsghdr *cmsg;
2431 int ret;
2432
2433 for_each_cmsghdr(cmsg, msg) {
2434 if (!CMSG_OK(msg, cmsg))
2435 return -EINVAL;
2436 if (cmsg->cmsg_level != SOL_SOCKET)
2437 continue;
2438 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2439 if (ret)
2440 return ret;
2441 }
2442 return 0;
2443}
2444EXPORT_SYMBOL(sock_cmsg_send);
2445
2446static void sk_enter_memory_pressure(struct sock *sk)
2447{
2448 if (!sk->sk_prot->enter_memory_pressure)
2449 return;
2450
2451 sk->sk_prot->enter_memory_pressure(sk);
2452}
2453
2454static void sk_leave_memory_pressure(struct sock *sk)
2455{
2456 if (sk->sk_prot->leave_memory_pressure) {
2457 sk->sk_prot->leave_memory_pressure(sk);
2458 } else {
2459 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2460
2461 if (memory_pressure && READ_ONCE(*memory_pressure))
2462 WRITE_ONCE(*memory_pressure, 0);
2463 }
2464}
2465
2466#define SKB_FRAG_PAGE_ORDER get_order(32768)
2467DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2468
2469/**
2470 * skb_page_frag_refill - check that a page_frag contains enough room
2471 * @sz: minimum size of the fragment we want to get
2472 * @pfrag: pointer to page_frag
2473 * @gfp: priority for memory allocation
2474 *
2475 * Note: While this allocator tries to use high order pages, there is
2476 * no guarantee that allocations succeed. Therefore, @sz MUST be
2477 * less or equal than PAGE_SIZE.
2478 */
2479bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2480{
2481 if (pfrag->page) {
2482 if (page_ref_count(pfrag->page) == 1) {
2483 pfrag->offset = 0;
2484 return true;
2485 }
2486 if (pfrag->offset + sz <= pfrag->size)
2487 return true;
2488 put_page(pfrag->page);
2489 }
2490
2491 pfrag->offset = 0;
2492 if (SKB_FRAG_PAGE_ORDER &&
2493 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2494 /* Avoid direct reclaim but allow kswapd to wake */
2495 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2496 __GFP_COMP | __GFP_NOWARN |
2497 __GFP_NORETRY,
2498 SKB_FRAG_PAGE_ORDER);
2499 if (likely(pfrag->page)) {
2500 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2501 return true;
2502 }
2503 }
2504 pfrag->page = alloc_page(gfp);
2505 if (likely(pfrag->page)) {
2506 pfrag->size = PAGE_SIZE;
2507 return true;
2508 }
2509 return false;
2510}
2511EXPORT_SYMBOL(skb_page_frag_refill);
2512
2513bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2514{
2515 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2516 return true;
2517
2518 sk_enter_memory_pressure(sk);
2519 sk_stream_moderate_sndbuf(sk);
2520 return false;
2521}
2522EXPORT_SYMBOL(sk_page_frag_refill);
2523
2524void __lock_sock(struct sock *sk)
2525 __releases(&sk->sk_lock.slock)
2526 __acquires(&sk->sk_lock.slock)
2527{
2528 DEFINE_WAIT(wait);
2529
2530 for (;;) {
2531 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2532 TASK_UNINTERRUPTIBLE);
2533 spin_unlock_bh(&sk->sk_lock.slock);
2534 schedule();
2535 spin_lock_bh(&sk->sk_lock.slock);
2536 if (!sock_owned_by_user(sk))
2537 break;
2538 }
2539 finish_wait(&sk->sk_lock.wq, &wait);
2540}
2541
2542void __release_sock(struct sock *sk)
2543 __releases(&sk->sk_lock.slock)
2544 __acquires(&sk->sk_lock.slock)
2545{
2546 struct sk_buff *skb, *next;
2547
2548 while ((skb = sk->sk_backlog.head) != NULL) {
2549 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2550
2551 spin_unlock_bh(&sk->sk_lock.slock);
2552
2553 do {
2554 next = skb->next;
2555 prefetch(next);
2556 WARN_ON_ONCE(skb_dst_is_noref(skb));
2557 skb_mark_not_on_list(skb);
2558 sk_backlog_rcv(sk, skb);
2559
2560 cond_resched();
2561
2562 skb = next;
2563 } while (skb != NULL);
2564
2565 spin_lock_bh(&sk->sk_lock.slock);
2566 }
2567
2568 /*
2569 * Doing the zeroing here guarantee we can not loop forever
2570 * while a wild producer attempts to flood us.
2571 */
2572 sk->sk_backlog.len = 0;
2573}
2574
2575void __sk_flush_backlog(struct sock *sk)
2576{
2577 spin_lock_bh(&sk->sk_lock.slock);
2578 __release_sock(sk);
2579 spin_unlock_bh(&sk->sk_lock.slock);
2580}
2581
2582/**
2583 * sk_wait_data - wait for data to arrive at sk_receive_queue
2584 * @sk: sock to wait on
2585 * @timeo: for how long
2586 * @skb: last skb seen on sk_receive_queue
2587 *
2588 * Now socket state including sk->sk_err is changed only under lock,
2589 * hence we may omit checks after joining wait queue.
2590 * We check receive queue before schedule() only as optimization;
2591 * it is very likely that release_sock() added new data.
2592 */
2593int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2594{
2595 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2596 int rc;
2597
2598 add_wait_queue(sk_sleep(sk), &wait);
2599 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2600 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2601 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2602 remove_wait_queue(sk_sleep(sk), &wait);
2603 return rc;
2604}
2605EXPORT_SYMBOL(sk_wait_data);
2606
2607/**
2608 * __sk_mem_raise_allocated - increase memory_allocated
2609 * @sk: socket
2610 * @size: memory size to allocate
2611 * @amt: pages to allocate
2612 * @kind: allocation type
2613 *
2614 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2615 */
2616int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2617{
2618 struct proto *prot = sk->sk_prot;
2619 long allocated = sk_memory_allocated_add(sk, amt);
2620 bool charged = true;
2621
2622 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2623 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2624 goto suppress_allocation;
2625
2626 /* Under limit. */
2627 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2628 sk_leave_memory_pressure(sk);
2629 return 1;
2630 }
2631
2632 /* Under pressure. */
2633 if (allocated > sk_prot_mem_limits(sk, 1))
2634 sk_enter_memory_pressure(sk);
2635
2636 /* Over hard limit. */
2637 if (allocated > sk_prot_mem_limits(sk, 2))
2638 goto suppress_allocation;
2639
2640 /* guarantee minimum buffer size under pressure */
2641 if (kind == SK_MEM_RECV) {
2642 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2643 return 1;
2644
2645 } else { /* SK_MEM_SEND */
2646 int wmem0 = sk_get_wmem0(sk, prot);
2647
2648 if (sk->sk_type == SOCK_STREAM) {
2649 if (sk->sk_wmem_queued < wmem0)
2650 return 1;
2651 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2652 return 1;
2653 }
2654 }
2655
2656 if (sk_has_memory_pressure(sk)) {
2657 u64 alloc;
2658
2659 if (!sk_under_memory_pressure(sk))
2660 return 1;
2661 alloc = sk_sockets_allocated_read_positive(sk);
2662 if (sk_prot_mem_limits(sk, 2) > alloc *
2663 sk_mem_pages(sk->sk_wmem_queued +
2664 atomic_read(&sk->sk_rmem_alloc) +
2665 sk->sk_forward_alloc))
2666 return 1;
2667 }
2668
2669suppress_allocation:
2670
2671 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2672 sk_stream_moderate_sndbuf(sk);
2673
2674 /* Fail only if socket is _under_ its sndbuf.
2675 * In this case we cannot block, so that we have to fail.
2676 */
2677 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2678 return 1;
2679 }
2680
2681 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2682 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2683
2684 sk_memory_allocated_sub(sk, amt);
2685
2686 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2687 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2688
2689 return 0;
2690}
2691EXPORT_SYMBOL(__sk_mem_raise_allocated);
2692
2693/**
2694 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2695 * @sk: socket
2696 * @size: memory size to allocate
2697 * @kind: allocation type
2698 *
2699 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2700 * rmem allocation. This function assumes that protocols which have
2701 * memory_pressure use sk_wmem_queued as write buffer accounting.
2702 */
2703int __sk_mem_schedule(struct sock *sk, int size, int kind)
2704{
2705 int ret, amt = sk_mem_pages(size);
2706
2707 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2708 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2709 if (!ret)
2710 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2711 return ret;
2712}
2713EXPORT_SYMBOL(__sk_mem_schedule);
2714
2715/**
2716 * __sk_mem_reduce_allocated - reclaim memory_allocated
2717 * @sk: socket
2718 * @amount: number of quanta
2719 *
2720 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2721 */
2722void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2723{
2724 sk_memory_allocated_sub(sk, amount);
2725
2726 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2727 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2728
2729 if (sk_under_memory_pressure(sk) &&
2730 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2731 sk_leave_memory_pressure(sk);
2732}
2733EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2734
2735/**
2736 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2737 * @sk: socket
2738 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2739 */
2740void __sk_mem_reclaim(struct sock *sk, int amount)
2741{
2742 amount >>= SK_MEM_QUANTUM_SHIFT;
2743 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2744 __sk_mem_reduce_allocated(sk, amount);
2745}
2746EXPORT_SYMBOL(__sk_mem_reclaim);
2747
2748int sk_set_peek_off(struct sock *sk, int val)
2749{
2750 sk->sk_peek_off = val;
2751 return 0;
2752}
2753EXPORT_SYMBOL_GPL(sk_set_peek_off);
2754
2755/*
2756 * Set of default routines for initialising struct proto_ops when
2757 * the protocol does not support a particular function. In certain
2758 * cases where it makes no sense for a protocol to have a "do nothing"
2759 * function, some default processing is provided.
2760 */
2761
2762int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2763{
2764 return -EOPNOTSUPP;
2765}
2766EXPORT_SYMBOL(sock_no_bind);
2767
2768int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2769 int len, int flags)
2770{
2771 return -EOPNOTSUPP;
2772}
2773EXPORT_SYMBOL(sock_no_connect);
2774
2775int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2776{
2777 return -EOPNOTSUPP;
2778}
2779EXPORT_SYMBOL(sock_no_socketpair);
2780
2781int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2782 bool kern)
2783{
2784 return -EOPNOTSUPP;
2785}
2786EXPORT_SYMBOL(sock_no_accept);
2787
2788int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2789 int peer)
2790{
2791 return -EOPNOTSUPP;
2792}
2793EXPORT_SYMBOL(sock_no_getname);
2794
2795int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2796{
2797 return -EOPNOTSUPP;
2798}
2799EXPORT_SYMBOL(sock_no_ioctl);
2800
2801int sock_no_listen(struct socket *sock, int backlog)
2802{
2803 return -EOPNOTSUPP;
2804}
2805EXPORT_SYMBOL(sock_no_listen);
2806
2807int sock_no_shutdown(struct socket *sock, int how)
2808{
2809 return -EOPNOTSUPP;
2810}
2811EXPORT_SYMBOL(sock_no_shutdown);
2812
2813int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2814{
2815 return -EOPNOTSUPP;
2816}
2817EXPORT_SYMBOL(sock_no_sendmsg);
2818
2819int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2820{
2821 return -EOPNOTSUPP;
2822}
2823EXPORT_SYMBOL(sock_no_sendmsg_locked);
2824
2825int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2826 int flags)
2827{
2828 return -EOPNOTSUPP;
2829}
2830EXPORT_SYMBOL(sock_no_recvmsg);
2831
2832int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2833{
2834 /* Mirror missing mmap method error code */
2835 return -ENODEV;
2836}
2837EXPORT_SYMBOL(sock_no_mmap);
2838
2839/*
2840 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2841 * various sock-based usage counts.
2842 */
2843void __receive_sock(struct file *file)
2844{
2845 struct socket *sock;
2846
2847 sock = sock_from_file(file);
2848 if (sock) {
2849 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2850 sock_update_classid(&sock->sk->sk_cgrp_data);
2851 }
2852}
2853
2854ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2855{
2856 ssize_t res;
2857 struct msghdr msg = {.msg_flags = flags};
2858 struct kvec iov;
2859 char *kaddr = kmap(page);
2860 iov.iov_base = kaddr + offset;
2861 iov.iov_len = size;
2862 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2863 kunmap(page);
2864 return res;
2865}
2866EXPORT_SYMBOL(sock_no_sendpage);
2867
2868ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2869 int offset, size_t size, int flags)
2870{
2871 ssize_t res;
2872 struct msghdr msg = {.msg_flags = flags};
2873 struct kvec iov;
2874 char *kaddr = kmap(page);
2875
2876 iov.iov_base = kaddr + offset;
2877 iov.iov_len = size;
2878 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2879 kunmap(page);
2880 return res;
2881}
2882EXPORT_SYMBOL(sock_no_sendpage_locked);
2883
2884/*
2885 * Default Socket Callbacks
2886 */
2887
2888static void sock_def_wakeup(struct sock *sk)
2889{
2890 struct socket_wq *wq;
2891
2892 rcu_read_lock();
2893 wq = rcu_dereference(sk->sk_wq);
2894 if (skwq_has_sleeper(wq))
2895 wake_up_interruptible_all(&wq->wait);
2896 rcu_read_unlock();
2897}
2898
2899static void sock_def_error_report(struct sock *sk)
2900{
2901 struct socket_wq *wq;
2902
2903 rcu_read_lock();
2904 wq = rcu_dereference(sk->sk_wq);
2905 if (skwq_has_sleeper(wq))
2906 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2907 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2908 rcu_read_unlock();
2909}
2910
2911void sock_def_readable(struct sock *sk)
2912{
2913 struct socket_wq *wq;
2914
2915 rcu_read_lock();
2916 wq = rcu_dereference(sk->sk_wq);
2917 if (skwq_has_sleeper(wq))
2918 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2919 EPOLLRDNORM | EPOLLRDBAND);
2920 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2921 rcu_read_unlock();
2922}
2923
2924static void sock_def_write_space(struct sock *sk)
2925{
2926 struct socket_wq *wq;
2927
2928 rcu_read_lock();
2929
2930 /* Do not wake up a writer until he can make "significant"
2931 * progress. --DaveM
2932 */
2933 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2934 wq = rcu_dereference(sk->sk_wq);
2935 if (skwq_has_sleeper(wq))
2936 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2937 EPOLLWRNORM | EPOLLWRBAND);
2938
2939 /* Should agree with poll, otherwise some programs break */
2940 if (sock_writeable(sk))
2941 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2942 }
2943
2944 rcu_read_unlock();
2945}
2946
2947static void sock_def_destruct(struct sock *sk)
2948{
2949}
2950
2951void sk_send_sigurg(struct sock *sk)
2952{
2953 if (sk->sk_socket && sk->sk_socket->file)
2954 if (send_sigurg(&sk->sk_socket->file->f_owner))
2955 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2956}
2957EXPORT_SYMBOL(sk_send_sigurg);
2958
2959void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2960 unsigned long expires)
2961{
2962 if (!mod_timer(timer, expires))
2963 sock_hold(sk);
2964}
2965EXPORT_SYMBOL(sk_reset_timer);
2966
2967void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2968{
2969 if (del_timer(timer))
2970 __sock_put(sk);
2971}
2972EXPORT_SYMBOL(sk_stop_timer);
2973
2974void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2975{
2976 if (del_timer_sync(timer))
2977 __sock_put(sk);
2978}
2979EXPORT_SYMBOL(sk_stop_timer_sync);
2980
2981void sock_init_data(struct socket *sock, struct sock *sk)
2982{
2983 sk_init_common(sk);
2984 sk->sk_send_head = NULL;
2985
2986 timer_setup(&sk->sk_timer, NULL, 0);
2987
2988 sk->sk_allocation = GFP_KERNEL;
2989 sk->sk_rcvbuf = sysctl_rmem_default;
2990 sk->sk_sndbuf = sysctl_wmem_default;
2991 sk->sk_state = TCP_CLOSE;
2992 sk_set_socket(sk, sock);
2993
2994 sock_set_flag(sk, SOCK_ZAPPED);
2995
2996 if (sock) {
2997 sk->sk_type = sock->type;
2998 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2999 sock->sk = sk;
3000 sk->sk_uid = SOCK_INODE(sock)->i_uid;
3001 } else {
3002 RCU_INIT_POINTER(sk->sk_wq, NULL);
3003 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
3004 }
3005
3006 rwlock_init(&sk->sk_callback_lock);
3007 if (sk->sk_kern_sock)
3008 lockdep_set_class_and_name(
3009 &sk->sk_callback_lock,
3010 af_kern_callback_keys + sk->sk_family,
3011 af_family_kern_clock_key_strings[sk->sk_family]);
3012 else
3013 lockdep_set_class_and_name(
3014 &sk->sk_callback_lock,
3015 af_callback_keys + sk->sk_family,
3016 af_family_clock_key_strings[sk->sk_family]);
3017
3018 sk->sk_state_change = sock_def_wakeup;
3019 sk->sk_data_ready = sock_def_readable;
3020 sk->sk_write_space = sock_def_write_space;
3021 sk->sk_error_report = sock_def_error_report;
3022 sk->sk_destruct = sock_def_destruct;
3023
3024 sk->sk_frag.page = NULL;
3025 sk->sk_frag.offset = 0;
3026 sk->sk_peek_off = -1;
3027
3028 sk->sk_peer_pid = NULL;
3029 sk->sk_peer_cred = NULL;
3030 sk->sk_write_pending = 0;
3031 sk->sk_rcvlowat = 1;
3032 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3033 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3034
3035 sk->sk_stamp = SK_DEFAULT_STAMP;
3036#if BITS_PER_LONG==32
3037 seqlock_init(&sk->sk_stamp_seq);
3038#endif
3039 atomic_set(&sk->sk_zckey, 0);
3040
3041#ifdef CONFIG_NET_RX_BUSY_POLL
3042 sk->sk_napi_id = 0;
3043 sk->sk_ll_usec = sysctl_net_busy_read;
3044#endif
3045
3046 sk->sk_max_pacing_rate = ~0UL;
3047 sk->sk_pacing_rate = ~0UL;
3048 WRITE_ONCE(sk->sk_pacing_shift, 10);
3049 sk->sk_incoming_cpu = -1;
3050
3051 sk_rx_queue_clear(sk);
3052 /*
3053 * Before updating sk_refcnt, we must commit prior changes to memory
3054 * (Documentation/RCU/rculist_nulls.rst for details)
3055 */
3056 smp_wmb();
3057 refcount_set(&sk->sk_refcnt, 1);
3058 atomic_set(&sk->sk_drops, 0);
3059}
3060EXPORT_SYMBOL(sock_init_data);
3061
3062void lock_sock_nested(struct sock *sk, int subclass)
3063{
3064 might_sleep();
3065 spin_lock_bh(&sk->sk_lock.slock);
3066 if (sk->sk_lock.owned)
3067 __lock_sock(sk);
3068 sk->sk_lock.owned = 1;
3069 spin_unlock(&sk->sk_lock.slock);
3070 /*
3071 * The sk_lock has mutex_lock() semantics here:
3072 */
3073 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3074 local_bh_enable();
3075}
3076EXPORT_SYMBOL(lock_sock_nested);
3077
3078void release_sock(struct sock *sk)
3079{
3080 spin_lock_bh(&sk->sk_lock.slock);
3081 if (sk->sk_backlog.tail)
3082 __release_sock(sk);
3083
3084 /* Warning : release_cb() might need to release sk ownership,
3085 * ie call sock_release_ownership(sk) before us.
3086 */
3087 if (sk->sk_prot->release_cb)
3088 sk->sk_prot->release_cb(sk);
3089
3090 sock_release_ownership(sk);
3091 if (waitqueue_active(&sk->sk_lock.wq))
3092 wake_up(&sk->sk_lock.wq);
3093 spin_unlock_bh(&sk->sk_lock.slock);
3094}
3095EXPORT_SYMBOL(release_sock);
3096
3097/**
3098 * lock_sock_fast - fast version of lock_sock
3099 * @sk: socket
3100 *
3101 * This version should be used for very small section, where process wont block
3102 * return false if fast path is taken:
3103 *
3104 * sk_lock.slock locked, owned = 0, BH disabled
3105 *
3106 * return true if slow path is taken:
3107 *
3108 * sk_lock.slock unlocked, owned = 1, BH enabled
3109 */
3110bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3111{
3112 might_sleep();
3113 spin_lock_bh(&sk->sk_lock.slock);
3114
3115 if (!sk->sk_lock.owned)
3116 /*
3117 * Note : We must disable BH
3118 */
3119 return false;
3120
3121 __lock_sock(sk);
3122 sk->sk_lock.owned = 1;
3123 spin_unlock(&sk->sk_lock.slock);
3124 /*
3125 * The sk_lock has mutex_lock() semantics here:
3126 */
3127 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3128 __acquire(&sk->sk_lock.slock);
3129 local_bh_enable();
3130 return true;
3131}
3132EXPORT_SYMBOL(lock_sock_fast);
3133
3134int sock_gettstamp(struct socket *sock, void __user *userstamp,
3135 bool timeval, bool time32)
3136{
3137 struct sock *sk = sock->sk;
3138 struct timespec64 ts;
3139
3140 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3141 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3142 if (ts.tv_sec == -1)
3143 return -ENOENT;
3144 if (ts.tv_sec == 0) {
3145 ktime_t kt = ktime_get_real();
3146 sock_write_timestamp(sk, kt);
3147 ts = ktime_to_timespec64(kt);
3148 }
3149
3150 if (timeval)
3151 ts.tv_nsec /= 1000;
3152
3153#ifdef CONFIG_COMPAT_32BIT_TIME
3154 if (time32)
3155 return put_old_timespec32(&ts, userstamp);
3156#endif
3157#ifdef CONFIG_SPARC64
3158 /* beware of padding in sparc64 timeval */
3159 if (timeval && !in_compat_syscall()) {
3160 struct __kernel_old_timeval __user tv = {
3161 .tv_sec = ts.tv_sec,
3162 .tv_usec = ts.tv_nsec,
3163 };
3164 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3165 return -EFAULT;
3166 return 0;
3167 }
3168#endif
3169 return put_timespec64(&ts, userstamp);
3170}
3171EXPORT_SYMBOL(sock_gettstamp);
3172
3173void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3174{
3175 if (!sock_flag(sk, flag)) {
3176 unsigned long previous_flags = sk->sk_flags;
3177
3178 sock_set_flag(sk, flag);
3179 /*
3180 * we just set one of the two flags which require net
3181 * time stamping, but time stamping might have been on
3182 * already because of the other one
3183 */
3184 if (sock_needs_netstamp(sk) &&
3185 !(previous_flags & SK_FLAGS_TIMESTAMP))
3186 net_enable_timestamp();
3187 }
3188}
3189
3190int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3191 int level, int type)
3192{
3193 struct sock_exterr_skb *serr;
3194 struct sk_buff *skb;
3195 int copied, err;
3196
3197 err = -EAGAIN;
3198 skb = sock_dequeue_err_skb(sk);
3199 if (skb == NULL)
3200 goto out;
3201
3202 copied = skb->len;
3203 if (copied > len) {
3204 msg->msg_flags |= MSG_TRUNC;
3205 copied = len;
3206 }
3207 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3208 if (err)
3209 goto out_free_skb;
3210
3211 sock_recv_timestamp(msg, sk, skb);
3212
3213 serr = SKB_EXT_ERR(skb);
3214 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3215
3216 msg->msg_flags |= MSG_ERRQUEUE;
3217 err = copied;
3218
3219out_free_skb:
3220 kfree_skb(skb);
3221out:
3222 return err;
3223}
3224EXPORT_SYMBOL(sock_recv_errqueue);
3225
3226/*
3227 * Get a socket option on an socket.
3228 *
3229 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3230 * asynchronous errors should be reported by getsockopt. We assume
3231 * this means if you specify SO_ERROR (otherwise whats the point of it).
3232 */
3233int sock_common_getsockopt(struct socket *sock, int level, int optname,
3234 char __user *optval, int __user *optlen)
3235{
3236 struct sock *sk = sock->sk;
3237
3238 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3239}
3240EXPORT_SYMBOL(sock_common_getsockopt);
3241
3242int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3243 int flags)
3244{
3245 struct sock *sk = sock->sk;
3246 int addr_len = 0;
3247 int err;
3248
3249 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3250 flags & ~MSG_DONTWAIT, &addr_len);
3251 if (err >= 0)
3252 msg->msg_namelen = addr_len;
3253 return err;
3254}
3255EXPORT_SYMBOL(sock_common_recvmsg);
3256
3257/*
3258 * Set socket options on an inet socket.
3259 */
3260int sock_common_setsockopt(struct socket *sock, int level, int optname,
3261 sockptr_t optval, unsigned int optlen)
3262{
3263 struct sock *sk = sock->sk;
3264
3265 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3266}
3267EXPORT_SYMBOL(sock_common_setsockopt);
3268
3269void sk_common_release(struct sock *sk)
3270{
3271 if (sk->sk_prot->destroy)
3272 sk->sk_prot->destroy(sk);
3273
3274 /*
3275 * Observation: when sk_common_release is called, processes have
3276 * no access to socket. But net still has.
3277 * Step one, detach it from networking:
3278 *
3279 * A. Remove from hash tables.
3280 */
3281
3282 sk->sk_prot->unhash(sk);
3283
3284 /*
3285 * In this point socket cannot receive new packets, but it is possible
3286 * that some packets are in flight because some CPU runs receiver and
3287 * did hash table lookup before we unhashed socket. They will achieve
3288 * receive queue and will be purged by socket destructor.
3289 *
3290 * Also we still have packets pending on receive queue and probably,
3291 * our own packets waiting in device queues. sock_destroy will drain
3292 * receive queue, but transmitted packets will delay socket destruction
3293 * until the last reference will be released.
3294 */
3295
3296 sock_orphan(sk);
3297
3298 xfrm_sk_free_policy(sk);
3299
3300 sk_refcnt_debug_release(sk);
3301
3302 sock_put(sk);
3303}
3304EXPORT_SYMBOL(sk_common_release);
3305
3306void sk_get_meminfo(const struct sock *sk, u32 *mem)
3307{
3308 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3309
3310 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3311 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3312 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3313 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3314 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3315 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3316 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3317 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3318 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3319}
3320
3321#ifdef CONFIG_PROC_FS
3322#define PROTO_INUSE_NR 64 /* should be enough for the first time */
3323struct prot_inuse {
3324 int val[PROTO_INUSE_NR];
3325};
3326
3327static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3328
3329void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3330{
3331 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3332}
3333EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3334
3335int sock_prot_inuse_get(struct net *net, struct proto *prot)
3336{
3337 int cpu, idx = prot->inuse_idx;
3338 int res = 0;
3339
3340 for_each_possible_cpu(cpu)
3341 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3342
3343 return res >= 0 ? res : 0;
3344}
3345EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3346
3347static void sock_inuse_add(struct net *net, int val)
3348{
3349 this_cpu_add(*net->core.sock_inuse, val);
3350}
3351
3352int sock_inuse_get(struct net *net)
3353{
3354 int cpu, res = 0;
3355
3356 for_each_possible_cpu(cpu)
3357 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3358
3359 return res;
3360}
3361
3362EXPORT_SYMBOL_GPL(sock_inuse_get);
3363
3364static int __net_init sock_inuse_init_net(struct net *net)
3365{
3366 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3367 if (net->core.prot_inuse == NULL)
3368 return -ENOMEM;
3369
3370 net->core.sock_inuse = alloc_percpu(int);
3371 if (net->core.sock_inuse == NULL)
3372 goto out;
3373
3374 return 0;
3375
3376out:
3377 free_percpu(net->core.prot_inuse);
3378 return -ENOMEM;
3379}
3380
3381static void __net_exit sock_inuse_exit_net(struct net *net)
3382{
3383 free_percpu(net->core.prot_inuse);
3384 free_percpu(net->core.sock_inuse);
3385}
3386
3387static struct pernet_operations net_inuse_ops = {
3388 .init = sock_inuse_init_net,
3389 .exit = sock_inuse_exit_net,
3390};
3391
3392static __init int net_inuse_init(void)
3393{
3394 if (register_pernet_subsys(&net_inuse_ops))
3395 panic("Cannot initialize net inuse counters");
3396
3397 return 0;
3398}
3399
3400core_initcall(net_inuse_init);
3401
3402static int assign_proto_idx(struct proto *prot)
3403{
3404 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3405
3406 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3407 pr_err("PROTO_INUSE_NR exhausted\n");
3408 return -ENOSPC;
3409 }
3410
3411 set_bit(prot->inuse_idx, proto_inuse_idx);
3412 return 0;
3413}
3414
3415static void release_proto_idx(struct proto *prot)
3416{
3417 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3418 clear_bit(prot->inuse_idx, proto_inuse_idx);
3419}
3420#else
3421static inline int assign_proto_idx(struct proto *prot)
3422{
3423 return 0;
3424}
3425
3426static inline void release_proto_idx(struct proto *prot)
3427{
3428}
3429
3430static void sock_inuse_add(struct net *net, int val)
3431{
3432}
3433#endif
3434
3435static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3436{
3437 if (!twsk_prot)
3438 return;
3439 kfree(twsk_prot->twsk_slab_name);
3440 twsk_prot->twsk_slab_name = NULL;
3441 kmem_cache_destroy(twsk_prot->twsk_slab);
3442 twsk_prot->twsk_slab = NULL;
3443}
3444
3445static int tw_prot_init(const struct proto *prot)
3446{
3447 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3448
3449 if (!twsk_prot)
3450 return 0;
3451
3452 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3453 prot->name);
3454 if (!twsk_prot->twsk_slab_name)
3455 return -ENOMEM;
3456
3457 twsk_prot->twsk_slab =
3458 kmem_cache_create(twsk_prot->twsk_slab_name,
3459 twsk_prot->twsk_obj_size, 0,
3460 SLAB_ACCOUNT | prot->slab_flags,
3461 NULL);
3462 if (!twsk_prot->twsk_slab) {
3463 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3464 prot->name);
3465 return -ENOMEM;
3466 }
3467
3468 return 0;
3469}
3470
3471static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3472{
3473 if (!rsk_prot)
3474 return;
3475 kfree(rsk_prot->slab_name);
3476 rsk_prot->slab_name = NULL;
3477 kmem_cache_destroy(rsk_prot->slab);
3478 rsk_prot->slab = NULL;
3479}
3480
3481static int req_prot_init(const struct proto *prot)
3482{
3483 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3484
3485 if (!rsk_prot)
3486 return 0;
3487
3488 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3489 prot->name);
3490 if (!rsk_prot->slab_name)
3491 return -ENOMEM;
3492
3493 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3494 rsk_prot->obj_size, 0,
3495 SLAB_ACCOUNT | prot->slab_flags,
3496 NULL);
3497
3498 if (!rsk_prot->slab) {
3499 pr_crit("%s: Can't create request sock SLAB cache!\n",
3500 prot->name);
3501 return -ENOMEM;
3502 }
3503 return 0;
3504}
3505
3506int proto_register(struct proto *prot, int alloc_slab)
3507{
3508 int ret = -ENOBUFS;
3509
3510 if (alloc_slab) {
3511 prot->slab = kmem_cache_create_usercopy(prot->name,
3512 prot->obj_size, 0,
3513 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3514 prot->slab_flags,
3515 prot->useroffset, prot->usersize,
3516 NULL);
3517
3518 if (prot->slab == NULL) {
3519 pr_crit("%s: Can't create sock SLAB cache!\n",
3520 prot->name);
3521 goto out;
3522 }
3523
3524 if (req_prot_init(prot))
3525 goto out_free_request_sock_slab;
3526
3527 if (tw_prot_init(prot))
3528 goto out_free_timewait_sock_slab;
3529 }
3530
3531 mutex_lock(&proto_list_mutex);
3532 ret = assign_proto_idx(prot);
3533 if (ret) {
3534 mutex_unlock(&proto_list_mutex);
3535 goto out_free_timewait_sock_slab;
3536 }
3537 list_add(&prot->node, &proto_list);
3538 mutex_unlock(&proto_list_mutex);
3539 return ret;
3540
3541out_free_timewait_sock_slab:
3542 if (alloc_slab)
3543 tw_prot_cleanup(prot->twsk_prot);
3544out_free_request_sock_slab:
3545 if (alloc_slab) {
3546 req_prot_cleanup(prot->rsk_prot);
3547
3548 kmem_cache_destroy(prot->slab);
3549 prot->slab = NULL;
3550 }
3551out:
3552 return ret;
3553}
3554EXPORT_SYMBOL(proto_register);
3555
3556void proto_unregister(struct proto *prot)
3557{
3558 mutex_lock(&proto_list_mutex);
3559 release_proto_idx(prot);
3560 list_del(&prot->node);
3561 mutex_unlock(&proto_list_mutex);
3562
3563 kmem_cache_destroy(prot->slab);
3564 prot->slab = NULL;
3565
3566 req_prot_cleanup(prot->rsk_prot);
3567 tw_prot_cleanup(prot->twsk_prot);
3568}
3569EXPORT_SYMBOL(proto_unregister);
3570
3571int sock_load_diag_module(int family, int protocol)
3572{
3573 if (!protocol) {
3574 if (!sock_is_registered(family))
3575 return -ENOENT;
3576
3577 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3578 NETLINK_SOCK_DIAG, family);
3579 }
3580
3581#ifdef CONFIG_INET
3582 if (family == AF_INET &&
3583 protocol != IPPROTO_RAW &&
3584 protocol < MAX_INET_PROTOS &&
3585 !rcu_access_pointer(inet_protos[protocol]))
3586 return -ENOENT;
3587#endif
3588
3589 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3590 NETLINK_SOCK_DIAG, family, protocol);
3591}
3592EXPORT_SYMBOL(sock_load_diag_module);
3593
3594#ifdef CONFIG_PROC_FS
3595static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3596 __acquires(proto_list_mutex)
3597{
3598 mutex_lock(&proto_list_mutex);
3599 return seq_list_start_head(&proto_list, *pos);
3600}
3601
3602static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3603{
3604 return seq_list_next(v, &proto_list, pos);
3605}
3606
3607static void proto_seq_stop(struct seq_file *seq, void *v)
3608 __releases(proto_list_mutex)
3609{
3610 mutex_unlock(&proto_list_mutex);
3611}
3612
3613static char proto_method_implemented(const void *method)
3614{
3615 return method == NULL ? 'n' : 'y';
3616}
3617static long sock_prot_memory_allocated(struct proto *proto)
3618{
3619 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3620}
3621
3622static const char *sock_prot_memory_pressure(struct proto *proto)
3623{
3624 return proto->memory_pressure != NULL ?
3625 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3626}
3627
3628static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3629{
3630
3631 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3632 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3633 proto->name,
3634 proto->obj_size,
3635 sock_prot_inuse_get(seq_file_net(seq), proto),
3636 sock_prot_memory_allocated(proto),
3637 sock_prot_memory_pressure(proto),
3638 proto->max_header,
3639 proto->slab == NULL ? "no" : "yes",
3640 module_name(proto->owner),
3641 proto_method_implemented(proto->close),
3642 proto_method_implemented(proto->connect),
3643 proto_method_implemented(proto->disconnect),
3644 proto_method_implemented(proto->accept),
3645 proto_method_implemented(proto->ioctl),
3646 proto_method_implemented(proto->init),
3647 proto_method_implemented(proto->destroy),
3648 proto_method_implemented(proto->shutdown),
3649 proto_method_implemented(proto->setsockopt),
3650 proto_method_implemented(proto->getsockopt),
3651 proto_method_implemented(proto->sendmsg),
3652 proto_method_implemented(proto->recvmsg),
3653 proto_method_implemented(proto->sendpage),
3654 proto_method_implemented(proto->bind),
3655 proto_method_implemented(proto->backlog_rcv),
3656 proto_method_implemented(proto->hash),
3657 proto_method_implemented(proto->unhash),
3658 proto_method_implemented(proto->get_port),
3659 proto_method_implemented(proto->enter_memory_pressure));
3660}
3661
3662static int proto_seq_show(struct seq_file *seq, void *v)
3663{
3664 if (v == &proto_list)
3665 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3666 "protocol",
3667 "size",
3668 "sockets",
3669 "memory",
3670 "press",
3671 "maxhdr",
3672 "slab",
3673 "module",
3674 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3675 else
3676 proto_seq_printf(seq, list_entry(v, struct proto, node));
3677 return 0;
3678}
3679
3680static const struct seq_operations proto_seq_ops = {
3681 .start = proto_seq_start,
3682 .next = proto_seq_next,
3683 .stop = proto_seq_stop,
3684 .show = proto_seq_show,
3685};
3686
3687static __net_init int proto_init_net(struct net *net)
3688{
3689 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3690 sizeof(struct seq_net_private)))
3691 return -ENOMEM;
3692
3693 return 0;
3694}
3695
3696static __net_exit void proto_exit_net(struct net *net)
3697{
3698 remove_proc_entry("protocols", net->proc_net);
3699}
3700
3701
3702static __net_initdata struct pernet_operations proto_net_ops = {
3703 .init = proto_init_net,
3704 .exit = proto_exit_net,
3705};
3706
3707static int __init proto_init(void)
3708{
3709 return register_pernet_subsys(&proto_net_ops);
3710}
3711
3712subsys_initcall(proto_init);
3713
3714#endif /* PROC_FS */
3715
3716#ifdef CONFIG_NET_RX_BUSY_POLL
3717bool sk_busy_loop_end(void *p, unsigned long start_time)
3718{
3719 struct sock *sk = p;
3720
3721 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3722 sk_busy_loop_timeout(sk, start_time);
3723}
3724EXPORT_SYMBOL(sk_busy_loop_end);
3725#endif /* CONFIG_NET_RX_BUSY_POLL */
3726
3727int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3728{
3729 if (!sk->sk_prot->bind_add)
3730 return -EOPNOTSUPP;
3731 return sk->sk_prot->bind_add(sk, addr, addr_len);
3732}
3733EXPORT_SYMBOL(sock_bind_add);