Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/init.h>
111#include <linux/highmem.h>
112#include <linux/user_namespace.h>
113#include <linux/static_key.h>
114#include <linux/memcontrol.h>
115#include <linux/prefetch.h>
116#include <linux/compat.h>
117
118#include <linux/uaccess.h>
119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
123#include <net/net_namespace.h>
124#include <net/request_sock.h>
125#include <net/sock.h>
126#include <linux/net_tstamp.h>
127#include <net/xfrm.h>
128#include <linux/ipsec.h>
129#include <net/cls_cgroup.h>
130#include <net/netprio_cgroup.h>
131#include <linux/sock_diag.h>
132
133#include <linux/filter.h>
134#include <net/sock_reuseport.h>
135#include <net/bpf_sk_storage.h>
136
137#include <trace/events/sock.h>
138
139#include <net/tcp.h>
140#include <net/busy_poll.h>
141
142#include <linux/ethtool.h>
143
144static DEFINE_MUTEX(proto_list_mutex);
145static LIST_HEAD(proto_list);
146
147/**
148 * sk_ns_capable - General socket capability test
149 * @sk: Socket to use a capability on or through
150 * @user_ns: The user namespace of the capability to use
151 * @cap: The capability to use
152 *
153 * Test to see if the opener of the socket had when the socket was
154 * created and the current process has the capability @cap in the user
155 * namespace @user_ns.
156 */
157bool sk_ns_capable(const struct sock *sk,
158 struct user_namespace *user_ns, int cap)
159{
160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 ns_capable(user_ns, cap);
162}
163EXPORT_SYMBOL(sk_ns_capable);
164
165/**
166 * sk_capable - Socket global capability test
167 * @sk: Socket to use a capability on or through
168 * @cap: The global capability to use
169 *
170 * Test to see if the opener of the socket had when the socket was
171 * created and the current process has the capability @cap in all user
172 * namespaces.
173 */
174bool sk_capable(const struct sock *sk, int cap)
175{
176 return sk_ns_capable(sk, &init_user_ns, cap);
177}
178EXPORT_SYMBOL(sk_capable);
179
180/**
181 * sk_net_capable - Network namespace socket capability test
182 * @sk: Socket to use a capability on or through
183 * @cap: The capability to use
184 *
185 * Test to see if the opener of the socket had when the socket was created
186 * and the current process has the capability @cap over the network namespace
187 * the socket is a member of.
188 */
189bool sk_net_capable(const struct sock *sk, int cap)
190{
191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192}
193EXPORT_SYMBOL(sk_net_capable);
194
195/*
196 * Each address family might have different locking rules, so we have
197 * one slock key per address family and separate keys for internal and
198 * userspace sockets.
199 */
200static struct lock_class_key af_family_keys[AF_MAX];
201static struct lock_class_key af_family_kern_keys[AF_MAX];
202static struct lock_class_key af_family_slock_keys[AF_MAX];
203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204
205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
210
211#define _sock_locks(x) \
212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
221 x "27" , x "28" , x "AF_CAN" , \
222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
227 x "AF_MCTP" , \
228 x "AF_MAX"
229
230static const char *const af_family_key_strings[AF_MAX+1] = {
231 _sock_locks("sk_lock-")
232};
233static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 _sock_locks("slock-")
235};
236static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 _sock_locks("clock-")
238};
239
240static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 _sock_locks("k-sk_lock-")
242};
243static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 _sock_locks("k-slock-")
245};
246static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 _sock_locks("k-clock-")
248};
249static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 _sock_locks("rlock-")
251};
252static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 _sock_locks("wlock-")
254};
255static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 _sock_locks("elock-")
257};
258
259/*
260 * sk_callback_lock and sk queues locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264static struct lock_class_key af_rlock_keys[AF_MAX];
265static struct lock_class_key af_wlock_keys[AF_MAX];
266static struct lock_class_key af_elock_keys[AF_MAX];
267static struct lock_class_key af_kern_callback_keys[AF_MAX];
268
269/* Run time adjustable parameters. */
270__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271EXPORT_SYMBOL(sysctl_wmem_max);
272__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273EXPORT_SYMBOL(sysctl_rmem_max);
274__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276
277/* Maximal space eaten by iovec or ancillary data plus some space */
278int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279EXPORT_SYMBOL(sysctl_optmem_max);
280
281int sysctl_tstamp_allow_data __read_mostly = 1;
282
283DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284EXPORT_SYMBOL_GPL(memalloc_socks_key);
285
286/**
287 * sk_set_memalloc - sets %SOCK_MEMALLOC
288 * @sk: socket to set it on
289 *
290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291 * It's the responsibility of the admin to adjust min_free_kbytes
292 * to meet the requirements
293 */
294void sk_set_memalloc(struct sock *sk)
295{
296 sock_set_flag(sk, SOCK_MEMALLOC);
297 sk->sk_allocation |= __GFP_MEMALLOC;
298 static_branch_inc(&memalloc_socks_key);
299}
300EXPORT_SYMBOL_GPL(sk_set_memalloc);
301
302void sk_clear_memalloc(struct sock *sk)
303{
304 sock_reset_flag(sk, SOCK_MEMALLOC);
305 sk->sk_allocation &= ~__GFP_MEMALLOC;
306 static_branch_dec(&memalloc_socks_key);
307
308 /*
309 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 * it has rmem allocations due to the last swapfile being deactivated
312 * but there is a risk that the socket is unusable due to exceeding
313 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 */
315 sk_mem_reclaim(sk);
316}
317EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318
319int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320{
321 int ret;
322 unsigned int noreclaim_flag;
323
324 /* these should have been dropped before queueing */
325 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326
327 noreclaim_flag = memalloc_noreclaim_save();
328 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
329 tcp_v6_do_rcv,
330 tcp_v4_do_rcv,
331 sk, skb);
332 memalloc_noreclaim_restore(noreclaim_flag);
333
334 return ret;
335}
336EXPORT_SYMBOL(__sk_backlog_rcv);
337
338void sk_error_report(struct sock *sk)
339{
340 sk->sk_error_report(sk);
341
342 switch (sk->sk_family) {
343 case AF_INET:
344 fallthrough;
345 case AF_INET6:
346 trace_inet_sk_error_report(sk);
347 break;
348 default:
349 break;
350 }
351}
352EXPORT_SYMBOL(sk_error_report);
353
354int sock_get_timeout(long timeo, void *optval, bool old_timeval)
355{
356 struct __kernel_sock_timeval tv;
357
358 if (timeo == MAX_SCHEDULE_TIMEOUT) {
359 tv.tv_sec = 0;
360 tv.tv_usec = 0;
361 } else {
362 tv.tv_sec = timeo / HZ;
363 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
364 }
365
366 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
367 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
368 *(struct old_timeval32 *)optval = tv32;
369 return sizeof(tv32);
370 }
371
372 if (old_timeval) {
373 struct __kernel_old_timeval old_tv;
374 old_tv.tv_sec = tv.tv_sec;
375 old_tv.tv_usec = tv.tv_usec;
376 *(struct __kernel_old_timeval *)optval = old_tv;
377 return sizeof(old_tv);
378 }
379
380 *(struct __kernel_sock_timeval *)optval = tv;
381 return sizeof(tv);
382}
383EXPORT_SYMBOL(sock_get_timeout);
384
385int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
386 sockptr_t optval, int optlen, bool old_timeval)
387{
388 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 struct old_timeval32 tv32;
390
391 if (optlen < sizeof(tv32))
392 return -EINVAL;
393
394 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 return -EFAULT;
396 tv->tv_sec = tv32.tv_sec;
397 tv->tv_usec = tv32.tv_usec;
398 } else if (old_timeval) {
399 struct __kernel_old_timeval old_tv;
400
401 if (optlen < sizeof(old_tv))
402 return -EINVAL;
403 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 return -EFAULT;
405 tv->tv_sec = old_tv.tv_sec;
406 tv->tv_usec = old_tv.tv_usec;
407 } else {
408 if (optlen < sizeof(*tv))
409 return -EINVAL;
410 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
411 return -EFAULT;
412 }
413
414 return 0;
415}
416EXPORT_SYMBOL(sock_copy_user_timeval);
417
418static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
419 bool old_timeval)
420{
421 struct __kernel_sock_timeval tv;
422 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
423
424 if (err)
425 return err;
426
427 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
428 return -EDOM;
429
430 if (tv.tv_sec < 0) {
431 static int warned __read_mostly;
432
433 *timeo_p = 0;
434 if (warned < 10 && net_ratelimit()) {
435 warned++;
436 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
437 __func__, current->comm, task_pid_nr(current));
438 }
439 return 0;
440 }
441 *timeo_p = MAX_SCHEDULE_TIMEOUT;
442 if (tv.tv_sec == 0 && tv.tv_usec == 0)
443 return 0;
444 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
445 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
446 return 0;
447}
448
449static bool sock_needs_netstamp(const struct sock *sk)
450{
451 switch (sk->sk_family) {
452 case AF_UNSPEC:
453 case AF_UNIX:
454 return false;
455 default:
456 return true;
457 }
458}
459
460static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
461{
462 if (sk->sk_flags & flags) {
463 sk->sk_flags &= ~flags;
464 if (sock_needs_netstamp(sk) &&
465 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
466 net_disable_timestamp();
467 }
468}
469
470
471int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
472{
473 unsigned long flags;
474 struct sk_buff_head *list = &sk->sk_receive_queue;
475
476 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
477 atomic_inc(&sk->sk_drops);
478 trace_sock_rcvqueue_full(sk, skb);
479 return -ENOMEM;
480 }
481
482 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
483 atomic_inc(&sk->sk_drops);
484 return -ENOBUFS;
485 }
486
487 skb->dev = NULL;
488 skb_set_owner_r(skb, sk);
489
490 /* we escape from rcu protected region, make sure we dont leak
491 * a norefcounted dst
492 */
493 skb_dst_force(skb);
494
495 spin_lock_irqsave(&list->lock, flags);
496 sock_skb_set_dropcount(sk, skb);
497 __skb_queue_tail(list, skb);
498 spin_unlock_irqrestore(&list->lock, flags);
499
500 if (!sock_flag(sk, SOCK_DEAD))
501 sk->sk_data_ready(sk);
502 return 0;
503}
504EXPORT_SYMBOL(__sock_queue_rcv_skb);
505
506int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
507{
508 int err;
509
510 err = sk_filter(sk, skb);
511 if (err)
512 return err;
513
514 return __sock_queue_rcv_skb(sk, skb);
515}
516EXPORT_SYMBOL(sock_queue_rcv_skb);
517
518int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
519 const int nested, unsigned int trim_cap, bool refcounted)
520{
521 int rc = NET_RX_SUCCESS;
522
523 if (sk_filter_trim_cap(sk, skb, trim_cap))
524 goto discard_and_relse;
525
526 skb->dev = NULL;
527
528 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
529 atomic_inc(&sk->sk_drops);
530 goto discard_and_relse;
531 }
532 if (nested)
533 bh_lock_sock_nested(sk);
534 else
535 bh_lock_sock(sk);
536 if (!sock_owned_by_user(sk)) {
537 /*
538 * trylock + unlock semantics:
539 */
540 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
541
542 rc = sk_backlog_rcv(sk, skb);
543
544 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
545 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
546 bh_unlock_sock(sk);
547 atomic_inc(&sk->sk_drops);
548 goto discard_and_relse;
549 }
550
551 bh_unlock_sock(sk);
552out:
553 if (refcounted)
554 sock_put(sk);
555 return rc;
556discard_and_relse:
557 kfree_skb(skb);
558 goto out;
559}
560EXPORT_SYMBOL(__sk_receive_skb);
561
562INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
563 u32));
564INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
565 u32));
566struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
567{
568 struct dst_entry *dst = __sk_dst_get(sk);
569
570 if (dst && dst->obsolete &&
571 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
572 dst, cookie) == NULL) {
573 sk_tx_queue_clear(sk);
574 sk->sk_dst_pending_confirm = 0;
575 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
576 dst_release(dst);
577 return NULL;
578 }
579
580 return dst;
581}
582EXPORT_SYMBOL(__sk_dst_check);
583
584struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
585{
586 struct dst_entry *dst = sk_dst_get(sk);
587
588 if (dst && dst->obsolete &&
589 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
590 dst, cookie) == NULL) {
591 sk_dst_reset(sk);
592 dst_release(dst);
593 return NULL;
594 }
595
596 return dst;
597}
598EXPORT_SYMBOL(sk_dst_check);
599
600static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
601{
602 int ret = -ENOPROTOOPT;
603#ifdef CONFIG_NETDEVICES
604 struct net *net = sock_net(sk);
605
606 /* Sorry... */
607 ret = -EPERM;
608 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
609 goto out;
610
611 ret = -EINVAL;
612 if (ifindex < 0)
613 goto out;
614
615 sk->sk_bound_dev_if = ifindex;
616 if (sk->sk_prot->rehash)
617 sk->sk_prot->rehash(sk);
618 sk_dst_reset(sk);
619
620 ret = 0;
621
622out:
623#endif
624
625 return ret;
626}
627
628int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
629{
630 int ret;
631
632 if (lock_sk)
633 lock_sock(sk);
634 ret = sock_bindtoindex_locked(sk, ifindex);
635 if (lock_sk)
636 release_sock(sk);
637
638 return ret;
639}
640EXPORT_SYMBOL(sock_bindtoindex);
641
642static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
643{
644 int ret = -ENOPROTOOPT;
645#ifdef CONFIG_NETDEVICES
646 struct net *net = sock_net(sk);
647 char devname[IFNAMSIZ];
648 int index;
649
650 ret = -EINVAL;
651 if (optlen < 0)
652 goto out;
653
654 /* Bind this socket to a particular device like "eth0",
655 * as specified in the passed interface name. If the
656 * name is "" or the option length is zero the socket
657 * is not bound.
658 */
659 if (optlen > IFNAMSIZ - 1)
660 optlen = IFNAMSIZ - 1;
661 memset(devname, 0, sizeof(devname));
662
663 ret = -EFAULT;
664 if (copy_from_sockptr(devname, optval, optlen))
665 goto out;
666
667 index = 0;
668 if (devname[0] != '\0') {
669 struct net_device *dev;
670
671 rcu_read_lock();
672 dev = dev_get_by_name_rcu(net, devname);
673 if (dev)
674 index = dev->ifindex;
675 rcu_read_unlock();
676 ret = -ENODEV;
677 if (!dev)
678 goto out;
679 }
680
681 return sock_bindtoindex(sk, index, true);
682out:
683#endif
684
685 return ret;
686}
687
688static int sock_getbindtodevice(struct sock *sk, char __user *optval,
689 int __user *optlen, int len)
690{
691 int ret = -ENOPROTOOPT;
692#ifdef CONFIG_NETDEVICES
693 struct net *net = sock_net(sk);
694 char devname[IFNAMSIZ];
695
696 if (sk->sk_bound_dev_if == 0) {
697 len = 0;
698 goto zero;
699 }
700
701 ret = -EINVAL;
702 if (len < IFNAMSIZ)
703 goto out;
704
705 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
706 if (ret)
707 goto out;
708
709 len = strlen(devname) + 1;
710
711 ret = -EFAULT;
712 if (copy_to_user(optval, devname, len))
713 goto out;
714
715zero:
716 ret = -EFAULT;
717 if (put_user(len, optlen))
718 goto out;
719
720 ret = 0;
721
722out:
723#endif
724
725 return ret;
726}
727
728bool sk_mc_loop(struct sock *sk)
729{
730 if (dev_recursion_level())
731 return false;
732 if (!sk)
733 return true;
734 switch (sk->sk_family) {
735 case AF_INET:
736 return inet_sk(sk)->mc_loop;
737#if IS_ENABLED(CONFIG_IPV6)
738 case AF_INET6:
739 return inet6_sk(sk)->mc_loop;
740#endif
741 }
742 WARN_ON_ONCE(1);
743 return true;
744}
745EXPORT_SYMBOL(sk_mc_loop);
746
747void sock_set_reuseaddr(struct sock *sk)
748{
749 lock_sock(sk);
750 sk->sk_reuse = SK_CAN_REUSE;
751 release_sock(sk);
752}
753EXPORT_SYMBOL(sock_set_reuseaddr);
754
755void sock_set_reuseport(struct sock *sk)
756{
757 lock_sock(sk);
758 sk->sk_reuseport = true;
759 release_sock(sk);
760}
761EXPORT_SYMBOL(sock_set_reuseport);
762
763void sock_no_linger(struct sock *sk)
764{
765 lock_sock(sk);
766 sk->sk_lingertime = 0;
767 sock_set_flag(sk, SOCK_LINGER);
768 release_sock(sk);
769}
770EXPORT_SYMBOL(sock_no_linger);
771
772void sock_set_priority(struct sock *sk, u32 priority)
773{
774 lock_sock(sk);
775 sk->sk_priority = priority;
776 release_sock(sk);
777}
778EXPORT_SYMBOL(sock_set_priority);
779
780void sock_set_sndtimeo(struct sock *sk, s64 secs)
781{
782 lock_sock(sk);
783 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
784 sk->sk_sndtimeo = secs * HZ;
785 else
786 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
787 release_sock(sk);
788}
789EXPORT_SYMBOL(sock_set_sndtimeo);
790
791static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
792{
793 if (val) {
794 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
795 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
796 sock_set_flag(sk, SOCK_RCVTSTAMP);
797 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
798 } else {
799 sock_reset_flag(sk, SOCK_RCVTSTAMP);
800 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
801 }
802}
803
804void sock_enable_timestamps(struct sock *sk)
805{
806 lock_sock(sk);
807 __sock_set_timestamps(sk, true, false, true);
808 release_sock(sk);
809}
810EXPORT_SYMBOL(sock_enable_timestamps);
811
812void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
813{
814 switch (optname) {
815 case SO_TIMESTAMP_OLD:
816 __sock_set_timestamps(sk, valbool, false, false);
817 break;
818 case SO_TIMESTAMP_NEW:
819 __sock_set_timestamps(sk, valbool, true, false);
820 break;
821 case SO_TIMESTAMPNS_OLD:
822 __sock_set_timestamps(sk, valbool, false, true);
823 break;
824 case SO_TIMESTAMPNS_NEW:
825 __sock_set_timestamps(sk, valbool, true, true);
826 break;
827 }
828}
829
830static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
831{
832 struct net *net = sock_net(sk);
833 struct net_device *dev = NULL;
834 bool match = false;
835 int *vclock_index;
836 int i, num;
837
838 if (sk->sk_bound_dev_if)
839 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
840
841 if (!dev) {
842 pr_err("%s: sock not bind to device\n", __func__);
843 return -EOPNOTSUPP;
844 }
845
846 num = ethtool_get_phc_vclocks(dev, &vclock_index);
847 dev_put(dev);
848
849 for (i = 0; i < num; i++) {
850 if (*(vclock_index + i) == phc_index) {
851 match = true;
852 break;
853 }
854 }
855
856 if (num > 0)
857 kfree(vclock_index);
858
859 if (!match)
860 return -EINVAL;
861
862 sk->sk_bind_phc = phc_index;
863
864 return 0;
865}
866
867int sock_set_timestamping(struct sock *sk, int optname,
868 struct so_timestamping timestamping)
869{
870 int val = timestamping.flags;
871 int ret;
872
873 if (val & ~SOF_TIMESTAMPING_MASK)
874 return -EINVAL;
875
876 if (val & SOF_TIMESTAMPING_OPT_ID &&
877 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
878 if (sk_is_tcp(sk)) {
879 if ((1 << sk->sk_state) &
880 (TCPF_CLOSE | TCPF_LISTEN))
881 return -EINVAL;
882 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
883 } else {
884 atomic_set(&sk->sk_tskey, 0);
885 }
886 }
887
888 if (val & SOF_TIMESTAMPING_OPT_STATS &&
889 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
890 return -EINVAL;
891
892 if (val & SOF_TIMESTAMPING_BIND_PHC) {
893 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
894 if (ret)
895 return ret;
896 }
897
898 sk->sk_tsflags = val;
899 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
900
901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
902 sock_enable_timestamp(sk,
903 SOCK_TIMESTAMPING_RX_SOFTWARE);
904 else
905 sock_disable_timestamp(sk,
906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
907 return 0;
908}
909
910void sock_set_keepalive(struct sock *sk)
911{
912 lock_sock(sk);
913 if (sk->sk_prot->keepalive)
914 sk->sk_prot->keepalive(sk, true);
915 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
916 release_sock(sk);
917}
918EXPORT_SYMBOL(sock_set_keepalive);
919
920static void __sock_set_rcvbuf(struct sock *sk, int val)
921{
922 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
923 * as a negative value.
924 */
925 val = min_t(int, val, INT_MAX / 2);
926 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
927
928 /* We double it on the way in to account for "struct sk_buff" etc.
929 * overhead. Applications assume that the SO_RCVBUF setting they make
930 * will allow that much actual data to be received on that socket.
931 *
932 * Applications are unaware that "struct sk_buff" and other overheads
933 * allocate from the receive buffer during socket buffer allocation.
934 *
935 * And after considering the possible alternatives, returning the value
936 * we actually used in getsockopt is the most desirable behavior.
937 */
938 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
939}
940
941void sock_set_rcvbuf(struct sock *sk, int val)
942{
943 lock_sock(sk);
944 __sock_set_rcvbuf(sk, val);
945 release_sock(sk);
946}
947EXPORT_SYMBOL(sock_set_rcvbuf);
948
949static void __sock_set_mark(struct sock *sk, u32 val)
950{
951 if (val != sk->sk_mark) {
952 sk->sk_mark = val;
953 sk_dst_reset(sk);
954 }
955}
956
957void sock_set_mark(struct sock *sk, u32 val)
958{
959 lock_sock(sk);
960 __sock_set_mark(sk, val);
961 release_sock(sk);
962}
963EXPORT_SYMBOL(sock_set_mark);
964
965static void sock_release_reserved_memory(struct sock *sk, int bytes)
966{
967 /* Round down bytes to multiple of pages */
968 bytes &= ~(SK_MEM_QUANTUM - 1);
969
970 WARN_ON(bytes > sk->sk_reserved_mem);
971 sk->sk_reserved_mem -= bytes;
972 sk_mem_reclaim(sk);
973}
974
975static int sock_reserve_memory(struct sock *sk, int bytes)
976{
977 long allocated;
978 bool charged;
979 int pages;
980
981 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
982 return -EOPNOTSUPP;
983
984 if (!bytes)
985 return 0;
986
987 pages = sk_mem_pages(bytes);
988
989 /* pre-charge to memcg */
990 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
991 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
992 if (!charged)
993 return -ENOMEM;
994
995 /* pre-charge to forward_alloc */
996 allocated = sk_memory_allocated_add(sk, pages);
997 /* If the system goes into memory pressure with this
998 * precharge, give up and return error.
999 */
1000 if (allocated > sk_prot_mem_limits(sk, 1)) {
1001 sk_memory_allocated_sub(sk, pages);
1002 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1003 return -ENOMEM;
1004 }
1005 sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1006
1007 sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1008
1009 return 0;
1010}
1011
1012/*
1013 * This is meant for all protocols to use and covers goings on
1014 * at the socket level. Everything here is generic.
1015 */
1016
1017int sock_setsockopt(struct socket *sock, int level, int optname,
1018 sockptr_t optval, unsigned int optlen)
1019{
1020 struct so_timestamping timestamping;
1021 struct sock_txtime sk_txtime;
1022 struct sock *sk = sock->sk;
1023 int val;
1024 int valbool;
1025 struct linger ling;
1026 int ret = 0;
1027
1028 /*
1029 * Options without arguments
1030 */
1031
1032 if (optname == SO_BINDTODEVICE)
1033 return sock_setbindtodevice(sk, optval, optlen);
1034
1035 if (optlen < sizeof(int))
1036 return -EINVAL;
1037
1038 if (copy_from_sockptr(&val, optval, sizeof(val)))
1039 return -EFAULT;
1040
1041 valbool = val ? 1 : 0;
1042
1043 lock_sock(sk);
1044
1045 switch (optname) {
1046 case SO_DEBUG:
1047 if (val && !capable(CAP_NET_ADMIN))
1048 ret = -EACCES;
1049 else
1050 sock_valbool_flag(sk, SOCK_DBG, valbool);
1051 break;
1052 case SO_REUSEADDR:
1053 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1054 break;
1055 case SO_REUSEPORT:
1056 sk->sk_reuseport = valbool;
1057 break;
1058 case SO_TYPE:
1059 case SO_PROTOCOL:
1060 case SO_DOMAIN:
1061 case SO_ERROR:
1062 ret = -ENOPROTOOPT;
1063 break;
1064 case SO_DONTROUTE:
1065 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1066 sk_dst_reset(sk);
1067 break;
1068 case SO_BROADCAST:
1069 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1070 break;
1071 case SO_SNDBUF:
1072 /* Don't error on this BSD doesn't and if you think
1073 * about it this is right. Otherwise apps have to
1074 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1075 * are treated in BSD as hints
1076 */
1077 val = min_t(u32, val, sysctl_wmem_max);
1078set_sndbuf:
1079 /* Ensure val * 2 fits into an int, to prevent max_t()
1080 * from treating it as a negative value.
1081 */
1082 val = min_t(int, val, INT_MAX / 2);
1083 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1084 WRITE_ONCE(sk->sk_sndbuf,
1085 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1086 /* Wake up sending tasks if we upped the value. */
1087 sk->sk_write_space(sk);
1088 break;
1089
1090 case SO_SNDBUFFORCE:
1091 if (!capable(CAP_NET_ADMIN)) {
1092 ret = -EPERM;
1093 break;
1094 }
1095
1096 /* No negative values (to prevent underflow, as val will be
1097 * multiplied by 2).
1098 */
1099 if (val < 0)
1100 val = 0;
1101 goto set_sndbuf;
1102
1103 case SO_RCVBUF:
1104 /* Don't error on this BSD doesn't and if you think
1105 * about it this is right. Otherwise apps have to
1106 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1107 * are treated in BSD as hints
1108 */
1109 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1110 break;
1111
1112 case SO_RCVBUFFORCE:
1113 if (!capable(CAP_NET_ADMIN)) {
1114 ret = -EPERM;
1115 break;
1116 }
1117
1118 /* No negative values (to prevent underflow, as val will be
1119 * multiplied by 2).
1120 */
1121 __sock_set_rcvbuf(sk, max(val, 0));
1122 break;
1123
1124 case SO_KEEPALIVE:
1125 if (sk->sk_prot->keepalive)
1126 sk->sk_prot->keepalive(sk, valbool);
1127 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1128 break;
1129
1130 case SO_OOBINLINE:
1131 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1132 break;
1133
1134 case SO_NO_CHECK:
1135 sk->sk_no_check_tx = valbool;
1136 break;
1137
1138 case SO_PRIORITY:
1139 if ((val >= 0 && val <= 6) ||
1140 ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1141 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1142 sk->sk_priority = val;
1143 else
1144 ret = -EPERM;
1145 break;
1146
1147 case SO_LINGER:
1148 if (optlen < sizeof(ling)) {
1149 ret = -EINVAL; /* 1003.1g */
1150 break;
1151 }
1152 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1153 ret = -EFAULT;
1154 break;
1155 }
1156 if (!ling.l_onoff)
1157 sock_reset_flag(sk, SOCK_LINGER);
1158 else {
1159#if (BITS_PER_LONG == 32)
1160 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1161 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1162 else
1163#endif
1164 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1165 sock_set_flag(sk, SOCK_LINGER);
1166 }
1167 break;
1168
1169 case SO_BSDCOMPAT:
1170 break;
1171
1172 case SO_PASSCRED:
1173 if (valbool)
1174 set_bit(SOCK_PASSCRED, &sock->flags);
1175 else
1176 clear_bit(SOCK_PASSCRED, &sock->flags);
1177 break;
1178
1179 case SO_TIMESTAMP_OLD:
1180 case SO_TIMESTAMP_NEW:
1181 case SO_TIMESTAMPNS_OLD:
1182 case SO_TIMESTAMPNS_NEW:
1183 sock_set_timestamp(sk, optname, valbool);
1184 break;
1185
1186 case SO_TIMESTAMPING_NEW:
1187 case SO_TIMESTAMPING_OLD:
1188 if (optlen == sizeof(timestamping)) {
1189 if (copy_from_sockptr(×tamping, optval,
1190 sizeof(timestamping))) {
1191 ret = -EFAULT;
1192 break;
1193 }
1194 } else {
1195 memset(×tamping, 0, sizeof(timestamping));
1196 timestamping.flags = val;
1197 }
1198 ret = sock_set_timestamping(sk, optname, timestamping);
1199 break;
1200
1201 case SO_RCVLOWAT:
1202 if (val < 0)
1203 val = INT_MAX;
1204 if (sock->ops->set_rcvlowat)
1205 ret = sock->ops->set_rcvlowat(sk, val);
1206 else
1207 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1208 break;
1209
1210 case SO_RCVTIMEO_OLD:
1211 case SO_RCVTIMEO_NEW:
1212 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1213 optlen, optname == SO_RCVTIMEO_OLD);
1214 break;
1215
1216 case SO_SNDTIMEO_OLD:
1217 case SO_SNDTIMEO_NEW:
1218 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1219 optlen, optname == SO_SNDTIMEO_OLD);
1220 break;
1221
1222 case SO_ATTACH_FILTER: {
1223 struct sock_fprog fprog;
1224
1225 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1226 if (!ret)
1227 ret = sk_attach_filter(&fprog, sk);
1228 break;
1229 }
1230 case SO_ATTACH_BPF:
1231 ret = -EINVAL;
1232 if (optlen == sizeof(u32)) {
1233 u32 ufd;
1234
1235 ret = -EFAULT;
1236 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1237 break;
1238
1239 ret = sk_attach_bpf(ufd, sk);
1240 }
1241 break;
1242
1243 case SO_ATTACH_REUSEPORT_CBPF: {
1244 struct sock_fprog fprog;
1245
1246 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1247 if (!ret)
1248 ret = sk_reuseport_attach_filter(&fprog, sk);
1249 break;
1250 }
1251 case SO_ATTACH_REUSEPORT_EBPF:
1252 ret = -EINVAL;
1253 if (optlen == sizeof(u32)) {
1254 u32 ufd;
1255
1256 ret = -EFAULT;
1257 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1258 break;
1259
1260 ret = sk_reuseport_attach_bpf(ufd, sk);
1261 }
1262 break;
1263
1264 case SO_DETACH_REUSEPORT_BPF:
1265 ret = reuseport_detach_prog(sk);
1266 break;
1267
1268 case SO_DETACH_FILTER:
1269 ret = sk_detach_filter(sk);
1270 break;
1271
1272 case SO_LOCK_FILTER:
1273 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1274 ret = -EPERM;
1275 else
1276 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1277 break;
1278
1279 case SO_PASSSEC:
1280 if (valbool)
1281 set_bit(SOCK_PASSSEC, &sock->flags);
1282 else
1283 clear_bit(SOCK_PASSSEC, &sock->flags);
1284 break;
1285 case SO_MARK:
1286 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1287 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1288 ret = -EPERM;
1289 break;
1290 }
1291
1292 __sock_set_mark(sk, val);
1293 break;
1294
1295 case SO_RXQ_OVFL:
1296 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1297 break;
1298
1299 case SO_WIFI_STATUS:
1300 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1301 break;
1302
1303 case SO_PEEK_OFF:
1304 if (sock->ops->set_peek_off)
1305 ret = sock->ops->set_peek_off(sk, val);
1306 else
1307 ret = -EOPNOTSUPP;
1308 break;
1309
1310 case SO_NOFCS:
1311 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1312 break;
1313
1314 case SO_SELECT_ERR_QUEUE:
1315 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1316 break;
1317
1318#ifdef CONFIG_NET_RX_BUSY_POLL
1319 case SO_BUSY_POLL:
1320 /* allow unprivileged users to decrease the value */
1321 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1322 ret = -EPERM;
1323 else {
1324 if (val < 0)
1325 ret = -EINVAL;
1326 else
1327 WRITE_ONCE(sk->sk_ll_usec, val);
1328 }
1329 break;
1330 case SO_PREFER_BUSY_POLL:
1331 if (valbool && !capable(CAP_NET_ADMIN))
1332 ret = -EPERM;
1333 else
1334 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1335 break;
1336 case SO_BUSY_POLL_BUDGET:
1337 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1338 ret = -EPERM;
1339 } else {
1340 if (val < 0 || val > U16_MAX)
1341 ret = -EINVAL;
1342 else
1343 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1344 }
1345 break;
1346#endif
1347
1348 case SO_MAX_PACING_RATE:
1349 {
1350 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1351
1352 if (sizeof(ulval) != sizeof(val) &&
1353 optlen >= sizeof(ulval) &&
1354 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1355 ret = -EFAULT;
1356 break;
1357 }
1358 if (ulval != ~0UL)
1359 cmpxchg(&sk->sk_pacing_status,
1360 SK_PACING_NONE,
1361 SK_PACING_NEEDED);
1362 sk->sk_max_pacing_rate = ulval;
1363 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1364 break;
1365 }
1366 case SO_INCOMING_CPU:
1367 WRITE_ONCE(sk->sk_incoming_cpu, val);
1368 break;
1369
1370 case SO_CNX_ADVICE:
1371 if (val == 1)
1372 dst_negative_advice(sk);
1373 break;
1374
1375 case SO_ZEROCOPY:
1376 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1377 if (!(sk_is_tcp(sk) ||
1378 (sk->sk_type == SOCK_DGRAM &&
1379 sk->sk_protocol == IPPROTO_UDP)))
1380 ret = -ENOTSUPP;
1381 } else if (sk->sk_family != PF_RDS) {
1382 ret = -ENOTSUPP;
1383 }
1384 if (!ret) {
1385 if (val < 0 || val > 1)
1386 ret = -EINVAL;
1387 else
1388 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1389 }
1390 break;
1391
1392 case SO_TXTIME:
1393 if (optlen != sizeof(struct sock_txtime)) {
1394 ret = -EINVAL;
1395 break;
1396 } else if (copy_from_sockptr(&sk_txtime, optval,
1397 sizeof(struct sock_txtime))) {
1398 ret = -EFAULT;
1399 break;
1400 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1401 ret = -EINVAL;
1402 break;
1403 }
1404 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1405 * scheduler has enough safe guards.
1406 */
1407 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1408 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1409 ret = -EPERM;
1410 break;
1411 }
1412 sock_valbool_flag(sk, SOCK_TXTIME, true);
1413 sk->sk_clockid = sk_txtime.clockid;
1414 sk->sk_txtime_deadline_mode =
1415 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1416 sk->sk_txtime_report_errors =
1417 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1418 break;
1419
1420 case SO_BINDTOIFINDEX:
1421 ret = sock_bindtoindex_locked(sk, val);
1422 break;
1423
1424 case SO_BUF_LOCK:
1425 if (val & ~SOCK_BUF_LOCK_MASK) {
1426 ret = -EINVAL;
1427 break;
1428 }
1429 sk->sk_userlocks = val | (sk->sk_userlocks &
1430 ~SOCK_BUF_LOCK_MASK);
1431 break;
1432
1433 case SO_RESERVE_MEM:
1434 {
1435 int delta;
1436
1437 if (val < 0) {
1438 ret = -EINVAL;
1439 break;
1440 }
1441
1442 delta = val - sk->sk_reserved_mem;
1443 if (delta < 0)
1444 sock_release_reserved_memory(sk, -delta);
1445 else
1446 ret = sock_reserve_memory(sk, delta);
1447 break;
1448 }
1449
1450 default:
1451 ret = -ENOPROTOOPT;
1452 break;
1453 }
1454 release_sock(sk);
1455 return ret;
1456}
1457EXPORT_SYMBOL(sock_setsockopt);
1458
1459static const struct cred *sk_get_peer_cred(struct sock *sk)
1460{
1461 const struct cred *cred;
1462
1463 spin_lock(&sk->sk_peer_lock);
1464 cred = get_cred(sk->sk_peer_cred);
1465 spin_unlock(&sk->sk_peer_lock);
1466
1467 return cred;
1468}
1469
1470static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1471 struct ucred *ucred)
1472{
1473 ucred->pid = pid_vnr(pid);
1474 ucred->uid = ucred->gid = -1;
1475 if (cred) {
1476 struct user_namespace *current_ns = current_user_ns();
1477
1478 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1479 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1480 }
1481}
1482
1483static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1484{
1485 struct user_namespace *user_ns = current_user_ns();
1486 int i;
1487
1488 for (i = 0; i < src->ngroups; i++)
1489 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1490 return -EFAULT;
1491
1492 return 0;
1493}
1494
1495int sock_getsockopt(struct socket *sock, int level, int optname,
1496 char __user *optval, int __user *optlen)
1497{
1498 struct sock *sk = sock->sk;
1499
1500 union {
1501 int val;
1502 u64 val64;
1503 unsigned long ulval;
1504 struct linger ling;
1505 struct old_timeval32 tm32;
1506 struct __kernel_old_timeval tm;
1507 struct __kernel_sock_timeval stm;
1508 struct sock_txtime txtime;
1509 struct so_timestamping timestamping;
1510 } v;
1511
1512 int lv = sizeof(int);
1513 int len;
1514
1515 if (get_user(len, optlen))
1516 return -EFAULT;
1517 if (len < 0)
1518 return -EINVAL;
1519
1520 memset(&v, 0, sizeof(v));
1521
1522 switch (optname) {
1523 case SO_DEBUG:
1524 v.val = sock_flag(sk, SOCK_DBG);
1525 break;
1526
1527 case SO_DONTROUTE:
1528 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1529 break;
1530
1531 case SO_BROADCAST:
1532 v.val = sock_flag(sk, SOCK_BROADCAST);
1533 break;
1534
1535 case SO_SNDBUF:
1536 v.val = sk->sk_sndbuf;
1537 break;
1538
1539 case SO_RCVBUF:
1540 v.val = sk->sk_rcvbuf;
1541 break;
1542
1543 case SO_REUSEADDR:
1544 v.val = sk->sk_reuse;
1545 break;
1546
1547 case SO_REUSEPORT:
1548 v.val = sk->sk_reuseport;
1549 break;
1550
1551 case SO_KEEPALIVE:
1552 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1553 break;
1554
1555 case SO_TYPE:
1556 v.val = sk->sk_type;
1557 break;
1558
1559 case SO_PROTOCOL:
1560 v.val = sk->sk_protocol;
1561 break;
1562
1563 case SO_DOMAIN:
1564 v.val = sk->sk_family;
1565 break;
1566
1567 case SO_ERROR:
1568 v.val = -sock_error(sk);
1569 if (v.val == 0)
1570 v.val = xchg(&sk->sk_err_soft, 0);
1571 break;
1572
1573 case SO_OOBINLINE:
1574 v.val = sock_flag(sk, SOCK_URGINLINE);
1575 break;
1576
1577 case SO_NO_CHECK:
1578 v.val = sk->sk_no_check_tx;
1579 break;
1580
1581 case SO_PRIORITY:
1582 v.val = sk->sk_priority;
1583 break;
1584
1585 case SO_LINGER:
1586 lv = sizeof(v.ling);
1587 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1588 v.ling.l_linger = sk->sk_lingertime / HZ;
1589 break;
1590
1591 case SO_BSDCOMPAT:
1592 break;
1593
1594 case SO_TIMESTAMP_OLD:
1595 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1596 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1597 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1598 break;
1599
1600 case SO_TIMESTAMPNS_OLD:
1601 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1602 break;
1603
1604 case SO_TIMESTAMP_NEW:
1605 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1606 break;
1607
1608 case SO_TIMESTAMPNS_NEW:
1609 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1610 break;
1611
1612 case SO_TIMESTAMPING_OLD:
1613 lv = sizeof(v.timestamping);
1614 v.timestamping.flags = sk->sk_tsflags;
1615 v.timestamping.bind_phc = sk->sk_bind_phc;
1616 break;
1617
1618 case SO_RCVTIMEO_OLD:
1619 case SO_RCVTIMEO_NEW:
1620 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1621 break;
1622
1623 case SO_SNDTIMEO_OLD:
1624 case SO_SNDTIMEO_NEW:
1625 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1626 break;
1627
1628 case SO_RCVLOWAT:
1629 v.val = sk->sk_rcvlowat;
1630 break;
1631
1632 case SO_SNDLOWAT:
1633 v.val = 1;
1634 break;
1635
1636 case SO_PASSCRED:
1637 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1638 break;
1639
1640 case SO_PEERCRED:
1641 {
1642 struct ucred peercred;
1643 if (len > sizeof(peercred))
1644 len = sizeof(peercred);
1645
1646 spin_lock(&sk->sk_peer_lock);
1647 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1648 spin_unlock(&sk->sk_peer_lock);
1649
1650 if (copy_to_user(optval, &peercred, len))
1651 return -EFAULT;
1652 goto lenout;
1653 }
1654
1655 case SO_PEERGROUPS:
1656 {
1657 const struct cred *cred;
1658 int ret, n;
1659
1660 cred = sk_get_peer_cred(sk);
1661 if (!cred)
1662 return -ENODATA;
1663
1664 n = cred->group_info->ngroups;
1665 if (len < n * sizeof(gid_t)) {
1666 len = n * sizeof(gid_t);
1667 put_cred(cred);
1668 return put_user(len, optlen) ? -EFAULT : -ERANGE;
1669 }
1670 len = n * sizeof(gid_t);
1671
1672 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1673 put_cred(cred);
1674 if (ret)
1675 return ret;
1676 goto lenout;
1677 }
1678
1679 case SO_PEERNAME:
1680 {
1681 char address[128];
1682
1683 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1684 if (lv < 0)
1685 return -ENOTCONN;
1686 if (lv < len)
1687 return -EINVAL;
1688 if (copy_to_user(optval, address, len))
1689 return -EFAULT;
1690 goto lenout;
1691 }
1692
1693 /* Dubious BSD thing... Probably nobody even uses it, but
1694 * the UNIX standard wants it for whatever reason... -DaveM
1695 */
1696 case SO_ACCEPTCONN:
1697 v.val = sk->sk_state == TCP_LISTEN;
1698 break;
1699
1700 case SO_PASSSEC:
1701 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1702 break;
1703
1704 case SO_PEERSEC:
1705 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1706
1707 case SO_MARK:
1708 v.val = sk->sk_mark;
1709 break;
1710
1711 case SO_RXQ_OVFL:
1712 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1713 break;
1714
1715 case SO_WIFI_STATUS:
1716 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1717 break;
1718
1719 case SO_PEEK_OFF:
1720 if (!sock->ops->set_peek_off)
1721 return -EOPNOTSUPP;
1722
1723 v.val = sk->sk_peek_off;
1724 break;
1725 case SO_NOFCS:
1726 v.val = sock_flag(sk, SOCK_NOFCS);
1727 break;
1728
1729 case SO_BINDTODEVICE:
1730 return sock_getbindtodevice(sk, optval, optlen, len);
1731
1732 case SO_GET_FILTER:
1733 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1734 if (len < 0)
1735 return len;
1736
1737 goto lenout;
1738
1739 case SO_LOCK_FILTER:
1740 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1741 break;
1742
1743 case SO_BPF_EXTENSIONS:
1744 v.val = bpf_tell_extensions();
1745 break;
1746
1747 case SO_SELECT_ERR_QUEUE:
1748 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1749 break;
1750
1751#ifdef CONFIG_NET_RX_BUSY_POLL
1752 case SO_BUSY_POLL:
1753 v.val = sk->sk_ll_usec;
1754 break;
1755 case SO_PREFER_BUSY_POLL:
1756 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1757 break;
1758#endif
1759
1760 case SO_MAX_PACING_RATE:
1761 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1762 lv = sizeof(v.ulval);
1763 v.ulval = sk->sk_max_pacing_rate;
1764 } else {
1765 /* 32bit version */
1766 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1767 }
1768 break;
1769
1770 case SO_INCOMING_CPU:
1771 v.val = READ_ONCE(sk->sk_incoming_cpu);
1772 break;
1773
1774 case SO_MEMINFO:
1775 {
1776 u32 meminfo[SK_MEMINFO_VARS];
1777
1778 sk_get_meminfo(sk, meminfo);
1779
1780 len = min_t(unsigned int, len, sizeof(meminfo));
1781 if (copy_to_user(optval, &meminfo, len))
1782 return -EFAULT;
1783
1784 goto lenout;
1785 }
1786
1787#ifdef CONFIG_NET_RX_BUSY_POLL
1788 case SO_INCOMING_NAPI_ID:
1789 v.val = READ_ONCE(sk->sk_napi_id);
1790
1791 /* aggregate non-NAPI IDs down to 0 */
1792 if (v.val < MIN_NAPI_ID)
1793 v.val = 0;
1794
1795 break;
1796#endif
1797
1798 case SO_COOKIE:
1799 lv = sizeof(u64);
1800 if (len < lv)
1801 return -EINVAL;
1802 v.val64 = sock_gen_cookie(sk);
1803 break;
1804
1805 case SO_ZEROCOPY:
1806 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1807 break;
1808
1809 case SO_TXTIME:
1810 lv = sizeof(v.txtime);
1811 v.txtime.clockid = sk->sk_clockid;
1812 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1813 SOF_TXTIME_DEADLINE_MODE : 0;
1814 v.txtime.flags |= sk->sk_txtime_report_errors ?
1815 SOF_TXTIME_REPORT_ERRORS : 0;
1816 break;
1817
1818 case SO_BINDTOIFINDEX:
1819 v.val = sk->sk_bound_dev_if;
1820 break;
1821
1822 case SO_NETNS_COOKIE:
1823 lv = sizeof(u64);
1824 if (len != lv)
1825 return -EINVAL;
1826 v.val64 = sock_net(sk)->net_cookie;
1827 break;
1828
1829 case SO_BUF_LOCK:
1830 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1831 break;
1832
1833 case SO_RESERVE_MEM:
1834 v.val = sk->sk_reserved_mem;
1835 break;
1836
1837 default:
1838 /* We implement the SO_SNDLOWAT etc to not be settable
1839 * (1003.1g 7).
1840 */
1841 return -ENOPROTOOPT;
1842 }
1843
1844 if (len > lv)
1845 len = lv;
1846 if (copy_to_user(optval, &v, len))
1847 return -EFAULT;
1848lenout:
1849 if (put_user(len, optlen))
1850 return -EFAULT;
1851 return 0;
1852}
1853
1854/*
1855 * Initialize an sk_lock.
1856 *
1857 * (We also register the sk_lock with the lock validator.)
1858 */
1859static inline void sock_lock_init(struct sock *sk)
1860{
1861 if (sk->sk_kern_sock)
1862 sock_lock_init_class_and_name(
1863 sk,
1864 af_family_kern_slock_key_strings[sk->sk_family],
1865 af_family_kern_slock_keys + sk->sk_family,
1866 af_family_kern_key_strings[sk->sk_family],
1867 af_family_kern_keys + sk->sk_family);
1868 else
1869 sock_lock_init_class_and_name(
1870 sk,
1871 af_family_slock_key_strings[sk->sk_family],
1872 af_family_slock_keys + sk->sk_family,
1873 af_family_key_strings[sk->sk_family],
1874 af_family_keys + sk->sk_family);
1875}
1876
1877/*
1878 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1879 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1880 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1881 */
1882static void sock_copy(struct sock *nsk, const struct sock *osk)
1883{
1884 const struct proto *prot = READ_ONCE(osk->sk_prot);
1885#ifdef CONFIG_SECURITY_NETWORK
1886 void *sptr = nsk->sk_security;
1887#endif
1888
1889 /* If we move sk_tx_queue_mapping out of the private section,
1890 * we must check if sk_tx_queue_clear() is called after
1891 * sock_copy() in sk_clone_lock().
1892 */
1893 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1894 offsetof(struct sock, sk_dontcopy_begin) ||
1895 offsetof(struct sock, sk_tx_queue_mapping) >=
1896 offsetof(struct sock, sk_dontcopy_end));
1897
1898 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1899
1900 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1901 prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1902
1903#ifdef CONFIG_SECURITY_NETWORK
1904 nsk->sk_security = sptr;
1905 security_sk_clone(osk, nsk);
1906#endif
1907}
1908
1909static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1910 int family)
1911{
1912 struct sock *sk;
1913 struct kmem_cache *slab;
1914
1915 slab = prot->slab;
1916 if (slab != NULL) {
1917 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1918 if (!sk)
1919 return sk;
1920 if (want_init_on_alloc(priority))
1921 sk_prot_clear_nulls(sk, prot->obj_size);
1922 } else
1923 sk = kmalloc(prot->obj_size, priority);
1924
1925 if (sk != NULL) {
1926 if (security_sk_alloc(sk, family, priority))
1927 goto out_free;
1928
1929 if (!try_module_get(prot->owner))
1930 goto out_free_sec;
1931 }
1932
1933 return sk;
1934
1935out_free_sec:
1936 security_sk_free(sk);
1937out_free:
1938 if (slab != NULL)
1939 kmem_cache_free(slab, sk);
1940 else
1941 kfree(sk);
1942 return NULL;
1943}
1944
1945static void sk_prot_free(struct proto *prot, struct sock *sk)
1946{
1947 struct kmem_cache *slab;
1948 struct module *owner;
1949
1950 owner = prot->owner;
1951 slab = prot->slab;
1952
1953 cgroup_sk_free(&sk->sk_cgrp_data);
1954 mem_cgroup_sk_free(sk);
1955 security_sk_free(sk);
1956 if (slab != NULL)
1957 kmem_cache_free(slab, sk);
1958 else
1959 kfree(sk);
1960 module_put(owner);
1961}
1962
1963/**
1964 * sk_alloc - All socket objects are allocated here
1965 * @net: the applicable net namespace
1966 * @family: protocol family
1967 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1968 * @prot: struct proto associated with this new sock instance
1969 * @kern: is this to be a kernel socket?
1970 */
1971struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1972 struct proto *prot, int kern)
1973{
1974 struct sock *sk;
1975
1976 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1977 if (sk) {
1978 sk->sk_family = family;
1979 /*
1980 * See comment in struct sock definition to understand
1981 * why we need sk_prot_creator -acme
1982 */
1983 sk->sk_prot = sk->sk_prot_creator = prot;
1984 sk->sk_kern_sock = kern;
1985 sock_lock_init(sk);
1986 sk->sk_net_refcnt = kern ? 0 : 1;
1987 if (likely(sk->sk_net_refcnt)) {
1988 get_net_track(net, &sk->ns_tracker, priority);
1989 sock_inuse_add(net, 1);
1990 }
1991
1992 sock_net_set(sk, net);
1993 refcount_set(&sk->sk_wmem_alloc, 1);
1994
1995 mem_cgroup_sk_alloc(sk);
1996 cgroup_sk_alloc(&sk->sk_cgrp_data);
1997 sock_update_classid(&sk->sk_cgrp_data);
1998 sock_update_netprioidx(&sk->sk_cgrp_data);
1999 sk_tx_queue_clear(sk);
2000 }
2001
2002 return sk;
2003}
2004EXPORT_SYMBOL(sk_alloc);
2005
2006/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2007 * grace period. This is the case for UDP sockets and TCP listeners.
2008 */
2009static void __sk_destruct(struct rcu_head *head)
2010{
2011 struct sock *sk = container_of(head, struct sock, sk_rcu);
2012 struct sk_filter *filter;
2013
2014 if (sk->sk_destruct)
2015 sk->sk_destruct(sk);
2016
2017 filter = rcu_dereference_check(sk->sk_filter,
2018 refcount_read(&sk->sk_wmem_alloc) == 0);
2019 if (filter) {
2020 sk_filter_uncharge(sk, filter);
2021 RCU_INIT_POINTER(sk->sk_filter, NULL);
2022 }
2023
2024 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2025
2026#ifdef CONFIG_BPF_SYSCALL
2027 bpf_sk_storage_free(sk);
2028#endif
2029
2030 if (atomic_read(&sk->sk_omem_alloc))
2031 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2032 __func__, atomic_read(&sk->sk_omem_alloc));
2033
2034 if (sk->sk_frag.page) {
2035 put_page(sk->sk_frag.page);
2036 sk->sk_frag.page = NULL;
2037 }
2038
2039 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2040 put_cred(sk->sk_peer_cred);
2041 put_pid(sk->sk_peer_pid);
2042
2043 if (likely(sk->sk_net_refcnt))
2044 put_net_track(sock_net(sk), &sk->ns_tracker);
2045 sk_prot_free(sk->sk_prot_creator, sk);
2046}
2047
2048void sk_destruct(struct sock *sk)
2049{
2050 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2051
2052 WARN_ON_ONCE(!llist_empty(&sk->defer_list));
2053 sk_defer_free_flush(sk);
2054
2055 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2056 reuseport_detach_sock(sk);
2057 use_call_rcu = true;
2058 }
2059
2060 if (use_call_rcu)
2061 call_rcu(&sk->sk_rcu, __sk_destruct);
2062 else
2063 __sk_destruct(&sk->sk_rcu);
2064}
2065
2066static void __sk_free(struct sock *sk)
2067{
2068 if (likely(sk->sk_net_refcnt))
2069 sock_inuse_add(sock_net(sk), -1);
2070
2071 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2072 sock_diag_broadcast_destroy(sk);
2073 else
2074 sk_destruct(sk);
2075}
2076
2077void sk_free(struct sock *sk)
2078{
2079 /*
2080 * We subtract one from sk_wmem_alloc and can know if
2081 * some packets are still in some tx queue.
2082 * If not null, sock_wfree() will call __sk_free(sk) later
2083 */
2084 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2085 __sk_free(sk);
2086}
2087EXPORT_SYMBOL(sk_free);
2088
2089static void sk_init_common(struct sock *sk)
2090{
2091 skb_queue_head_init(&sk->sk_receive_queue);
2092 skb_queue_head_init(&sk->sk_write_queue);
2093 skb_queue_head_init(&sk->sk_error_queue);
2094
2095 rwlock_init(&sk->sk_callback_lock);
2096 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2097 af_rlock_keys + sk->sk_family,
2098 af_family_rlock_key_strings[sk->sk_family]);
2099 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2100 af_wlock_keys + sk->sk_family,
2101 af_family_wlock_key_strings[sk->sk_family]);
2102 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2103 af_elock_keys + sk->sk_family,
2104 af_family_elock_key_strings[sk->sk_family]);
2105 lockdep_set_class_and_name(&sk->sk_callback_lock,
2106 af_callback_keys + sk->sk_family,
2107 af_family_clock_key_strings[sk->sk_family]);
2108}
2109
2110/**
2111 * sk_clone_lock - clone a socket, and lock its clone
2112 * @sk: the socket to clone
2113 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2114 *
2115 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2116 */
2117struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2118{
2119 struct proto *prot = READ_ONCE(sk->sk_prot);
2120 struct sk_filter *filter;
2121 bool is_charged = true;
2122 struct sock *newsk;
2123
2124 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2125 if (!newsk)
2126 goto out;
2127
2128 sock_copy(newsk, sk);
2129
2130 newsk->sk_prot_creator = prot;
2131
2132 /* SANITY */
2133 if (likely(newsk->sk_net_refcnt)) {
2134 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2135 sock_inuse_add(sock_net(newsk), 1);
2136 }
2137 sk_node_init(&newsk->sk_node);
2138 sock_lock_init(newsk);
2139 bh_lock_sock(newsk);
2140 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2141 newsk->sk_backlog.len = 0;
2142
2143 atomic_set(&newsk->sk_rmem_alloc, 0);
2144
2145 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2146 refcount_set(&newsk->sk_wmem_alloc, 1);
2147
2148 atomic_set(&newsk->sk_omem_alloc, 0);
2149 sk_init_common(newsk);
2150
2151 newsk->sk_dst_cache = NULL;
2152 newsk->sk_dst_pending_confirm = 0;
2153 newsk->sk_wmem_queued = 0;
2154 newsk->sk_forward_alloc = 0;
2155 newsk->sk_reserved_mem = 0;
2156 atomic_set(&newsk->sk_drops, 0);
2157 newsk->sk_send_head = NULL;
2158 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2159 atomic_set(&newsk->sk_zckey, 0);
2160
2161 sock_reset_flag(newsk, SOCK_DONE);
2162
2163 /* sk->sk_memcg will be populated at accept() time */
2164 newsk->sk_memcg = NULL;
2165
2166 cgroup_sk_clone(&newsk->sk_cgrp_data);
2167
2168 rcu_read_lock();
2169 filter = rcu_dereference(sk->sk_filter);
2170 if (filter != NULL)
2171 /* though it's an empty new sock, the charging may fail
2172 * if sysctl_optmem_max was changed between creation of
2173 * original socket and cloning
2174 */
2175 is_charged = sk_filter_charge(newsk, filter);
2176 RCU_INIT_POINTER(newsk->sk_filter, filter);
2177 rcu_read_unlock();
2178
2179 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2180 /* We need to make sure that we don't uncharge the new
2181 * socket if we couldn't charge it in the first place
2182 * as otherwise we uncharge the parent's filter.
2183 */
2184 if (!is_charged)
2185 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2186 sk_free_unlock_clone(newsk);
2187 newsk = NULL;
2188 goto out;
2189 }
2190 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2191
2192 if (bpf_sk_storage_clone(sk, newsk)) {
2193 sk_free_unlock_clone(newsk);
2194 newsk = NULL;
2195 goto out;
2196 }
2197
2198 /* Clear sk_user_data if parent had the pointer tagged
2199 * as not suitable for copying when cloning.
2200 */
2201 if (sk_user_data_is_nocopy(newsk))
2202 newsk->sk_user_data = NULL;
2203
2204 newsk->sk_err = 0;
2205 newsk->sk_err_soft = 0;
2206 newsk->sk_priority = 0;
2207 newsk->sk_incoming_cpu = raw_smp_processor_id();
2208
2209 /* Before updating sk_refcnt, we must commit prior changes to memory
2210 * (Documentation/RCU/rculist_nulls.rst for details)
2211 */
2212 smp_wmb();
2213 refcount_set(&newsk->sk_refcnt, 2);
2214
2215 /* Increment the counter in the same struct proto as the master
2216 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2217 * is the same as sk->sk_prot->socks, as this field was copied
2218 * with memcpy).
2219 *
2220 * This _changes_ the previous behaviour, where
2221 * tcp_create_openreq_child always was incrementing the
2222 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2223 * to be taken into account in all callers. -acme
2224 */
2225 sk_refcnt_debug_inc(newsk);
2226 sk_set_socket(newsk, NULL);
2227 sk_tx_queue_clear(newsk);
2228 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2229
2230 if (newsk->sk_prot->sockets_allocated)
2231 sk_sockets_allocated_inc(newsk);
2232
2233 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2234 net_enable_timestamp();
2235out:
2236 return newsk;
2237}
2238EXPORT_SYMBOL_GPL(sk_clone_lock);
2239
2240void sk_free_unlock_clone(struct sock *sk)
2241{
2242 /* It is still raw copy of parent, so invalidate
2243 * destructor and make plain sk_free() */
2244 sk->sk_destruct = NULL;
2245 bh_unlock_sock(sk);
2246 sk_free(sk);
2247}
2248EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2249
2250void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2251{
2252 u32 max_segs = 1;
2253
2254 sk_dst_set(sk, dst);
2255 sk->sk_route_caps = dst->dev->features;
2256 if (sk_is_tcp(sk))
2257 sk->sk_route_caps |= NETIF_F_GSO;
2258 if (sk->sk_route_caps & NETIF_F_GSO)
2259 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2260 if (unlikely(sk->sk_gso_disabled))
2261 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2262 if (sk_can_gso(sk)) {
2263 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2264 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2265 } else {
2266 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2267 /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2268 sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2269 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2270 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2271 }
2272 }
2273 sk->sk_gso_max_segs = max_segs;
2274}
2275EXPORT_SYMBOL_GPL(sk_setup_caps);
2276
2277/*
2278 * Simple resource managers for sockets.
2279 */
2280
2281
2282/*
2283 * Write buffer destructor automatically called from kfree_skb.
2284 */
2285void sock_wfree(struct sk_buff *skb)
2286{
2287 struct sock *sk = skb->sk;
2288 unsigned int len = skb->truesize;
2289
2290 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2291 /*
2292 * Keep a reference on sk_wmem_alloc, this will be released
2293 * after sk_write_space() call
2294 */
2295 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2296 sk->sk_write_space(sk);
2297 len = 1;
2298 }
2299 /*
2300 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2301 * could not do because of in-flight packets
2302 */
2303 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2304 __sk_free(sk);
2305}
2306EXPORT_SYMBOL(sock_wfree);
2307
2308/* This variant of sock_wfree() is used by TCP,
2309 * since it sets SOCK_USE_WRITE_QUEUE.
2310 */
2311void __sock_wfree(struct sk_buff *skb)
2312{
2313 struct sock *sk = skb->sk;
2314
2315 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2316 __sk_free(sk);
2317}
2318
2319void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2320{
2321 skb_orphan(skb);
2322 skb->sk = sk;
2323#ifdef CONFIG_INET
2324 if (unlikely(!sk_fullsock(sk))) {
2325 skb->destructor = sock_edemux;
2326 sock_hold(sk);
2327 return;
2328 }
2329#endif
2330 skb->destructor = sock_wfree;
2331 skb_set_hash_from_sk(skb, sk);
2332 /*
2333 * We used to take a refcount on sk, but following operation
2334 * is enough to guarantee sk_free() wont free this sock until
2335 * all in-flight packets are completed
2336 */
2337 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2338}
2339EXPORT_SYMBOL(skb_set_owner_w);
2340
2341static bool can_skb_orphan_partial(const struct sk_buff *skb)
2342{
2343#ifdef CONFIG_TLS_DEVICE
2344 /* Drivers depend on in-order delivery for crypto offload,
2345 * partial orphan breaks out-of-order-OK logic.
2346 */
2347 if (skb->decrypted)
2348 return false;
2349#endif
2350 return (skb->destructor == sock_wfree ||
2351 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2352}
2353
2354/* This helper is used by netem, as it can hold packets in its
2355 * delay queue. We want to allow the owner socket to send more
2356 * packets, as if they were already TX completed by a typical driver.
2357 * But we also want to keep skb->sk set because some packet schedulers
2358 * rely on it (sch_fq for example).
2359 */
2360void skb_orphan_partial(struct sk_buff *skb)
2361{
2362 if (skb_is_tcp_pure_ack(skb))
2363 return;
2364
2365 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2366 return;
2367
2368 skb_orphan(skb);
2369}
2370EXPORT_SYMBOL(skb_orphan_partial);
2371
2372/*
2373 * Read buffer destructor automatically called from kfree_skb.
2374 */
2375void sock_rfree(struct sk_buff *skb)
2376{
2377 struct sock *sk = skb->sk;
2378 unsigned int len = skb->truesize;
2379
2380 atomic_sub(len, &sk->sk_rmem_alloc);
2381 sk_mem_uncharge(sk, len);
2382}
2383EXPORT_SYMBOL(sock_rfree);
2384
2385/*
2386 * Buffer destructor for skbs that are not used directly in read or write
2387 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2388 */
2389void sock_efree(struct sk_buff *skb)
2390{
2391 sock_put(skb->sk);
2392}
2393EXPORT_SYMBOL(sock_efree);
2394
2395/* Buffer destructor for prefetch/receive path where reference count may
2396 * not be held, e.g. for listen sockets.
2397 */
2398#ifdef CONFIG_INET
2399void sock_pfree(struct sk_buff *skb)
2400{
2401 if (sk_is_refcounted(skb->sk))
2402 sock_gen_put(skb->sk);
2403}
2404EXPORT_SYMBOL(sock_pfree);
2405#endif /* CONFIG_INET */
2406
2407kuid_t sock_i_uid(struct sock *sk)
2408{
2409 kuid_t uid;
2410
2411 read_lock_bh(&sk->sk_callback_lock);
2412 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2413 read_unlock_bh(&sk->sk_callback_lock);
2414 return uid;
2415}
2416EXPORT_SYMBOL(sock_i_uid);
2417
2418unsigned long sock_i_ino(struct sock *sk)
2419{
2420 unsigned long ino;
2421
2422 read_lock_bh(&sk->sk_callback_lock);
2423 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2424 read_unlock_bh(&sk->sk_callback_lock);
2425 return ino;
2426}
2427EXPORT_SYMBOL(sock_i_ino);
2428
2429/*
2430 * Allocate a skb from the socket's send buffer.
2431 */
2432struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2433 gfp_t priority)
2434{
2435 if (force ||
2436 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2437 struct sk_buff *skb = alloc_skb(size, priority);
2438
2439 if (skb) {
2440 skb_set_owner_w(skb, sk);
2441 return skb;
2442 }
2443 }
2444 return NULL;
2445}
2446EXPORT_SYMBOL(sock_wmalloc);
2447
2448static void sock_ofree(struct sk_buff *skb)
2449{
2450 struct sock *sk = skb->sk;
2451
2452 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2453}
2454
2455struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2456 gfp_t priority)
2457{
2458 struct sk_buff *skb;
2459
2460 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2461 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2462 sysctl_optmem_max)
2463 return NULL;
2464
2465 skb = alloc_skb(size, priority);
2466 if (!skb)
2467 return NULL;
2468
2469 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2470 skb->sk = sk;
2471 skb->destructor = sock_ofree;
2472 return skb;
2473}
2474
2475/*
2476 * Allocate a memory block from the socket's option memory buffer.
2477 */
2478void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2479{
2480 if ((unsigned int)size <= sysctl_optmem_max &&
2481 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2482 void *mem;
2483 /* First do the add, to avoid the race if kmalloc
2484 * might sleep.
2485 */
2486 atomic_add(size, &sk->sk_omem_alloc);
2487 mem = kmalloc(size, priority);
2488 if (mem)
2489 return mem;
2490 atomic_sub(size, &sk->sk_omem_alloc);
2491 }
2492 return NULL;
2493}
2494EXPORT_SYMBOL(sock_kmalloc);
2495
2496/* Free an option memory block. Note, we actually want the inline
2497 * here as this allows gcc to detect the nullify and fold away the
2498 * condition entirely.
2499 */
2500static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2501 const bool nullify)
2502{
2503 if (WARN_ON_ONCE(!mem))
2504 return;
2505 if (nullify)
2506 kfree_sensitive(mem);
2507 else
2508 kfree(mem);
2509 atomic_sub(size, &sk->sk_omem_alloc);
2510}
2511
2512void sock_kfree_s(struct sock *sk, void *mem, int size)
2513{
2514 __sock_kfree_s(sk, mem, size, false);
2515}
2516EXPORT_SYMBOL(sock_kfree_s);
2517
2518void sock_kzfree_s(struct sock *sk, void *mem, int size)
2519{
2520 __sock_kfree_s(sk, mem, size, true);
2521}
2522EXPORT_SYMBOL(sock_kzfree_s);
2523
2524/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2525 I think, these locks should be removed for datagram sockets.
2526 */
2527static long sock_wait_for_wmem(struct sock *sk, long timeo)
2528{
2529 DEFINE_WAIT(wait);
2530
2531 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2532 for (;;) {
2533 if (!timeo)
2534 break;
2535 if (signal_pending(current))
2536 break;
2537 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2538 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2539 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2540 break;
2541 if (sk->sk_shutdown & SEND_SHUTDOWN)
2542 break;
2543 if (sk->sk_err)
2544 break;
2545 timeo = schedule_timeout(timeo);
2546 }
2547 finish_wait(sk_sleep(sk), &wait);
2548 return timeo;
2549}
2550
2551
2552/*
2553 * Generic send/receive buffer handlers
2554 */
2555
2556struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2557 unsigned long data_len, int noblock,
2558 int *errcode, int max_page_order)
2559{
2560 struct sk_buff *skb;
2561 long timeo;
2562 int err;
2563
2564 timeo = sock_sndtimeo(sk, noblock);
2565 for (;;) {
2566 err = sock_error(sk);
2567 if (err != 0)
2568 goto failure;
2569
2570 err = -EPIPE;
2571 if (sk->sk_shutdown & SEND_SHUTDOWN)
2572 goto failure;
2573
2574 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2575 break;
2576
2577 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2578 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2579 err = -EAGAIN;
2580 if (!timeo)
2581 goto failure;
2582 if (signal_pending(current))
2583 goto interrupted;
2584 timeo = sock_wait_for_wmem(sk, timeo);
2585 }
2586 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2587 errcode, sk->sk_allocation);
2588 if (skb)
2589 skb_set_owner_w(skb, sk);
2590 return skb;
2591
2592interrupted:
2593 err = sock_intr_errno(timeo);
2594failure:
2595 *errcode = err;
2596 return NULL;
2597}
2598EXPORT_SYMBOL(sock_alloc_send_pskb);
2599
2600struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2601 int noblock, int *errcode)
2602{
2603 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2604}
2605EXPORT_SYMBOL(sock_alloc_send_skb);
2606
2607int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2608 struct sockcm_cookie *sockc)
2609{
2610 u32 tsflags;
2611
2612 switch (cmsg->cmsg_type) {
2613 case SO_MARK:
2614 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2615 return -EPERM;
2616 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2617 return -EINVAL;
2618 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2619 break;
2620 case SO_TIMESTAMPING_OLD:
2621 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2622 return -EINVAL;
2623
2624 tsflags = *(u32 *)CMSG_DATA(cmsg);
2625 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2626 return -EINVAL;
2627
2628 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2629 sockc->tsflags |= tsflags;
2630 break;
2631 case SCM_TXTIME:
2632 if (!sock_flag(sk, SOCK_TXTIME))
2633 return -EINVAL;
2634 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2635 return -EINVAL;
2636 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2637 break;
2638 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2639 case SCM_RIGHTS:
2640 case SCM_CREDENTIALS:
2641 break;
2642 default:
2643 return -EINVAL;
2644 }
2645 return 0;
2646}
2647EXPORT_SYMBOL(__sock_cmsg_send);
2648
2649int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2650 struct sockcm_cookie *sockc)
2651{
2652 struct cmsghdr *cmsg;
2653 int ret;
2654
2655 for_each_cmsghdr(cmsg, msg) {
2656 if (!CMSG_OK(msg, cmsg))
2657 return -EINVAL;
2658 if (cmsg->cmsg_level != SOL_SOCKET)
2659 continue;
2660 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2661 if (ret)
2662 return ret;
2663 }
2664 return 0;
2665}
2666EXPORT_SYMBOL(sock_cmsg_send);
2667
2668static void sk_enter_memory_pressure(struct sock *sk)
2669{
2670 if (!sk->sk_prot->enter_memory_pressure)
2671 return;
2672
2673 sk->sk_prot->enter_memory_pressure(sk);
2674}
2675
2676static void sk_leave_memory_pressure(struct sock *sk)
2677{
2678 if (sk->sk_prot->leave_memory_pressure) {
2679 sk->sk_prot->leave_memory_pressure(sk);
2680 } else {
2681 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2682
2683 if (memory_pressure && READ_ONCE(*memory_pressure))
2684 WRITE_ONCE(*memory_pressure, 0);
2685 }
2686}
2687
2688DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2689
2690/**
2691 * skb_page_frag_refill - check that a page_frag contains enough room
2692 * @sz: minimum size of the fragment we want to get
2693 * @pfrag: pointer to page_frag
2694 * @gfp: priority for memory allocation
2695 *
2696 * Note: While this allocator tries to use high order pages, there is
2697 * no guarantee that allocations succeed. Therefore, @sz MUST be
2698 * less or equal than PAGE_SIZE.
2699 */
2700bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2701{
2702 if (pfrag->page) {
2703 if (page_ref_count(pfrag->page) == 1) {
2704 pfrag->offset = 0;
2705 return true;
2706 }
2707 if (pfrag->offset + sz <= pfrag->size)
2708 return true;
2709 put_page(pfrag->page);
2710 }
2711
2712 pfrag->offset = 0;
2713 if (SKB_FRAG_PAGE_ORDER &&
2714 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2715 /* Avoid direct reclaim but allow kswapd to wake */
2716 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2717 __GFP_COMP | __GFP_NOWARN |
2718 __GFP_NORETRY,
2719 SKB_FRAG_PAGE_ORDER);
2720 if (likely(pfrag->page)) {
2721 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2722 return true;
2723 }
2724 }
2725 pfrag->page = alloc_page(gfp);
2726 if (likely(pfrag->page)) {
2727 pfrag->size = PAGE_SIZE;
2728 return true;
2729 }
2730 return false;
2731}
2732EXPORT_SYMBOL(skb_page_frag_refill);
2733
2734bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2735{
2736 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2737 return true;
2738
2739 sk_enter_memory_pressure(sk);
2740 sk_stream_moderate_sndbuf(sk);
2741 return false;
2742}
2743EXPORT_SYMBOL(sk_page_frag_refill);
2744
2745void __lock_sock(struct sock *sk)
2746 __releases(&sk->sk_lock.slock)
2747 __acquires(&sk->sk_lock.slock)
2748{
2749 DEFINE_WAIT(wait);
2750
2751 for (;;) {
2752 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2753 TASK_UNINTERRUPTIBLE);
2754 spin_unlock_bh(&sk->sk_lock.slock);
2755 schedule();
2756 spin_lock_bh(&sk->sk_lock.slock);
2757 if (!sock_owned_by_user(sk))
2758 break;
2759 }
2760 finish_wait(&sk->sk_lock.wq, &wait);
2761}
2762
2763void __release_sock(struct sock *sk)
2764 __releases(&sk->sk_lock.slock)
2765 __acquires(&sk->sk_lock.slock)
2766{
2767 struct sk_buff *skb, *next;
2768
2769 while ((skb = sk->sk_backlog.head) != NULL) {
2770 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2771
2772 spin_unlock_bh(&sk->sk_lock.slock);
2773
2774 do {
2775 next = skb->next;
2776 prefetch(next);
2777 WARN_ON_ONCE(skb_dst_is_noref(skb));
2778 skb_mark_not_on_list(skb);
2779 sk_backlog_rcv(sk, skb);
2780
2781 cond_resched();
2782
2783 skb = next;
2784 } while (skb != NULL);
2785
2786 spin_lock_bh(&sk->sk_lock.slock);
2787 }
2788
2789 /*
2790 * Doing the zeroing here guarantee we can not loop forever
2791 * while a wild producer attempts to flood us.
2792 */
2793 sk->sk_backlog.len = 0;
2794}
2795
2796void __sk_flush_backlog(struct sock *sk)
2797{
2798 spin_lock_bh(&sk->sk_lock.slock);
2799 __release_sock(sk);
2800 spin_unlock_bh(&sk->sk_lock.slock);
2801}
2802
2803/**
2804 * sk_wait_data - wait for data to arrive at sk_receive_queue
2805 * @sk: sock to wait on
2806 * @timeo: for how long
2807 * @skb: last skb seen on sk_receive_queue
2808 *
2809 * Now socket state including sk->sk_err is changed only under lock,
2810 * hence we may omit checks after joining wait queue.
2811 * We check receive queue before schedule() only as optimization;
2812 * it is very likely that release_sock() added new data.
2813 */
2814int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2815{
2816 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2817 int rc;
2818
2819 add_wait_queue(sk_sleep(sk), &wait);
2820 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2821 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2822 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2823 remove_wait_queue(sk_sleep(sk), &wait);
2824 return rc;
2825}
2826EXPORT_SYMBOL(sk_wait_data);
2827
2828/**
2829 * __sk_mem_raise_allocated - increase memory_allocated
2830 * @sk: socket
2831 * @size: memory size to allocate
2832 * @amt: pages to allocate
2833 * @kind: allocation type
2834 *
2835 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2836 */
2837int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2838{
2839 struct proto *prot = sk->sk_prot;
2840 long allocated = sk_memory_allocated_add(sk, amt);
2841 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2842 bool charged = true;
2843
2844 if (memcg_charge &&
2845 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2846 gfp_memcg_charge())))
2847 goto suppress_allocation;
2848
2849 /* Under limit. */
2850 if (allocated <= sk_prot_mem_limits(sk, 0)) {
2851 sk_leave_memory_pressure(sk);
2852 return 1;
2853 }
2854
2855 /* Under pressure. */
2856 if (allocated > sk_prot_mem_limits(sk, 1))
2857 sk_enter_memory_pressure(sk);
2858
2859 /* Over hard limit. */
2860 if (allocated > sk_prot_mem_limits(sk, 2))
2861 goto suppress_allocation;
2862
2863 /* guarantee minimum buffer size under pressure */
2864 if (kind == SK_MEM_RECV) {
2865 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2866 return 1;
2867
2868 } else { /* SK_MEM_SEND */
2869 int wmem0 = sk_get_wmem0(sk, prot);
2870
2871 if (sk->sk_type == SOCK_STREAM) {
2872 if (sk->sk_wmem_queued < wmem0)
2873 return 1;
2874 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2875 return 1;
2876 }
2877 }
2878
2879 if (sk_has_memory_pressure(sk)) {
2880 u64 alloc;
2881
2882 if (!sk_under_memory_pressure(sk))
2883 return 1;
2884 alloc = sk_sockets_allocated_read_positive(sk);
2885 if (sk_prot_mem_limits(sk, 2) > alloc *
2886 sk_mem_pages(sk->sk_wmem_queued +
2887 atomic_read(&sk->sk_rmem_alloc) +
2888 sk->sk_forward_alloc))
2889 return 1;
2890 }
2891
2892suppress_allocation:
2893
2894 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2895 sk_stream_moderate_sndbuf(sk);
2896
2897 /* Fail only if socket is _under_ its sndbuf.
2898 * In this case we cannot block, so that we have to fail.
2899 */
2900 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2901 /* Force charge with __GFP_NOFAIL */
2902 if (memcg_charge && !charged) {
2903 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2904 gfp_memcg_charge() | __GFP_NOFAIL);
2905 }
2906 return 1;
2907 }
2908 }
2909
2910 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2911 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2912
2913 sk_memory_allocated_sub(sk, amt);
2914
2915 if (memcg_charge && charged)
2916 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2917
2918 return 0;
2919}
2920EXPORT_SYMBOL(__sk_mem_raise_allocated);
2921
2922/**
2923 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2924 * @sk: socket
2925 * @size: memory size to allocate
2926 * @kind: allocation type
2927 *
2928 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2929 * rmem allocation. This function assumes that protocols which have
2930 * memory_pressure use sk_wmem_queued as write buffer accounting.
2931 */
2932int __sk_mem_schedule(struct sock *sk, int size, int kind)
2933{
2934 int ret, amt = sk_mem_pages(size);
2935
2936 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2937 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2938 if (!ret)
2939 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2940 return ret;
2941}
2942EXPORT_SYMBOL(__sk_mem_schedule);
2943
2944/**
2945 * __sk_mem_reduce_allocated - reclaim memory_allocated
2946 * @sk: socket
2947 * @amount: number of quanta
2948 *
2949 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2950 */
2951void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2952{
2953 sk_memory_allocated_sub(sk, amount);
2954
2955 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2956 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2957
2958 if (sk_under_memory_pressure(sk) &&
2959 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2960 sk_leave_memory_pressure(sk);
2961}
2962EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2963
2964/**
2965 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2966 * @sk: socket
2967 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2968 */
2969void __sk_mem_reclaim(struct sock *sk, int amount)
2970{
2971 amount >>= SK_MEM_QUANTUM_SHIFT;
2972 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2973 __sk_mem_reduce_allocated(sk, amount);
2974}
2975EXPORT_SYMBOL(__sk_mem_reclaim);
2976
2977int sk_set_peek_off(struct sock *sk, int val)
2978{
2979 sk->sk_peek_off = val;
2980 return 0;
2981}
2982EXPORT_SYMBOL_GPL(sk_set_peek_off);
2983
2984/*
2985 * Set of default routines for initialising struct proto_ops when
2986 * the protocol does not support a particular function. In certain
2987 * cases where it makes no sense for a protocol to have a "do nothing"
2988 * function, some default processing is provided.
2989 */
2990
2991int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2992{
2993 return -EOPNOTSUPP;
2994}
2995EXPORT_SYMBOL(sock_no_bind);
2996
2997int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2998 int len, int flags)
2999{
3000 return -EOPNOTSUPP;
3001}
3002EXPORT_SYMBOL(sock_no_connect);
3003
3004int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3005{
3006 return -EOPNOTSUPP;
3007}
3008EXPORT_SYMBOL(sock_no_socketpair);
3009
3010int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3011 bool kern)
3012{
3013 return -EOPNOTSUPP;
3014}
3015EXPORT_SYMBOL(sock_no_accept);
3016
3017int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3018 int peer)
3019{
3020 return -EOPNOTSUPP;
3021}
3022EXPORT_SYMBOL(sock_no_getname);
3023
3024int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3025{
3026 return -EOPNOTSUPP;
3027}
3028EXPORT_SYMBOL(sock_no_ioctl);
3029
3030int sock_no_listen(struct socket *sock, int backlog)
3031{
3032 return -EOPNOTSUPP;
3033}
3034EXPORT_SYMBOL(sock_no_listen);
3035
3036int sock_no_shutdown(struct socket *sock, int how)
3037{
3038 return -EOPNOTSUPP;
3039}
3040EXPORT_SYMBOL(sock_no_shutdown);
3041
3042int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3043{
3044 return -EOPNOTSUPP;
3045}
3046EXPORT_SYMBOL(sock_no_sendmsg);
3047
3048int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3049{
3050 return -EOPNOTSUPP;
3051}
3052EXPORT_SYMBOL(sock_no_sendmsg_locked);
3053
3054int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3055 int flags)
3056{
3057 return -EOPNOTSUPP;
3058}
3059EXPORT_SYMBOL(sock_no_recvmsg);
3060
3061int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3062{
3063 /* Mirror missing mmap method error code */
3064 return -ENODEV;
3065}
3066EXPORT_SYMBOL(sock_no_mmap);
3067
3068/*
3069 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3070 * various sock-based usage counts.
3071 */
3072void __receive_sock(struct file *file)
3073{
3074 struct socket *sock;
3075
3076 sock = sock_from_file(file);
3077 if (sock) {
3078 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3079 sock_update_classid(&sock->sk->sk_cgrp_data);
3080 }
3081}
3082
3083ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3084{
3085 ssize_t res;
3086 struct msghdr msg = {.msg_flags = flags};
3087 struct kvec iov;
3088 char *kaddr = kmap(page);
3089 iov.iov_base = kaddr + offset;
3090 iov.iov_len = size;
3091 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3092 kunmap(page);
3093 return res;
3094}
3095EXPORT_SYMBOL(sock_no_sendpage);
3096
3097ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3098 int offset, size_t size, int flags)
3099{
3100 ssize_t res;
3101 struct msghdr msg = {.msg_flags = flags};
3102 struct kvec iov;
3103 char *kaddr = kmap(page);
3104
3105 iov.iov_base = kaddr + offset;
3106 iov.iov_len = size;
3107 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3108 kunmap(page);
3109 return res;
3110}
3111EXPORT_SYMBOL(sock_no_sendpage_locked);
3112
3113/*
3114 * Default Socket Callbacks
3115 */
3116
3117static void sock_def_wakeup(struct sock *sk)
3118{
3119 struct socket_wq *wq;
3120
3121 rcu_read_lock();
3122 wq = rcu_dereference(sk->sk_wq);
3123 if (skwq_has_sleeper(wq))
3124 wake_up_interruptible_all(&wq->wait);
3125 rcu_read_unlock();
3126}
3127
3128static void sock_def_error_report(struct sock *sk)
3129{
3130 struct socket_wq *wq;
3131
3132 rcu_read_lock();
3133 wq = rcu_dereference(sk->sk_wq);
3134 if (skwq_has_sleeper(wq))
3135 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3136 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3137 rcu_read_unlock();
3138}
3139
3140void sock_def_readable(struct sock *sk)
3141{
3142 struct socket_wq *wq;
3143
3144 rcu_read_lock();
3145 wq = rcu_dereference(sk->sk_wq);
3146 if (skwq_has_sleeper(wq))
3147 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3148 EPOLLRDNORM | EPOLLRDBAND);
3149 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3150 rcu_read_unlock();
3151}
3152
3153static void sock_def_write_space(struct sock *sk)
3154{
3155 struct socket_wq *wq;
3156
3157 rcu_read_lock();
3158
3159 /* Do not wake up a writer until he can make "significant"
3160 * progress. --DaveM
3161 */
3162 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3163 wq = rcu_dereference(sk->sk_wq);
3164 if (skwq_has_sleeper(wq))
3165 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3166 EPOLLWRNORM | EPOLLWRBAND);
3167
3168 /* Should agree with poll, otherwise some programs break */
3169 if (sock_writeable(sk))
3170 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3171 }
3172
3173 rcu_read_unlock();
3174}
3175
3176static void sock_def_destruct(struct sock *sk)
3177{
3178}
3179
3180void sk_send_sigurg(struct sock *sk)
3181{
3182 if (sk->sk_socket && sk->sk_socket->file)
3183 if (send_sigurg(&sk->sk_socket->file->f_owner))
3184 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3185}
3186EXPORT_SYMBOL(sk_send_sigurg);
3187
3188void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3189 unsigned long expires)
3190{
3191 if (!mod_timer(timer, expires))
3192 sock_hold(sk);
3193}
3194EXPORT_SYMBOL(sk_reset_timer);
3195
3196void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3197{
3198 if (del_timer(timer))
3199 __sock_put(sk);
3200}
3201EXPORT_SYMBOL(sk_stop_timer);
3202
3203void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3204{
3205 if (del_timer_sync(timer))
3206 __sock_put(sk);
3207}
3208EXPORT_SYMBOL(sk_stop_timer_sync);
3209
3210void sock_init_data(struct socket *sock, struct sock *sk)
3211{
3212 sk_init_common(sk);
3213 sk->sk_send_head = NULL;
3214
3215 timer_setup(&sk->sk_timer, NULL, 0);
3216
3217 sk->sk_allocation = GFP_KERNEL;
3218 sk->sk_rcvbuf = sysctl_rmem_default;
3219 sk->sk_sndbuf = sysctl_wmem_default;
3220 sk->sk_state = TCP_CLOSE;
3221 sk_set_socket(sk, sock);
3222
3223 sock_set_flag(sk, SOCK_ZAPPED);
3224
3225 if (sock) {
3226 sk->sk_type = sock->type;
3227 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3228 sock->sk = sk;
3229 sk->sk_uid = SOCK_INODE(sock)->i_uid;
3230 } else {
3231 RCU_INIT_POINTER(sk->sk_wq, NULL);
3232 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
3233 }
3234
3235 rwlock_init(&sk->sk_callback_lock);
3236 if (sk->sk_kern_sock)
3237 lockdep_set_class_and_name(
3238 &sk->sk_callback_lock,
3239 af_kern_callback_keys + sk->sk_family,
3240 af_family_kern_clock_key_strings[sk->sk_family]);
3241 else
3242 lockdep_set_class_and_name(
3243 &sk->sk_callback_lock,
3244 af_callback_keys + sk->sk_family,
3245 af_family_clock_key_strings[sk->sk_family]);
3246
3247 sk->sk_state_change = sock_def_wakeup;
3248 sk->sk_data_ready = sock_def_readable;
3249 sk->sk_write_space = sock_def_write_space;
3250 sk->sk_error_report = sock_def_error_report;
3251 sk->sk_destruct = sock_def_destruct;
3252
3253 sk->sk_frag.page = NULL;
3254 sk->sk_frag.offset = 0;
3255 sk->sk_peek_off = -1;
3256
3257 sk->sk_peer_pid = NULL;
3258 sk->sk_peer_cred = NULL;
3259 spin_lock_init(&sk->sk_peer_lock);
3260
3261 sk->sk_write_pending = 0;
3262 sk->sk_rcvlowat = 1;
3263 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3264 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3265
3266 sk->sk_stamp = SK_DEFAULT_STAMP;
3267#if BITS_PER_LONG==32
3268 seqlock_init(&sk->sk_stamp_seq);
3269#endif
3270 atomic_set(&sk->sk_zckey, 0);
3271
3272#ifdef CONFIG_NET_RX_BUSY_POLL
3273 sk->sk_napi_id = 0;
3274 sk->sk_ll_usec = sysctl_net_busy_read;
3275#endif
3276
3277 sk->sk_max_pacing_rate = ~0UL;
3278 sk->sk_pacing_rate = ~0UL;
3279 WRITE_ONCE(sk->sk_pacing_shift, 10);
3280 sk->sk_incoming_cpu = -1;
3281
3282 sk_rx_queue_clear(sk);
3283 /*
3284 * Before updating sk_refcnt, we must commit prior changes to memory
3285 * (Documentation/RCU/rculist_nulls.rst for details)
3286 */
3287 smp_wmb();
3288 refcount_set(&sk->sk_refcnt, 1);
3289 atomic_set(&sk->sk_drops, 0);
3290}
3291EXPORT_SYMBOL(sock_init_data);
3292
3293void lock_sock_nested(struct sock *sk, int subclass)
3294{
3295 /* The sk_lock has mutex_lock() semantics here. */
3296 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3297
3298 might_sleep();
3299 spin_lock_bh(&sk->sk_lock.slock);
3300 if (sock_owned_by_user_nocheck(sk))
3301 __lock_sock(sk);
3302 sk->sk_lock.owned = 1;
3303 spin_unlock_bh(&sk->sk_lock.slock);
3304}
3305EXPORT_SYMBOL(lock_sock_nested);
3306
3307void release_sock(struct sock *sk)
3308{
3309 spin_lock_bh(&sk->sk_lock.slock);
3310 if (sk->sk_backlog.tail)
3311 __release_sock(sk);
3312
3313 /* Warning : release_cb() might need to release sk ownership,
3314 * ie call sock_release_ownership(sk) before us.
3315 */
3316 if (sk->sk_prot->release_cb)
3317 sk->sk_prot->release_cb(sk);
3318
3319 sock_release_ownership(sk);
3320 if (waitqueue_active(&sk->sk_lock.wq))
3321 wake_up(&sk->sk_lock.wq);
3322 spin_unlock_bh(&sk->sk_lock.slock);
3323}
3324EXPORT_SYMBOL(release_sock);
3325
3326bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3327{
3328 might_sleep();
3329 spin_lock_bh(&sk->sk_lock.slock);
3330
3331 if (!sock_owned_by_user_nocheck(sk)) {
3332 /*
3333 * Fast path return with bottom halves disabled and
3334 * sock::sk_lock.slock held.
3335 *
3336 * The 'mutex' is not contended and holding
3337 * sock::sk_lock.slock prevents all other lockers to
3338 * proceed so the corresponding unlock_sock_fast() can
3339 * avoid the slow path of release_sock() completely and
3340 * just release slock.
3341 *
3342 * From a semantical POV this is equivalent to 'acquiring'
3343 * the 'mutex', hence the corresponding lockdep
3344 * mutex_release() has to happen in the fast path of
3345 * unlock_sock_fast().
3346 */
3347 return false;
3348 }
3349
3350 __lock_sock(sk);
3351 sk->sk_lock.owned = 1;
3352 __acquire(&sk->sk_lock.slock);
3353 spin_unlock_bh(&sk->sk_lock.slock);
3354 return true;
3355}
3356EXPORT_SYMBOL(__lock_sock_fast);
3357
3358int sock_gettstamp(struct socket *sock, void __user *userstamp,
3359 bool timeval, bool time32)
3360{
3361 struct sock *sk = sock->sk;
3362 struct timespec64 ts;
3363
3364 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3365 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3366 if (ts.tv_sec == -1)
3367 return -ENOENT;
3368 if (ts.tv_sec == 0) {
3369 ktime_t kt = ktime_get_real();
3370 sock_write_timestamp(sk, kt);
3371 ts = ktime_to_timespec64(kt);
3372 }
3373
3374 if (timeval)
3375 ts.tv_nsec /= 1000;
3376
3377#ifdef CONFIG_COMPAT_32BIT_TIME
3378 if (time32)
3379 return put_old_timespec32(&ts, userstamp);
3380#endif
3381#ifdef CONFIG_SPARC64
3382 /* beware of padding in sparc64 timeval */
3383 if (timeval && !in_compat_syscall()) {
3384 struct __kernel_old_timeval __user tv = {
3385 .tv_sec = ts.tv_sec,
3386 .tv_usec = ts.tv_nsec,
3387 };
3388 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3389 return -EFAULT;
3390 return 0;
3391 }
3392#endif
3393 return put_timespec64(&ts, userstamp);
3394}
3395EXPORT_SYMBOL(sock_gettstamp);
3396
3397void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3398{
3399 if (!sock_flag(sk, flag)) {
3400 unsigned long previous_flags = sk->sk_flags;
3401
3402 sock_set_flag(sk, flag);
3403 /*
3404 * we just set one of the two flags which require net
3405 * time stamping, but time stamping might have been on
3406 * already because of the other one
3407 */
3408 if (sock_needs_netstamp(sk) &&
3409 !(previous_flags & SK_FLAGS_TIMESTAMP))
3410 net_enable_timestamp();
3411 }
3412}
3413
3414int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3415 int level, int type)
3416{
3417 struct sock_exterr_skb *serr;
3418 struct sk_buff *skb;
3419 int copied, err;
3420
3421 err = -EAGAIN;
3422 skb = sock_dequeue_err_skb(sk);
3423 if (skb == NULL)
3424 goto out;
3425
3426 copied = skb->len;
3427 if (copied > len) {
3428 msg->msg_flags |= MSG_TRUNC;
3429 copied = len;
3430 }
3431 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3432 if (err)
3433 goto out_free_skb;
3434
3435 sock_recv_timestamp(msg, sk, skb);
3436
3437 serr = SKB_EXT_ERR(skb);
3438 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3439
3440 msg->msg_flags |= MSG_ERRQUEUE;
3441 err = copied;
3442
3443out_free_skb:
3444 kfree_skb(skb);
3445out:
3446 return err;
3447}
3448EXPORT_SYMBOL(sock_recv_errqueue);
3449
3450/*
3451 * Get a socket option on an socket.
3452 *
3453 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3454 * asynchronous errors should be reported by getsockopt. We assume
3455 * this means if you specify SO_ERROR (otherwise whats the point of it).
3456 */
3457int sock_common_getsockopt(struct socket *sock, int level, int optname,
3458 char __user *optval, int __user *optlen)
3459{
3460 struct sock *sk = sock->sk;
3461
3462 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3463}
3464EXPORT_SYMBOL(sock_common_getsockopt);
3465
3466int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3467 int flags)
3468{
3469 struct sock *sk = sock->sk;
3470 int addr_len = 0;
3471 int err;
3472
3473 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3474 flags & ~MSG_DONTWAIT, &addr_len);
3475 if (err >= 0)
3476 msg->msg_namelen = addr_len;
3477 return err;
3478}
3479EXPORT_SYMBOL(sock_common_recvmsg);
3480
3481/*
3482 * Set socket options on an inet socket.
3483 */
3484int sock_common_setsockopt(struct socket *sock, int level, int optname,
3485 sockptr_t optval, unsigned int optlen)
3486{
3487 struct sock *sk = sock->sk;
3488
3489 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3490}
3491EXPORT_SYMBOL(sock_common_setsockopt);
3492
3493void sk_common_release(struct sock *sk)
3494{
3495 if (sk->sk_prot->destroy)
3496 sk->sk_prot->destroy(sk);
3497
3498 /*
3499 * Observation: when sk_common_release is called, processes have
3500 * no access to socket. But net still has.
3501 * Step one, detach it from networking:
3502 *
3503 * A. Remove from hash tables.
3504 */
3505
3506 sk->sk_prot->unhash(sk);
3507
3508 /*
3509 * In this point socket cannot receive new packets, but it is possible
3510 * that some packets are in flight because some CPU runs receiver and
3511 * did hash table lookup before we unhashed socket. They will achieve
3512 * receive queue and will be purged by socket destructor.
3513 *
3514 * Also we still have packets pending on receive queue and probably,
3515 * our own packets waiting in device queues. sock_destroy will drain
3516 * receive queue, but transmitted packets will delay socket destruction
3517 * until the last reference will be released.
3518 */
3519
3520 sock_orphan(sk);
3521
3522 xfrm_sk_free_policy(sk);
3523
3524 sk_refcnt_debug_release(sk);
3525
3526 sock_put(sk);
3527}
3528EXPORT_SYMBOL(sk_common_release);
3529
3530void sk_get_meminfo(const struct sock *sk, u32 *mem)
3531{
3532 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3533
3534 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3535 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3536 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3537 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3538 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3539 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3540 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3541 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3542 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3543}
3544
3545#ifdef CONFIG_PROC_FS
3546static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3547
3548int sock_prot_inuse_get(struct net *net, struct proto *prot)
3549{
3550 int cpu, idx = prot->inuse_idx;
3551 int res = 0;
3552
3553 for_each_possible_cpu(cpu)
3554 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3555
3556 return res >= 0 ? res : 0;
3557}
3558EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3559
3560int sock_inuse_get(struct net *net)
3561{
3562 int cpu, res = 0;
3563
3564 for_each_possible_cpu(cpu)
3565 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3566
3567 return res;
3568}
3569
3570EXPORT_SYMBOL_GPL(sock_inuse_get);
3571
3572static int __net_init sock_inuse_init_net(struct net *net)
3573{
3574 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3575 if (net->core.prot_inuse == NULL)
3576 return -ENOMEM;
3577 return 0;
3578}
3579
3580static void __net_exit sock_inuse_exit_net(struct net *net)
3581{
3582 free_percpu(net->core.prot_inuse);
3583}
3584
3585static struct pernet_operations net_inuse_ops = {
3586 .init = sock_inuse_init_net,
3587 .exit = sock_inuse_exit_net,
3588};
3589
3590static __init int net_inuse_init(void)
3591{
3592 if (register_pernet_subsys(&net_inuse_ops))
3593 panic("Cannot initialize net inuse counters");
3594
3595 return 0;
3596}
3597
3598core_initcall(net_inuse_init);
3599
3600static int assign_proto_idx(struct proto *prot)
3601{
3602 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3603
3604 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3605 pr_err("PROTO_INUSE_NR exhausted\n");
3606 return -ENOSPC;
3607 }
3608
3609 set_bit(prot->inuse_idx, proto_inuse_idx);
3610 return 0;
3611}
3612
3613static void release_proto_idx(struct proto *prot)
3614{
3615 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3616 clear_bit(prot->inuse_idx, proto_inuse_idx);
3617}
3618#else
3619static inline int assign_proto_idx(struct proto *prot)
3620{
3621 return 0;
3622}
3623
3624static inline void release_proto_idx(struct proto *prot)
3625{
3626}
3627
3628#endif
3629
3630static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3631{
3632 if (!twsk_prot)
3633 return;
3634 kfree(twsk_prot->twsk_slab_name);
3635 twsk_prot->twsk_slab_name = NULL;
3636 kmem_cache_destroy(twsk_prot->twsk_slab);
3637 twsk_prot->twsk_slab = NULL;
3638}
3639
3640static int tw_prot_init(const struct proto *prot)
3641{
3642 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3643
3644 if (!twsk_prot)
3645 return 0;
3646
3647 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3648 prot->name);
3649 if (!twsk_prot->twsk_slab_name)
3650 return -ENOMEM;
3651
3652 twsk_prot->twsk_slab =
3653 kmem_cache_create(twsk_prot->twsk_slab_name,
3654 twsk_prot->twsk_obj_size, 0,
3655 SLAB_ACCOUNT | prot->slab_flags,
3656 NULL);
3657 if (!twsk_prot->twsk_slab) {
3658 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3659 prot->name);
3660 return -ENOMEM;
3661 }
3662
3663 return 0;
3664}
3665
3666static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3667{
3668 if (!rsk_prot)
3669 return;
3670 kfree(rsk_prot->slab_name);
3671 rsk_prot->slab_name = NULL;
3672 kmem_cache_destroy(rsk_prot->slab);
3673 rsk_prot->slab = NULL;
3674}
3675
3676static int req_prot_init(const struct proto *prot)
3677{
3678 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3679
3680 if (!rsk_prot)
3681 return 0;
3682
3683 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3684 prot->name);
3685 if (!rsk_prot->slab_name)
3686 return -ENOMEM;
3687
3688 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3689 rsk_prot->obj_size, 0,
3690 SLAB_ACCOUNT | prot->slab_flags,
3691 NULL);
3692
3693 if (!rsk_prot->slab) {
3694 pr_crit("%s: Can't create request sock SLAB cache!\n",
3695 prot->name);
3696 return -ENOMEM;
3697 }
3698 return 0;
3699}
3700
3701int proto_register(struct proto *prot, int alloc_slab)
3702{
3703 int ret = -ENOBUFS;
3704
3705 if (alloc_slab) {
3706 prot->slab = kmem_cache_create_usercopy(prot->name,
3707 prot->obj_size, 0,
3708 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3709 prot->slab_flags,
3710 prot->useroffset, prot->usersize,
3711 NULL);
3712
3713 if (prot->slab == NULL) {
3714 pr_crit("%s: Can't create sock SLAB cache!\n",
3715 prot->name);
3716 goto out;
3717 }
3718
3719 if (req_prot_init(prot))
3720 goto out_free_request_sock_slab;
3721
3722 if (tw_prot_init(prot))
3723 goto out_free_timewait_sock_slab;
3724 }
3725
3726 mutex_lock(&proto_list_mutex);
3727 ret = assign_proto_idx(prot);
3728 if (ret) {
3729 mutex_unlock(&proto_list_mutex);
3730 goto out_free_timewait_sock_slab;
3731 }
3732 list_add(&prot->node, &proto_list);
3733 mutex_unlock(&proto_list_mutex);
3734 return ret;
3735
3736out_free_timewait_sock_slab:
3737 if (alloc_slab)
3738 tw_prot_cleanup(prot->twsk_prot);
3739out_free_request_sock_slab:
3740 if (alloc_slab) {
3741 req_prot_cleanup(prot->rsk_prot);
3742
3743 kmem_cache_destroy(prot->slab);
3744 prot->slab = NULL;
3745 }
3746out:
3747 return ret;
3748}
3749EXPORT_SYMBOL(proto_register);
3750
3751void proto_unregister(struct proto *prot)
3752{
3753 mutex_lock(&proto_list_mutex);
3754 release_proto_idx(prot);
3755 list_del(&prot->node);
3756 mutex_unlock(&proto_list_mutex);
3757
3758 kmem_cache_destroy(prot->slab);
3759 prot->slab = NULL;
3760
3761 req_prot_cleanup(prot->rsk_prot);
3762 tw_prot_cleanup(prot->twsk_prot);
3763}
3764EXPORT_SYMBOL(proto_unregister);
3765
3766int sock_load_diag_module(int family, int protocol)
3767{
3768 if (!protocol) {
3769 if (!sock_is_registered(family))
3770 return -ENOENT;
3771
3772 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3773 NETLINK_SOCK_DIAG, family);
3774 }
3775
3776#ifdef CONFIG_INET
3777 if (family == AF_INET &&
3778 protocol != IPPROTO_RAW &&
3779 protocol < MAX_INET_PROTOS &&
3780 !rcu_access_pointer(inet_protos[protocol]))
3781 return -ENOENT;
3782#endif
3783
3784 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3785 NETLINK_SOCK_DIAG, family, protocol);
3786}
3787EXPORT_SYMBOL(sock_load_diag_module);
3788
3789#ifdef CONFIG_PROC_FS
3790static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3791 __acquires(proto_list_mutex)
3792{
3793 mutex_lock(&proto_list_mutex);
3794 return seq_list_start_head(&proto_list, *pos);
3795}
3796
3797static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3798{
3799 return seq_list_next(v, &proto_list, pos);
3800}
3801
3802static void proto_seq_stop(struct seq_file *seq, void *v)
3803 __releases(proto_list_mutex)
3804{
3805 mutex_unlock(&proto_list_mutex);
3806}
3807
3808static char proto_method_implemented(const void *method)
3809{
3810 return method == NULL ? 'n' : 'y';
3811}
3812static long sock_prot_memory_allocated(struct proto *proto)
3813{
3814 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3815}
3816
3817static const char *sock_prot_memory_pressure(struct proto *proto)
3818{
3819 return proto->memory_pressure != NULL ?
3820 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3821}
3822
3823static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3824{
3825
3826 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
3827 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3828 proto->name,
3829 proto->obj_size,
3830 sock_prot_inuse_get(seq_file_net(seq), proto),
3831 sock_prot_memory_allocated(proto),
3832 sock_prot_memory_pressure(proto),
3833 proto->max_header,
3834 proto->slab == NULL ? "no" : "yes",
3835 module_name(proto->owner),
3836 proto_method_implemented(proto->close),
3837 proto_method_implemented(proto->connect),
3838 proto_method_implemented(proto->disconnect),
3839 proto_method_implemented(proto->accept),
3840 proto_method_implemented(proto->ioctl),
3841 proto_method_implemented(proto->init),
3842 proto_method_implemented(proto->destroy),
3843 proto_method_implemented(proto->shutdown),
3844 proto_method_implemented(proto->setsockopt),
3845 proto_method_implemented(proto->getsockopt),
3846 proto_method_implemented(proto->sendmsg),
3847 proto_method_implemented(proto->recvmsg),
3848 proto_method_implemented(proto->sendpage),
3849 proto_method_implemented(proto->bind),
3850 proto_method_implemented(proto->backlog_rcv),
3851 proto_method_implemented(proto->hash),
3852 proto_method_implemented(proto->unhash),
3853 proto_method_implemented(proto->get_port),
3854 proto_method_implemented(proto->enter_memory_pressure));
3855}
3856
3857static int proto_seq_show(struct seq_file *seq, void *v)
3858{
3859 if (v == &proto_list)
3860 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3861 "protocol",
3862 "size",
3863 "sockets",
3864 "memory",
3865 "press",
3866 "maxhdr",
3867 "slab",
3868 "module",
3869 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3870 else
3871 proto_seq_printf(seq, list_entry(v, struct proto, node));
3872 return 0;
3873}
3874
3875static const struct seq_operations proto_seq_ops = {
3876 .start = proto_seq_start,
3877 .next = proto_seq_next,
3878 .stop = proto_seq_stop,
3879 .show = proto_seq_show,
3880};
3881
3882static __net_init int proto_init_net(struct net *net)
3883{
3884 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3885 sizeof(struct seq_net_private)))
3886 return -ENOMEM;
3887
3888 return 0;
3889}
3890
3891static __net_exit void proto_exit_net(struct net *net)
3892{
3893 remove_proc_entry("protocols", net->proc_net);
3894}
3895
3896
3897static __net_initdata struct pernet_operations proto_net_ops = {
3898 .init = proto_init_net,
3899 .exit = proto_exit_net,
3900};
3901
3902static int __init proto_init(void)
3903{
3904 return register_pernet_subsys(&proto_net_ops);
3905}
3906
3907subsys_initcall(proto_init);
3908
3909#endif /* PROC_FS */
3910
3911#ifdef CONFIG_NET_RX_BUSY_POLL
3912bool sk_busy_loop_end(void *p, unsigned long start_time)
3913{
3914 struct sock *sk = p;
3915
3916 return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3917 sk_busy_loop_timeout(sk, start_time);
3918}
3919EXPORT_SYMBOL(sk_busy_loop_end);
3920#endif /* CONFIG_NET_RX_BUSY_POLL */
3921
3922int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3923{
3924 if (!sk->sk_prot->bind_add)
3925 return -EOPNOTSUPP;
3926 return sk->sk_prot->bind_add(sk, addr, addr_len);
3927}
3928EXPORT_SYMBOL(sock_bind_add);