Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <asm/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/udp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114#include <linux/static_key.h>
115#include <linux/memcontrol.h>
116#include <linux/prefetch.h>
117#include <linux/compat.h>
118#include <linux/mroute.h>
119#include <linux/mroute6.h>
120#include <linux/icmpv6.h>
121
122#include <linux/uaccess.h>
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
127#include <net/net_namespace.h>
128#include <net/request_sock.h>
129#include <net/sock.h>
130#include <net/proto_memory.h>
131#include <linux/net_tstamp.h>
132#include <net/xfrm.h>
133#include <linux/ipsec.h>
134#include <net/cls_cgroup.h>
135#include <net/netprio_cgroup.h>
136#include <linux/sock_diag.h>
137
138#include <linux/filter.h>
139#include <net/sock_reuseport.h>
140#include <net/bpf_sk_storage.h>
141
142#include <trace/events/sock.h>
143
144#include <net/tcp.h>
145#include <net/busy_poll.h>
146#include <net/phonet/phonet.h>
147
148#include <linux/ethtool.h>
149
150#include "dev.h"
151
152static DEFINE_MUTEX(proto_list_mutex);
153static LIST_HEAD(proto_list);
154
155static void sock_def_write_space_wfree(struct sock *sk);
156static void sock_def_write_space(struct sock *sk);
157
158/**
159 * sk_ns_capable - General socket capability test
160 * @sk: Socket to use a capability on or through
161 * @user_ns: The user namespace of the capability to use
162 * @cap: The capability to use
163 *
164 * Test to see if the opener of the socket had when the socket was
165 * created and the current process has the capability @cap in the user
166 * namespace @user_ns.
167 */
168bool sk_ns_capable(const struct sock *sk,
169 struct user_namespace *user_ns, int cap)
170{
171 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
172 ns_capable(user_ns, cap);
173}
174EXPORT_SYMBOL(sk_ns_capable);
175
176/**
177 * sk_capable - Socket global capability test
178 * @sk: Socket to use a capability on or through
179 * @cap: The global capability to use
180 *
181 * Test to see if the opener of the socket had when the socket was
182 * created and the current process has the capability @cap in all user
183 * namespaces.
184 */
185bool sk_capable(const struct sock *sk, int cap)
186{
187 return sk_ns_capable(sk, &init_user_ns, cap);
188}
189EXPORT_SYMBOL(sk_capable);
190
191/**
192 * sk_net_capable - Network namespace socket capability test
193 * @sk: Socket to use a capability on or through
194 * @cap: The capability to use
195 *
196 * Test to see if the opener of the socket had when the socket was created
197 * and the current process has the capability @cap over the network namespace
198 * the socket is a member of.
199 */
200bool sk_net_capable(const struct sock *sk, int cap)
201{
202 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
203}
204EXPORT_SYMBOL(sk_net_capable);
205
206/*
207 * Each address family might have different locking rules, so we have
208 * one slock key per address family and separate keys for internal and
209 * userspace sockets.
210 */
211static struct lock_class_key af_family_keys[AF_MAX];
212static struct lock_class_key af_family_kern_keys[AF_MAX];
213static struct lock_class_key af_family_slock_keys[AF_MAX];
214static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
215
216/*
217 * Make lock validator output more readable. (we pre-construct these
218 * strings build-time, so that runtime initialization of socket
219 * locks is fast):
220 */
221
222#define _sock_locks(x) \
223 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
224 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
225 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
226 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
227 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
228 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
229 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
230 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
231 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
232 x "27" , x "28" , x "AF_CAN" , \
233 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
234 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
235 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
236 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
237 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
238 x "AF_MCTP" , \
239 x "AF_MAX"
240
241static const char *const af_family_key_strings[AF_MAX+1] = {
242 _sock_locks("sk_lock-")
243};
244static const char *const af_family_slock_key_strings[AF_MAX+1] = {
245 _sock_locks("slock-")
246};
247static const char *const af_family_clock_key_strings[AF_MAX+1] = {
248 _sock_locks("clock-")
249};
250
251static const char *const af_family_kern_key_strings[AF_MAX+1] = {
252 _sock_locks("k-sk_lock-")
253};
254static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
255 _sock_locks("k-slock-")
256};
257static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
258 _sock_locks("k-clock-")
259};
260static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
261 _sock_locks("rlock-")
262};
263static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
264 _sock_locks("wlock-")
265};
266static const char *const af_family_elock_key_strings[AF_MAX+1] = {
267 _sock_locks("elock-")
268};
269
270/*
271 * sk_callback_lock and sk queues locking rules are per-address-family,
272 * so split the lock classes by using a per-AF key:
273 */
274static struct lock_class_key af_callback_keys[AF_MAX];
275static struct lock_class_key af_rlock_keys[AF_MAX];
276static struct lock_class_key af_wlock_keys[AF_MAX];
277static struct lock_class_key af_elock_keys[AF_MAX];
278static struct lock_class_key af_kern_callback_keys[AF_MAX];
279
280/* Run time adjustable parameters. */
281__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
282EXPORT_SYMBOL(sysctl_wmem_max);
283__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
284EXPORT_SYMBOL(sysctl_rmem_max);
285__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
286__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
287
288int sysctl_tstamp_allow_data __read_mostly = 1;
289
290DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
291EXPORT_SYMBOL_GPL(memalloc_socks_key);
292
293/**
294 * sk_set_memalloc - sets %SOCK_MEMALLOC
295 * @sk: socket to set it on
296 *
297 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
298 * It's the responsibility of the admin to adjust min_free_kbytes
299 * to meet the requirements
300 */
301void sk_set_memalloc(struct sock *sk)
302{
303 sock_set_flag(sk, SOCK_MEMALLOC);
304 sk->sk_allocation |= __GFP_MEMALLOC;
305 static_branch_inc(&memalloc_socks_key);
306}
307EXPORT_SYMBOL_GPL(sk_set_memalloc);
308
309void sk_clear_memalloc(struct sock *sk)
310{
311 sock_reset_flag(sk, SOCK_MEMALLOC);
312 sk->sk_allocation &= ~__GFP_MEMALLOC;
313 static_branch_dec(&memalloc_socks_key);
314
315 /*
316 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
317 * progress of swapping. SOCK_MEMALLOC may be cleared while
318 * it has rmem allocations due to the last swapfile being deactivated
319 * but there is a risk that the socket is unusable due to exceeding
320 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 */
322 sk_mem_reclaim(sk);
323}
324EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325
326int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327{
328 int ret;
329 unsigned int noreclaim_flag;
330
331 /* these should have been dropped before queueing */
332 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333
334 noreclaim_flag = memalloc_noreclaim_save();
335 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
336 tcp_v6_do_rcv,
337 tcp_v4_do_rcv,
338 sk, skb);
339 memalloc_noreclaim_restore(noreclaim_flag);
340
341 return ret;
342}
343EXPORT_SYMBOL(__sk_backlog_rcv);
344
345void sk_error_report(struct sock *sk)
346{
347 sk->sk_error_report(sk);
348
349 switch (sk->sk_family) {
350 case AF_INET:
351 fallthrough;
352 case AF_INET6:
353 trace_inet_sk_error_report(sk);
354 break;
355 default:
356 break;
357 }
358}
359EXPORT_SYMBOL(sk_error_report);
360
361int sock_get_timeout(long timeo, void *optval, bool old_timeval)
362{
363 struct __kernel_sock_timeval tv;
364
365 if (timeo == MAX_SCHEDULE_TIMEOUT) {
366 tv.tv_sec = 0;
367 tv.tv_usec = 0;
368 } else {
369 tv.tv_sec = timeo / HZ;
370 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
371 }
372
373 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
374 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
375 *(struct old_timeval32 *)optval = tv32;
376 return sizeof(tv32);
377 }
378
379 if (old_timeval) {
380 struct __kernel_old_timeval old_tv;
381 old_tv.tv_sec = tv.tv_sec;
382 old_tv.tv_usec = tv.tv_usec;
383 *(struct __kernel_old_timeval *)optval = old_tv;
384 return sizeof(old_tv);
385 }
386
387 *(struct __kernel_sock_timeval *)optval = tv;
388 return sizeof(tv);
389}
390EXPORT_SYMBOL(sock_get_timeout);
391
392int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
393 sockptr_t optval, int optlen, bool old_timeval)
394{
395 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
396 struct old_timeval32 tv32;
397
398 if (optlen < sizeof(tv32))
399 return -EINVAL;
400
401 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
402 return -EFAULT;
403 tv->tv_sec = tv32.tv_sec;
404 tv->tv_usec = tv32.tv_usec;
405 } else if (old_timeval) {
406 struct __kernel_old_timeval old_tv;
407
408 if (optlen < sizeof(old_tv))
409 return -EINVAL;
410 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
411 return -EFAULT;
412 tv->tv_sec = old_tv.tv_sec;
413 tv->tv_usec = old_tv.tv_usec;
414 } else {
415 if (optlen < sizeof(*tv))
416 return -EINVAL;
417 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
418 return -EFAULT;
419 }
420
421 return 0;
422}
423EXPORT_SYMBOL(sock_copy_user_timeval);
424
425static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
426 bool old_timeval)
427{
428 struct __kernel_sock_timeval tv;
429 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
430 long val;
431
432 if (err)
433 return err;
434
435 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
436 return -EDOM;
437
438 if (tv.tv_sec < 0) {
439 static int warned __read_mostly;
440
441 WRITE_ONCE(*timeo_p, 0);
442 if (warned < 10 && net_ratelimit()) {
443 warned++;
444 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
445 __func__, current->comm, task_pid_nr(current));
446 }
447 return 0;
448 }
449 val = MAX_SCHEDULE_TIMEOUT;
450 if ((tv.tv_sec || tv.tv_usec) &&
451 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
452 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
453 USEC_PER_SEC / HZ);
454 WRITE_ONCE(*timeo_p, val);
455 return 0;
456}
457
458static bool sock_needs_netstamp(const struct sock *sk)
459{
460 switch (sk->sk_family) {
461 case AF_UNSPEC:
462 case AF_UNIX:
463 return false;
464 default:
465 return true;
466 }
467}
468
469static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470{
471 if (sk->sk_flags & flags) {
472 sk->sk_flags &= ~flags;
473 if (sock_needs_netstamp(sk) &&
474 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 net_disable_timestamp();
476 }
477}
478
479
480int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481{
482 unsigned long flags;
483 struct sk_buff_head *list = &sk->sk_receive_queue;
484
485 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
486 atomic_inc(&sk->sk_drops);
487 trace_sock_rcvqueue_full(sk, skb);
488 return -ENOMEM;
489 }
490
491 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 atomic_inc(&sk->sk_drops);
493 return -ENOBUFS;
494 }
495
496 skb->dev = NULL;
497 skb_set_owner_r(skb, sk);
498
499 /* we escape from rcu protected region, make sure we dont leak
500 * a norefcounted dst
501 */
502 skb_dst_force(skb);
503
504 spin_lock_irqsave(&list->lock, flags);
505 sock_skb_set_dropcount(sk, skb);
506 __skb_queue_tail(list, skb);
507 spin_unlock_irqrestore(&list->lock, flags);
508
509 if (!sock_flag(sk, SOCK_DEAD))
510 sk->sk_data_ready(sk);
511 return 0;
512}
513EXPORT_SYMBOL(__sock_queue_rcv_skb);
514
515int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 enum skb_drop_reason *reason)
517{
518 enum skb_drop_reason drop_reason;
519 int err;
520
521 err = sk_filter(sk, skb);
522 if (err) {
523 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 goto out;
525 }
526 err = __sock_queue_rcv_skb(sk, skb);
527 switch (err) {
528 case -ENOMEM:
529 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 break;
531 case -ENOBUFS:
532 drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 break;
534 default:
535 drop_reason = SKB_NOT_DROPPED_YET;
536 break;
537 }
538out:
539 if (reason)
540 *reason = drop_reason;
541 return err;
542}
543EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544
545int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 const int nested, unsigned int trim_cap, bool refcounted)
547{
548 int rc = NET_RX_SUCCESS;
549
550 if (sk_filter_trim_cap(sk, skb, trim_cap))
551 goto discard_and_relse;
552
553 skb->dev = NULL;
554
555 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
556 atomic_inc(&sk->sk_drops);
557 goto discard_and_relse;
558 }
559 if (nested)
560 bh_lock_sock_nested(sk);
561 else
562 bh_lock_sock(sk);
563 if (!sock_owned_by_user(sk)) {
564 /*
565 * trylock + unlock semantics:
566 */
567 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568
569 rc = sk_backlog_rcv(sk, skb);
570
571 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 bh_unlock_sock(sk);
574 atomic_inc(&sk->sk_drops);
575 goto discard_and_relse;
576 }
577
578 bh_unlock_sock(sk);
579out:
580 if (refcounted)
581 sock_put(sk);
582 return rc;
583discard_and_relse:
584 kfree_skb(skb);
585 goto out;
586}
587EXPORT_SYMBOL(__sk_receive_skb);
588
589INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 u32));
591INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 u32));
593struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594{
595 struct dst_entry *dst = __sk_dst_get(sk);
596
597 if (dst && dst->obsolete &&
598 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 dst, cookie) == NULL) {
600 sk_tx_queue_clear(sk);
601 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
602 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 dst_release(dst);
604 return NULL;
605 }
606
607 return dst;
608}
609EXPORT_SYMBOL(__sk_dst_check);
610
611struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612{
613 struct dst_entry *dst = sk_dst_get(sk);
614
615 if (dst && dst->obsolete &&
616 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 dst, cookie) == NULL) {
618 sk_dst_reset(sk);
619 dst_release(dst);
620 return NULL;
621 }
622
623 return dst;
624}
625EXPORT_SYMBOL(sk_dst_check);
626
627static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628{
629 int ret = -ENOPROTOOPT;
630#ifdef CONFIG_NETDEVICES
631 struct net *net = sock_net(sk);
632
633 /* Sorry... */
634 ret = -EPERM;
635 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 goto out;
637
638 ret = -EINVAL;
639 if (ifindex < 0)
640 goto out;
641
642 /* Paired with all READ_ONCE() done locklessly. */
643 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644
645 if (sk->sk_prot->rehash)
646 sk->sk_prot->rehash(sk);
647 sk_dst_reset(sk);
648
649 ret = 0;
650
651out:
652#endif
653
654 return ret;
655}
656
657int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658{
659 int ret;
660
661 if (lock_sk)
662 lock_sock(sk);
663 ret = sock_bindtoindex_locked(sk, ifindex);
664 if (lock_sk)
665 release_sock(sk);
666
667 return ret;
668}
669EXPORT_SYMBOL(sock_bindtoindex);
670
671static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672{
673 int ret = -ENOPROTOOPT;
674#ifdef CONFIG_NETDEVICES
675 struct net *net = sock_net(sk);
676 char devname[IFNAMSIZ];
677 int index;
678
679 ret = -EINVAL;
680 if (optlen < 0)
681 goto out;
682
683 /* Bind this socket to a particular device like "eth0",
684 * as specified in the passed interface name. If the
685 * name is "" or the option length is zero the socket
686 * is not bound.
687 */
688 if (optlen > IFNAMSIZ - 1)
689 optlen = IFNAMSIZ - 1;
690 memset(devname, 0, sizeof(devname));
691
692 ret = -EFAULT;
693 if (copy_from_sockptr(devname, optval, optlen))
694 goto out;
695
696 index = 0;
697 if (devname[0] != '\0') {
698 struct net_device *dev;
699
700 rcu_read_lock();
701 dev = dev_get_by_name_rcu(net, devname);
702 if (dev)
703 index = dev->ifindex;
704 rcu_read_unlock();
705 ret = -ENODEV;
706 if (!dev)
707 goto out;
708 }
709
710 sockopt_lock_sock(sk);
711 ret = sock_bindtoindex_locked(sk, index);
712 sockopt_release_sock(sk);
713out:
714#endif
715
716 return ret;
717}
718
719static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 sockptr_t optlen, int len)
721{
722 int ret = -ENOPROTOOPT;
723#ifdef CONFIG_NETDEVICES
724 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 struct net *net = sock_net(sk);
726 char devname[IFNAMSIZ];
727
728 if (bound_dev_if == 0) {
729 len = 0;
730 goto zero;
731 }
732
733 ret = -EINVAL;
734 if (len < IFNAMSIZ)
735 goto out;
736
737 ret = netdev_get_name(net, devname, bound_dev_if);
738 if (ret)
739 goto out;
740
741 len = strlen(devname) + 1;
742
743 ret = -EFAULT;
744 if (copy_to_sockptr(optval, devname, len))
745 goto out;
746
747zero:
748 ret = -EFAULT;
749 if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 goto out;
751
752 ret = 0;
753
754out:
755#endif
756
757 return ret;
758}
759
760bool sk_mc_loop(const struct sock *sk)
761{
762 if (dev_recursion_level())
763 return false;
764 if (!sk)
765 return true;
766 /* IPV6_ADDRFORM can change sk->sk_family under us. */
767 switch (READ_ONCE(sk->sk_family)) {
768 case AF_INET:
769 return inet_test_bit(MC_LOOP, sk);
770#if IS_ENABLED(CONFIG_IPV6)
771 case AF_INET6:
772 return inet6_test_bit(MC6_LOOP, sk);
773#endif
774 }
775 WARN_ON_ONCE(1);
776 return true;
777}
778EXPORT_SYMBOL(sk_mc_loop);
779
780void sock_set_reuseaddr(struct sock *sk)
781{
782 lock_sock(sk);
783 sk->sk_reuse = SK_CAN_REUSE;
784 release_sock(sk);
785}
786EXPORT_SYMBOL(sock_set_reuseaddr);
787
788void sock_set_reuseport(struct sock *sk)
789{
790 lock_sock(sk);
791 sk->sk_reuseport = true;
792 release_sock(sk);
793}
794EXPORT_SYMBOL(sock_set_reuseport);
795
796void sock_no_linger(struct sock *sk)
797{
798 lock_sock(sk);
799 WRITE_ONCE(sk->sk_lingertime, 0);
800 sock_set_flag(sk, SOCK_LINGER);
801 release_sock(sk);
802}
803EXPORT_SYMBOL(sock_no_linger);
804
805void sock_set_priority(struct sock *sk, u32 priority)
806{
807 WRITE_ONCE(sk->sk_priority, priority);
808}
809EXPORT_SYMBOL(sock_set_priority);
810
811void sock_set_sndtimeo(struct sock *sk, s64 secs)
812{
813 lock_sock(sk);
814 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
815 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
816 else
817 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
818 release_sock(sk);
819}
820EXPORT_SYMBOL(sock_set_sndtimeo);
821
822static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
823{
824 if (val) {
825 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
826 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
827 sock_set_flag(sk, SOCK_RCVTSTAMP);
828 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
829 } else {
830 sock_reset_flag(sk, SOCK_RCVTSTAMP);
831 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
832 }
833}
834
835void sock_enable_timestamps(struct sock *sk)
836{
837 lock_sock(sk);
838 __sock_set_timestamps(sk, true, false, true);
839 release_sock(sk);
840}
841EXPORT_SYMBOL(sock_enable_timestamps);
842
843void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
844{
845 switch (optname) {
846 case SO_TIMESTAMP_OLD:
847 __sock_set_timestamps(sk, valbool, false, false);
848 break;
849 case SO_TIMESTAMP_NEW:
850 __sock_set_timestamps(sk, valbool, true, false);
851 break;
852 case SO_TIMESTAMPNS_OLD:
853 __sock_set_timestamps(sk, valbool, false, true);
854 break;
855 case SO_TIMESTAMPNS_NEW:
856 __sock_set_timestamps(sk, valbool, true, true);
857 break;
858 }
859}
860
861static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
862{
863 struct net *net = sock_net(sk);
864 struct net_device *dev = NULL;
865 bool match = false;
866 int *vclock_index;
867 int i, num;
868
869 if (sk->sk_bound_dev_if)
870 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
871
872 if (!dev) {
873 pr_err("%s: sock not bind to device\n", __func__);
874 return -EOPNOTSUPP;
875 }
876
877 num = ethtool_get_phc_vclocks(dev, &vclock_index);
878 dev_put(dev);
879
880 for (i = 0; i < num; i++) {
881 if (*(vclock_index + i) == phc_index) {
882 match = true;
883 break;
884 }
885 }
886
887 if (num > 0)
888 kfree(vclock_index);
889
890 if (!match)
891 return -EINVAL;
892
893 WRITE_ONCE(sk->sk_bind_phc, phc_index);
894
895 return 0;
896}
897
898int sock_set_timestamping(struct sock *sk, int optname,
899 struct so_timestamping timestamping)
900{
901 int val = timestamping.flags;
902 int ret;
903
904 if (val & ~SOF_TIMESTAMPING_MASK)
905 return -EINVAL;
906
907 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
908 !(val & SOF_TIMESTAMPING_OPT_ID))
909 return -EINVAL;
910
911 if (val & SOF_TIMESTAMPING_OPT_ID &&
912 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
913 if (sk_is_tcp(sk)) {
914 if ((1 << sk->sk_state) &
915 (TCPF_CLOSE | TCPF_LISTEN))
916 return -EINVAL;
917 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
918 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
919 else
920 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
921 } else {
922 atomic_set(&sk->sk_tskey, 0);
923 }
924 }
925
926 if (val & SOF_TIMESTAMPING_OPT_STATS &&
927 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
928 return -EINVAL;
929
930 if (val & SOF_TIMESTAMPING_BIND_PHC) {
931 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
932 if (ret)
933 return ret;
934 }
935
936 WRITE_ONCE(sk->sk_tsflags, val);
937 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
938
939 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
940 sock_enable_timestamp(sk,
941 SOCK_TIMESTAMPING_RX_SOFTWARE);
942 else
943 sock_disable_timestamp(sk,
944 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
945 return 0;
946}
947
948void sock_set_keepalive(struct sock *sk)
949{
950 lock_sock(sk);
951 if (sk->sk_prot->keepalive)
952 sk->sk_prot->keepalive(sk, true);
953 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
954 release_sock(sk);
955}
956EXPORT_SYMBOL(sock_set_keepalive);
957
958static void __sock_set_rcvbuf(struct sock *sk, int val)
959{
960 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
961 * as a negative value.
962 */
963 val = min_t(int, val, INT_MAX / 2);
964 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
965
966 /* We double it on the way in to account for "struct sk_buff" etc.
967 * overhead. Applications assume that the SO_RCVBUF setting they make
968 * will allow that much actual data to be received on that socket.
969 *
970 * Applications are unaware that "struct sk_buff" and other overheads
971 * allocate from the receive buffer during socket buffer allocation.
972 *
973 * And after considering the possible alternatives, returning the value
974 * we actually used in getsockopt is the most desirable behavior.
975 */
976 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
977}
978
979void sock_set_rcvbuf(struct sock *sk, int val)
980{
981 lock_sock(sk);
982 __sock_set_rcvbuf(sk, val);
983 release_sock(sk);
984}
985EXPORT_SYMBOL(sock_set_rcvbuf);
986
987static void __sock_set_mark(struct sock *sk, u32 val)
988{
989 if (val != sk->sk_mark) {
990 WRITE_ONCE(sk->sk_mark, val);
991 sk_dst_reset(sk);
992 }
993}
994
995void sock_set_mark(struct sock *sk, u32 val)
996{
997 lock_sock(sk);
998 __sock_set_mark(sk, val);
999 release_sock(sk);
1000}
1001EXPORT_SYMBOL(sock_set_mark);
1002
1003static void sock_release_reserved_memory(struct sock *sk, int bytes)
1004{
1005 /* Round down bytes to multiple of pages */
1006 bytes = round_down(bytes, PAGE_SIZE);
1007
1008 WARN_ON(bytes > sk->sk_reserved_mem);
1009 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1010 sk_mem_reclaim(sk);
1011}
1012
1013static int sock_reserve_memory(struct sock *sk, int bytes)
1014{
1015 long allocated;
1016 bool charged;
1017 int pages;
1018
1019 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1020 return -EOPNOTSUPP;
1021
1022 if (!bytes)
1023 return 0;
1024
1025 pages = sk_mem_pages(bytes);
1026
1027 /* pre-charge to memcg */
1028 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1029 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1030 if (!charged)
1031 return -ENOMEM;
1032
1033 /* pre-charge to forward_alloc */
1034 sk_memory_allocated_add(sk, pages);
1035 allocated = sk_memory_allocated(sk);
1036 /* If the system goes into memory pressure with this
1037 * precharge, give up and return error.
1038 */
1039 if (allocated > sk_prot_mem_limits(sk, 1)) {
1040 sk_memory_allocated_sub(sk, pages);
1041 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1042 return -ENOMEM;
1043 }
1044 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1045
1046 WRITE_ONCE(sk->sk_reserved_mem,
1047 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1048
1049 return 0;
1050}
1051
1052void sockopt_lock_sock(struct sock *sk)
1053{
1054 /* When current->bpf_ctx is set, the setsockopt is called from
1055 * a bpf prog. bpf has ensured the sk lock has been
1056 * acquired before calling setsockopt().
1057 */
1058 if (has_current_bpf_ctx())
1059 return;
1060
1061 lock_sock(sk);
1062}
1063EXPORT_SYMBOL(sockopt_lock_sock);
1064
1065void sockopt_release_sock(struct sock *sk)
1066{
1067 if (has_current_bpf_ctx())
1068 return;
1069
1070 release_sock(sk);
1071}
1072EXPORT_SYMBOL(sockopt_release_sock);
1073
1074bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075{
1076 return has_current_bpf_ctx() || ns_capable(ns, cap);
1077}
1078EXPORT_SYMBOL(sockopt_ns_capable);
1079
1080bool sockopt_capable(int cap)
1081{
1082 return has_current_bpf_ctx() || capable(cap);
1083}
1084EXPORT_SYMBOL(sockopt_capable);
1085
1086static int sockopt_validate_clockid(__kernel_clockid_t value)
1087{
1088 switch (value) {
1089 case CLOCK_REALTIME:
1090 case CLOCK_MONOTONIC:
1091 case CLOCK_TAI:
1092 return 0;
1093 }
1094 return -EINVAL;
1095}
1096
1097/*
1098 * This is meant for all protocols to use and covers goings on
1099 * at the socket level. Everything here is generic.
1100 */
1101
1102int sk_setsockopt(struct sock *sk, int level, int optname,
1103 sockptr_t optval, unsigned int optlen)
1104{
1105 struct so_timestamping timestamping;
1106 struct socket *sock = sk->sk_socket;
1107 struct sock_txtime sk_txtime;
1108 int val;
1109 int valbool;
1110 struct linger ling;
1111 int ret = 0;
1112
1113 /*
1114 * Options without arguments
1115 */
1116
1117 if (optname == SO_BINDTODEVICE)
1118 return sock_setbindtodevice(sk, optval, optlen);
1119
1120 if (optlen < sizeof(int))
1121 return -EINVAL;
1122
1123 if (copy_from_sockptr(&val, optval, sizeof(val)))
1124 return -EFAULT;
1125
1126 valbool = val ? 1 : 0;
1127
1128 /* handle options which do not require locking the socket. */
1129 switch (optname) {
1130 case SO_PRIORITY:
1131 if ((val >= 0 && val <= 6) ||
1132 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1133 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1134 sock_set_priority(sk, val);
1135 return 0;
1136 }
1137 return -EPERM;
1138 case SO_PASSSEC:
1139 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1140 return 0;
1141 case SO_PASSCRED:
1142 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1143 return 0;
1144 case SO_PASSPIDFD:
1145 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1146 return 0;
1147 case SO_TYPE:
1148 case SO_PROTOCOL:
1149 case SO_DOMAIN:
1150 case SO_ERROR:
1151 return -ENOPROTOOPT;
1152#ifdef CONFIG_NET_RX_BUSY_POLL
1153 case SO_BUSY_POLL:
1154 if (val < 0)
1155 return -EINVAL;
1156 WRITE_ONCE(sk->sk_ll_usec, val);
1157 return 0;
1158 case SO_PREFER_BUSY_POLL:
1159 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1160 return -EPERM;
1161 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1162 return 0;
1163 case SO_BUSY_POLL_BUDGET:
1164 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1165 !sockopt_capable(CAP_NET_ADMIN))
1166 return -EPERM;
1167 if (val < 0 || val > U16_MAX)
1168 return -EINVAL;
1169 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1170 return 0;
1171#endif
1172 case SO_MAX_PACING_RATE:
1173 {
1174 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1175 unsigned long pacing_rate;
1176
1177 if (sizeof(ulval) != sizeof(val) &&
1178 optlen >= sizeof(ulval) &&
1179 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1180 return -EFAULT;
1181 }
1182 if (ulval != ~0UL)
1183 cmpxchg(&sk->sk_pacing_status,
1184 SK_PACING_NONE,
1185 SK_PACING_NEEDED);
1186 /* Pairs with READ_ONCE() from sk_getsockopt() */
1187 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1188 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1189 if (ulval < pacing_rate)
1190 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1191 return 0;
1192 }
1193 case SO_TXREHASH:
1194 if (val < -1 || val > 1)
1195 return -EINVAL;
1196 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1197 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1198 /* Paired with READ_ONCE() in tcp_rtx_synack()
1199 * and sk_getsockopt().
1200 */
1201 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1202 return 0;
1203 case SO_PEEK_OFF:
1204 {
1205 int (*set_peek_off)(struct sock *sk, int val);
1206
1207 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1208 if (set_peek_off)
1209 ret = set_peek_off(sk, val);
1210 else
1211 ret = -EOPNOTSUPP;
1212 return ret;
1213 }
1214 }
1215
1216 sockopt_lock_sock(sk);
1217
1218 switch (optname) {
1219 case SO_DEBUG:
1220 if (val && !sockopt_capable(CAP_NET_ADMIN))
1221 ret = -EACCES;
1222 else
1223 sock_valbool_flag(sk, SOCK_DBG, valbool);
1224 break;
1225 case SO_REUSEADDR:
1226 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1227 break;
1228 case SO_REUSEPORT:
1229 sk->sk_reuseport = valbool;
1230 break;
1231 case SO_DONTROUTE:
1232 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1233 sk_dst_reset(sk);
1234 break;
1235 case SO_BROADCAST:
1236 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1237 break;
1238 case SO_SNDBUF:
1239 /* Don't error on this BSD doesn't and if you think
1240 * about it this is right. Otherwise apps have to
1241 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1242 * are treated in BSD as hints
1243 */
1244 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1245set_sndbuf:
1246 /* Ensure val * 2 fits into an int, to prevent max_t()
1247 * from treating it as a negative value.
1248 */
1249 val = min_t(int, val, INT_MAX / 2);
1250 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1251 WRITE_ONCE(sk->sk_sndbuf,
1252 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1253 /* Wake up sending tasks if we upped the value. */
1254 sk->sk_write_space(sk);
1255 break;
1256
1257 case SO_SNDBUFFORCE:
1258 if (!sockopt_capable(CAP_NET_ADMIN)) {
1259 ret = -EPERM;
1260 break;
1261 }
1262
1263 /* No negative values (to prevent underflow, as val will be
1264 * multiplied by 2).
1265 */
1266 if (val < 0)
1267 val = 0;
1268 goto set_sndbuf;
1269
1270 case SO_RCVBUF:
1271 /* Don't error on this BSD doesn't and if you think
1272 * about it this is right. Otherwise apps have to
1273 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1274 * are treated in BSD as hints
1275 */
1276 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1277 break;
1278
1279 case SO_RCVBUFFORCE:
1280 if (!sockopt_capable(CAP_NET_ADMIN)) {
1281 ret = -EPERM;
1282 break;
1283 }
1284
1285 /* No negative values (to prevent underflow, as val will be
1286 * multiplied by 2).
1287 */
1288 __sock_set_rcvbuf(sk, max(val, 0));
1289 break;
1290
1291 case SO_KEEPALIVE:
1292 if (sk->sk_prot->keepalive)
1293 sk->sk_prot->keepalive(sk, valbool);
1294 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1295 break;
1296
1297 case SO_OOBINLINE:
1298 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1299 break;
1300
1301 case SO_NO_CHECK:
1302 sk->sk_no_check_tx = valbool;
1303 break;
1304
1305 case SO_LINGER:
1306 if (optlen < sizeof(ling)) {
1307 ret = -EINVAL; /* 1003.1g */
1308 break;
1309 }
1310 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1311 ret = -EFAULT;
1312 break;
1313 }
1314 if (!ling.l_onoff) {
1315 sock_reset_flag(sk, SOCK_LINGER);
1316 } else {
1317 unsigned long t_sec = ling.l_linger;
1318
1319 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1320 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1321 else
1322 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1323 sock_set_flag(sk, SOCK_LINGER);
1324 }
1325 break;
1326
1327 case SO_BSDCOMPAT:
1328 break;
1329
1330 case SO_TIMESTAMP_OLD:
1331 case SO_TIMESTAMP_NEW:
1332 case SO_TIMESTAMPNS_OLD:
1333 case SO_TIMESTAMPNS_NEW:
1334 sock_set_timestamp(sk, optname, valbool);
1335 break;
1336
1337 case SO_TIMESTAMPING_NEW:
1338 case SO_TIMESTAMPING_OLD:
1339 if (optlen == sizeof(timestamping)) {
1340 if (copy_from_sockptr(×tamping, optval,
1341 sizeof(timestamping))) {
1342 ret = -EFAULT;
1343 break;
1344 }
1345 } else {
1346 memset(×tamping, 0, sizeof(timestamping));
1347 timestamping.flags = val;
1348 }
1349 ret = sock_set_timestamping(sk, optname, timestamping);
1350 break;
1351
1352 case SO_RCVLOWAT:
1353 {
1354 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1355
1356 if (val < 0)
1357 val = INT_MAX;
1358 if (sock)
1359 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1360 if (set_rcvlowat)
1361 ret = set_rcvlowat(sk, val);
1362 else
1363 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1364 break;
1365 }
1366 case SO_RCVTIMEO_OLD:
1367 case SO_RCVTIMEO_NEW:
1368 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1369 optlen, optname == SO_RCVTIMEO_OLD);
1370 break;
1371
1372 case SO_SNDTIMEO_OLD:
1373 case SO_SNDTIMEO_NEW:
1374 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1375 optlen, optname == SO_SNDTIMEO_OLD);
1376 break;
1377
1378 case SO_ATTACH_FILTER: {
1379 struct sock_fprog fprog;
1380
1381 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1382 if (!ret)
1383 ret = sk_attach_filter(&fprog, sk);
1384 break;
1385 }
1386 case SO_ATTACH_BPF:
1387 ret = -EINVAL;
1388 if (optlen == sizeof(u32)) {
1389 u32 ufd;
1390
1391 ret = -EFAULT;
1392 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1393 break;
1394
1395 ret = sk_attach_bpf(ufd, sk);
1396 }
1397 break;
1398
1399 case SO_ATTACH_REUSEPORT_CBPF: {
1400 struct sock_fprog fprog;
1401
1402 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1403 if (!ret)
1404 ret = sk_reuseport_attach_filter(&fprog, sk);
1405 break;
1406 }
1407 case SO_ATTACH_REUSEPORT_EBPF:
1408 ret = -EINVAL;
1409 if (optlen == sizeof(u32)) {
1410 u32 ufd;
1411
1412 ret = -EFAULT;
1413 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1414 break;
1415
1416 ret = sk_reuseport_attach_bpf(ufd, sk);
1417 }
1418 break;
1419
1420 case SO_DETACH_REUSEPORT_BPF:
1421 ret = reuseport_detach_prog(sk);
1422 break;
1423
1424 case SO_DETACH_FILTER:
1425 ret = sk_detach_filter(sk);
1426 break;
1427
1428 case SO_LOCK_FILTER:
1429 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1430 ret = -EPERM;
1431 else
1432 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1433 break;
1434
1435 case SO_MARK:
1436 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1437 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1438 ret = -EPERM;
1439 break;
1440 }
1441
1442 __sock_set_mark(sk, val);
1443 break;
1444 case SO_RCVMARK:
1445 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1446 break;
1447
1448 case SO_RXQ_OVFL:
1449 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1450 break;
1451
1452 case SO_WIFI_STATUS:
1453 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1454 break;
1455
1456 case SO_NOFCS:
1457 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1458 break;
1459
1460 case SO_SELECT_ERR_QUEUE:
1461 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1462 break;
1463
1464
1465 case SO_INCOMING_CPU:
1466 reuseport_update_incoming_cpu(sk, val);
1467 break;
1468
1469 case SO_CNX_ADVICE:
1470 if (val == 1)
1471 dst_negative_advice(sk);
1472 break;
1473
1474 case SO_ZEROCOPY:
1475 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1476 if (!(sk_is_tcp(sk) ||
1477 (sk->sk_type == SOCK_DGRAM &&
1478 sk->sk_protocol == IPPROTO_UDP)))
1479 ret = -EOPNOTSUPP;
1480 } else if (sk->sk_family != PF_RDS) {
1481 ret = -EOPNOTSUPP;
1482 }
1483 if (!ret) {
1484 if (val < 0 || val > 1)
1485 ret = -EINVAL;
1486 else
1487 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1488 }
1489 break;
1490
1491 case SO_TXTIME:
1492 if (optlen != sizeof(struct sock_txtime)) {
1493 ret = -EINVAL;
1494 break;
1495 } else if (copy_from_sockptr(&sk_txtime, optval,
1496 sizeof(struct sock_txtime))) {
1497 ret = -EFAULT;
1498 break;
1499 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1500 ret = -EINVAL;
1501 break;
1502 }
1503 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1504 * scheduler has enough safe guards.
1505 */
1506 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1507 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1508 ret = -EPERM;
1509 break;
1510 }
1511
1512 ret = sockopt_validate_clockid(sk_txtime.clockid);
1513 if (ret)
1514 break;
1515
1516 sock_valbool_flag(sk, SOCK_TXTIME, true);
1517 sk->sk_clockid = sk_txtime.clockid;
1518 sk->sk_txtime_deadline_mode =
1519 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1520 sk->sk_txtime_report_errors =
1521 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1522 break;
1523
1524 case SO_BINDTOIFINDEX:
1525 ret = sock_bindtoindex_locked(sk, val);
1526 break;
1527
1528 case SO_BUF_LOCK:
1529 if (val & ~SOCK_BUF_LOCK_MASK) {
1530 ret = -EINVAL;
1531 break;
1532 }
1533 sk->sk_userlocks = val | (sk->sk_userlocks &
1534 ~SOCK_BUF_LOCK_MASK);
1535 break;
1536
1537 case SO_RESERVE_MEM:
1538 {
1539 int delta;
1540
1541 if (val < 0) {
1542 ret = -EINVAL;
1543 break;
1544 }
1545
1546 delta = val - sk->sk_reserved_mem;
1547 if (delta < 0)
1548 sock_release_reserved_memory(sk, -delta);
1549 else
1550 ret = sock_reserve_memory(sk, delta);
1551 break;
1552 }
1553
1554 default:
1555 ret = -ENOPROTOOPT;
1556 break;
1557 }
1558 sockopt_release_sock(sk);
1559 return ret;
1560}
1561
1562int sock_setsockopt(struct socket *sock, int level, int optname,
1563 sockptr_t optval, unsigned int optlen)
1564{
1565 return sk_setsockopt(sock->sk, level, optname,
1566 optval, optlen);
1567}
1568EXPORT_SYMBOL(sock_setsockopt);
1569
1570static const struct cred *sk_get_peer_cred(struct sock *sk)
1571{
1572 const struct cred *cred;
1573
1574 spin_lock(&sk->sk_peer_lock);
1575 cred = get_cred(sk->sk_peer_cred);
1576 spin_unlock(&sk->sk_peer_lock);
1577
1578 return cred;
1579}
1580
1581static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1582 struct ucred *ucred)
1583{
1584 ucred->pid = pid_vnr(pid);
1585 ucred->uid = ucred->gid = -1;
1586 if (cred) {
1587 struct user_namespace *current_ns = current_user_ns();
1588
1589 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1590 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1591 }
1592}
1593
1594static int groups_to_user(sockptr_t dst, const struct group_info *src)
1595{
1596 struct user_namespace *user_ns = current_user_ns();
1597 int i;
1598
1599 for (i = 0; i < src->ngroups; i++) {
1600 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1601
1602 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1603 return -EFAULT;
1604 }
1605
1606 return 0;
1607}
1608
1609int sk_getsockopt(struct sock *sk, int level, int optname,
1610 sockptr_t optval, sockptr_t optlen)
1611{
1612 struct socket *sock = sk->sk_socket;
1613
1614 union {
1615 int val;
1616 u64 val64;
1617 unsigned long ulval;
1618 struct linger ling;
1619 struct old_timeval32 tm32;
1620 struct __kernel_old_timeval tm;
1621 struct __kernel_sock_timeval stm;
1622 struct sock_txtime txtime;
1623 struct so_timestamping timestamping;
1624 } v;
1625
1626 int lv = sizeof(int);
1627 int len;
1628
1629 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1630 return -EFAULT;
1631 if (len < 0)
1632 return -EINVAL;
1633
1634 memset(&v, 0, sizeof(v));
1635
1636 switch (optname) {
1637 case SO_DEBUG:
1638 v.val = sock_flag(sk, SOCK_DBG);
1639 break;
1640
1641 case SO_DONTROUTE:
1642 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1643 break;
1644
1645 case SO_BROADCAST:
1646 v.val = sock_flag(sk, SOCK_BROADCAST);
1647 break;
1648
1649 case SO_SNDBUF:
1650 v.val = READ_ONCE(sk->sk_sndbuf);
1651 break;
1652
1653 case SO_RCVBUF:
1654 v.val = READ_ONCE(sk->sk_rcvbuf);
1655 break;
1656
1657 case SO_REUSEADDR:
1658 v.val = sk->sk_reuse;
1659 break;
1660
1661 case SO_REUSEPORT:
1662 v.val = sk->sk_reuseport;
1663 break;
1664
1665 case SO_KEEPALIVE:
1666 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1667 break;
1668
1669 case SO_TYPE:
1670 v.val = sk->sk_type;
1671 break;
1672
1673 case SO_PROTOCOL:
1674 v.val = sk->sk_protocol;
1675 break;
1676
1677 case SO_DOMAIN:
1678 v.val = sk->sk_family;
1679 break;
1680
1681 case SO_ERROR:
1682 v.val = -sock_error(sk);
1683 if (v.val == 0)
1684 v.val = xchg(&sk->sk_err_soft, 0);
1685 break;
1686
1687 case SO_OOBINLINE:
1688 v.val = sock_flag(sk, SOCK_URGINLINE);
1689 break;
1690
1691 case SO_NO_CHECK:
1692 v.val = sk->sk_no_check_tx;
1693 break;
1694
1695 case SO_PRIORITY:
1696 v.val = READ_ONCE(sk->sk_priority);
1697 break;
1698
1699 case SO_LINGER:
1700 lv = sizeof(v.ling);
1701 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1702 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1703 break;
1704
1705 case SO_BSDCOMPAT:
1706 break;
1707
1708 case SO_TIMESTAMP_OLD:
1709 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1710 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1711 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1712 break;
1713
1714 case SO_TIMESTAMPNS_OLD:
1715 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1716 break;
1717
1718 case SO_TIMESTAMP_NEW:
1719 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1720 break;
1721
1722 case SO_TIMESTAMPNS_NEW:
1723 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1724 break;
1725
1726 case SO_TIMESTAMPING_OLD:
1727 case SO_TIMESTAMPING_NEW:
1728 lv = sizeof(v.timestamping);
1729 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1730 * returning the flags when they were set through the same option.
1731 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1732 */
1733 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1734 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1735 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1736 }
1737 break;
1738
1739 case SO_RCVTIMEO_OLD:
1740 case SO_RCVTIMEO_NEW:
1741 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1742 SO_RCVTIMEO_OLD == optname);
1743 break;
1744
1745 case SO_SNDTIMEO_OLD:
1746 case SO_SNDTIMEO_NEW:
1747 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1748 SO_SNDTIMEO_OLD == optname);
1749 break;
1750
1751 case SO_RCVLOWAT:
1752 v.val = READ_ONCE(sk->sk_rcvlowat);
1753 break;
1754
1755 case SO_SNDLOWAT:
1756 v.val = 1;
1757 break;
1758
1759 case SO_PASSCRED:
1760 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1761 break;
1762
1763 case SO_PASSPIDFD:
1764 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1765 break;
1766
1767 case SO_PEERCRED:
1768 {
1769 struct ucred peercred;
1770 if (len > sizeof(peercred))
1771 len = sizeof(peercred);
1772
1773 spin_lock(&sk->sk_peer_lock);
1774 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1775 spin_unlock(&sk->sk_peer_lock);
1776
1777 if (copy_to_sockptr(optval, &peercred, len))
1778 return -EFAULT;
1779 goto lenout;
1780 }
1781
1782 case SO_PEERPIDFD:
1783 {
1784 struct pid *peer_pid;
1785 struct file *pidfd_file = NULL;
1786 int pidfd;
1787
1788 if (len > sizeof(pidfd))
1789 len = sizeof(pidfd);
1790
1791 spin_lock(&sk->sk_peer_lock);
1792 peer_pid = get_pid(sk->sk_peer_pid);
1793 spin_unlock(&sk->sk_peer_lock);
1794
1795 if (!peer_pid)
1796 return -ENODATA;
1797
1798 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1799 put_pid(peer_pid);
1800 if (pidfd < 0)
1801 return pidfd;
1802
1803 if (copy_to_sockptr(optval, &pidfd, len) ||
1804 copy_to_sockptr(optlen, &len, sizeof(int))) {
1805 put_unused_fd(pidfd);
1806 fput(pidfd_file);
1807
1808 return -EFAULT;
1809 }
1810
1811 fd_install(pidfd, pidfd_file);
1812 return 0;
1813 }
1814
1815 case SO_PEERGROUPS:
1816 {
1817 const struct cred *cred;
1818 int ret, n;
1819
1820 cred = sk_get_peer_cred(sk);
1821 if (!cred)
1822 return -ENODATA;
1823
1824 n = cred->group_info->ngroups;
1825 if (len < n * sizeof(gid_t)) {
1826 len = n * sizeof(gid_t);
1827 put_cred(cred);
1828 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1829 }
1830 len = n * sizeof(gid_t);
1831
1832 ret = groups_to_user(optval, cred->group_info);
1833 put_cred(cred);
1834 if (ret)
1835 return ret;
1836 goto lenout;
1837 }
1838
1839 case SO_PEERNAME:
1840 {
1841 struct sockaddr_storage address;
1842
1843 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1844 if (lv < 0)
1845 return -ENOTCONN;
1846 if (lv < len)
1847 return -EINVAL;
1848 if (copy_to_sockptr(optval, &address, len))
1849 return -EFAULT;
1850 goto lenout;
1851 }
1852
1853 /* Dubious BSD thing... Probably nobody even uses it, but
1854 * the UNIX standard wants it for whatever reason... -DaveM
1855 */
1856 case SO_ACCEPTCONN:
1857 v.val = sk->sk_state == TCP_LISTEN;
1858 break;
1859
1860 case SO_PASSSEC:
1861 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1862 break;
1863
1864 case SO_PEERSEC:
1865 return security_socket_getpeersec_stream(sock,
1866 optval, optlen, len);
1867
1868 case SO_MARK:
1869 v.val = READ_ONCE(sk->sk_mark);
1870 break;
1871
1872 case SO_RCVMARK:
1873 v.val = sock_flag(sk, SOCK_RCVMARK);
1874 break;
1875
1876 case SO_RXQ_OVFL:
1877 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1878 break;
1879
1880 case SO_WIFI_STATUS:
1881 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1882 break;
1883
1884 case SO_PEEK_OFF:
1885 if (!READ_ONCE(sock->ops)->set_peek_off)
1886 return -EOPNOTSUPP;
1887
1888 v.val = READ_ONCE(sk->sk_peek_off);
1889 break;
1890 case SO_NOFCS:
1891 v.val = sock_flag(sk, SOCK_NOFCS);
1892 break;
1893
1894 case SO_BINDTODEVICE:
1895 return sock_getbindtodevice(sk, optval, optlen, len);
1896
1897 case SO_GET_FILTER:
1898 len = sk_get_filter(sk, optval, len);
1899 if (len < 0)
1900 return len;
1901
1902 goto lenout;
1903
1904 case SO_LOCK_FILTER:
1905 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1906 break;
1907
1908 case SO_BPF_EXTENSIONS:
1909 v.val = bpf_tell_extensions();
1910 break;
1911
1912 case SO_SELECT_ERR_QUEUE:
1913 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1914 break;
1915
1916#ifdef CONFIG_NET_RX_BUSY_POLL
1917 case SO_BUSY_POLL:
1918 v.val = READ_ONCE(sk->sk_ll_usec);
1919 break;
1920 case SO_PREFER_BUSY_POLL:
1921 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1922 break;
1923#endif
1924
1925 case SO_MAX_PACING_RATE:
1926 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1927 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1928 lv = sizeof(v.ulval);
1929 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1930 } else {
1931 /* 32bit version */
1932 v.val = min_t(unsigned long, ~0U,
1933 READ_ONCE(sk->sk_max_pacing_rate));
1934 }
1935 break;
1936
1937 case SO_INCOMING_CPU:
1938 v.val = READ_ONCE(sk->sk_incoming_cpu);
1939 break;
1940
1941 case SO_MEMINFO:
1942 {
1943 u32 meminfo[SK_MEMINFO_VARS];
1944
1945 sk_get_meminfo(sk, meminfo);
1946
1947 len = min_t(unsigned int, len, sizeof(meminfo));
1948 if (copy_to_sockptr(optval, &meminfo, len))
1949 return -EFAULT;
1950
1951 goto lenout;
1952 }
1953
1954#ifdef CONFIG_NET_RX_BUSY_POLL
1955 case SO_INCOMING_NAPI_ID:
1956 v.val = READ_ONCE(sk->sk_napi_id);
1957
1958 /* aggregate non-NAPI IDs down to 0 */
1959 if (v.val < MIN_NAPI_ID)
1960 v.val = 0;
1961
1962 break;
1963#endif
1964
1965 case SO_COOKIE:
1966 lv = sizeof(u64);
1967 if (len < lv)
1968 return -EINVAL;
1969 v.val64 = sock_gen_cookie(sk);
1970 break;
1971
1972 case SO_ZEROCOPY:
1973 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1974 break;
1975
1976 case SO_TXTIME:
1977 lv = sizeof(v.txtime);
1978 v.txtime.clockid = sk->sk_clockid;
1979 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1980 SOF_TXTIME_DEADLINE_MODE : 0;
1981 v.txtime.flags |= sk->sk_txtime_report_errors ?
1982 SOF_TXTIME_REPORT_ERRORS : 0;
1983 break;
1984
1985 case SO_BINDTOIFINDEX:
1986 v.val = READ_ONCE(sk->sk_bound_dev_if);
1987 break;
1988
1989 case SO_NETNS_COOKIE:
1990 lv = sizeof(u64);
1991 if (len != lv)
1992 return -EINVAL;
1993 v.val64 = sock_net(sk)->net_cookie;
1994 break;
1995
1996 case SO_BUF_LOCK:
1997 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1998 break;
1999
2000 case SO_RESERVE_MEM:
2001 v.val = READ_ONCE(sk->sk_reserved_mem);
2002 break;
2003
2004 case SO_TXREHASH:
2005 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2006 v.val = READ_ONCE(sk->sk_txrehash);
2007 break;
2008
2009 default:
2010 /* We implement the SO_SNDLOWAT etc to not be settable
2011 * (1003.1g 7).
2012 */
2013 return -ENOPROTOOPT;
2014 }
2015
2016 if (len > lv)
2017 len = lv;
2018 if (copy_to_sockptr(optval, &v, len))
2019 return -EFAULT;
2020lenout:
2021 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2022 return -EFAULT;
2023 return 0;
2024}
2025
2026/*
2027 * Initialize an sk_lock.
2028 *
2029 * (We also register the sk_lock with the lock validator.)
2030 */
2031static inline void sock_lock_init(struct sock *sk)
2032{
2033 if (sk->sk_kern_sock)
2034 sock_lock_init_class_and_name(
2035 sk,
2036 af_family_kern_slock_key_strings[sk->sk_family],
2037 af_family_kern_slock_keys + sk->sk_family,
2038 af_family_kern_key_strings[sk->sk_family],
2039 af_family_kern_keys + sk->sk_family);
2040 else
2041 sock_lock_init_class_and_name(
2042 sk,
2043 af_family_slock_key_strings[sk->sk_family],
2044 af_family_slock_keys + sk->sk_family,
2045 af_family_key_strings[sk->sk_family],
2046 af_family_keys + sk->sk_family);
2047}
2048
2049/*
2050 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2051 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2052 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2053 */
2054static void sock_copy(struct sock *nsk, const struct sock *osk)
2055{
2056 const struct proto *prot = READ_ONCE(osk->sk_prot);
2057#ifdef CONFIG_SECURITY_NETWORK
2058 void *sptr = nsk->sk_security;
2059#endif
2060
2061 /* If we move sk_tx_queue_mapping out of the private section,
2062 * we must check if sk_tx_queue_clear() is called after
2063 * sock_copy() in sk_clone_lock().
2064 */
2065 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2066 offsetof(struct sock, sk_dontcopy_begin) ||
2067 offsetof(struct sock, sk_tx_queue_mapping) >=
2068 offsetof(struct sock, sk_dontcopy_end));
2069
2070 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2071
2072 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2073 prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2074 /* alloc is larger than struct, see sk_prot_alloc() */);
2075
2076#ifdef CONFIG_SECURITY_NETWORK
2077 nsk->sk_security = sptr;
2078 security_sk_clone(osk, nsk);
2079#endif
2080}
2081
2082static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2083 int family)
2084{
2085 struct sock *sk;
2086 struct kmem_cache *slab;
2087
2088 slab = prot->slab;
2089 if (slab != NULL) {
2090 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2091 if (!sk)
2092 return sk;
2093 if (want_init_on_alloc(priority))
2094 sk_prot_clear_nulls(sk, prot->obj_size);
2095 } else
2096 sk = kmalloc(prot->obj_size, priority);
2097
2098 if (sk != NULL) {
2099 if (security_sk_alloc(sk, family, priority))
2100 goto out_free;
2101
2102 if (!try_module_get(prot->owner))
2103 goto out_free_sec;
2104 }
2105
2106 return sk;
2107
2108out_free_sec:
2109 security_sk_free(sk);
2110out_free:
2111 if (slab != NULL)
2112 kmem_cache_free(slab, sk);
2113 else
2114 kfree(sk);
2115 return NULL;
2116}
2117
2118static void sk_prot_free(struct proto *prot, struct sock *sk)
2119{
2120 struct kmem_cache *slab;
2121 struct module *owner;
2122
2123 owner = prot->owner;
2124 slab = prot->slab;
2125
2126 cgroup_sk_free(&sk->sk_cgrp_data);
2127 mem_cgroup_sk_free(sk);
2128 security_sk_free(sk);
2129 if (slab != NULL)
2130 kmem_cache_free(slab, sk);
2131 else
2132 kfree(sk);
2133 module_put(owner);
2134}
2135
2136/**
2137 * sk_alloc - All socket objects are allocated here
2138 * @net: the applicable net namespace
2139 * @family: protocol family
2140 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2141 * @prot: struct proto associated with this new sock instance
2142 * @kern: is this to be a kernel socket?
2143 */
2144struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2145 struct proto *prot, int kern)
2146{
2147 struct sock *sk;
2148
2149 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2150 if (sk) {
2151 sk->sk_family = family;
2152 /*
2153 * See comment in struct sock definition to understand
2154 * why we need sk_prot_creator -acme
2155 */
2156 sk->sk_prot = sk->sk_prot_creator = prot;
2157 sk->sk_kern_sock = kern;
2158 sock_lock_init(sk);
2159 sk->sk_net_refcnt = kern ? 0 : 1;
2160 if (likely(sk->sk_net_refcnt)) {
2161 get_net_track(net, &sk->ns_tracker, priority);
2162 sock_inuse_add(net, 1);
2163 } else {
2164 __netns_tracker_alloc(net, &sk->ns_tracker,
2165 false, priority);
2166 }
2167
2168 sock_net_set(sk, net);
2169 refcount_set(&sk->sk_wmem_alloc, 1);
2170
2171 mem_cgroup_sk_alloc(sk);
2172 cgroup_sk_alloc(&sk->sk_cgrp_data);
2173 sock_update_classid(&sk->sk_cgrp_data);
2174 sock_update_netprioidx(&sk->sk_cgrp_data);
2175 sk_tx_queue_clear(sk);
2176 }
2177
2178 return sk;
2179}
2180EXPORT_SYMBOL(sk_alloc);
2181
2182/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2183 * grace period. This is the case for UDP sockets and TCP listeners.
2184 */
2185static void __sk_destruct(struct rcu_head *head)
2186{
2187 struct sock *sk = container_of(head, struct sock, sk_rcu);
2188 struct sk_filter *filter;
2189
2190 if (sk->sk_destruct)
2191 sk->sk_destruct(sk);
2192
2193 filter = rcu_dereference_check(sk->sk_filter,
2194 refcount_read(&sk->sk_wmem_alloc) == 0);
2195 if (filter) {
2196 sk_filter_uncharge(sk, filter);
2197 RCU_INIT_POINTER(sk->sk_filter, NULL);
2198 }
2199
2200 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2201
2202#ifdef CONFIG_BPF_SYSCALL
2203 bpf_sk_storage_free(sk);
2204#endif
2205
2206 if (atomic_read(&sk->sk_omem_alloc))
2207 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2208 __func__, atomic_read(&sk->sk_omem_alloc));
2209
2210 if (sk->sk_frag.page) {
2211 put_page(sk->sk_frag.page);
2212 sk->sk_frag.page = NULL;
2213 }
2214
2215 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2216 put_cred(sk->sk_peer_cred);
2217 put_pid(sk->sk_peer_pid);
2218
2219 if (likely(sk->sk_net_refcnt))
2220 put_net_track(sock_net(sk), &sk->ns_tracker);
2221 else
2222 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2223
2224 sk_prot_free(sk->sk_prot_creator, sk);
2225}
2226
2227void sk_destruct(struct sock *sk)
2228{
2229 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2230
2231 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2232 reuseport_detach_sock(sk);
2233 use_call_rcu = true;
2234 }
2235
2236 if (use_call_rcu)
2237 call_rcu(&sk->sk_rcu, __sk_destruct);
2238 else
2239 __sk_destruct(&sk->sk_rcu);
2240}
2241
2242static void __sk_free(struct sock *sk)
2243{
2244 if (likely(sk->sk_net_refcnt))
2245 sock_inuse_add(sock_net(sk), -1);
2246
2247 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2248 sock_diag_broadcast_destroy(sk);
2249 else
2250 sk_destruct(sk);
2251}
2252
2253void sk_free(struct sock *sk)
2254{
2255 /*
2256 * We subtract one from sk_wmem_alloc and can know if
2257 * some packets are still in some tx queue.
2258 * If not null, sock_wfree() will call __sk_free(sk) later
2259 */
2260 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2261 __sk_free(sk);
2262}
2263EXPORT_SYMBOL(sk_free);
2264
2265static void sk_init_common(struct sock *sk)
2266{
2267 skb_queue_head_init(&sk->sk_receive_queue);
2268 skb_queue_head_init(&sk->sk_write_queue);
2269 skb_queue_head_init(&sk->sk_error_queue);
2270
2271 rwlock_init(&sk->sk_callback_lock);
2272 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2273 af_rlock_keys + sk->sk_family,
2274 af_family_rlock_key_strings[sk->sk_family]);
2275 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2276 af_wlock_keys + sk->sk_family,
2277 af_family_wlock_key_strings[sk->sk_family]);
2278 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2279 af_elock_keys + sk->sk_family,
2280 af_family_elock_key_strings[sk->sk_family]);
2281 if (sk->sk_kern_sock)
2282 lockdep_set_class_and_name(&sk->sk_callback_lock,
2283 af_kern_callback_keys + sk->sk_family,
2284 af_family_kern_clock_key_strings[sk->sk_family]);
2285 else
2286 lockdep_set_class_and_name(&sk->sk_callback_lock,
2287 af_callback_keys + sk->sk_family,
2288 af_family_clock_key_strings[sk->sk_family]);
2289}
2290
2291/**
2292 * sk_clone_lock - clone a socket, and lock its clone
2293 * @sk: the socket to clone
2294 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2295 *
2296 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2297 */
2298struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2299{
2300 struct proto *prot = READ_ONCE(sk->sk_prot);
2301 struct sk_filter *filter;
2302 bool is_charged = true;
2303 struct sock *newsk;
2304
2305 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2306 if (!newsk)
2307 goto out;
2308
2309 sock_copy(newsk, sk);
2310
2311 newsk->sk_prot_creator = prot;
2312
2313 /* SANITY */
2314 if (likely(newsk->sk_net_refcnt)) {
2315 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2316 sock_inuse_add(sock_net(newsk), 1);
2317 } else {
2318 /* Kernel sockets are not elevating the struct net refcount.
2319 * Instead, use a tracker to more easily detect if a layer
2320 * is not properly dismantling its kernel sockets at netns
2321 * destroy time.
2322 */
2323 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2324 false, priority);
2325 }
2326 sk_node_init(&newsk->sk_node);
2327 sock_lock_init(newsk);
2328 bh_lock_sock(newsk);
2329 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2330 newsk->sk_backlog.len = 0;
2331
2332 atomic_set(&newsk->sk_rmem_alloc, 0);
2333
2334 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2335 refcount_set(&newsk->sk_wmem_alloc, 1);
2336
2337 atomic_set(&newsk->sk_omem_alloc, 0);
2338 sk_init_common(newsk);
2339
2340 newsk->sk_dst_cache = NULL;
2341 newsk->sk_dst_pending_confirm = 0;
2342 newsk->sk_wmem_queued = 0;
2343 newsk->sk_forward_alloc = 0;
2344 newsk->sk_reserved_mem = 0;
2345 atomic_set(&newsk->sk_drops, 0);
2346 newsk->sk_send_head = NULL;
2347 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2348 atomic_set(&newsk->sk_zckey, 0);
2349
2350 sock_reset_flag(newsk, SOCK_DONE);
2351
2352 /* sk->sk_memcg will be populated at accept() time */
2353 newsk->sk_memcg = NULL;
2354
2355 cgroup_sk_clone(&newsk->sk_cgrp_data);
2356
2357 rcu_read_lock();
2358 filter = rcu_dereference(sk->sk_filter);
2359 if (filter != NULL)
2360 /* though it's an empty new sock, the charging may fail
2361 * if sysctl_optmem_max was changed between creation of
2362 * original socket and cloning
2363 */
2364 is_charged = sk_filter_charge(newsk, filter);
2365 RCU_INIT_POINTER(newsk->sk_filter, filter);
2366 rcu_read_unlock();
2367
2368 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2369 /* We need to make sure that we don't uncharge the new
2370 * socket if we couldn't charge it in the first place
2371 * as otherwise we uncharge the parent's filter.
2372 */
2373 if (!is_charged)
2374 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2375 sk_free_unlock_clone(newsk);
2376 newsk = NULL;
2377 goto out;
2378 }
2379 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2380
2381 if (bpf_sk_storage_clone(sk, newsk)) {
2382 sk_free_unlock_clone(newsk);
2383 newsk = NULL;
2384 goto out;
2385 }
2386
2387 /* Clear sk_user_data if parent had the pointer tagged
2388 * as not suitable for copying when cloning.
2389 */
2390 if (sk_user_data_is_nocopy(newsk))
2391 newsk->sk_user_data = NULL;
2392
2393 newsk->sk_err = 0;
2394 newsk->sk_err_soft = 0;
2395 newsk->sk_priority = 0;
2396 newsk->sk_incoming_cpu = raw_smp_processor_id();
2397
2398 /* Before updating sk_refcnt, we must commit prior changes to memory
2399 * (Documentation/RCU/rculist_nulls.rst for details)
2400 */
2401 smp_wmb();
2402 refcount_set(&newsk->sk_refcnt, 2);
2403
2404 sk_set_socket(newsk, NULL);
2405 sk_tx_queue_clear(newsk);
2406 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2407
2408 if (newsk->sk_prot->sockets_allocated)
2409 sk_sockets_allocated_inc(newsk);
2410
2411 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2412 net_enable_timestamp();
2413out:
2414 return newsk;
2415}
2416EXPORT_SYMBOL_GPL(sk_clone_lock);
2417
2418void sk_free_unlock_clone(struct sock *sk)
2419{
2420 /* It is still raw copy of parent, so invalidate
2421 * destructor and make plain sk_free() */
2422 sk->sk_destruct = NULL;
2423 bh_unlock_sock(sk);
2424 sk_free(sk);
2425}
2426EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2427
2428static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2429{
2430 bool is_ipv6 = false;
2431 u32 max_size;
2432
2433#if IS_ENABLED(CONFIG_IPV6)
2434 is_ipv6 = (sk->sk_family == AF_INET6 &&
2435 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2436#endif
2437 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2438 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2439 READ_ONCE(dst->dev->gso_ipv4_max_size);
2440 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2441 max_size = GSO_LEGACY_MAX_SIZE;
2442
2443 return max_size - (MAX_TCP_HEADER + 1);
2444}
2445
2446void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2447{
2448 u32 max_segs = 1;
2449
2450 sk->sk_route_caps = dst->dev->features;
2451 if (sk_is_tcp(sk))
2452 sk->sk_route_caps |= NETIF_F_GSO;
2453 if (sk->sk_route_caps & NETIF_F_GSO)
2454 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2455 if (unlikely(sk->sk_gso_disabled))
2456 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2457 if (sk_can_gso(sk)) {
2458 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2459 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2460 } else {
2461 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2462 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2463 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2464 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2465 }
2466 }
2467 sk->sk_gso_max_segs = max_segs;
2468 sk_dst_set(sk, dst);
2469}
2470EXPORT_SYMBOL_GPL(sk_setup_caps);
2471
2472/*
2473 * Simple resource managers for sockets.
2474 */
2475
2476
2477/*
2478 * Write buffer destructor automatically called from kfree_skb.
2479 */
2480void sock_wfree(struct sk_buff *skb)
2481{
2482 struct sock *sk = skb->sk;
2483 unsigned int len = skb->truesize;
2484 bool free;
2485
2486 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2487 if (sock_flag(sk, SOCK_RCU_FREE) &&
2488 sk->sk_write_space == sock_def_write_space) {
2489 rcu_read_lock();
2490 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2491 sock_def_write_space_wfree(sk);
2492 rcu_read_unlock();
2493 if (unlikely(free))
2494 __sk_free(sk);
2495 return;
2496 }
2497
2498 /*
2499 * Keep a reference on sk_wmem_alloc, this will be released
2500 * after sk_write_space() call
2501 */
2502 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2503 sk->sk_write_space(sk);
2504 len = 1;
2505 }
2506 /*
2507 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2508 * could not do because of in-flight packets
2509 */
2510 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2511 __sk_free(sk);
2512}
2513EXPORT_SYMBOL(sock_wfree);
2514
2515/* This variant of sock_wfree() is used by TCP,
2516 * since it sets SOCK_USE_WRITE_QUEUE.
2517 */
2518void __sock_wfree(struct sk_buff *skb)
2519{
2520 struct sock *sk = skb->sk;
2521
2522 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2523 __sk_free(sk);
2524}
2525
2526void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2527{
2528 skb_orphan(skb);
2529 skb->sk = sk;
2530#ifdef CONFIG_INET
2531 if (unlikely(!sk_fullsock(sk))) {
2532 skb->destructor = sock_edemux;
2533 sock_hold(sk);
2534 return;
2535 }
2536#endif
2537 skb->destructor = sock_wfree;
2538 skb_set_hash_from_sk(skb, sk);
2539 /*
2540 * We used to take a refcount on sk, but following operation
2541 * is enough to guarantee sk_free() wont free this sock until
2542 * all in-flight packets are completed
2543 */
2544 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2545}
2546EXPORT_SYMBOL(skb_set_owner_w);
2547
2548static bool can_skb_orphan_partial(const struct sk_buff *skb)
2549{
2550 /* Drivers depend on in-order delivery for crypto offload,
2551 * partial orphan breaks out-of-order-OK logic.
2552 */
2553 if (skb_is_decrypted(skb))
2554 return false;
2555
2556 return (skb->destructor == sock_wfree ||
2557 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2558}
2559
2560/* This helper is used by netem, as it can hold packets in its
2561 * delay queue. We want to allow the owner socket to send more
2562 * packets, as if they were already TX completed by a typical driver.
2563 * But we also want to keep skb->sk set because some packet schedulers
2564 * rely on it (sch_fq for example).
2565 */
2566void skb_orphan_partial(struct sk_buff *skb)
2567{
2568 if (skb_is_tcp_pure_ack(skb))
2569 return;
2570
2571 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2572 return;
2573
2574 skb_orphan(skb);
2575}
2576EXPORT_SYMBOL(skb_orphan_partial);
2577
2578/*
2579 * Read buffer destructor automatically called from kfree_skb.
2580 */
2581void sock_rfree(struct sk_buff *skb)
2582{
2583 struct sock *sk = skb->sk;
2584 unsigned int len = skb->truesize;
2585
2586 atomic_sub(len, &sk->sk_rmem_alloc);
2587 sk_mem_uncharge(sk, len);
2588}
2589EXPORT_SYMBOL(sock_rfree);
2590
2591/*
2592 * Buffer destructor for skbs that are not used directly in read or write
2593 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2594 */
2595void sock_efree(struct sk_buff *skb)
2596{
2597 sock_put(skb->sk);
2598}
2599EXPORT_SYMBOL(sock_efree);
2600
2601/* Buffer destructor for prefetch/receive path where reference count may
2602 * not be held, e.g. for listen sockets.
2603 */
2604#ifdef CONFIG_INET
2605void sock_pfree(struct sk_buff *skb)
2606{
2607 struct sock *sk = skb->sk;
2608
2609 if (!sk_is_refcounted(sk))
2610 return;
2611
2612 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2613 inet_reqsk(sk)->rsk_listener = NULL;
2614 reqsk_free(inet_reqsk(sk));
2615 return;
2616 }
2617
2618 sock_gen_put(sk);
2619}
2620EXPORT_SYMBOL(sock_pfree);
2621#endif /* CONFIG_INET */
2622
2623kuid_t sock_i_uid(struct sock *sk)
2624{
2625 kuid_t uid;
2626
2627 read_lock_bh(&sk->sk_callback_lock);
2628 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2629 read_unlock_bh(&sk->sk_callback_lock);
2630 return uid;
2631}
2632EXPORT_SYMBOL(sock_i_uid);
2633
2634unsigned long __sock_i_ino(struct sock *sk)
2635{
2636 unsigned long ino;
2637
2638 read_lock(&sk->sk_callback_lock);
2639 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2640 read_unlock(&sk->sk_callback_lock);
2641 return ino;
2642}
2643EXPORT_SYMBOL(__sock_i_ino);
2644
2645unsigned long sock_i_ino(struct sock *sk)
2646{
2647 unsigned long ino;
2648
2649 local_bh_disable();
2650 ino = __sock_i_ino(sk);
2651 local_bh_enable();
2652 return ino;
2653}
2654EXPORT_SYMBOL(sock_i_ino);
2655
2656/*
2657 * Allocate a skb from the socket's send buffer.
2658 */
2659struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2660 gfp_t priority)
2661{
2662 if (force ||
2663 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2664 struct sk_buff *skb = alloc_skb(size, priority);
2665
2666 if (skb) {
2667 skb_set_owner_w(skb, sk);
2668 return skb;
2669 }
2670 }
2671 return NULL;
2672}
2673EXPORT_SYMBOL(sock_wmalloc);
2674
2675static void sock_ofree(struct sk_buff *skb)
2676{
2677 struct sock *sk = skb->sk;
2678
2679 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2680}
2681
2682struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2683 gfp_t priority)
2684{
2685 struct sk_buff *skb;
2686
2687 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2688 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2689 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2690 return NULL;
2691
2692 skb = alloc_skb(size, priority);
2693 if (!skb)
2694 return NULL;
2695
2696 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2697 skb->sk = sk;
2698 skb->destructor = sock_ofree;
2699 return skb;
2700}
2701
2702/*
2703 * Allocate a memory block from the socket's option memory buffer.
2704 */
2705void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2706{
2707 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2708
2709 if ((unsigned int)size <= optmem_max &&
2710 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2711 void *mem;
2712 /* First do the add, to avoid the race if kmalloc
2713 * might sleep.
2714 */
2715 atomic_add(size, &sk->sk_omem_alloc);
2716 mem = kmalloc(size, priority);
2717 if (mem)
2718 return mem;
2719 atomic_sub(size, &sk->sk_omem_alloc);
2720 }
2721 return NULL;
2722}
2723EXPORT_SYMBOL(sock_kmalloc);
2724
2725/* Free an option memory block. Note, we actually want the inline
2726 * here as this allows gcc to detect the nullify and fold away the
2727 * condition entirely.
2728 */
2729static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2730 const bool nullify)
2731{
2732 if (WARN_ON_ONCE(!mem))
2733 return;
2734 if (nullify)
2735 kfree_sensitive(mem);
2736 else
2737 kfree(mem);
2738 atomic_sub(size, &sk->sk_omem_alloc);
2739}
2740
2741void sock_kfree_s(struct sock *sk, void *mem, int size)
2742{
2743 __sock_kfree_s(sk, mem, size, false);
2744}
2745EXPORT_SYMBOL(sock_kfree_s);
2746
2747void sock_kzfree_s(struct sock *sk, void *mem, int size)
2748{
2749 __sock_kfree_s(sk, mem, size, true);
2750}
2751EXPORT_SYMBOL(sock_kzfree_s);
2752
2753/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2754 I think, these locks should be removed for datagram sockets.
2755 */
2756static long sock_wait_for_wmem(struct sock *sk, long timeo)
2757{
2758 DEFINE_WAIT(wait);
2759
2760 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2761 for (;;) {
2762 if (!timeo)
2763 break;
2764 if (signal_pending(current))
2765 break;
2766 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2767 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2768 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2769 break;
2770 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2771 break;
2772 if (READ_ONCE(sk->sk_err))
2773 break;
2774 timeo = schedule_timeout(timeo);
2775 }
2776 finish_wait(sk_sleep(sk), &wait);
2777 return timeo;
2778}
2779
2780
2781/*
2782 * Generic send/receive buffer handlers
2783 */
2784
2785struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2786 unsigned long data_len, int noblock,
2787 int *errcode, int max_page_order)
2788{
2789 struct sk_buff *skb;
2790 long timeo;
2791 int err;
2792
2793 timeo = sock_sndtimeo(sk, noblock);
2794 for (;;) {
2795 err = sock_error(sk);
2796 if (err != 0)
2797 goto failure;
2798
2799 err = -EPIPE;
2800 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2801 goto failure;
2802
2803 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2804 break;
2805
2806 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2807 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2808 err = -EAGAIN;
2809 if (!timeo)
2810 goto failure;
2811 if (signal_pending(current))
2812 goto interrupted;
2813 timeo = sock_wait_for_wmem(sk, timeo);
2814 }
2815 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2816 errcode, sk->sk_allocation);
2817 if (skb)
2818 skb_set_owner_w(skb, sk);
2819 return skb;
2820
2821interrupted:
2822 err = sock_intr_errno(timeo);
2823failure:
2824 *errcode = err;
2825 return NULL;
2826}
2827EXPORT_SYMBOL(sock_alloc_send_pskb);
2828
2829int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2830 struct sockcm_cookie *sockc)
2831{
2832 u32 tsflags;
2833
2834 switch (cmsg->cmsg_type) {
2835 case SO_MARK:
2836 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2837 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2838 return -EPERM;
2839 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2840 return -EINVAL;
2841 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2842 break;
2843 case SO_TIMESTAMPING_OLD:
2844 case SO_TIMESTAMPING_NEW:
2845 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2846 return -EINVAL;
2847
2848 tsflags = *(u32 *)CMSG_DATA(cmsg);
2849 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2850 return -EINVAL;
2851
2852 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2853 sockc->tsflags |= tsflags;
2854 break;
2855 case SCM_TXTIME:
2856 if (!sock_flag(sk, SOCK_TXTIME))
2857 return -EINVAL;
2858 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2859 return -EINVAL;
2860 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2861 break;
2862 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2863 case SCM_RIGHTS:
2864 case SCM_CREDENTIALS:
2865 break;
2866 default:
2867 return -EINVAL;
2868 }
2869 return 0;
2870}
2871EXPORT_SYMBOL(__sock_cmsg_send);
2872
2873int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2874 struct sockcm_cookie *sockc)
2875{
2876 struct cmsghdr *cmsg;
2877 int ret;
2878
2879 for_each_cmsghdr(cmsg, msg) {
2880 if (!CMSG_OK(msg, cmsg))
2881 return -EINVAL;
2882 if (cmsg->cmsg_level != SOL_SOCKET)
2883 continue;
2884 ret = __sock_cmsg_send(sk, cmsg, sockc);
2885 if (ret)
2886 return ret;
2887 }
2888 return 0;
2889}
2890EXPORT_SYMBOL(sock_cmsg_send);
2891
2892static void sk_enter_memory_pressure(struct sock *sk)
2893{
2894 if (!sk->sk_prot->enter_memory_pressure)
2895 return;
2896
2897 sk->sk_prot->enter_memory_pressure(sk);
2898}
2899
2900static void sk_leave_memory_pressure(struct sock *sk)
2901{
2902 if (sk->sk_prot->leave_memory_pressure) {
2903 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2904 tcp_leave_memory_pressure, sk);
2905 } else {
2906 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2907
2908 if (memory_pressure && READ_ONCE(*memory_pressure))
2909 WRITE_ONCE(*memory_pressure, 0);
2910 }
2911}
2912
2913DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2914
2915/**
2916 * skb_page_frag_refill - check that a page_frag contains enough room
2917 * @sz: minimum size of the fragment we want to get
2918 * @pfrag: pointer to page_frag
2919 * @gfp: priority for memory allocation
2920 *
2921 * Note: While this allocator tries to use high order pages, there is
2922 * no guarantee that allocations succeed. Therefore, @sz MUST be
2923 * less or equal than PAGE_SIZE.
2924 */
2925bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2926{
2927 if (pfrag->page) {
2928 if (page_ref_count(pfrag->page) == 1) {
2929 pfrag->offset = 0;
2930 return true;
2931 }
2932 if (pfrag->offset + sz <= pfrag->size)
2933 return true;
2934 put_page(pfrag->page);
2935 }
2936
2937 pfrag->offset = 0;
2938 if (SKB_FRAG_PAGE_ORDER &&
2939 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2940 /* Avoid direct reclaim but allow kswapd to wake */
2941 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2942 __GFP_COMP | __GFP_NOWARN |
2943 __GFP_NORETRY,
2944 SKB_FRAG_PAGE_ORDER);
2945 if (likely(pfrag->page)) {
2946 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2947 return true;
2948 }
2949 }
2950 pfrag->page = alloc_page(gfp);
2951 if (likely(pfrag->page)) {
2952 pfrag->size = PAGE_SIZE;
2953 return true;
2954 }
2955 return false;
2956}
2957EXPORT_SYMBOL(skb_page_frag_refill);
2958
2959bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2960{
2961 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2962 return true;
2963
2964 sk_enter_memory_pressure(sk);
2965 sk_stream_moderate_sndbuf(sk);
2966 return false;
2967}
2968EXPORT_SYMBOL(sk_page_frag_refill);
2969
2970void __lock_sock(struct sock *sk)
2971 __releases(&sk->sk_lock.slock)
2972 __acquires(&sk->sk_lock.slock)
2973{
2974 DEFINE_WAIT(wait);
2975
2976 for (;;) {
2977 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2978 TASK_UNINTERRUPTIBLE);
2979 spin_unlock_bh(&sk->sk_lock.slock);
2980 schedule();
2981 spin_lock_bh(&sk->sk_lock.slock);
2982 if (!sock_owned_by_user(sk))
2983 break;
2984 }
2985 finish_wait(&sk->sk_lock.wq, &wait);
2986}
2987
2988void __release_sock(struct sock *sk)
2989 __releases(&sk->sk_lock.slock)
2990 __acquires(&sk->sk_lock.slock)
2991{
2992 struct sk_buff *skb, *next;
2993
2994 while ((skb = sk->sk_backlog.head) != NULL) {
2995 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2996
2997 spin_unlock_bh(&sk->sk_lock.slock);
2998
2999 do {
3000 next = skb->next;
3001 prefetch(next);
3002 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3003 skb_mark_not_on_list(skb);
3004 sk_backlog_rcv(sk, skb);
3005
3006 cond_resched();
3007
3008 skb = next;
3009 } while (skb != NULL);
3010
3011 spin_lock_bh(&sk->sk_lock.slock);
3012 }
3013
3014 /*
3015 * Doing the zeroing here guarantee we can not loop forever
3016 * while a wild producer attempts to flood us.
3017 */
3018 sk->sk_backlog.len = 0;
3019}
3020
3021void __sk_flush_backlog(struct sock *sk)
3022{
3023 spin_lock_bh(&sk->sk_lock.slock);
3024 __release_sock(sk);
3025
3026 if (sk->sk_prot->release_cb)
3027 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3028 tcp_release_cb, sk);
3029
3030 spin_unlock_bh(&sk->sk_lock.slock);
3031}
3032EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3033
3034/**
3035 * sk_wait_data - wait for data to arrive at sk_receive_queue
3036 * @sk: sock to wait on
3037 * @timeo: for how long
3038 * @skb: last skb seen on sk_receive_queue
3039 *
3040 * Now socket state including sk->sk_err is changed only under lock,
3041 * hence we may omit checks after joining wait queue.
3042 * We check receive queue before schedule() only as optimization;
3043 * it is very likely that release_sock() added new data.
3044 */
3045int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3046{
3047 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3048 int rc;
3049
3050 add_wait_queue(sk_sleep(sk), &wait);
3051 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3052 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3053 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3054 remove_wait_queue(sk_sleep(sk), &wait);
3055 return rc;
3056}
3057EXPORT_SYMBOL(sk_wait_data);
3058
3059/**
3060 * __sk_mem_raise_allocated - increase memory_allocated
3061 * @sk: socket
3062 * @size: memory size to allocate
3063 * @amt: pages to allocate
3064 * @kind: allocation type
3065 *
3066 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3067 *
3068 * Unlike the globally shared limits among the sockets under same protocol,
3069 * consuming the budget of a memcg won't have direct effect on other ones.
3070 * So be optimistic about memcg's tolerance, and leave the callers to decide
3071 * whether or not to raise allocated through sk_under_memory_pressure() or
3072 * its variants.
3073 */
3074int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3075{
3076 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3077 struct proto *prot = sk->sk_prot;
3078 bool charged = false;
3079 long allocated;
3080
3081 sk_memory_allocated_add(sk, amt);
3082 allocated = sk_memory_allocated(sk);
3083
3084 if (memcg) {
3085 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3086 goto suppress_allocation;
3087 charged = true;
3088 }
3089
3090 /* Under limit. */
3091 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3092 sk_leave_memory_pressure(sk);
3093 return 1;
3094 }
3095
3096 /* Under pressure. */
3097 if (allocated > sk_prot_mem_limits(sk, 1))
3098 sk_enter_memory_pressure(sk);
3099
3100 /* Over hard limit. */
3101 if (allocated > sk_prot_mem_limits(sk, 2))
3102 goto suppress_allocation;
3103
3104 /* Guarantee minimum buffer size under pressure (either global
3105 * or memcg) to make sure features described in RFC 7323 (TCP
3106 * Extensions for High Performance) work properly.
3107 *
3108 * This rule does NOT stand when exceeds global or memcg's hard
3109 * limit, or else a DoS attack can be taken place by spawning
3110 * lots of sockets whose usage are under minimum buffer size.
3111 */
3112 if (kind == SK_MEM_RECV) {
3113 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3114 return 1;
3115
3116 } else { /* SK_MEM_SEND */
3117 int wmem0 = sk_get_wmem0(sk, prot);
3118
3119 if (sk->sk_type == SOCK_STREAM) {
3120 if (sk->sk_wmem_queued < wmem0)
3121 return 1;
3122 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3123 return 1;
3124 }
3125 }
3126
3127 if (sk_has_memory_pressure(sk)) {
3128 u64 alloc;
3129
3130 /* The following 'average' heuristic is within the
3131 * scope of global accounting, so it only makes
3132 * sense for global memory pressure.
3133 */
3134 if (!sk_under_global_memory_pressure(sk))
3135 return 1;
3136
3137 /* Try to be fair among all the sockets under global
3138 * pressure by allowing the ones that below average
3139 * usage to raise.
3140 */
3141 alloc = sk_sockets_allocated_read_positive(sk);
3142 if (sk_prot_mem_limits(sk, 2) > alloc *
3143 sk_mem_pages(sk->sk_wmem_queued +
3144 atomic_read(&sk->sk_rmem_alloc) +
3145 sk->sk_forward_alloc))
3146 return 1;
3147 }
3148
3149suppress_allocation:
3150
3151 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3152 sk_stream_moderate_sndbuf(sk);
3153
3154 /* Fail only if socket is _under_ its sndbuf.
3155 * In this case we cannot block, so that we have to fail.
3156 */
3157 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3158 /* Force charge with __GFP_NOFAIL */
3159 if (memcg && !charged) {
3160 mem_cgroup_charge_skmem(memcg, amt,
3161 gfp_memcg_charge() | __GFP_NOFAIL);
3162 }
3163 return 1;
3164 }
3165 }
3166
3167 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3168 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3169
3170 sk_memory_allocated_sub(sk, amt);
3171
3172 if (charged)
3173 mem_cgroup_uncharge_skmem(memcg, amt);
3174
3175 return 0;
3176}
3177
3178/**
3179 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3180 * @sk: socket
3181 * @size: memory size to allocate
3182 * @kind: allocation type
3183 *
3184 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3185 * rmem allocation. This function assumes that protocols which have
3186 * memory_pressure use sk_wmem_queued as write buffer accounting.
3187 */
3188int __sk_mem_schedule(struct sock *sk, int size, int kind)
3189{
3190 int ret, amt = sk_mem_pages(size);
3191
3192 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3193 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3194 if (!ret)
3195 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3196 return ret;
3197}
3198EXPORT_SYMBOL(__sk_mem_schedule);
3199
3200/**
3201 * __sk_mem_reduce_allocated - reclaim memory_allocated
3202 * @sk: socket
3203 * @amount: number of quanta
3204 *
3205 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3206 */
3207void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3208{
3209 sk_memory_allocated_sub(sk, amount);
3210
3211 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3212 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3213
3214 if (sk_under_global_memory_pressure(sk) &&
3215 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3216 sk_leave_memory_pressure(sk);
3217}
3218
3219/**
3220 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3221 * @sk: socket
3222 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3223 */
3224void __sk_mem_reclaim(struct sock *sk, int amount)
3225{
3226 amount >>= PAGE_SHIFT;
3227 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3228 __sk_mem_reduce_allocated(sk, amount);
3229}
3230EXPORT_SYMBOL(__sk_mem_reclaim);
3231
3232int sk_set_peek_off(struct sock *sk, int val)
3233{
3234 WRITE_ONCE(sk->sk_peek_off, val);
3235 return 0;
3236}
3237EXPORT_SYMBOL_GPL(sk_set_peek_off);
3238
3239/*
3240 * Set of default routines for initialising struct proto_ops when
3241 * the protocol does not support a particular function. In certain
3242 * cases where it makes no sense for a protocol to have a "do nothing"
3243 * function, some default processing is provided.
3244 */
3245
3246int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3247{
3248 return -EOPNOTSUPP;
3249}
3250EXPORT_SYMBOL(sock_no_bind);
3251
3252int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3253 int len, int flags)
3254{
3255 return -EOPNOTSUPP;
3256}
3257EXPORT_SYMBOL(sock_no_connect);
3258
3259int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3260{
3261 return -EOPNOTSUPP;
3262}
3263EXPORT_SYMBOL(sock_no_socketpair);
3264
3265int sock_no_accept(struct socket *sock, struct socket *newsock,
3266 struct proto_accept_arg *arg)
3267{
3268 return -EOPNOTSUPP;
3269}
3270EXPORT_SYMBOL(sock_no_accept);
3271
3272int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3273 int peer)
3274{
3275 return -EOPNOTSUPP;
3276}
3277EXPORT_SYMBOL(sock_no_getname);
3278
3279int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3280{
3281 return -EOPNOTSUPP;
3282}
3283EXPORT_SYMBOL(sock_no_ioctl);
3284
3285int sock_no_listen(struct socket *sock, int backlog)
3286{
3287 return -EOPNOTSUPP;
3288}
3289EXPORT_SYMBOL(sock_no_listen);
3290
3291int sock_no_shutdown(struct socket *sock, int how)
3292{
3293 return -EOPNOTSUPP;
3294}
3295EXPORT_SYMBOL(sock_no_shutdown);
3296
3297int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3298{
3299 return -EOPNOTSUPP;
3300}
3301EXPORT_SYMBOL(sock_no_sendmsg);
3302
3303int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3304{
3305 return -EOPNOTSUPP;
3306}
3307EXPORT_SYMBOL(sock_no_sendmsg_locked);
3308
3309int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3310 int flags)
3311{
3312 return -EOPNOTSUPP;
3313}
3314EXPORT_SYMBOL(sock_no_recvmsg);
3315
3316int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3317{
3318 /* Mirror missing mmap method error code */
3319 return -ENODEV;
3320}
3321EXPORT_SYMBOL(sock_no_mmap);
3322
3323/*
3324 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3325 * various sock-based usage counts.
3326 */
3327void __receive_sock(struct file *file)
3328{
3329 struct socket *sock;
3330
3331 sock = sock_from_file(file);
3332 if (sock) {
3333 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3334 sock_update_classid(&sock->sk->sk_cgrp_data);
3335 }
3336}
3337
3338/*
3339 * Default Socket Callbacks
3340 */
3341
3342static void sock_def_wakeup(struct sock *sk)
3343{
3344 struct socket_wq *wq;
3345
3346 rcu_read_lock();
3347 wq = rcu_dereference(sk->sk_wq);
3348 if (skwq_has_sleeper(wq))
3349 wake_up_interruptible_all(&wq->wait);
3350 rcu_read_unlock();
3351}
3352
3353static void sock_def_error_report(struct sock *sk)
3354{
3355 struct socket_wq *wq;
3356
3357 rcu_read_lock();
3358 wq = rcu_dereference(sk->sk_wq);
3359 if (skwq_has_sleeper(wq))
3360 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3361 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3362 rcu_read_unlock();
3363}
3364
3365void sock_def_readable(struct sock *sk)
3366{
3367 struct socket_wq *wq;
3368
3369 trace_sk_data_ready(sk);
3370
3371 rcu_read_lock();
3372 wq = rcu_dereference(sk->sk_wq);
3373 if (skwq_has_sleeper(wq))
3374 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3375 EPOLLRDNORM | EPOLLRDBAND);
3376 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3377 rcu_read_unlock();
3378}
3379
3380static void sock_def_write_space(struct sock *sk)
3381{
3382 struct socket_wq *wq;
3383
3384 rcu_read_lock();
3385
3386 /* Do not wake up a writer until he can make "significant"
3387 * progress. --DaveM
3388 */
3389 if (sock_writeable(sk)) {
3390 wq = rcu_dereference(sk->sk_wq);
3391 if (skwq_has_sleeper(wq))
3392 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3393 EPOLLWRNORM | EPOLLWRBAND);
3394
3395 /* Should agree with poll, otherwise some programs break */
3396 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3397 }
3398
3399 rcu_read_unlock();
3400}
3401
3402/* An optimised version of sock_def_write_space(), should only be called
3403 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3404 * ->sk_wmem_alloc.
3405 */
3406static void sock_def_write_space_wfree(struct sock *sk)
3407{
3408 /* Do not wake up a writer until he can make "significant"
3409 * progress. --DaveM
3410 */
3411 if (sock_writeable(sk)) {
3412 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3413
3414 /* rely on refcount_sub from sock_wfree() */
3415 smp_mb__after_atomic();
3416 if (wq && waitqueue_active(&wq->wait))
3417 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3418 EPOLLWRNORM | EPOLLWRBAND);
3419
3420 /* Should agree with poll, otherwise some programs break */
3421 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3422 }
3423}
3424
3425static void sock_def_destruct(struct sock *sk)
3426{
3427}
3428
3429void sk_send_sigurg(struct sock *sk)
3430{
3431 if (sk->sk_socket && sk->sk_socket->file)
3432 if (send_sigurg(&sk->sk_socket->file->f_owner))
3433 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3434}
3435EXPORT_SYMBOL(sk_send_sigurg);
3436
3437void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3438 unsigned long expires)
3439{
3440 if (!mod_timer(timer, expires))
3441 sock_hold(sk);
3442}
3443EXPORT_SYMBOL(sk_reset_timer);
3444
3445void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3446{
3447 if (del_timer(timer))
3448 __sock_put(sk);
3449}
3450EXPORT_SYMBOL(sk_stop_timer);
3451
3452void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3453{
3454 if (del_timer_sync(timer))
3455 __sock_put(sk);
3456}
3457EXPORT_SYMBOL(sk_stop_timer_sync);
3458
3459void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3460{
3461 sk_init_common(sk);
3462 sk->sk_send_head = NULL;
3463
3464 timer_setup(&sk->sk_timer, NULL, 0);
3465
3466 sk->sk_allocation = GFP_KERNEL;
3467 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3468 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3469 sk->sk_state = TCP_CLOSE;
3470 sk->sk_use_task_frag = true;
3471 sk_set_socket(sk, sock);
3472
3473 sock_set_flag(sk, SOCK_ZAPPED);
3474
3475 if (sock) {
3476 sk->sk_type = sock->type;
3477 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3478 sock->sk = sk;
3479 } else {
3480 RCU_INIT_POINTER(sk->sk_wq, NULL);
3481 }
3482 sk->sk_uid = uid;
3483
3484 sk->sk_state_change = sock_def_wakeup;
3485 sk->sk_data_ready = sock_def_readable;
3486 sk->sk_write_space = sock_def_write_space;
3487 sk->sk_error_report = sock_def_error_report;
3488 sk->sk_destruct = sock_def_destruct;
3489
3490 sk->sk_frag.page = NULL;
3491 sk->sk_frag.offset = 0;
3492 sk->sk_peek_off = -1;
3493
3494 sk->sk_peer_pid = NULL;
3495 sk->sk_peer_cred = NULL;
3496 spin_lock_init(&sk->sk_peer_lock);
3497
3498 sk->sk_write_pending = 0;
3499 sk->sk_rcvlowat = 1;
3500 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3501 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3502
3503 sk->sk_stamp = SK_DEFAULT_STAMP;
3504#if BITS_PER_LONG==32
3505 seqlock_init(&sk->sk_stamp_seq);
3506#endif
3507 atomic_set(&sk->sk_zckey, 0);
3508
3509#ifdef CONFIG_NET_RX_BUSY_POLL
3510 sk->sk_napi_id = 0;
3511 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3512#endif
3513
3514 sk->sk_max_pacing_rate = ~0UL;
3515 sk->sk_pacing_rate = ~0UL;
3516 WRITE_ONCE(sk->sk_pacing_shift, 10);
3517 sk->sk_incoming_cpu = -1;
3518
3519 sk_rx_queue_clear(sk);
3520 /*
3521 * Before updating sk_refcnt, we must commit prior changes to memory
3522 * (Documentation/RCU/rculist_nulls.rst for details)
3523 */
3524 smp_wmb();
3525 refcount_set(&sk->sk_refcnt, 1);
3526 atomic_set(&sk->sk_drops, 0);
3527}
3528EXPORT_SYMBOL(sock_init_data_uid);
3529
3530void sock_init_data(struct socket *sock, struct sock *sk)
3531{
3532 kuid_t uid = sock ?
3533 SOCK_INODE(sock)->i_uid :
3534 make_kuid(sock_net(sk)->user_ns, 0);
3535
3536 sock_init_data_uid(sock, sk, uid);
3537}
3538EXPORT_SYMBOL(sock_init_data);
3539
3540void lock_sock_nested(struct sock *sk, int subclass)
3541{
3542 /* The sk_lock has mutex_lock() semantics here. */
3543 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3544
3545 might_sleep();
3546 spin_lock_bh(&sk->sk_lock.slock);
3547 if (sock_owned_by_user_nocheck(sk))
3548 __lock_sock(sk);
3549 sk->sk_lock.owned = 1;
3550 spin_unlock_bh(&sk->sk_lock.slock);
3551}
3552EXPORT_SYMBOL(lock_sock_nested);
3553
3554void release_sock(struct sock *sk)
3555{
3556 spin_lock_bh(&sk->sk_lock.slock);
3557 if (sk->sk_backlog.tail)
3558 __release_sock(sk);
3559
3560 if (sk->sk_prot->release_cb)
3561 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3562 tcp_release_cb, sk);
3563
3564 sock_release_ownership(sk);
3565 if (waitqueue_active(&sk->sk_lock.wq))
3566 wake_up(&sk->sk_lock.wq);
3567 spin_unlock_bh(&sk->sk_lock.slock);
3568}
3569EXPORT_SYMBOL(release_sock);
3570
3571bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3572{
3573 might_sleep();
3574 spin_lock_bh(&sk->sk_lock.slock);
3575
3576 if (!sock_owned_by_user_nocheck(sk)) {
3577 /*
3578 * Fast path return with bottom halves disabled and
3579 * sock::sk_lock.slock held.
3580 *
3581 * The 'mutex' is not contended and holding
3582 * sock::sk_lock.slock prevents all other lockers to
3583 * proceed so the corresponding unlock_sock_fast() can
3584 * avoid the slow path of release_sock() completely and
3585 * just release slock.
3586 *
3587 * From a semantical POV this is equivalent to 'acquiring'
3588 * the 'mutex', hence the corresponding lockdep
3589 * mutex_release() has to happen in the fast path of
3590 * unlock_sock_fast().
3591 */
3592 return false;
3593 }
3594
3595 __lock_sock(sk);
3596 sk->sk_lock.owned = 1;
3597 __acquire(&sk->sk_lock.slock);
3598 spin_unlock_bh(&sk->sk_lock.slock);
3599 return true;
3600}
3601EXPORT_SYMBOL(__lock_sock_fast);
3602
3603int sock_gettstamp(struct socket *sock, void __user *userstamp,
3604 bool timeval, bool time32)
3605{
3606 struct sock *sk = sock->sk;
3607 struct timespec64 ts;
3608
3609 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3610 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3611 if (ts.tv_sec == -1)
3612 return -ENOENT;
3613 if (ts.tv_sec == 0) {
3614 ktime_t kt = ktime_get_real();
3615 sock_write_timestamp(sk, kt);
3616 ts = ktime_to_timespec64(kt);
3617 }
3618
3619 if (timeval)
3620 ts.tv_nsec /= 1000;
3621
3622#ifdef CONFIG_COMPAT_32BIT_TIME
3623 if (time32)
3624 return put_old_timespec32(&ts, userstamp);
3625#endif
3626#ifdef CONFIG_SPARC64
3627 /* beware of padding in sparc64 timeval */
3628 if (timeval && !in_compat_syscall()) {
3629 struct __kernel_old_timeval __user tv = {
3630 .tv_sec = ts.tv_sec,
3631 .tv_usec = ts.tv_nsec,
3632 };
3633 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3634 return -EFAULT;
3635 return 0;
3636 }
3637#endif
3638 return put_timespec64(&ts, userstamp);
3639}
3640EXPORT_SYMBOL(sock_gettstamp);
3641
3642void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3643{
3644 if (!sock_flag(sk, flag)) {
3645 unsigned long previous_flags = sk->sk_flags;
3646
3647 sock_set_flag(sk, flag);
3648 /*
3649 * we just set one of the two flags which require net
3650 * time stamping, but time stamping might have been on
3651 * already because of the other one
3652 */
3653 if (sock_needs_netstamp(sk) &&
3654 !(previous_flags & SK_FLAGS_TIMESTAMP))
3655 net_enable_timestamp();
3656 }
3657}
3658
3659int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3660 int level, int type)
3661{
3662 struct sock_exterr_skb *serr;
3663 struct sk_buff *skb;
3664 int copied, err;
3665
3666 err = -EAGAIN;
3667 skb = sock_dequeue_err_skb(sk);
3668 if (skb == NULL)
3669 goto out;
3670
3671 copied = skb->len;
3672 if (copied > len) {
3673 msg->msg_flags |= MSG_TRUNC;
3674 copied = len;
3675 }
3676 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3677 if (err)
3678 goto out_free_skb;
3679
3680 sock_recv_timestamp(msg, sk, skb);
3681
3682 serr = SKB_EXT_ERR(skb);
3683 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3684
3685 msg->msg_flags |= MSG_ERRQUEUE;
3686 err = copied;
3687
3688out_free_skb:
3689 kfree_skb(skb);
3690out:
3691 return err;
3692}
3693EXPORT_SYMBOL(sock_recv_errqueue);
3694
3695/*
3696 * Get a socket option on an socket.
3697 *
3698 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3699 * asynchronous errors should be reported by getsockopt. We assume
3700 * this means if you specify SO_ERROR (otherwise whats the point of it).
3701 */
3702int sock_common_getsockopt(struct socket *sock, int level, int optname,
3703 char __user *optval, int __user *optlen)
3704{
3705 struct sock *sk = sock->sk;
3706
3707 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3708 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3709}
3710EXPORT_SYMBOL(sock_common_getsockopt);
3711
3712int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3713 int flags)
3714{
3715 struct sock *sk = sock->sk;
3716 int addr_len = 0;
3717 int err;
3718
3719 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3720 if (err >= 0)
3721 msg->msg_namelen = addr_len;
3722 return err;
3723}
3724EXPORT_SYMBOL(sock_common_recvmsg);
3725
3726/*
3727 * Set socket options on an inet socket.
3728 */
3729int sock_common_setsockopt(struct socket *sock, int level, int optname,
3730 sockptr_t optval, unsigned int optlen)
3731{
3732 struct sock *sk = sock->sk;
3733
3734 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3735 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3736}
3737EXPORT_SYMBOL(sock_common_setsockopt);
3738
3739void sk_common_release(struct sock *sk)
3740{
3741 if (sk->sk_prot->destroy)
3742 sk->sk_prot->destroy(sk);
3743
3744 /*
3745 * Observation: when sk_common_release is called, processes have
3746 * no access to socket. But net still has.
3747 * Step one, detach it from networking:
3748 *
3749 * A. Remove from hash tables.
3750 */
3751
3752 sk->sk_prot->unhash(sk);
3753
3754 if (sk->sk_socket)
3755 sk->sk_socket->sk = NULL;
3756
3757 /*
3758 * In this point socket cannot receive new packets, but it is possible
3759 * that some packets are in flight because some CPU runs receiver and
3760 * did hash table lookup before we unhashed socket. They will achieve
3761 * receive queue and will be purged by socket destructor.
3762 *
3763 * Also we still have packets pending on receive queue and probably,
3764 * our own packets waiting in device queues. sock_destroy will drain
3765 * receive queue, but transmitted packets will delay socket destruction
3766 * until the last reference will be released.
3767 */
3768
3769 sock_orphan(sk);
3770
3771 xfrm_sk_free_policy(sk);
3772
3773 sock_put(sk);
3774}
3775EXPORT_SYMBOL(sk_common_release);
3776
3777void sk_get_meminfo(const struct sock *sk, u32 *mem)
3778{
3779 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3780
3781 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3782 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3783 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3784 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3785 mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3786 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3787 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3788 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3789 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3790}
3791
3792#ifdef CONFIG_PROC_FS
3793static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3794
3795int sock_prot_inuse_get(struct net *net, struct proto *prot)
3796{
3797 int cpu, idx = prot->inuse_idx;
3798 int res = 0;
3799
3800 for_each_possible_cpu(cpu)
3801 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3802
3803 return res >= 0 ? res : 0;
3804}
3805EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3806
3807int sock_inuse_get(struct net *net)
3808{
3809 int cpu, res = 0;
3810
3811 for_each_possible_cpu(cpu)
3812 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3813
3814 return res;
3815}
3816
3817EXPORT_SYMBOL_GPL(sock_inuse_get);
3818
3819static int __net_init sock_inuse_init_net(struct net *net)
3820{
3821 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3822 if (net->core.prot_inuse == NULL)
3823 return -ENOMEM;
3824 return 0;
3825}
3826
3827static void __net_exit sock_inuse_exit_net(struct net *net)
3828{
3829 free_percpu(net->core.prot_inuse);
3830}
3831
3832static struct pernet_operations net_inuse_ops = {
3833 .init = sock_inuse_init_net,
3834 .exit = sock_inuse_exit_net,
3835};
3836
3837static __init int net_inuse_init(void)
3838{
3839 if (register_pernet_subsys(&net_inuse_ops))
3840 panic("Cannot initialize net inuse counters");
3841
3842 return 0;
3843}
3844
3845core_initcall(net_inuse_init);
3846
3847static int assign_proto_idx(struct proto *prot)
3848{
3849 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3850
3851 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3852 pr_err("PROTO_INUSE_NR exhausted\n");
3853 return -ENOSPC;
3854 }
3855
3856 set_bit(prot->inuse_idx, proto_inuse_idx);
3857 return 0;
3858}
3859
3860static void release_proto_idx(struct proto *prot)
3861{
3862 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3863 clear_bit(prot->inuse_idx, proto_inuse_idx);
3864}
3865#else
3866static inline int assign_proto_idx(struct proto *prot)
3867{
3868 return 0;
3869}
3870
3871static inline void release_proto_idx(struct proto *prot)
3872{
3873}
3874
3875#endif
3876
3877static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3878{
3879 if (!twsk_prot)
3880 return;
3881 kfree(twsk_prot->twsk_slab_name);
3882 twsk_prot->twsk_slab_name = NULL;
3883 kmem_cache_destroy(twsk_prot->twsk_slab);
3884 twsk_prot->twsk_slab = NULL;
3885}
3886
3887static int tw_prot_init(const struct proto *prot)
3888{
3889 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3890
3891 if (!twsk_prot)
3892 return 0;
3893
3894 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3895 prot->name);
3896 if (!twsk_prot->twsk_slab_name)
3897 return -ENOMEM;
3898
3899 twsk_prot->twsk_slab =
3900 kmem_cache_create(twsk_prot->twsk_slab_name,
3901 twsk_prot->twsk_obj_size, 0,
3902 SLAB_ACCOUNT | prot->slab_flags,
3903 NULL);
3904 if (!twsk_prot->twsk_slab) {
3905 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3906 prot->name);
3907 return -ENOMEM;
3908 }
3909
3910 return 0;
3911}
3912
3913static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3914{
3915 if (!rsk_prot)
3916 return;
3917 kfree(rsk_prot->slab_name);
3918 rsk_prot->slab_name = NULL;
3919 kmem_cache_destroy(rsk_prot->slab);
3920 rsk_prot->slab = NULL;
3921}
3922
3923static int req_prot_init(const struct proto *prot)
3924{
3925 struct request_sock_ops *rsk_prot = prot->rsk_prot;
3926
3927 if (!rsk_prot)
3928 return 0;
3929
3930 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3931 prot->name);
3932 if (!rsk_prot->slab_name)
3933 return -ENOMEM;
3934
3935 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3936 rsk_prot->obj_size, 0,
3937 SLAB_ACCOUNT | prot->slab_flags,
3938 NULL);
3939
3940 if (!rsk_prot->slab) {
3941 pr_crit("%s: Can't create request sock SLAB cache!\n",
3942 prot->name);
3943 return -ENOMEM;
3944 }
3945 return 0;
3946}
3947
3948int proto_register(struct proto *prot, int alloc_slab)
3949{
3950 int ret = -ENOBUFS;
3951
3952 if (prot->memory_allocated && !prot->sysctl_mem) {
3953 pr_err("%s: missing sysctl_mem\n", prot->name);
3954 return -EINVAL;
3955 }
3956 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3957 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3958 return -EINVAL;
3959 }
3960 if (alloc_slab) {
3961 prot->slab = kmem_cache_create_usercopy(prot->name,
3962 prot->obj_size, 0,
3963 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3964 prot->slab_flags,
3965 prot->useroffset, prot->usersize,
3966 NULL);
3967
3968 if (prot->slab == NULL) {
3969 pr_crit("%s: Can't create sock SLAB cache!\n",
3970 prot->name);
3971 goto out;
3972 }
3973
3974 if (req_prot_init(prot))
3975 goto out_free_request_sock_slab;
3976
3977 if (tw_prot_init(prot))
3978 goto out_free_timewait_sock_slab;
3979 }
3980
3981 mutex_lock(&proto_list_mutex);
3982 ret = assign_proto_idx(prot);
3983 if (ret) {
3984 mutex_unlock(&proto_list_mutex);
3985 goto out_free_timewait_sock_slab;
3986 }
3987 list_add(&prot->node, &proto_list);
3988 mutex_unlock(&proto_list_mutex);
3989 return ret;
3990
3991out_free_timewait_sock_slab:
3992 if (alloc_slab)
3993 tw_prot_cleanup(prot->twsk_prot);
3994out_free_request_sock_slab:
3995 if (alloc_slab) {
3996 req_prot_cleanup(prot->rsk_prot);
3997
3998 kmem_cache_destroy(prot->slab);
3999 prot->slab = NULL;
4000 }
4001out:
4002 return ret;
4003}
4004EXPORT_SYMBOL(proto_register);
4005
4006void proto_unregister(struct proto *prot)
4007{
4008 mutex_lock(&proto_list_mutex);
4009 release_proto_idx(prot);
4010 list_del(&prot->node);
4011 mutex_unlock(&proto_list_mutex);
4012
4013 kmem_cache_destroy(prot->slab);
4014 prot->slab = NULL;
4015
4016 req_prot_cleanup(prot->rsk_prot);
4017 tw_prot_cleanup(prot->twsk_prot);
4018}
4019EXPORT_SYMBOL(proto_unregister);
4020
4021int sock_load_diag_module(int family, int protocol)
4022{
4023 if (!protocol) {
4024 if (!sock_is_registered(family))
4025 return -ENOENT;
4026
4027 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4028 NETLINK_SOCK_DIAG, family);
4029 }
4030
4031#ifdef CONFIG_INET
4032 if (family == AF_INET &&
4033 protocol != IPPROTO_RAW &&
4034 protocol < MAX_INET_PROTOS &&
4035 !rcu_access_pointer(inet_protos[protocol]))
4036 return -ENOENT;
4037#endif
4038
4039 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4040 NETLINK_SOCK_DIAG, family, protocol);
4041}
4042EXPORT_SYMBOL(sock_load_diag_module);
4043
4044#ifdef CONFIG_PROC_FS
4045static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4046 __acquires(proto_list_mutex)
4047{
4048 mutex_lock(&proto_list_mutex);
4049 return seq_list_start_head(&proto_list, *pos);
4050}
4051
4052static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4053{
4054 return seq_list_next(v, &proto_list, pos);
4055}
4056
4057static void proto_seq_stop(struct seq_file *seq, void *v)
4058 __releases(proto_list_mutex)
4059{
4060 mutex_unlock(&proto_list_mutex);
4061}
4062
4063static char proto_method_implemented(const void *method)
4064{
4065 return method == NULL ? 'n' : 'y';
4066}
4067static long sock_prot_memory_allocated(struct proto *proto)
4068{
4069 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4070}
4071
4072static const char *sock_prot_memory_pressure(struct proto *proto)
4073{
4074 return proto->memory_pressure != NULL ?
4075 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4076}
4077
4078static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4079{
4080
4081 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4082 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4083 proto->name,
4084 proto->obj_size,
4085 sock_prot_inuse_get(seq_file_net(seq), proto),
4086 sock_prot_memory_allocated(proto),
4087 sock_prot_memory_pressure(proto),
4088 proto->max_header,
4089 proto->slab == NULL ? "no" : "yes",
4090 module_name(proto->owner),
4091 proto_method_implemented(proto->close),
4092 proto_method_implemented(proto->connect),
4093 proto_method_implemented(proto->disconnect),
4094 proto_method_implemented(proto->accept),
4095 proto_method_implemented(proto->ioctl),
4096 proto_method_implemented(proto->init),
4097 proto_method_implemented(proto->destroy),
4098 proto_method_implemented(proto->shutdown),
4099 proto_method_implemented(proto->setsockopt),
4100 proto_method_implemented(proto->getsockopt),
4101 proto_method_implemented(proto->sendmsg),
4102 proto_method_implemented(proto->recvmsg),
4103 proto_method_implemented(proto->bind),
4104 proto_method_implemented(proto->backlog_rcv),
4105 proto_method_implemented(proto->hash),
4106 proto_method_implemented(proto->unhash),
4107 proto_method_implemented(proto->get_port),
4108 proto_method_implemented(proto->enter_memory_pressure));
4109}
4110
4111static int proto_seq_show(struct seq_file *seq, void *v)
4112{
4113 if (v == &proto_list)
4114 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4115 "protocol",
4116 "size",
4117 "sockets",
4118 "memory",
4119 "press",
4120 "maxhdr",
4121 "slab",
4122 "module",
4123 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4124 else
4125 proto_seq_printf(seq, list_entry(v, struct proto, node));
4126 return 0;
4127}
4128
4129static const struct seq_operations proto_seq_ops = {
4130 .start = proto_seq_start,
4131 .next = proto_seq_next,
4132 .stop = proto_seq_stop,
4133 .show = proto_seq_show,
4134};
4135
4136static __net_init int proto_init_net(struct net *net)
4137{
4138 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4139 sizeof(struct seq_net_private)))
4140 return -ENOMEM;
4141
4142 return 0;
4143}
4144
4145static __net_exit void proto_exit_net(struct net *net)
4146{
4147 remove_proc_entry("protocols", net->proc_net);
4148}
4149
4150
4151static __net_initdata struct pernet_operations proto_net_ops = {
4152 .init = proto_init_net,
4153 .exit = proto_exit_net,
4154};
4155
4156static int __init proto_init(void)
4157{
4158 return register_pernet_subsys(&proto_net_ops);
4159}
4160
4161subsys_initcall(proto_init);
4162
4163#endif /* PROC_FS */
4164
4165#ifdef CONFIG_NET_RX_BUSY_POLL
4166bool sk_busy_loop_end(void *p, unsigned long start_time)
4167{
4168 struct sock *sk = p;
4169
4170 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4171 return true;
4172
4173 if (sk_is_udp(sk) &&
4174 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4175 return true;
4176
4177 return sk_busy_loop_timeout(sk, start_time);
4178}
4179EXPORT_SYMBOL(sk_busy_loop_end);
4180#endif /* CONFIG_NET_RX_BUSY_POLL */
4181
4182int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4183{
4184 if (!sk->sk_prot->bind_add)
4185 return -EOPNOTSUPP;
4186 return sk->sk_prot->bind_add(sk, addr, addr_len);
4187}
4188EXPORT_SYMBOL(sock_bind_add);
4189
4190/* Copy 'size' bytes from userspace and return `size` back to userspace */
4191int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4192 void __user *arg, void *karg, size_t size)
4193{
4194 int ret;
4195
4196 if (copy_from_user(karg, arg, size))
4197 return -EFAULT;
4198
4199 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4200 if (ret)
4201 return ret;
4202
4203 if (copy_to_user(arg, karg, size))
4204 return -EFAULT;
4205
4206 return 0;
4207}
4208EXPORT_SYMBOL(sock_ioctl_inout);
4209
4210/* This is the most common ioctl prep function, where the result (4 bytes) is
4211 * copied back to userspace if the ioctl() returns successfully. No input is
4212 * copied from userspace as input argument.
4213 */
4214static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4215{
4216 int ret, karg = 0;
4217
4218 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4219 if (ret)
4220 return ret;
4221
4222 return put_user(karg, (int __user *)arg);
4223}
4224
4225/* A wrapper around sock ioctls, which copies the data from userspace
4226 * (depending on the protocol/ioctl), and copies back the result to userspace.
4227 * The main motivation for this function is to pass kernel memory to the
4228 * protocol ioctl callbacks, instead of userspace memory.
4229 */
4230int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4231{
4232 int rc = 1;
4233
4234 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4235 rc = ipmr_sk_ioctl(sk, cmd, arg);
4236 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4237 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4238 else if (sk_is_phonet(sk))
4239 rc = phonet_sk_ioctl(sk, cmd, arg);
4240
4241 /* If ioctl was processed, returns its value */
4242 if (rc <= 0)
4243 return rc;
4244
4245 /* Otherwise call the default handler */
4246 return sock_ioctl_out(sk, cmd, arg);
4247}
4248EXPORT_SYMBOL(sk_ioctl);
4249
4250static int __init sock_struct_check(void)
4251{
4252 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4253 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4254 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4255 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4256 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4257
4258 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4259 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4260 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4261 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4262 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4263 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4264 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4265 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4266 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4267
4268 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4269 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4270 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4271
4272 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4273 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4274 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4275 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4276
4277 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4278 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4279 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4280 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4281 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4282 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4283 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4284 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4285 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4286 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4287 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4288 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4289 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4290 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4291 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4292 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4293
4294 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4295 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4296 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4297 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4298 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4299 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4300 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4301 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4302 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4303 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4304 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4305 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4306 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4307 return 0;
4308}
4309
4310core_initcall(sock_struct_check);