Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88#include <linux/unaligned.h>
89#include <linux/capability.h>
90#include <linux/errno.h>
91#include <linux/errqueue.h>
92#include <linux/types.h>
93#include <linux/socket.h>
94#include <linux/in.h>
95#include <linux/kernel.h>
96#include <linux/module.h>
97#include <linux/proc_fs.h>
98#include <linux/seq_file.h>
99#include <linux/sched.h>
100#include <linux/sched/mm.h>
101#include <linux/timer.h>
102#include <linux/string.h>
103#include <linux/sockios.h>
104#include <linux/net.h>
105#include <linux/mm.h>
106#include <linux/slab.h>
107#include <linux/interrupt.h>
108#include <linux/poll.h>
109#include <linux/tcp.h>
110#include <linux/udp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114#include <linux/static_key.h>
115#include <linux/memcontrol.h>
116#include <linux/prefetch.h>
117#include <linux/compat.h>
118#include <linux/mroute.h>
119#include <linux/mroute6.h>
120#include <linux/icmpv6.h>
121
122#include <linux/uaccess.h>
123
124#include <linux/netdevice.h>
125#include <net/protocol.h>
126#include <linux/skbuff.h>
127#include <linux/skbuff_ref.h>
128#include <net/net_namespace.h>
129#include <net/request_sock.h>
130#include <net/sock.h>
131#include <net/proto_memory.h>
132#include <linux/net_tstamp.h>
133#include <net/xfrm.h>
134#include <linux/ipsec.h>
135#include <net/cls_cgroup.h>
136#include <net/netprio_cgroup.h>
137#include <linux/sock_diag.h>
138
139#include <linux/filter.h>
140#include <net/sock_reuseport.h>
141#include <net/bpf_sk_storage.h>
142
143#include <trace/events/sock.h>
144
145#include <net/tcp.h>
146#include <net/busy_poll.h>
147#include <net/phonet/phonet.h>
148
149#include <linux/ethtool.h>
150
151#include <uapi/linux/pidfd.h>
152
153#include "dev.h"
154
155static DEFINE_MUTEX(proto_list_mutex);
156static LIST_HEAD(proto_list);
157
158static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
159static void sock_def_write_space(struct sock *sk);
160
161/**
162 * sk_ns_capable - General socket capability test
163 * @sk: Socket to use a capability on or through
164 * @user_ns: The user namespace of the capability to use
165 * @cap: The capability to use
166 *
167 * Test to see if the opener of the socket had when the socket was
168 * created and the current process has the capability @cap in the user
169 * namespace @user_ns.
170 */
171bool sk_ns_capable(const struct sock *sk,
172 struct user_namespace *user_ns, int cap)
173{
174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 ns_capable(user_ns, cap);
176}
177EXPORT_SYMBOL(sk_ns_capable);
178
179/**
180 * sk_capable - Socket global capability test
181 * @sk: Socket to use a capability on or through
182 * @cap: The global capability to use
183 *
184 * Test to see if the opener of the socket had when the socket was
185 * created and the current process has the capability @cap in all user
186 * namespaces.
187 */
188bool sk_capable(const struct sock *sk, int cap)
189{
190 return sk_ns_capable(sk, &init_user_ns, cap);
191}
192EXPORT_SYMBOL(sk_capable);
193
194/**
195 * sk_net_capable - Network namespace socket capability test
196 * @sk: Socket to use a capability on or through
197 * @cap: The capability to use
198 *
199 * Test to see if the opener of the socket had when the socket was created
200 * and the current process has the capability @cap over the network namespace
201 * the socket is a member of.
202 */
203bool sk_net_capable(const struct sock *sk, int cap)
204{
205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206}
207EXPORT_SYMBOL(sk_net_capable);
208
209/*
210 * Each address family might have different locking rules, so we have
211 * one slock key per address family and separate keys for internal and
212 * userspace sockets.
213 */
214static struct lock_class_key af_family_keys[AF_MAX];
215static struct lock_class_key af_family_kern_keys[AF_MAX];
216static struct lock_class_key af_family_slock_keys[AF_MAX];
217static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218
219/*
220 * Make lock validator output more readable. (we pre-construct these
221 * strings build-time, so that runtime initialization of socket
222 * locks is fast):
223 */
224
225#define _sock_locks(x) \
226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
235 x "27" , x "28" , x "AF_CAN" , \
236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
241 x "AF_MCTP" , \
242 x "AF_MAX"
243
244static const char *const af_family_key_strings[AF_MAX+1] = {
245 _sock_locks("sk_lock-")
246};
247static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 _sock_locks("slock-")
249};
250static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 _sock_locks("clock-")
252};
253
254static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 _sock_locks("k-sk_lock-")
256};
257static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 _sock_locks("k-slock-")
259};
260static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 _sock_locks("k-clock-")
262};
263static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 _sock_locks("rlock-")
265};
266static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 _sock_locks("wlock-")
268};
269static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 _sock_locks("elock-")
271};
272
273/*
274 * sk_callback_lock and sk queues locking rules are per-address-family,
275 * so split the lock classes by using a per-AF key:
276 */
277static struct lock_class_key af_callback_keys[AF_MAX];
278static struct lock_class_key af_rlock_keys[AF_MAX];
279static struct lock_class_key af_wlock_keys[AF_MAX];
280static struct lock_class_key af_elock_keys[AF_MAX];
281static struct lock_class_key af_kern_callback_keys[AF_MAX];
282
283/* Run time adjustable parameters. */
284__u32 sysctl_wmem_max __read_mostly = 4 << 20;
285EXPORT_SYMBOL(sysctl_wmem_max);
286__u32 sysctl_rmem_max __read_mostly = 4 << 20;
287EXPORT_SYMBOL(sysctl_rmem_max);
288__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
289__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
290
291DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292EXPORT_SYMBOL_GPL(memalloc_socks_key);
293
294/**
295 * sk_set_memalloc - sets %SOCK_MEMALLOC
296 * @sk: socket to set it on
297 *
298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299 * It's the responsibility of the admin to adjust min_free_kbytes
300 * to meet the requirements
301 */
302void sk_set_memalloc(struct sock *sk)
303{
304 sock_set_flag(sk, SOCK_MEMALLOC);
305 sk->sk_allocation |= __GFP_MEMALLOC;
306 static_branch_inc(&memalloc_socks_key);
307}
308EXPORT_SYMBOL_GPL(sk_set_memalloc);
309
310void sk_clear_memalloc(struct sock *sk)
311{
312 sock_reset_flag(sk, SOCK_MEMALLOC);
313 sk->sk_allocation &= ~__GFP_MEMALLOC;
314 static_branch_dec(&memalloc_socks_key);
315
316 /*
317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 * it has rmem allocations due to the last swapfile being deactivated
320 * but there is a risk that the socket is unusable due to exceeding
321 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 */
323 sk_mem_reclaim(sk);
324}
325EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326
327int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328{
329 int ret;
330 unsigned int noreclaim_flag;
331
332 /* these should have been dropped before queueing */
333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334
335 noreclaim_flag = memalloc_noreclaim_save();
336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 tcp_v6_do_rcv,
338 tcp_v4_do_rcv,
339 sk, skb);
340 memalloc_noreclaim_restore(noreclaim_flag);
341
342 return ret;
343}
344EXPORT_SYMBOL(__sk_backlog_rcv);
345
346void sk_error_report(struct sock *sk)
347{
348 sk->sk_error_report(sk);
349
350 switch (sk->sk_family) {
351 case AF_INET:
352 fallthrough;
353 case AF_INET6:
354 trace_inet_sk_error_report(sk);
355 break;
356 default:
357 break;
358 }
359}
360EXPORT_SYMBOL(sk_error_report);
361
362int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363{
364 struct __kernel_sock_timeval tv;
365
366 if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 tv.tv_sec = 0;
368 tv.tv_usec = 0;
369 } else {
370 tv.tv_sec = timeo / HZ;
371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 }
373
374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 *(struct old_timeval32 *)optval = tv32;
377 return sizeof(tv32);
378 }
379
380 if (old_timeval) {
381 struct __kernel_old_timeval old_tv;
382 old_tv.tv_sec = tv.tv_sec;
383 old_tv.tv_usec = tv.tv_usec;
384 *(struct __kernel_old_timeval *)optval = old_tv;
385 return sizeof(old_tv);
386 }
387
388 *(struct __kernel_sock_timeval *)optval = tv;
389 return sizeof(tv);
390}
391EXPORT_SYMBOL(sock_get_timeout);
392
393int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 sockptr_t optval, int optlen, bool old_timeval)
395{
396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 struct old_timeval32 tv32;
398
399 if (optlen < sizeof(tv32))
400 return -EINVAL;
401
402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 return -EFAULT;
404 tv->tv_sec = tv32.tv_sec;
405 tv->tv_usec = tv32.tv_usec;
406 } else if (old_timeval) {
407 struct __kernel_old_timeval old_tv;
408
409 if (optlen < sizeof(old_tv))
410 return -EINVAL;
411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 return -EFAULT;
413 tv->tv_sec = old_tv.tv_sec;
414 tv->tv_usec = old_tv.tv_usec;
415 } else {
416 if (optlen < sizeof(*tv))
417 return -EINVAL;
418 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 return -EFAULT;
420 }
421
422 return 0;
423}
424EXPORT_SYMBOL(sock_copy_user_timeval);
425
426static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 bool old_timeval)
428{
429 struct __kernel_sock_timeval tv;
430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 long val;
432
433 if (err)
434 return err;
435
436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 return -EDOM;
438
439 if (tv.tv_sec < 0) {
440 static int warned __read_mostly;
441
442 WRITE_ONCE(*timeo_p, 0);
443 if (warned < 10 && net_ratelimit()) {
444 warned++;
445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 __func__, current->comm, task_pid_nr(current));
447 }
448 return 0;
449 }
450 val = MAX_SCHEDULE_TIMEOUT;
451 if ((tv.tv_sec || tv.tv_usec) &&
452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 USEC_PER_SEC / HZ);
455 WRITE_ONCE(*timeo_p, val);
456 return 0;
457}
458
459static bool sk_set_prio_allowed(const struct sock *sk, int val)
460{
461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464}
465
466static bool sock_needs_netstamp(const struct sock *sk)
467{
468 switch (sk->sk_family) {
469 case AF_UNSPEC:
470 case AF_UNIX:
471 return false;
472 default:
473 return true;
474 }
475}
476
477static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478{
479 if (sk->sk_flags & flags) {
480 sk->sk_flags &= ~flags;
481 if (sock_needs_netstamp(sk) &&
482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 net_disable_timestamp();
484 }
485}
486
487
488int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489{
490 unsigned long flags;
491 struct sk_buff_head *list = &sk->sk_receive_queue;
492
493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 sk_drops_inc(sk);
495 trace_sock_rcvqueue_full(sk, skb);
496 return -ENOMEM;
497 }
498
499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 sk_drops_inc(sk);
501 return -ENOBUFS;
502 }
503
504 skb->dev = NULL;
505 skb_set_owner_r(skb, sk);
506
507 /* we escape from rcu protected region, make sure we dont leak
508 * a norefcounted dst
509 */
510 skb_dst_force(skb);
511
512 spin_lock_irqsave(&list->lock, flags);
513 sock_skb_set_dropcount(sk, skb);
514 __skb_queue_tail(list, skb);
515 spin_unlock_irqrestore(&list->lock, flags);
516
517 if (!sock_flag(sk, SOCK_DEAD))
518 sk->sk_data_ready(sk);
519 return 0;
520}
521EXPORT_SYMBOL(__sock_queue_rcv_skb);
522
523int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 enum skb_drop_reason *reason)
525{
526 enum skb_drop_reason drop_reason;
527 int err;
528
529 err = sk_filter_reason(sk, skb, &drop_reason);
530 if (err)
531 goto out;
532
533 err = __sock_queue_rcv_skb(sk, skb);
534 switch (err) {
535 case -ENOMEM:
536 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
537 break;
538 case -ENOBUFS:
539 drop_reason = SKB_DROP_REASON_PROTO_MEM;
540 break;
541 default:
542 drop_reason = SKB_NOT_DROPPED_YET;
543 break;
544 }
545out:
546 if (reason)
547 *reason = drop_reason;
548 return err;
549}
550EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
551
552int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
553 const int nested, unsigned int trim_cap, bool refcounted)
554{
555 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
556 int rc = NET_RX_SUCCESS;
557 int err;
558
559 if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
560 goto discard_and_relse;
561
562 skb->dev = NULL;
563
564 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
565 sk_drops_inc(sk);
566 reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
567 goto discard_and_relse;
568 }
569 if (nested)
570 bh_lock_sock_nested(sk);
571 else
572 bh_lock_sock(sk);
573 if (!sock_owned_by_user(sk)) {
574 /*
575 * trylock + unlock semantics:
576 */
577 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
578
579 rc = sk_backlog_rcv(sk, skb);
580
581 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
582 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
583 bh_unlock_sock(sk);
584 if (err == -ENOMEM)
585 reason = SKB_DROP_REASON_PFMEMALLOC;
586 if (err == -ENOBUFS)
587 reason = SKB_DROP_REASON_SOCKET_BACKLOG;
588 sk_drops_inc(sk);
589 goto discard_and_relse;
590 }
591
592 bh_unlock_sock(sk);
593out:
594 if (refcounted)
595 sock_put(sk);
596 return rc;
597discard_and_relse:
598 sk_skb_reason_drop(sk, skb, reason);
599 goto out;
600}
601EXPORT_SYMBOL(__sk_receive_skb);
602
603INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
604 u32));
605INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
606 u32));
607struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
608{
609 struct dst_entry *dst = __sk_dst_get(sk);
610
611 if (dst && READ_ONCE(dst->obsolete) &&
612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
613 dst, cookie) == NULL) {
614 sk_tx_queue_clear(sk);
615 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
616 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
617 dst_release(dst);
618 return NULL;
619 }
620
621 return dst;
622}
623EXPORT_SYMBOL(__sk_dst_check);
624
625struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
626{
627 struct dst_entry *dst = sk_dst_get(sk);
628
629 if (dst && READ_ONCE(dst->obsolete) &&
630 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
631 dst, cookie) == NULL) {
632 sk_dst_reset(sk);
633 dst_release(dst);
634 return NULL;
635 }
636
637 return dst;
638}
639EXPORT_SYMBOL(sk_dst_check);
640
641static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
642{
643 int ret = -ENOPROTOOPT;
644#ifdef CONFIG_NETDEVICES
645 struct net *net = sock_net(sk);
646
647 /* Sorry... */
648 ret = -EPERM;
649 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
650 goto out;
651
652 ret = -EINVAL;
653 if (ifindex < 0)
654 goto out;
655
656 /* Paired with all READ_ONCE() done locklessly. */
657 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
658
659 if (sk->sk_prot->rehash)
660 sk->sk_prot->rehash(sk);
661 sk_dst_reset(sk);
662
663 ret = 0;
664
665out:
666#endif
667
668 return ret;
669}
670
671int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
672{
673 int ret;
674
675 if (lock_sk)
676 lock_sock(sk);
677 ret = sock_bindtoindex_locked(sk, ifindex);
678 if (lock_sk)
679 release_sock(sk);
680
681 return ret;
682}
683EXPORT_SYMBOL(sock_bindtoindex);
684
685static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
686{
687 int ret = -ENOPROTOOPT;
688#ifdef CONFIG_NETDEVICES
689 struct net *net = sock_net(sk);
690 char devname[IFNAMSIZ];
691 int index;
692
693 ret = -EINVAL;
694 if (optlen < 0)
695 goto out;
696
697 /* Bind this socket to a particular device like "eth0",
698 * as specified in the passed interface name. If the
699 * name is "" or the option length is zero the socket
700 * is not bound.
701 */
702 if (optlen > IFNAMSIZ - 1)
703 optlen = IFNAMSIZ - 1;
704 memset(devname, 0, sizeof(devname));
705
706 ret = -EFAULT;
707 if (copy_from_sockptr(devname, optval, optlen))
708 goto out;
709
710 index = 0;
711 if (devname[0] != '\0') {
712 struct net_device *dev;
713
714 rcu_read_lock();
715 dev = dev_get_by_name_rcu(net, devname);
716 if (dev)
717 index = dev->ifindex;
718 rcu_read_unlock();
719 ret = -ENODEV;
720 if (!dev)
721 goto out;
722 }
723
724 sockopt_lock_sock(sk);
725 ret = sock_bindtoindex_locked(sk, index);
726 sockopt_release_sock(sk);
727out:
728#endif
729
730 return ret;
731}
732
733static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
734 sockptr_t optlen, int len)
735{
736 int ret = -ENOPROTOOPT;
737#ifdef CONFIG_NETDEVICES
738 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
739 struct net *net = sock_net(sk);
740 char devname[IFNAMSIZ];
741
742 if (bound_dev_if == 0) {
743 len = 0;
744 goto zero;
745 }
746
747 ret = -EINVAL;
748 if (len < IFNAMSIZ)
749 goto out;
750
751 ret = netdev_get_name(net, devname, bound_dev_if);
752 if (ret)
753 goto out;
754
755 len = strlen(devname) + 1;
756
757 ret = -EFAULT;
758 if (copy_to_sockptr(optval, devname, len))
759 goto out;
760
761zero:
762 ret = -EFAULT;
763 if (copy_to_sockptr(optlen, &len, sizeof(int)))
764 goto out;
765
766 ret = 0;
767
768out:
769#endif
770
771 return ret;
772}
773
774bool sk_mc_loop(const struct sock *sk)
775{
776 if (dev_recursion_level())
777 return false;
778 if (!sk)
779 return true;
780 /* IPV6_ADDRFORM can change sk->sk_family under us. */
781 switch (READ_ONCE(sk->sk_family)) {
782 case AF_INET:
783 return inet_test_bit(MC_LOOP, sk);
784#if IS_ENABLED(CONFIG_IPV6)
785 case AF_INET6:
786 return inet6_test_bit(MC6_LOOP, sk);
787#endif
788 }
789 WARN_ON_ONCE(1);
790 return true;
791}
792EXPORT_SYMBOL(sk_mc_loop);
793
794void sock_set_reuseaddr(struct sock *sk)
795{
796 lock_sock(sk);
797 sk->sk_reuse = SK_CAN_REUSE;
798 release_sock(sk);
799}
800EXPORT_SYMBOL(sock_set_reuseaddr);
801
802void sock_set_reuseport(struct sock *sk)
803{
804 lock_sock(sk);
805 sk->sk_reuseport = true;
806 release_sock(sk);
807}
808EXPORT_SYMBOL(sock_set_reuseport);
809
810void sock_no_linger(struct sock *sk)
811{
812 lock_sock(sk);
813 WRITE_ONCE(sk->sk_lingertime, 0);
814 sock_set_flag(sk, SOCK_LINGER);
815 release_sock(sk);
816}
817EXPORT_SYMBOL(sock_no_linger);
818
819void sock_set_priority(struct sock *sk, u32 priority)
820{
821 WRITE_ONCE(sk->sk_priority, priority);
822}
823EXPORT_SYMBOL(sock_set_priority);
824
825void sock_set_sndtimeo(struct sock *sk, s64 secs)
826{
827 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
828 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
829 else
830 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
831}
832EXPORT_SYMBOL(sock_set_sndtimeo);
833
834static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
835{
836 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
837 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
838 if (val) {
839 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
840 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
841 }
842}
843
844void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845{
846 switch (optname) {
847 case SO_TIMESTAMP_OLD:
848 __sock_set_timestamps(sk, valbool, false, false);
849 break;
850 case SO_TIMESTAMP_NEW:
851 __sock_set_timestamps(sk, valbool, true, false);
852 break;
853 case SO_TIMESTAMPNS_OLD:
854 __sock_set_timestamps(sk, valbool, false, true);
855 break;
856 case SO_TIMESTAMPNS_NEW:
857 __sock_set_timestamps(sk, valbool, true, true);
858 break;
859 }
860}
861
862static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863{
864 struct net *net = sock_net(sk);
865 struct net_device *dev = NULL;
866 bool match = false;
867 int *vclock_index;
868 int i, num;
869
870 if (sk->sk_bound_dev_if)
871 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872
873 if (!dev) {
874 pr_err("%s: sock not bind to device\n", __func__);
875 return -EOPNOTSUPP;
876 }
877
878 num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 dev_put(dev);
880
881 for (i = 0; i < num; i++) {
882 if (*(vclock_index + i) == phc_index) {
883 match = true;
884 break;
885 }
886 }
887
888 if (num > 0)
889 kfree(vclock_index);
890
891 if (!match)
892 return -EINVAL;
893
894 WRITE_ONCE(sk->sk_bind_phc, phc_index);
895
896 return 0;
897}
898
899int sock_set_timestamping(struct sock *sk, int optname,
900 struct so_timestamping timestamping)
901{
902 int val = timestamping.flags;
903 int ret;
904
905 if (val & ~SOF_TIMESTAMPING_MASK)
906 return -EINVAL;
907
908 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 !(val & SOF_TIMESTAMPING_OPT_ID))
910 return -EINVAL;
911
912 if (val & SOF_TIMESTAMPING_OPT_ID &&
913 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 if (sk_is_tcp(sk)) {
915 if ((1 << sk->sk_state) &
916 (TCPF_CLOSE | TCPF_LISTEN))
917 return -EINVAL;
918 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 else
921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 } else {
923 atomic_set(&sk->sk_tskey, 0);
924 }
925 }
926
927 if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 return -EINVAL;
930
931 if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 if (ret)
934 return ret;
935 }
936
937 WRITE_ONCE(sk->sk_tsflags, val);
938 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
940
941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942 sock_enable_timestamp(sk,
943 SOCK_TIMESTAMPING_RX_SOFTWARE);
944 else
945 sock_disable_timestamp(sk,
946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
947 return 0;
948}
949
950#if defined(CONFIG_CGROUP_BPF)
951void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
952{
953 struct bpf_sock_ops_kern sock_ops;
954
955 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
956 sock_ops.op = op;
957 sock_ops.is_fullsock = 1;
958 sock_ops.sk = sk;
959 bpf_skops_init_skb(&sock_ops, skb, 0);
960 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
961}
962#endif
963
964void sock_set_keepalive(struct sock *sk)
965{
966 lock_sock(sk);
967 if (sk->sk_prot->keepalive)
968 sk->sk_prot->keepalive(sk, true);
969 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
970 release_sock(sk);
971}
972EXPORT_SYMBOL(sock_set_keepalive);
973
974static void __sock_set_rcvbuf(struct sock *sk, int val)
975{
976 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
977 * as a negative value.
978 */
979 val = min_t(int, val, INT_MAX / 2);
980 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
981
982 /* We double it on the way in to account for "struct sk_buff" etc.
983 * overhead. Applications assume that the SO_RCVBUF setting they make
984 * will allow that much actual data to be received on that socket.
985 *
986 * Applications are unaware that "struct sk_buff" and other overheads
987 * allocate from the receive buffer during socket buffer allocation.
988 *
989 * And after considering the possible alternatives, returning the value
990 * we actually used in getsockopt is the most desirable behavior.
991 */
992 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
993}
994
995void sock_set_rcvbuf(struct sock *sk, int val)
996{
997 lock_sock(sk);
998 __sock_set_rcvbuf(sk, val);
999 release_sock(sk);
1000}
1001EXPORT_SYMBOL(sock_set_rcvbuf);
1002
1003static void __sock_set_mark(struct sock *sk, u32 val)
1004{
1005 if (val != sk->sk_mark) {
1006 WRITE_ONCE(sk->sk_mark, val);
1007 sk_dst_reset(sk);
1008 }
1009}
1010
1011void sock_set_mark(struct sock *sk, u32 val)
1012{
1013 lock_sock(sk);
1014 __sock_set_mark(sk, val);
1015 release_sock(sk);
1016}
1017EXPORT_SYMBOL(sock_set_mark);
1018
1019static void sock_release_reserved_memory(struct sock *sk, int bytes)
1020{
1021 /* Round down bytes to multiple of pages */
1022 bytes = round_down(bytes, PAGE_SIZE);
1023
1024 WARN_ON(bytes > sk->sk_reserved_mem);
1025 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026 sk_mem_reclaim(sk);
1027}
1028
1029static int sock_reserve_memory(struct sock *sk, int bytes)
1030{
1031 long allocated;
1032 bool charged;
1033 int pages;
1034
1035 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1036 return -EOPNOTSUPP;
1037
1038 if (!bytes)
1039 return 0;
1040
1041 pages = sk_mem_pages(bytes);
1042
1043 /* pre-charge to memcg */
1044 charged = mem_cgroup_sk_charge(sk, pages,
1045 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1046 if (!charged)
1047 return -ENOMEM;
1048
1049 if (sk->sk_bypass_prot_mem)
1050 goto success;
1051
1052 /* pre-charge to forward_alloc */
1053 sk_memory_allocated_add(sk, pages);
1054 allocated = sk_memory_allocated(sk);
1055
1056 /* If the system goes into memory pressure with this
1057 * precharge, give up and return error.
1058 */
1059 if (allocated > sk_prot_mem_limits(sk, 1)) {
1060 sk_memory_allocated_sub(sk, pages);
1061 mem_cgroup_sk_uncharge(sk, pages);
1062 return -ENOMEM;
1063 }
1064
1065success:
1066 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1067
1068 WRITE_ONCE(sk->sk_reserved_mem,
1069 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070
1071 return 0;
1072}
1073
1074#ifdef CONFIG_PAGE_POOL
1075
1076/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1078 * allocates to copy these tokens, and to prevent looping over the frags for
1079 * too long.
1080 */
1081#define MAX_DONTNEED_TOKENS 128
1082#define MAX_DONTNEED_FRAGS 1024
1083
1084static noinline_for_stack int
1085sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1086{
1087 unsigned int num_tokens, i, j, k, netmem_num = 0;
1088 struct dmabuf_token *tokens;
1089 int ret = 0, num_frags = 0;
1090 netmem_ref netmems[16];
1091
1092 if (!sk_is_tcp(sk))
1093 return -EBADF;
1094
1095 if (optlen % sizeof(*tokens) ||
1096 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1097 return -EINVAL;
1098
1099 num_tokens = optlen / sizeof(*tokens);
1100 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101 if (!tokens)
1102 return -ENOMEM;
1103
1104 if (copy_from_sockptr(tokens, optval, optlen)) {
1105 kvfree(tokens);
1106 return -EFAULT;
1107 }
1108
1109 xa_lock_bh(&sk->sk_user_frags);
1110 for (i = 0; i < num_tokens; i++) {
1111 for (j = 0; j < tokens[i].token_count; j++) {
1112 if (++num_frags > MAX_DONTNEED_FRAGS)
1113 goto frag_limit_reached;
1114
1115 netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116 &sk->sk_user_frags, tokens[i].token_start + j);
1117
1118 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119 continue;
1120
1121 netmems[netmem_num++] = netmem;
1122 if (netmem_num == ARRAY_SIZE(netmems)) {
1123 xa_unlock_bh(&sk->sk_user_frags);
1124 for (k = 0; k < netmem_num; k++)
1125 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126 netmem_num = 0;
1127 xa_lock_bh(&sk->sk_user_frags);
1128 }
1129 ret++;
1130 }
1131 }
1132
1133frag_limit_reached:
1134 xa_unlock_bh(&sk->sk_user_frags);
1135 for (k = 0; k < netmem_num; k++)
1136 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137
1138 kvfree(tokens);
1139 return ret;
1140}
1141#endif
1142
1143void sockopt_lock_sock(struct sock *sk)
1144{
1145 /* When current->bpf_ctx is set, the setsockopt is called from
1146 * a bpf prog. bpf has ensured the sk lock has been
1147 * acquired before calling setsockopt().
1148 */
1149 if (has_current_bpf_ctx())
1150 return;
1151
1152 lock_sock(sk);
1153}
1154EXPORT_SYMBOL(sockopt_lock_sock);
1155
1156void sockopt_release_sock(struct sock *sk)
1157{
1158 if (has_current_bpf_ctx())
1159 return;
1160
1161 release_sock(sk);
1162}
1163EXPORT_SYMBOL(sockopt_release_sock);
1164
1165bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1166{
1167 return has_current_bpf_ctx() || ns_capable(ns, cap);
1168}
1169EXPORT_SYMBOL(sockopt_ns_capable);
1170
1171bool sockopt_capable(int cap)
1172{
1173 return has_current_bpf_ctx() || capable(cap);
1174}
1175EXPORT_SYMBOL(sockopt_capable);
1176
1177static int sockopt_validate_clockid(__kernel_clockid_t value)
1178{
1179 switch (value) {
1180 case CLOCK_REALTIME:
1181 case CLOCK_MONOTONIC:
1182 case CLOCK_TAI:
1183 return 0;
1184 }
1185 return -EINVAL;
1186}
1187
1188/*
1189 * This is meant for all protocols to use and covers goings on
1190 * at the socket level. Everything here is generic.
1191 */
1192
1193int sk_setsockopt(struct sock *sk, int level, int optname,
1194 sockptr_t optval, unsigned int optlen)
1195{
1196 struct so_timestamping timestamping;
1197 struct socket *sock = sk->sk_socket;
1198 struct sock_txtime sk_txtime;
1199 int val;
1200 int valbool;
1201 struct linger ling;
1202 int ret = 0;
1203
1204 /*
1205 * Options without arguments
1206 */
1207
1208 if (optname == SO_BINDTODEVICE)
1209 return sock_setbindtodevice(sk, optval, optlen);
1210
1211 if (optlen < sizeof(int))
1212 return -EINVAL;
1213
1214 if (copy_from_sockptr(&val, optval, sizeof(val)))
1215 return -EFAULT;
1216
1217 valbool = val ? 1 : 0;
1218
1219 /* handle options which do not require locking the socket. */
1220 switch (optname) {
1221 case SO_PRIORITY:
1222 if (sk_set_prio_allowed(sk, val)) {
1223 sock_set_priority(sk, val);
1224 return 0;
1225 }
1226 return -EPERM;
1227 case SO_TYPE:
1228 case SO_PROTOCOL:
1229 case SO_DOMAIN:
1230 case SO_ERROR:
1231 return -ENOPROTOOPT;
1232#ifdef CONFIG_NET_RX_BUSY_POLL
1233 case SO_BUSY_POLL:
1234 if (val < 0)
1235 return -EINVAL;
1236 WRITE_ONCE(sk->sk_ll_usec, val);
1237 return 0;
1238 case SO_PREFER_BUSY_POLL:
1239 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240 return -EPERM;
1241 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242 return 0;
1243 case SO_BUSY_POLL_BUDGET:
1244 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245 !sockopt_capable(CAP_NET_ADMIN))
1246 return -EPERM;
1247 if (val < 0 || val > U16_MAX)
1248 return -EINVAL;
1249 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250 return 0;
1251#endif
1252 case SO_MAX_PACING_RATE:
1253 {
1254 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1255 unsigned long pacing_rate;
1256
1257 if (sizeof(ulval) != sizeof(val) &&
1258 optlen >= sizeof(ulval) &&
1259 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1260 return -EFAULT;
1261 }
1262 if (ulval != ~0UL)
1263 cmpxchg(&sk->sk_pacing_status,
1264 SK_PACING_NONE,
1265 SK_PACING_NEEDED);
1266 /* Pairs with READ_ONCE() from sk_getsockopt() */
1267 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269 if (ulval < pacing_rate)
1270 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271 return 0;
1272 }
1273 case SO_TXREHASH:
1274 if (!sk_is_tcp(sk))
1275 return -EOPNOTSUPP;
1276 if (val < -1 || val > 1)
1277 return -EINVAL;
1278 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280 /* Paired with READ_ONCE() in tcp_rtx_synack()
1281 * and sk_getsockopt().
1282 */
1283 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284 return 0;
1285 case SO_PEEK_OFF:
1286 {
1287 int (*set_peek_off)(struct sock *sk, int val);
1288
1289 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290 if (set_peek_off)
1291 ret = set_peek_off(sk, val);
1292 else
1293 ret = -EOPNOTSUPP;
1294 return ret;
1295 }
1296#ifdef CONFIG_PAGE_POOL
1297 case SO_DEVMEM_DONTNEED:
1298 return sock_devmem_dontneed(sk, optval, optlen);
1299#endif
1300 case SO_SNDTIMEO_OLD:
1301 case SO_SNDTIMEO_NEW:
1302 return sock_set_timeout(&sk->sk_sndtimeo, optval,
1303 optlen, optname == SO_SNDTIMEO_OLD);
1304 case SO_RCVTIMEO_OLD:
1305 case SO_RCVTIMEO_NEW:
1306 return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1307 optlen, optname == SO_RCVTIMEO_OLD);
1308 }
1309
1310 sockopt_lock_sock(sk);
1311
1312 switch (optname) {
1313 case SO_DEBUG:
1314 if (val && !sockopt_capable(CAP_NET_ADMIN))
1315 ret = -EACCES;
1316 else
1317 sock_valbool_flag(sk, SOCK_DBG, valbool);
1318 break;
1319 case SO_REUSEADDR:
1320 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321 break;
1322 case SO_REUSEPORT:
1323 if (valbool && !sk_is_inet(sk))
1324 ret = -EOPNOTSUPP;
1325 else
1326 sk->sk_reuseport = valbool;
1327 break;
1328 case SO_DONTROUTE:
1329 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1330 sk_dst_reset(sk);
1331 break;
1332 case SO_BROADCAST:
1333 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1334 break;
1335 case SO_SNDBUF:
1336 /* Don't error on this BSD doesn't and if you think
1337 * about it this is right. Otherwise apps have to
1338 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1339 * are treated in BSD as hints
1340 */
1341 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342set_sndbuf:
1343 /* Ensure val * 2 fits into an int, to prevent max_t()
1344 * from treating it as a negative value.
1345 */
1346 val = min_t(int, val, INT_MAX / 2);
1347 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1348 WRITE_ONCE(sk->sk_sndbuf,
1349 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1350 /* Wake up sending tasks if we upped the value. */
1351 sk->sk_write_space(sk);
1352 break;
1353
1354 case SO_SNDBUFFORCE:
1355 if (!sockopt_capable(CAP_NET_ADMIN)) {
1356 ret = -EPERM;
1357 break;
1358 }
1359
1360 /* No negative values (to prevent underflow, as val will be
1361 * multiplied by 2).
1362 */
1363 if (val < 0)
1364 val = 0;
1365 goto set_sndbuf;
1366
1367 case SO_RCVBUF:
1368 /* Don't error on this BSD doesn't and if you think
1369 * about it this is right. Otherwise apps have to
1370 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1371 * are treated in BSD as hints
1372 */
1373 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374 break;
1375
1376 case SO_RCVBUFFORCE:
1377 if (!sockopt_capable(CAP_NET_ADMIN)) {
1378 ret = -EPERM;
1379 break;
1380 }
1381
1382 /* No negative values (to prevent underflow, as val will be
1383 * multiplied by 2).
1384 */
1385 __sock_set_rcvbuf(sk, max(val, 0));
1386 break;
1387
1388 case SO_KEEPALIVE:
1389 if (sk->sk_prot->keepalive)
1390 sk->sk_prot->keepalive(sk, valbool);
1391 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1392 break;
1393
1394 case SO_OOBINLINE:
1395 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1396 break;
1397
1398 case SO_NO_CHECK:
1399 sk->sk_no_check_tx = valbool;
1400 break;
1401
1402 case SO_LINGER:
1403 if (optlen < sizeof(ling)) {
1404 ret = -EINVAL; /* 1003.1g */
1405 break;
1406 }
1407 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1408 ret = -EFAULT;
1409 break;
1410 }
1411 if (!ling.l_onoff) {
1412 sock_reset_flag(sk, SOCK_LINGER);
1413 } else {
1414 unsigned long t_sec = ling.l_linger;
1415
1416 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418 else
1419 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420 sock_set_flag(sk, SOCK_LINGER);
1421 }
1422 break;
1423
1424 case SO_BSDCOMPAT:
1425 break;
1426
1427 case SO_TIMESTAMP_OLD:
1428 case SO_TIMESTAMP_NEW:
1429 case SO_TIMESTAMPNS_OLD:
1430 case SO_TIMESTAMPNS_NEW:
1431 sock_set_timestamp(sk, optname, valbool);
1432 break;
1433
1434 case SO_TIMESTAMPING_NEW:
1435 case SO_TIMESTAMPING_OLD:
1436 if (optlen == sizeof(timestamping)) {
1437 if (copy_from_sockptr(×tamping, optval,
1438 sizeof(timestamping))) {
1439 ret = -EFAULT;
1440 break;
1441 }
1442 } else {
1443 memset(×tamping, 0, sizeof(timestamping));
1444 timestamping.flags = val;
1445 }
1446 ret = sock_set_timestamping(sk, optname, timestamping);
1447 break;
1448
1449 case SO_RCVLOWAT:
1450 {
1451 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1452
1453 if (val < 0)
1454 val = INT_MAX;
1455 if (sock)
1456 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457 if (set_rcvlowat)
1458 ret = set_rcvlowat(sk, val);
1459 else
1460 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1461 break;
1462 }
1463 case SO_ATTACH_FILTER: {
1464 struct sock_fprog fprog;
1465
1466 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467 if (!ret)
1468 ret = sk_attach_filter(&fprog, sk);
1469 break;
1470 }
1471 case SO_ATTACH_BPF:
1472 ret = -EINVAL;
1473 if (optlen == sizeof(u32)) {
1474 u32 ufd;
1475
1476 ret = -EFAULT;
1477 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478 break;
1479
1480 ret = sk_attach_bpf(ufd, sk);
1481 }
1482 break;
1483
1484 case SO_ATTACH_REUSEPORT_CBPF: {
1485 struct sock_fprog fprog;
1486
1487 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488 if (!ret)
1489 ret = sk_reuseport_attach_filter(&fprog, sk);
1490 break;
1491 }
1492 case SO_ATTACH_REUSEPORT_EBPF:
1493 ret = -EINVAL;
1494 if (optlen == sizeof(u32)) {
1495 u32 ufd;
1496
1497 ret = -EFAULT;
1498 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499 break;
1500
1501 ret = sk_reuseport_attach_bpf(ufd, sk);
1502 }
1503 break;
1504
1505 case SO_DETACH_REUSEPORT_BPF:
1506 ret = reuseport_detach_prog(sk);
1507 break;
1508
1509 case SO_DETACH_FILTER:
1510 ret = sk_detach_filter(sk);
1511 break;
1512
1513 case SO_LOCK_FILTER:
1514 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515 ret = -EPERM;
1516 else
1517 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518 break;
1519
1520 case SO_MARK:
1521 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523 ret = -EPERM;
1524 break;
1525 }
1526
1527 __sock_set_mark(sk, val);
1528 break;
1529 case SO_RCVMARK:
1530 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531 break;
1532
1533 case SO_RCVPRIORITY:
1534 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535 break;
1536
1537 case SO_RXQ_OVFL:
1538 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539 break;
1540
1541 case SO_WIFI_STATUS:
1542 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543 break;
1544
1545 case SO_NOFCS:
1546 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547 break;
1548
1549 case SO_SELECT_ERR_QUEUE:
1550 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551 break;
1552
1553 case SO_PASSCRED:
1554 if (sk_may_scm_recv(sk))
1555 sk->sk_scm_credentials = valbool;
1556 else
1557 ret = -EOPNOTSUPP;
1558 break;
1559
1560 case SO_PASSSEC:
1561 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562 sk->sk_scm_security = valbool;
1563 else
1564 ret = -EOPNOTSUPP;
1565 break;
1566
1567 case SO_PASSPIDFD:
1568 if (sk_is_unix(sk))
1569 sk->sk_scm_pidfd = valbool;
1570 else
1571 ret = -EOPNOTSUPP;
1572 break;
1573
1574 case SO_PASSRIGHTS:
1575 if (sk_is_unix(sk))
1576 sk->sk_scm_rights = valbool;
1577 else
1578 ret = -EOPNOTSUPP;
1579 break;
1580
1581 case SO_INCOMING_CPU:
1582 reuseport_update_incoming_cpu(sk, val);
1583 break;
1584
1585 case SO_CNX_ADVICE:
1586 if (val == 1)
1587 dst_negative_advice(sk);
1588 break;
1589
1590 case SO_ZEROCOPY:
1591 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592 if (!(sk_is_tcp(sk) ||
1593 (sk->sk_type == SOCK_DGRAM &&
1594 sk->sk_protocol == IPPROTO_UDP)))
1595 ret = -EOPNOTSUPP;
1596 } else if (sk->sk_family != PF_RDS) {
1597 ret = -EOPNOTSUPP;
1598 }
1599 if (!ret) {
1600 if (val < 0 || val > 1)
1601 ret = -EINVAL;
1602 else
1603 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604 }
1605 break;
1606
1607 case SO_TXTIME:
1608 if (optlen != sizeof(struct sock_txtime)) {
1609 ret = -EINVAL;
1610 break;
1611 } else if (copy_from_sockptr(&sk_txtime, optval,
1612 sizeof(struct sock_txtime))) {
1613 ret = -EFAULT;
1614 break;
1615 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616 ret = -EINVAL;
1617 break;
1618 }
1619 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620 * scheduler has enough safe guards.
1621 */
1622 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624 ret = -EPERM;
1625 break;
1626 }
1627
1628 ret = sockopt_validate_clockid(sk_txtime.clockid);
1629 if (ret)
1630 break;
1631
1632 sock_valbool_flag(sk, SOCK_TXTIME, true);
1633 sk->sk_clockid = sk_txtime.clockid;
1634 sk->sk_txtime_deadline_mode =
1635 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636 sk->sk_txtime_report_errors =
1637 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638 break;
1639
1640 case SO_BINDTOIFINDEX:
1641 ret = sock_bindtoindex_locked(sk, val);
1642 break;
1643
1644 case SO_BUF_LOCK:
1645 if (val & ~SOCK_BUF_LOCK_MASK) {
1646 ret = -EINVAL;
1647 break;
1648 }
1649 sk->sk_userlocks = val | (sk->sk_userlocks &
1650 ~SOCK_BUF_LOCK_MASK);
1651 break;
1652
1653 case SO_RESERVE_MEM:
1654 {
1655 int delta;
1656
1657 if (val < 0) {
1658 ret = -EINVAL;
1659 break;
1660 }
1661
1662 delta = val - sk->sk_reserved_mem;
1663 if (delta < 0)
1664 sock_release_reserved_memory(sk, -delta);
1665 else
1666 ret = sock_reserve_memory(sk, delta);
1667 break;
1668 }
1669
1670 default:
1671 ret = -ENOPROTOOPT;
1672 break;
1673 }
1674 sockopt_release_sock(sk);
1675 return ret;
1676}
1677
1678int sock_setsockopt(struct socket *sock, int level, int optname,
1679 sockptr_t optval, unsigned int optlen)
1680{
1681 return sk_setsockopt(sock->sk, level, optname,
1682 optval, optlen);
1683}
1684EXPORT_SYMBOL(sock_setsockopt);
1685
1686static const struct cred *sk_get_peer_cred(struct sock *sk)
1687{
1688 const struct cred *cred;
1689
1690 spin_lock(&sk->sk_peer_lock);
1691 cred = get_cred(sk->sk_peer_cred);
1692 spin_unlock(&sk->sk_peer_lock);
1693
1694 return cred;
1695}
1696
1697static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698 struct ucred *ucred)
1699{
1700 ucred->pid = pid_vnr(pid);
1701 ucred->uid = ucred->gid = -1;
1702 if (cred) {
1703 struct user_namespace *current_ns = current_user_ns();
1704
1705 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707 }
1708}
1709
1710static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711{
1712 struct user_namespace *user_ns = current_user_ns();
1713 int i;
1714
1715 for (i = 0; i < src->ngroups; i++) {
1716 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717
1718 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719 return -EFAULT;
1720 }
1721
1722 return 0;
1723}
1724
1725int sk_getsockopt(struct sock *sk, int level, int optname,
1726 sockptr_t optval, sockptr_t optlen)
1727{
1728 struct socket *sock = sk->sk_socket;
1729
1730 union {
1731 int val;
1732 u64 val64;
1733 unsigned long ulval;
1734 struct linger ling;
1735 struct old_timeval32 tm32;
1736 struct __kernel_old_timeval tm;
1737 struct __kernel_sock_timeval stm;
1738 struct sock_txtime txtime;
1739 struct so_timestamping timestamping;
1740 } v;
1741
1742 int lv = sizeof(int);
1743 int len;
1744
1745 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746 return -EFAULT;
1747 if (len < 0)
1748 return -EINVAL;
1749
1750 memset(&v, 0, sizeof(v));
1751
1752 switch (optname) {
1753 case SO_DEBUG:
1754 v.val = sock_flag(sk, SOCK_DBG);
1755 break;
1756
1757 case SO_DONTROUTE:
1758 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759 break;
1760
1761 case SO_BROADCAST:
1762 v.val = sock_flag(sk, SOCK_BROADCAST);
1763 break;
1764
1765 case SO_SNDBUF:
1766 v.val = READ_ONCE(sk->sk_sndbuf);
1767 break;
1768
1769 case SO_RCVBUF:
1770 v.val = READ_ONCE(sk->sk_rcvbuf);
1771 break;
1772
1773 case SO_REUSEADDR:
1774 v.val = sk->sk_reuse;
1775 break;
1776
1777 case SO_REUSEPORT:
1778 v.val = sk->sk_reuseport;
1779 break;
1780
1781 case SO_KEEPALIVE:
1782 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783 break;
1784
1785 case SO_TYPE:
1786 v.val = sk->sk_type;
1787 break;
1788
1789 case SO_PROTOCOL:
1790 v.val = sk->sk_protocol;
1791 break;
1792
1793 case SO_DOMAIN:
1794 v.val = sk->sk_family;
1795 break;
1796
1797 case SO_ERROR:
1798 v.val = -sock_error(sk);
1799 if (v.val == 0)
1800 v.val = xchg(&sk->sk_err_soft, 0);
1801 break;
1802
1803 case SO_OOBINLINE:
1804 v.val = sock_flag(sk, SOCK_URGINLINE);
1805 break;
1806
1807 case SO_NO_CHECK:
1808 v.val = sk->sk_no_check_tx;
1809 break;
1810
1811 case SO_PRIORITY:
1812 v.val = READ_ONCE(sk->sk_priority);
1813 break;
1814
1815 case SO_LINGER:
1816 lv = sizeof(v.ling);
1817 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1818 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1819 break;
1820
1821 case SO_BSDCOMPAT:
1822 break;
1823
1824 case SO_TIMESTAMP_OLD:
1825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1828 break;
1829
1830 case SO_TIMESTAMPNS_OLD:
1831 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832 break;
1833
1834 case SO_TIMESTAMP_NEW:
1835 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836 break;
1837
1838 case SO_TIMESTAMPNS_NEW:
1839 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840 break;
1841
1842 case SO_TIMESTAMPING_OLD:
1843 case SO_TIMESTAMPING_NEW:
1844 lv = sizeof(v.timestamping);
1845 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846 * returning the flags when they were set through the same option.
1847 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848 */
1849 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852 }
1853 break;
1854
1855 case SO_RCVTIMEO_OLD:
1856 case SO_RCVTIMEO_NEW:
1857 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858 SO_RCVTIMEO_OLD == optname);
1859 break;
1860
1861 case SO_SNDTIMEO_OLD:
1862 case SO_SNDTIMEO_NEW:
1863 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864 SO_SNDTIMEO_OLD == optname);
1865 break;
1866
1867 case SO_RCVLOWAT:
1868 v.val = READ_ONCE(sk->sk_rcvlowat);
1869 break;
1870
1871 case SO_SNDLOWAT:
1872 v.val = 1;
1873 break;
1874
1875 case SO_PASSCRED:
1876 if (!sk_may_scm_recv(sk))
1877 return -EOPNOTSUPP;
1878
1879 v.val = sk->sk_scm_credentials;
1880 break;
1881
1882 case SO_PASSPIDFD:
1883 if (!sk_is_unix(sk))
1884 return -EOPNOTSUPP;
1885
1886 v.val = sk->sk_scm_pidfd;
1887 break;
1888
1889 case SO_PASSRIGHTS:
1890 if (!sk_is_unix(sk))
1891 return -EOPNOTSUPP;
1892
1893 v.val = sk->sk_scm_rights;
1894 break;
1895
1896 case SO_PEERCRED:
1897 {
1898 struct ucred peercred;
1899 if (len > sizeof(peercred))
1900 len = sizeof(peercred);
1901
1902 spin_lock(&sk->sk_peer_lock);
1903 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904 spin_unlock(&sk->sk_peer_lock);
1905
1906 if (copy_to_sockptr(optval, &peercred, len))
1907 return -EFAULT;
1908 goto lenout;
1909 }
1910
1911 case SO_PEERPIDFD:
1912 {
1913 struct pid *peer_pid;
1914 struct file *pidfd_file = NULL;
1915 unsigned int flags = 0;
1916 int pidfd;
1917
1918 if (len > sizeof(pidfd))
1919 len = sizeof(pidfd);
1920
1921 spin_lock(&sk->sk_peer_lock);
1922 peer_pid = get_pid(sk->sk_peer_pid);
1923 spin_unlock(&sk->sk_peer_lock);
1924
1925 if (!peer_pid)
1926 return -ENODATA;
1927
1928 /* The use of PIDFD_STALE requires stashing of struct pid
1929 * on pidfs with pidfs_register_pid() and only AF_UNIX
1930 * were prepared for this.
1931 */
1932 if (sk->sk_family == AF_UNIX)
1933 flags = PIDFD_STALE;
1934
1935 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1936 put_pid(peer_pid);
1937 if (pidfd < 0)
1938 return pidfd;
1939
1940 if (copy_to_sockptr(optval, &pidfd, len) ||
1941 copy_to_sockptr(optlen, &len, sizeof(int))) {
1942 put_unused_fd(pidfd);
1943 fput(pidfd_file);
1944
1945 return -EFAULT;
1946 }
1947
1948 fd_install(pidfd, pidfd_file);
1949 return 0;
1950 }
1951
1952 case SO_PEERGROUPS:
1953 {
1954 const struct cred *cred;
1955 int ret, n;
1956
1957 cred = sk_get_peer_cred(sk);
1958 if (!cred)
1959 return -ENODATA;
1960
1961 n = cred->group_info->ngroups;
1962 if (len < n * sizeof(gid_t)) {
1963 len = n * sizeof(gid_t);
1964 put_cred(cred);
1965 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1966 }
1967 len = n * sizeof(gid_t);
1968
1969 ret = groups_to_user(optval, cred->group_info);
1970 put_cred(cred);
1971 if (ret)
1972 return ret;
1973 goto lenout;
1974 }
1975
1976 case SO_PEERNAME:
1977 {
1978 struct sockaddr_storage address;
1979
1980 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1981 if (lv < 0)
1982 return -ENOTCONN;
1983 if (lv < len)
1984 return -EINVAL;
1985 if (copy_to_sockptr(optval, &address, len))
1986 return -EFAULT;
1987 goto lenout;
1988 }
1989
1990 /* Dubious BSD thing... Probably nobody even uses it, but
1991 * the UNIX standard wants it for whatever reason... -DaveM
1992 */
1993 case SO_ACCEPTCONN:
1994 v.val = sk->sk_state == TCP_LISTEN;
1995 break;
1996
1997 case SO_PASSSEC:
1998 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1999 return -EOPNOTSUPP;
2000
2001 v.val = sk->sk_scm_security;
2002 break;
2003
2004 case SO_PEERSEC:
2005 return security_socket_getpeersec_stream(sock,
2006 optval, optlen, len);
2007
2008 case SO_MARK:
2009 v.val = READ_ONCE(sk->sk_mark);
2010 break;
2011
2012 case SO_RCVMARK:
2013 v.val = sock_flag(sk, SOCK_RCVMARK);
2014 break;
2015
2016 case SO_RCVPRIORITY:
2017 v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2018 break;
2019
2020 case SO_RXQ_OVFL:
2021 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2022 break;
2023
2024 case SO_WIFI_STATUS:
2025 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2026 break;
2027
2028 case SO_PEEK_OFF:
2029 if (!READ_ONCE(sock->ops)->set_peek_off)
2030 return -EOPNOTSUPP;
2031
2032 v.val = READ_ONCE(sk->sk_peek_off);
2033 break;
2034 case SO_NOFCS:
2035 v.val = sock_flag(sk, SOCK_NOFCS);
2036 break;
2037
2038 case SO_BINDTODEVICE:
2039 return sock_getbindtodevice(sk, optval, optlen, len);
2040
2041 case SO_GET_FILTER:
2042 len = sk_get_filter(sk, optval, len);
2043 if (len < 0)
2044 return len;
2045
2046 goto lenout;
2047
2048 case SO_LOCK_FILTER:
2049 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2050 break;
2051
2052 case SO_BPF_EXTENSIONS:
2053 v.val = bpf_tell_extensions();
2054 break;
2055
2056 case SO_SELECT_ERR_QUEUE:
2057 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2058 break;
2059
2060#ifdef CONFIG_NET_RX_BUSY_POLL
2061 case SO_BUSY_POLL:
2062 v.val = READ_ONCE(sk->sk_ll_usec);
2063 break;
2064 case SO_PREFER_BUSY_POLL:
2065 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066 break;
2067#endif
2068
2069 case SO_MAX_PACING_RATE:
2070 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2071 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072 lv = sizeof(v.ulval);
2073 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074 } else {
2075 /* 32bit version */
2076 v.val = min_t(unsigned long, ~0U,
2077 READ_ONCE(sk->sk_max_pacing_rate));
2078 }
2079 break;
2080
2081 case SO_INCOMING_CPU:
2082 v.val = READ_ONCE(sk->sk_incoming_cpu);
2083 break;
2084
2085 case SO_MEMINFO:
2086 {
2087 u32 meminfo[SK_MEMINFO_VARS];
2088
2089 sk_get_meminfo(sk, meminfo);
2090
2091 len = min_t(unsigned int, len, sizeof(meminfo));
2092 if (copy_to_sockptr(optval, &meminfo, len))
2093 return -EFAULT;
2094
2095 goto lenout;
2096 }
2097
2098#ifdef CONFIG_NET_RX_BUSY_POLL
2099 case SO_INCOMING_NAPI_ID:
2100 v.val = READ_ONCE(sk->sk_napi_id);
2101
2102 /* aggregate non-NAPI IDs down to 0 */
2103 if (!napi_id_valid(v.val))
2104 v.val = 0;
2105
2106 break;
2107#endif
2108
2109 case SO_COOKIE:
2110 lv = sizeof(u64);
2111 if (len < lv)
2112 return -EINVAL;
2113 v.val64 = sock_gen_cookie(sk);
2114 break;
2115
2116 case SO_ZEROCOPY:
2117 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2118 break;
2119
2120 case SO_TXTIME:
2121 lv = sizeof(v.txtime);
2122 v.txtime.clockid = sk->sk_clockid;
2123 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2124 SOF_TXTIME_DEADLINE_MODE : 0;
2125 v.txtime.flags |= sk->sk_txtime_report_errors ?
2126 SOF_TXTIME_REPORT_ERRORS : 0;
2127 break;
2128
2129 case SO_BINDTOIFINDEX:
2130 v.val = READ_ONCE(sk->sk_bound_dev_if);
2131 break;
2132
2133 case SO_NETNS_COOKIE:
2134 lv = sizeof(u64);
2135 if (len != lv)
2136 return -EINVAL;
2137 v.val64 = sock_net(sk)->net_cookie;
2138 break;
2139
2140 case SO_BUF_LOCK:
2141 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142 break;
2143
2144 case SO_RESERVE_MEM:
2145 v.val = READ_ONCE(sk->sk_reserved_mem);
2146 break;
2147
2148 case SO_TXREHASH:
2149 if (!sk_is_tcp(sk))
2150 return -EOPNOTSUPP;
2151
2152 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2153 v.val = READ_ONCE(sk->sk_txrehash);
2154 break;
2155
2156 default:
2157 /* We implement the SO_SNDLOWAT etc to not be settable
2158 * (1003.1g 7).
2159 */
2160 return -ENOPROTOOPT;
2161 }
2162
2163 if (len > lv)
2164 len = lv;
2165 if (copy_to_sockptr(optval, &v, len))
2166 return -EFAULT;
2167lenout:
2168 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2169 return -EFAULT;
2170 return 0;
2171}
2172
2173/*
2174 * Initialize an sk_lock.
2175 *
2176 * (We also register the sk_lock with the lock validator.)
2177 */
2178static inline void sock_lock_init(struct sock *sk)
2179{
2180 sk_owner_clear(sk);
2181
2182 if (sk->sk_kern_sock)
2183 sock_lock_init_class_and_name(
2184 sk,
2185 af_family_kern_slock_key_strings[sk->sk_family],
2186 af_family_kern_slock_keys + sk->sk_family,
2187 af_family_kern_key_strings[sk->sk_family],
2188 af_family_kern_keys + sk->sk_family);
2189 else
2190 sock_lock_init_class_and_name(
2191 sk,
2192 af_family_slock_key_strings[sk->sk_family],
2193 af_family_slock_keys + sk->sk_family,
2194 af_family_key_strings[sk->sk_family],
2195 af_family_keys + sk->sk_family);
2196}
2197
2198/*
2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202 */
2203static void sock_copy(struct sock *nsk, const struct sock *osk)
2204{
2205 const struct proto *prot = READ_ONCE(osk->sk_prot);
2206#ifdef CONFIG_SECURITY_NETWORK
2207 void *sptr = nsk->sk_security;
2208#endif
2209
2210 /* If we move sk_tx_queue_mapping out of the private section,
2211 * we must check if sk_tx_queue_clear() is called after
2212 * sock_copy() in sk_clone_lock().
2213 */
2214 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215 offsetof(struct sock, sk_dontcopy_begin) ||
2216 offsetof(struct sock, sk_tx_queue_mapping) >=
2217 offsetof(struct sock, sk_dontcopy_end));
2218
2219 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220
2221 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222 prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223 /* alloc is larger than struct, see sk_prot_alloc() */);
2224
2225#ifdef CONFIG_SECURITY_NETWORK
2226 nsk->sk_security = sptr;
2227 security_sk_clone(osk, nsk);
2228#endif
2229}
2230
2231static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2232 int family)
2233{
2234 struct sock *sk;
2235 struct kmem_cache *slab;
2236
2237 slab = prot->slab;
2238 if (slab != NULL) {
2239 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240 if (!sk)
2241 return sk;
2242 if (want_init_on_alloc(priority))
2243 sk_prot_clear_nulls(sk, prot->obj_size);
2244 } else
2245 sk = kmalloc(prot->obj_size, priority);
2246
2247 if (sk != NULL) {
2248 if (security_sk_alloc(sk, family, priority))
2249 goto out_free;
2250
2251 if (!try_module_get(prot->owner))
2252 goto out_free_sec;
2253 }
2254
2255 return sk;
2256
2257out_free_sec:
2258 security_sk_free(sk);
2259out_free:
2260 if (slab != NULL)
2261 kmem_cache_free(slab, sk);
2262 else
2263 kfree(sk);
2264 return NULL;
2265}
2266
2267static void sk_prot_free(struct proto *prot, struct sock *sk)
2268{
2269 struct kmem_cache *slab;
2270 struct module *owner;
2271
2272 owner = prot->owner;
2273 slab = prot->slab;
2274
2275 cgroup_sk_free(&sk->sk_cgrp_data);
2276 mem_cgroup_sk_free(sk);
2277 security_sk_free(sk);
2278
2279 sk_owner_put(sk);
2280
2281 if (slab != NULL)
2282 kmem_cache_free(slab, sk);
2283 else
2284 kfree(sk);
2285 module_put(owner);
2286}
2287
2288/**
2289 * sk_alloc - All socket objects are allocated here
2290 * @net: the applicable net namespace
2291 * @family: protocol family
2292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293 * @prot: struct proto associated with this new sock instance
2294 * @kern: is this to be a kernel socket?
2295 */
2296struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2297 struct proto *prot, int kern)
2298{
2299 struct sock *sk;
2300
2301 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2302 if (sk) {
2303 sk->sk_family = family;
2304 /*
2305 * See comment in struct sock definition to understand
2306 * why we need sk_prot_creator -acme
2307 */
2308 sk->sk_prot = sk->sk_prot_creator = prot;
2309
2310 if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311 sk->sk_bypass_prot_mem = 1;
2312
2313 sk->sk_kern_sock = kern;
2314 sock_lock_init(sk);
2315
2316 sk->sk_net_refcnt = kern ? 0 : 1;
2317 if (likely(sk->sk_net_refcnt)) {
2318 get_net_track(net, &sk->ns_tracker, priority);
2319 sock_inuse_add(net, 1);
2320 } else {
2321 net_passive_inc(net);
2322 __netns_tracker_alloc(net, &sk->ns_tracker,
2323 false, priority);
2324 }
2325
2326 sock_net_set(sk, net);
2327 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328
2329 mem_cgroup_sk_alloc(sk);
2330 cgroup_sk_alloc(&sk->sk_cgrp_data);
2331 sock_update_classid(&sk->sk_cgrp_data);
2332 sock_update_netprioidx(&sk->sk_cgrp_data);
2333 sk_tx_queue_clear(sk);
2334 }
2335
2336 return sk;
2337}
2338EXPORT_SYMBOL(sk_alloc);
2339
2340/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2341 * grace period. This is the case for UDP sockets and TCP listeners.
2342 */
2343static void __sk_destruct(struct rcu_head *head)
2344{
2345 struct sock *sk = container_of(head, struct sock, sk_rcu);
2346 struct net *net = sock_net(sk);
2347 struct sk_filter *filter;
2348
2349 if (sk->sk_destruct)
2350 sk->sk_destruct(sk);
2351
2352 filter = rcu_dereference_check(sk->sk_filter,
2353 refcount_read(&sk->sk_wmem_alloc) == 0);
2354 if (filter) {
2355 sk_filter_uncharge(sk, filter);
2356 RCU_INIT_POINTER(sk->sk_filter, NULL);
2357 }
2358
2359 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360
2361#ifdef CONFIG_BPF_SYSCALL
2362 bpf_sk_storage_free(sk);
2363#endif
2364
2365 if (atomic_read(&sk->sk_omem_alloc))
2366 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367 __func__, atomic_read(&sk->sk_omem_alloc));
2368
2369 if (sk->sk_frag.page) {
2370 put_page(sk->sk_frag.page);
2371 sk->sk_frag.page = NULL;
2372 }
2373
2374 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2375 put_cred(sk->sk_peer_cred);
2376 put_pid(sk->sk_peer_pid);
2377
2378 if (likely(sk->sk_net_refcnt)) {
2379 put_net_track(net, &sk->ns_tracker);
2380 } else {
2381 __netns_tracker_free(net, &sk->ns_tracker, false);
2382 net_passive_dec(net);
2383 }
2384 sk_prot_free(sk->sk_prot_creator, sk);
2385}
2386
2387void sk_net_refcnt_upgrade(struct sock *sk)
2388{
2389 struct net *net = sock_net(sk);
2390
2391 WARN_ON_ONCE(sk->sk_net_refcnt);
2392 __netns_tracker_free(net, &sk->ns_tracker, false);
2393 net_passive_dec(net);
2394 sk->sk_net_refcnt = 1;
2395 get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2396 sock_inuse_add(net, 1);
2397}
2398EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399
2400void sk_destruct(struct sock *sk)
2401{
2402 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2403
2404 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405 reuseport_detach_sock(sk);
2406 use_call_rcu = true;
2407 }
2408
2409 if (use_call_rcu)
2410 call_rcu(&sk->sk_rcu, __sk_destruct);
2411 else
2412 __sk_destruct(&sk->sk_rcu);
2413}
2414
2415static void __sk_free(struct sock *sk)
2416{
2417 if (likely(sk->sk_net_refcnt))
2418 sock_inuse_add(sock_net(sk), -1);
2419
2420 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421 sock_diag_broadcast_destroy(sk);
2422 else
2423 sk_destruct(sk);
2424}
2425
2426void sk_free(struct sock *sk)
2427{
2428 /*
2429 * We subtract one from sk_wmem_alloc and can know if
2430 * some packets are still in some tx queue.
2431 * If not null, sock_wfree() will call __sk_free(sk) later
2432 */
2433 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2434 __sk_free(sk);
2435}
2436EXPORT_SYMBOL(sk_free);
2437
2438static void sk_init_common(struct sock *sk)
2439{
2440 skb_queue_head_init(&sk->sk_receive_queue);
2441 skb_queue_head_init(&sk->sk_write_queue);
2442 skb_queue_head_init(&sk->sk_error_queue);
2443
2444 rwlock_init(&sk->sk_callback_lock);
2445 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446 af_rlock_keys + sk->sk_family,
2447 af_family_rlock_key_strings[sk->sk_family]);
2448 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449 af_wlock_keys + sk->sk_family,
2450 af_family_wlock_key_strings[sk->sk_family]);
2451 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452 af_elock_keys + sk->sk_family,
2453 af_family_elock_key_strings[sk->sk_family]);
2454 if (sk->sk_kern_sock)
2455 lockdep_set_class_and_name(&sk->sk_callback_lock,
2456 af_kern_callback_keys + sk->sk_family,
2457 af_family_kern_clock_key_strings[sk->sk_family]);
2458 else
2459 lockdep_set_class_and_name(&sk->sk_callback_lock,
2460 af_callback_keys + sk->sk_family,
2461 af_family_clock_key_strings[sk->sk_family]);
2462}
2463
2464/**
2465 * sk_clone - clone a socket
2466 * @sk: the socket to clone
2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468 * @lock: if true, lock the cloned sk
2469 *
2470 * If @lock is true, the clone is locked by bh_lock_sock(), and
2471 * caller must unlock socket even in error path by bh_unlock_sock().
2472 */
2473struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
2474 bool lock)
2475{
2476 struct proto *prot = READ_ONCE(sk->sk_prot);
2477 struct sk_filter *filter;
2478 bool is_charged = true;
2479 struct sock *newsk;
2480
2481 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2482 if (!newsk)
2483 goto out;
2484
2485 sock_copy(newsk, sk);
2486
2487 newsk->sk_prot_creator = prot;
2488
2489 /* SANITY */
2490 if (likely(newsk->sk_net_refcnt)) {
2491 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2492 sock_inuse_add(sock_net(newsk), 1);
2493 } else {
2494 /* Kernel sockets are not elevating the struct net refcount.
2495 * Instead, use a tracker to more easily detect if a layer
2496 * is not properly dismantling its kernel sockets at netns
2497 * destroy time.
2498 */
2499 net_passive_inc(sock_net(newsk));
2500 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2501 false, priority);
2502 }
2503
2504 sk_node_init(&newsk->sk_node);
2505 sock_lock_init(newsk);
2506
2507 if (lock)
2508 bh_lock_sock(newsk);
2509
2510 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2511 newsk->sk_backlog.len = 0;
2512
2513 atomic_set(&newsk->sk_rmem_alloc, 0);
2514
2515 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2516
2517 atomic_set(&newsk->sk_omem_alloc, 0);
2518 sk_init_common(newsk);
2519
2520 newsk->sk_dst_cache = NULL;
2521 newsk->sk_dst_pending_confirm = 0;
2522 newsk->sk_wmem_queued = 0;
2523 newsk->sk_forward_alloc = 0;
2524 newsk->sk_reserved_mem = 0;
2525 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2526 sk_drops_reset(newsk);
2527 newsk->sk_send_head = NULL;
2528 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2529 atomic_set(&newsk->sk_zckey, 0);
2530
2531 sock_reset_flag(newsk, SOCK_DONE);
2532
2533#ifdef CONFIG_MEMCG
2534 /* sk->sk_memcg will be populated at accept() time */
2535 newsk->sk_memcg = NULL;
2536#endif
2537
2538 cgroup_sk_clone(&newsk->sk_cgrp_data);
2539
2540 rcu_read_lock();
2541 filter = rcu_dereference(sk->sk_filter);
2542 if (filter != NULL)
2543 /* though it's an empty new sock, the charging may fail
2544 * if sysctl_optmem_max was changed between creation of
2545 * original socket and cloning
2546 */
2547 is_charged = sk_filter_charge(newsk, filter);
2548 RCU_INIT_POINTER(newsk->sk_filter, filter);
2549 rcu_read_unlock();
2550
2551 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2552 /* We need to make sure that we don't uncharge the new
2553 * socket if we couldn't charge it in the first place
2554 * as otherwise we uncharge the parent's filter.
2555 */
2556 if (!is_charged)
2557 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2558
2559 goto free;
2560 }
2561
2562 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2563
2564 if (bpf_sk_storage_clone(sk, newsk))
2565 goto free;
2566
2567 /* Clear sk_user_data if parent had the pointer tagged
2568 * as not suitable for copying when cloning.
2569 */
2570 if (sk_user_data_is_nocopy(newsk))
2571 newsk->sk_user_data = NULL;
2572
2573 newsk->sk_err = 0;
2574 newsk->sk_err_soft = 0;
2575 newsk->sk_priority = 0;
2576 newsk->sk_incoming_cpu = raw_smp_processor_id();
2577
2578 /* Before updating sk_refcnt, we must commit prior changes to memory
2579 * (Documentation/RCU/rculist_nulls.rst for details)
2580 */
2581 smp_wmb();
2582 refcount_set(&newsk->sk_refcnt, 2);
2583
2584 sk_set_socket(newsk, NULL);
2585 sk_tx_queue_clear(newsk);
2586 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2587
2588 if (newsk->sk_prot->sockets_allocated)
2589 sk_sockets_allocated_inc(newsk);
2590
2591 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2592 net_enable_timestamp();
2593out:
2594 return newsk;
2595free:
2596 /* It is still raw copy of parent, so invalidate
2597 * destructor and make plain sk_free()
2598 */
2599 newsk->sk_destruct = NULL;
2600 if (lock)
2601 bh_unlock_sock(newsk);
2602 sk_free(newsk);
2603 newsk = NULL;
2604 goto out;
2605}
2606EXPORT_SYMBOL_GPL(sk_clone);
2607
2608static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2609{
2610 bool is_ipv6 = false;
2611 u32 max_size;
2612
2613#if IS_ENABLED(CONFIG_IPV6)
2614 is_ipv6 = (sk->sk_family == AF_INET6 &&
2615 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2616#endif
2617 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2618 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2619 READ_ONCE(dev->gso_ipv4_max_size);
2620 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2621 max_size = GSO_LEGACY_MAX_SIZE;
2622
2623 return max_size - (MAX_TCP_HEADER + 1);
2624}
2625
2626void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2627{
2628 const struct net_device *dev;
2629 u32 max_segs = 1;
2630
2631 rcu_read_lock();
2632 dev = dst_dev_rcu(dst);
2633 sk->sk_route_caps = dev->features;
2634 if (sk_is_tcp(sk)) {
2635 struct inet_connection_sock *icsk = inet_csk(sk);
2636
2637 sk->sk_route_caps |= NETIF_F_GSO;
2638 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2639 }
2640 if (sk->sk_route_caps & NETIF_F_GSO)
2641 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2642 if (unlikely(sk->sk_gso_disabled))
2643 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2644 if (sk_can_gso(sk)) {
2645 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2646 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2647 } else {
2648 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2649 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2650 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2651 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2652 }
2653 }
2654 sk->sk_gso_max_segs = max_segs;
2655 sk_dst_set(sk, dst);
2656 rcu_read_unlock();
2657}
2658EXPORT_SYMBOL_GPL(sk_setup_caps);
2659
2660/*
2661 * Simple resource managers for sockets.
2662 */
2663
2664
2665/*
2666 * Write buffer destructor automatically called from kfree_skb.
2667 */
2668void sock_wfree(struct sk_buff *skb)
2669{
2670 unsigned int len = skb->truesize;
2671 struct sock *sk = skb->sk;
2672 bool free;
2673 int old;
2674
2675 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2676 if (sock_flag(sk, SOCK_RCU_FREE) &&
2677 sk->sk_write_space == sock_def_write_space) {
2678 rcu_read_lock();
2679 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2680 &old);
2681 sock_def_write_space_wfree(sk, old - len);
2682 rcu_read_unlock();
2683 if (unlikely(free))
2684 __sk_free(sk);
2685 return;
2686 }
2687
2688 /*
2689 * Keep a reference on sk_wmem_alloc, this will be released
2690 * after sk_write_space() call
2691 */
2692 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2693 sk->sk_write_space(sk);
2694 len = 1;
2695 }
2696 /*
2697 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2698 * could not do because of in-flight packets
2699 */
2700 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2701 __sk_free(sk);
2702}
2703EXPORT_SYMBOL(sock_wfree);
2704
2705/* This variant of sock_wfree() is used by TCP,
2706 * since it sets SOCK_USE_WRITE_QUEUE.
2707 */
2708void __sock_wfree(struct sk_buff *skb)
2709{
2710 struct sock *sk = skb->sk;
2711
2712 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2713 __sk_free(sk);
2714}
2715
2716void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2717{
2718 int old_wmem;
2719
2720 skb_orphan(skb);
2721#ifdef CONFIG_INET
2722 if (unlikely(!sk_fullsock(sk)))
2723 return skb_set_owner_edemux(skb, sk);
2724#endif
2725 skb->sk = sk;
2726 skb->destructor = sock_wfree;
2727 skb_set_hash_from_sk(skb, sk);
2728 /*
2729 * We used to take a refcount on sk, but following operation
2730 * is enough to guarantee sk_free() won't free this sock until
2731 * all in-flight packets are completed
2732 */
2733 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2734
2735 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2736 * is in a host queue (qdisc, NIC queue).
2737 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2738 * based on XPS for better performance.
2739 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2740 */
2741 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2742}
2743EXPORT_SYMBOL(skb_set_owner_w);
2744
2745static bool can_skb_orphan_partial(const struct sk_buff *skb)
2746{
2747 /* Drivers depend on in-order delivery for crypto offload,
2748 * partial orphan breaks out-of-order-OK logic.
2749 */
2750 if (skb_is_decrypted(skb))
2751 return false;
2752
2753 return (skb->destructor == sock_wfree ||
2754 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2755}
2756
2757/* This helper is used by netem, as it can hold packets in its
2758 * delay queue. We want to allow the owner socket to send more
2759 * packets, as if they were already TX completed by a typical driver.
2760 * But we also want to keep skb->sk set because some packet schedulers
2761 * rely on it (sch_fq for example).
2762 */
2763void skb_orphan_partial(struct sk_buff *skb)
2764{
2765 if (skb_is_tcp_pure_ack(skb))
2766 return;
2767
2768 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2769 return;
2770
2771 skb_orphan(skb);
2772}
2773EXPORT_SYMBOL(skb_orphan_partial);
2774
2775/*
2776 * Read buffer destructor automatically called from kfree_skb.
2777 */
2778void sock_rfree(struct sk_buff *skb)
2779{
2780 struct sock *sk = skb->sk;
2781 unsigned int len = skb->truesize;
2782
2783 atomic_sub(len, &sk->sk_rmem_alloc);
2784 sk_mem_uncharge(sk, len);
2785}
2786EXPORT_SYMBOL(sock_rfree);
2787
2788/*
2789 * Buffer destructor for skbs that are not used directly in read or write
2790 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2791 */
2792void sock_efree(struct sk_buff *skb)
2793{
2794 sock_put(skb->sk);
2795}
2796EXPORT_SYMBOL(sock_efree);
2797
2798/* Buffer destructor for prefetch/receive path where reference count may
2799 * not be held, e.g. for listen sockets.
2800 */
2801#ifdef CONFIG_INET
2802void sock_pfree(struct sk_buff *skb)
2803{
2804 struct sock *sk = skb->sk;
2805
2806 if (!sk_is_refcounted(sk))
2807 return;
2808
2809 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2810 inet_reqsk(sk)->rsk_listener = NULL;
2811 reqsk_free(inet_reqsk(sk));
2812 return;
2813 }
2814
2815 sock_gen_put(sk);
2816}
2817EXPORT_SYMBOL(sock_pfree);
2818#endif /* CONFIG_INET */
2819
2820/*
2821 * Allocate a skb from the socket's send buffer.
2822 */
2823struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2824 gfp_t priority)
2825{
2826 if (force ||
2827 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2828 struct sk_buff *skb = alloc_skb(size, priority);
2829
2830 if (skb) {
2831 skb_set_owner_w(skb, sk);
2832 return skb;
2833 }
2834 }
2835 return NULL;
2836}
2837EXPORT_SYMBOL(sock_wmalloc);
2838
2839static void sock_ofree(struct sk_buff *skb)
2840{
2841 struct sock *sk = skb->sk;
2842
2843 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2844}
2845
2846struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2847 gfp_t priority)
2848{
2849 struct sk_buff *skb;
2850
2851 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2852 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2853 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2854 return NULL;
2855
2856 skb = alloc_skb(size, priority);
2857 if (!skb)
2858 return NULL;
2859
2860 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2861 skb->sk = sk;
2862 skb->destructor = sock_ofree;
2863 return skb;
2864}
2865
2866/*
2867 * Allocate a memory block from the socket's option memory buffer.
2868 */
2869void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2870{
2871 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2872
2873 if ((unsigned int)size <= optmem_max &&
2874 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2875 void *mem;
2876 /* First do the add, to avoid the race if kmalloc
2877 * might sleep.
2878 */
2879 atomic_add(size, &sk->sk_omem_alloc);
2880 mem = kmalloc(size, priority);
2881 if (mem)
2882 return mem;
2883 atomic_sub(size, &sk->sk_omem_alloc);
2884 }
2885 return NULL;
2886}
2887EXPORT_SYMBOL(sock_kmalloc);
2888
2889/*
2890 * Duplicate the input "src" memory block using the socket's
2891 * option memory buffer.
2892 */
2893void *sock_kmemdup(struct sock *sk, const void *src,
2894 int size, gfp_t priority)
2895{
2896 void *mem;
2897
2898 mem = sock_kmalloc(sk, size, priority);
2899 if (mem)
2900 memcpy(mem, src, size);
2901 return mem;
2902}
2903EXPORT_SYMBOL(sock_kmemdup);
2904
2905/* Free an option memory block. Note, we actually want the inline
2906 * here as this allows gcc to detect the nullify and fold away the
2907 * condition entirely.
2908 */
2909static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2910 const bool nullify)
2911{
2912 if (WARN_ON_ONCE(!mem))
2913 return;
2914 if (nullify)
2915 kfree_sensitive(mem);
2916 else
2917 kfree(mem);
2918 atomic_sub(size, &sk->sk_omem_alloc);
2919}
2920
2921void sock_kfree_s(struct sock *sk, void *mem, int size)
2922{
2923 __sock_kfree_s(sk, mem, size, false);
2924}
2925EXPORT_SYMBOL(sock_kfree_s);
2926
2927void sock_kzfree_s(struct sock *sk, void *mem, int size)
2928{
2929 __sock_kfree_s(sk, mem, size, true);
2930}
2931EXPORT_SYMBOL(sock_kzfree_s);
2932
2933/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2934 I think, these locks should be removed for datagram sockets.
2935 */
2936static long sock_wait_for_wmem(struct sock *sk, long timeo)
2937{
2938 DEFINE_WAIT(wait);
2939
2940 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2941 for (;;) {
2942 if (!timeo)
2943 break;
2944 if (signal_pending(current))
2945 break;
2946 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2947 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2948 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2949 break;
2950 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2951 break;
2952 if (READ_ONCE(sk->sk_err))
2953 break;
2954 timeo = schedule_timeout(timeo);
2955 }
2956 finish_wait(sk_sleep(sk), &wait);
2957 return timeo;
2958}
2959
2960
2961/*
2962 * Generic send/receive buffer handlers
2963 */
2964
2965struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2966 unsigned long data_len, int noblock,
2967 int *errcode, int max_page_order)
2968{
2969 struct sk_buff *skb;
2970 long timeo;
2971 int err;
2972
2973 timeo = sock_sndtimeo(sk, noblock);
2974 for (;;) {
2975 err = sock_error(sk);
2976 if (err != 0)
2977 goto failure;
2978
2979 err = -EPIPE;
2980 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2981 goto failure;
2982
2983 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2984 break;
2985
2986 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2987 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2988 err = -EAGAIN;
2989 if (!timeo)
2990 goto failure;
2991 if (signal_pending(current))
2992 goto interrupted;
2993 timeo = sock_wait_for_wmem(sk, timeo);
2994 }
2995 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2996 errcode, sk->sk_allocation);
2997 if (skb)
2998 skb_set_owner_w(skb, sk);
2999 return skb;
3000
3001interrupted:
3002 err = sock_intr_errno(timeo);
3003failure:
3004 *errcode = err;
3005 return NULL;
3006}
3007EXPORT_SYMBOL(sock_alloc_send_pskb);
3008
3009int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3010 struct sockcm_cookie *sockc)
3011{
3012 u32 tsflags;
3013
3014 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3015
3016 switch (cmsg->cmsg_type) {
3017 case SO_MARK:
3018 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3019 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020 return -EPERM;
3021 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022 return -EINVAL;
3023 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3024 break;
3025 case SO_TIMESTAMPING_OLD:
3026 case SO_TIMESTAMPING_NEW:
3027 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3028 return -EINVAL;
3029
3030 tsflags = *(u32 *)CMSG_DATA(cmsg);
3031 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3032 return -EINVAL;
3033
3034 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3035 sockc->tsflags |= tsflags;
3036 break;
3037 case SCM_TXTIME:
3038 if (!sock_flag(sk, SOCK_TXTIME))
3039 return -EINVAL;
3040 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3041 return -EINVAL;
3042 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3043 break;
3044 case SCM_TS_OPT_ID:
3045 if (sk_is_tcp(sk))
3046 return -EINVAL;
3047 tsflags = READ_ONCE(sk->sk_tsflags);
3048 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3049 return -EINVAL;
3050 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3051 return -EINVAL;
3052 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3053 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3054 break;
3055 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3056 case SCM_RIGHTS:
3057 case SCM_CREDENTIALS:
3058 break;
3059 case SO_PRIORITY:
3060 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3061 return -EINVAL;
3062 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3063 return -EPERM;
3064 sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3065 break;
3066 case SCM_DEVMEM_DMABUF:
3067 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3068 return -EINVAL;
3069 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3070 break;
3071 default:
3072 return -EINVAL;
3073 }
3074 return 0;
3075}
3076EXPORT_SYMBOL(__sock_cmsg_send);
3077
3078int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3079 struct sockcm_cookie *sockc)
3080{
3081 struct cmsghdr *cmsg;
3082 int ret;
3083
3084 for_each_cmsghdr(cmsg, msg) {
3085 if (!CMSG_OK(msg, cmsg))
3086 return -EINVAL;
3087 if (cmsg->cmsg_level != SOL_SOCKET)
3088 continue;
3089 ret = __sock_cmsg_send(sk, cmsg, sockc);
3090 if (ret)
3091 return ret;
3092 }
3093 return 0;
3094}
3095EXPORT_SYMBOL(sock_cmsg_send);
3096
3097static void sk_enter_memory_pressure(struct sock *sk)
3098{
3099 if (!sk->sk_prot->enter_memory_pressure)
3100 return;
3101
3102 sk->sk_prot->enter_memory_pressure(sk);
3103}
3104
3105static void sk_leave_memory_pressure(struct sock *sk)
3106{
3107 if (sk->sk_prot->leave_memory_pressure) {
3108 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3109 tcp_leave_memory_pressure, sk);
3110 } else {
3111 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3112
3113 if (memory_pressure && READ_ONCE(*memory_pressure))
3114 WRITE_ONCE(*memory_pressure, 0);
3115 }
3116}
3117
3118DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3119
3120/**
3121 * skb_page_frag_refill - check that a page_frag contains enough room
3122 * @sz: minimum size of the fragment we want to get
3123 * @pfrag: pointer to page_frag
3124 * @gfp: priority for memory allocation
3125 *
3126 * Note: While this allocator tries to use high order pages, there is
3127 * no guarantee that allocations succeed. Therefore, @sz MUST be
3128 * less or equal than PAGE_SIZE.
3129 */
3130bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3131{
3132 if (pfrag->page) {
3133 if (page_ref_count(pfrag->page) == 1) {
3134 pfrag->offset = 0;
3135 return true;
3136 }
3137 if (pfrag->offset + sz <= pfrag->size)
3138 return true;
3139 put_page(pfrag->page);
3140 }
3141
3142 pfrag->offset = 0;
3143 if (SKB_FRAG_PAGE_ORDER &&
3144 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3145 /* Avoid direct reclaim but allow kswapd to wake */
3146 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3147 __GFP_COMP | __GFP_NOWARN |
3148 __GFP_NORETRY,
3149 SKB_FRAG_PAGE_ORDER);
3150 if (likely(pfrag->page)) {
3151 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3152 return true;
3153 }
3154 }
3155 pfrag->page = alloc_page(gfp);
3156 if (likely(pfrag->page)) {
3157 pfrag->size = PAGE_SIZE;
3158 return true;
3159 }
3160 return false;
3161}
3162EXPORT_SYMBOL(skb_page_frag_refill);
3163
3164bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3165{
3166 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3167 return true;
3168
3169 if (!sk->sk_bypass_prot_mem)
3170 sk_enter_memory_pressure(sk);
3171
3172 sk_stream_moderate_sndbuf(sk);
3173
3174 return false;
3175}
3176EXPORT_SYMBOL(sk_page_frag_refill);
3177
3178void __lock_sock(struct sock *sk)
3179 __releases(&sk->sk_lock.slock)
3180 __acquires(&sk->sk_lock.slock)
3181{
3182 DEFINE_WAIT(wait);
3183
3184 for (;;) {
3185 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3186 TASK_UNINTERRUPTIBLE);
3187 spin_unlock_bh(&sk->sk_lock.slock);
3188 schedule();
3189 spin_lock_bh(&sk->sk_lock.slock);
3190 if (!sock_owned_by_user(sk))
3191 break;
3192 }
3193 finish_wait(&sk->sk_lock.wq, &wait);
3194}
3195
3196void __release_sock(struct sock *sk)
3197 __releases(&sk->sk_lock.slock)
3198 __acquires(&sk->sk_lock.slock)
3199{
3200 struct sk_buff *skb, *next;
3201 int nb = 0;
3202
3203 while ((skb = sk->sk_backlog.head) != NULL) {
3204 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205
3206 spin_unlock_bh(&sk->sk_lock.slock);
3207
3208 while (1) {
3209 next = skb->next;
3210 prefetch(next);
3211 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212 skb_mark_not_on_list(skb);
3213 sk_backlog_rcv(sk, skb);
3214
3215 skb = next;
3216 if (!skb)
3217 break;
3218
3219 if (!(++nb & 15))
3220 cond_resched();
3221 }
3222
3223 spin_lock_bh(&sk->sk_lock.slock);
3224 }
3225
3226 /*
3227 * Doing the zeroing here guarantee we can not loop forever
3228 * while a wild producer attempts to flood us.
3229 */
3230 sk->sk_backlog.len = 0;
3231}
3232
3233void __sk_flush_backlog(struct sock *sk)
3234{
3235 spin_lock_bh(&sk->sk_lock.slock);
3236 __release_sock(sk);
3237
3238 if (sk->sk_prot->release_cb)
3239 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3240 tcp_release_cb, sk);
3241
3242 spin_unlock_bh(&sk->sk_lock.slock);
3243}
3244EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3245
3246/**
3247 * sk_wait_data - wait for data to arrive at sk_receive_queue
3248 * @sk: sock to wait on
3249 * @timeo: for how long
3250 * @skb: last skb seen on sk_receive_queue
3251 *
3252 * Now socket state including sk->sk_err is changed only under lock,
3253 * hence we may omit checks after joining wait queue.
3254 * We check receive queue before schedule() only as optimization;
3255 * it is very likely that release_sock() added new data.
3256 */
3257int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3258{
3259 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3260 int rc;
3261
3262 add_wait_queue(sk_sleep(sk), &wait);
3263 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3264 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3265 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3266 remove_wait_queue(sk_sleep(sk), &wait);
3267 return rc;
3268}
3269EXPORT_SYMBOL(sk_wait_data);
3270
3271/**
3272 * __sk_mem_raise_allocated - increase memory_allocated
3273 * @sk: socket
3274 * @size: memory size to allocate
3275 * @amt: pages to allocate
3276 * @kind: allocation type
3277 *
3278 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3279 *
3280 * Unlike the globally shared limits among the sockets under same protocol,
3281 * consuming the budget of a memcg won't have direct effect on other ones.
3282 * So be optimistic about memcg's tolerance, and leave the callers to decide
3283 * whether or not to raise allocated through sk_under_memory_pressure() or
3284 * its variants.
3285 */
3286int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3287{
3288 bool memcg_enabled = false, charged = false;
3289 struct proto *prot = sk->sk_prot;
3290 long allocated = 0;
3291
3292 if (!sk->sk_bypass_prot_mem) {
3293 sk_memory_allocated_add(sk, amt);
3294 allocated = sk_memory_allocated(sk);
3295 }
3296
3297 if (mem_cgroup_sk_enabled(sk)) {
3298 memcg_enabled = true;
3299 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3300 if (!charged)
3301 goto suppress_allocation;
3302 }
3303
3304 if (!allocated)
3305 return 1;
3306
3307 /* Under limit. */
3308 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3309 sk_leave_memory_pressure(sk);
3310 return 1;
3311 }
3312
3313 /* Under pressure. */
3314 if (allocated > sk_prot_mem_limits(sk, 1))
3315 sk_enter_memory_pressure(sk);
3316
3317 /* Over hard limit. */
3318 if (allocated > sk_prot_mem_limits(sk, 2))
3319 goto suppress_allocation;
3320
3321 /* Guarantee minimum buffer size under pressure (either global
3322 * or memcg) to make sure features described in RFC 7323 (TCP
3323 * Extensions for High Performance) work properly.
3324 *
3325 * This rule does NOT stand when exceeds global or memcg's hard
3326 * limit, or else a DoS attack can be taken place by spawning
3327 * lots of sockets whose usage are under minimum buffer size.
3328 */
3329 if (kind == SK_MEM_RECV) {
3330 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3331 return 1;
3332
3333 } else { /* SK_MEM_SEND */
3334 int wmem0 = sk_get_wmem0(sk, prot);
3335
3336 if (sk->sk_type == SOCK_STREAM) {
3337 if (sk->sk_wmem_queued < wmem0)
3338 return 1;
3339 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3340 return 1;
3341 }
3342 }
3343
3344 if (sk_has_memory_pressure(sk)) {
3345 u64 alloc;
3346
3347 /* The following 'average' heuristic is within the
3348 * scope of global accounting, so it only makes
3349 * sense for global memory pressure.
3350 */
3351 if (!sk_under_global_memory_pressure(sk))
3352 return 1;
3353
3354 /* Try to be fair among all the sockets under global
3355 * pressure by allowing the ones that below average
3356 * usage to raise.
3357 */
3358 alloc = sk_sockets_allocated_read_positive(sk);
3359 if (sk_prot_mem_limits(sk, 2) > alloc *
3360 sk_mem_pages(sk->sk_wmem_queued +
3361 atomic_read(&sk->sk_rmem_alloc) +
3362 sk->sk_forward_alloc))
3363 return 1;
3364 }
3365
3366suppress_allocation:
3367
3368 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3369 sk_stream_moderate_sndbuf(sk);
3370
3371 /* Fail only if socket is _under_ its sndbuf.
3372 * In this case we cannot block, so that we have to fail.
3373 */
3374 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3375 /* Force charge with __GFP_NOFAIL */
3376 if (memcg_enabled && !charged)
3377 mem_cgroup_sk_charge(sk, amt,
3378 gfp_memcg_charge() | __GFP_NOFAIL);
3379 return 1;
3380 }
3381 }
3382
3383 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3384
3385 if (allocated)
3386 sk_memory_allocated_sub(sk, amt);
3387
3388 if (charged)
3389 mem_cgroup_sk_uncharge(sk, amt);
3390
3391 return 0;
3392}
3393
3394/**
3395 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3396 * @sk: socket
3397 * @size: memory size to allocate
3398 * @kind: allocation type
3399 *
3400 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3401 * rmem allocation. This function assumes that protocols which have
3402 * memory_pressure use sk_wmem_queued as write buffer accounting.
3403 */
3404int __sk_mem_schedule(struct sock *sk, int size, int kind)
3405{
3406 int ret, amt = sk_mem_pages(size);
3407
3408 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3409 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3410 if (!ret)
3411 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3412 return ret;
3413}
3414EXPORT_SYMBOL(__sk_mem_schedule);
3415
3416/**
3417 * __sk_mem_reduce_allocated - reclaim memory_allocated
3418 * @sk: socket
3419 * @amount: number of quanta
3420 *
3421 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3422 */
3423void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3424{
3425 if (mem_cgroup_sk_enabled(sk))
3426 mem_cgroup_sk_uncharge(sk, amount);
3427
3428 if (sk->sk_bypass_prot_mem)
3429 return;
3430
3431 sk_memory_allocated_sub(sk, amount);
3432
3433 if (sk_under_global_memory_pressure(sk) &&
3434 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3435 sk_leave_memory_pressure(sk);
3436}
3437
3438/**
3439 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3440 * @sk: socket
3441 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3442 */
3443void __sk_mem_reclaim(struct sock *sk, int amount)
3444{
3445 amount >>= PAGE_SHIFT;
3446 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3447 __sk_mem_reduce_allocated(sk, amount);
3448}
3449EXPORT_SYMBOL(__sk_mem_reclaim);
3450
3451void __sk_charge(struct sock *sk, gfp_t gfp)
3452{
3453 int amt;
3454
3455 gfp |= __GFP_NOFAIL;
3456 if (mem_cgroup_from_sk(sk)) {
3457 /* The socket has not been accepted yet, no need
3458 * to look at newsk->sk_wmem_queued.
3459 */
3460 amt = sk_mem_pages(sk->sk_forward_alloc +
3461 atomic_read(&sk->sk_rmem_alloc));
3462 if (amt)
3463 mem_cgroup_sk_charge(sk, amt, gfp);
3464 }
3465
3466 kmem_cache_charge(sk, gfp);
3467}
3468
3469int sk_set_peek_off(struct sock *sk, int val)
3470{
3471 WRITE_ONCE(sk->sk_peek_off, val);
3472 return 0;
3473}
3474EXPORT_SYMBOL_GPL(sk_set_peek_off);
3475
3476/*
3477 * Set of default routines for initialising struct proto_ops when
3478 * the protocol does not support a particular function. In certain
3479 * cases where it makes no sense for a protocol to have a "do nothing"
3480 * function, some default processing is provided.
3481 */
3482
3483int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
3484{
3485 return -EOPNOTSUPP;
3486}
3487EXPORT_SYMBOL(sock_no_bind);
3488
3489int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
3490 int len, int flags)
3491{
3492 return -EOPNOTSUPP;
3493}
3494EXPORT_SYMBOL(sock_no_connect);
3495
3496int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3497{
3498 return -EOPNOTSUPP;
3499}
3500EXPORT_SYMBOL(sock_no_socketpair);
3501
3502int sock_no_accept(struct socket *sock, struct socket *newsock,
3503 struct proto_accept_arg *arg)
3504{
3505 return -EOPNOTSUPP;
3506}
3507EXPORT_SYMBOL(sock_no_accept);
3508
3509int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3510 int peer)
3511{
3512 return -EOPNOTSUPP;
3513}
3514EXPORT_SYMBOL(sock_no_getname);
3515
3516int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3517{
3518 return -EOPNOTSUPP;
3519}
3520EXPORT_SYMBOL(sock_no_ioctl);
3521
3522int sock_no_listen(struct socket *sock, int backlog)
3523{
3524 return -EOPNOTSUPP;
3525}
3526EXPORT_SYMBOL(sock_no_listen);
3527
3528int sock_no_shutdown(struct socket *sock, int how)
3529{
3530 return -EOPNOTSUPP;
3531}
3532EXPORT_SYMBOL(sock_no_shutdown);
3533
3534int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3535{
3536 return -EOPNOTSUPP;
3537}
3538EXPORT_SYMBOL(sock_no_sendmsg);
3539
3540int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3541{
3542 return -EOPNOTSUPP;
3543}
3544EXPORT_SYMBOL(sock_no_sendmsg_locked);
3545
3546int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3547 int flags)
3548{
3549 return -EOPNOTSUPP;
3550}
3551EXPORT_SYMBOL(sock_no_recvmsg);
3552
3553int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3554{
3555 /* Mirror missing mmap method error code */
3556 return -ENODEV;
3557}
3558EXPORT_SYMBOL(sock_no_mmap);
3559
3560/*
3561 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3562 * various sock-based usage counts.
3563 */
3564void __receive_sock(struct file *file)
3565{
3566 struct socket *sock;
3567
3568 sock = sock_from_file(file);
3569 if (sock) {
3570 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3571 sock_update_classid(&sock->sk->sk_cgrp_data);
3572 }
3573}
3574
3575/*
3576 * Default Socket Callbacks
3577 */
3578
3579static void sock_def_wakeup(struct sock *sk)
3580{
3581 struct socket_wq *wq;
3582
3583 rcu_read_lock();
3584 wq = rcu_dereference(sk->sk_wq);
3585 if (skwq_has_sleeper(wq))
3586 wake_up_interruptible_all(&wq->wait);
3587 rcu_read_unlock();
3588}
3589
3590static void sock_def_error_report(struct sock *sk)
3591{
3592 struct socket_wq *wq;
3593
3594 rcu_read_lock();
3595 wq = rcu_dereference(sk->sk_wq);
3596 if (skwq_has_sleeper(wq))
3597 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3598 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3599 rcu_read_unlock();
3600}
3601
3602void sock_def_readable(struct sock *sk)
3603{
3604 struct socket_wq *wq;
3605
3606 trace_sk_data_ready(sk);
3607
3608 rcu_read_lock();
3609 wq = rcu_dereference(sk->sk_wq);
3610 if (skwq_has_sleeper(wq))
3611 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3612 EPOLLRDNORM | EPOLLRDBAND);
3613 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3614 rcu_read_unlock();
3615}
3616
3617static void sock_def_write_space(struct sock *sk)
3618{
3619 struct socket_wq *wq;
3620
3621 rcu_read_lock();
3622
3623 /* Do not wake up a writer until he can make "significant"
3624 * progress. --DaveM
3625 */
3626 if (sock_writeable(sk)) {
3627 wq = rcu_dereference(sk->sk_wq);
3628 if (skwq_has_sleeper(wq))
3629 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3630 EPOLLWRNORM | EPOLLWRBAND);
3631
3632 /* Should agree with poll, otherwise some programs break */
3633 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3634 }
3635
3636 rcu_read_unlock();
3637}
3638
3639/* An optimised version of sock_def_write_space(), should only be called
3640 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3641 * ->sk_wmem_alloc.
3642 */
3643static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3644{
3645 /* Do not wake up a writer until he can make "significant"
3646 * progress. --DaveM
3647 */
3648 if (__sock_writeable(sk, wmem_alloc)) {
3649 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3650
3651 /* rely on refcount_sub from sock_wfree() */
3652 smp_mb__after_atomic();
3653 if (wq && waitqueue_active(&wq->wait))
3654 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3655 EPOLLWRNORM | EPOLLWRBAND);
3656
3657 /* Should agree with poll, otherwise some programs break */
3658 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3659 }
3660}
3661
3662static void sock_def_destruct(struct sock *sk)
3663{
3664}
3665
3666void sk_send_sigurg(struct sock *sk)
3667{
3668 if (sk->sk_socket && sk->sk_socket->file)
3669 if (send_sigurg(sk->sk_socket->file))
3670 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3671}
3672EXPORT_SYMBOL(sk_send_sigurg);
3673
3674void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3675 unsigned long expires)
3676{
3677 if (!mod_timer(timer, expires))
3678 sock_hold(sk);
3679}
3680EXPORT_SYMBOL(sk_reset_timer);
3681
3682void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3683{
3684 if (timer_delete(timer))
3685 __sock_put(sk);
3686}
3687EXPORT_SYMBOL(sk_stop_timer);
3688
3689void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3690{
3691 if (timer_delete_sync(timer))
3692 __sock_put(sk);
3693}
3694EXPORT_SYMBOL(sk_stop_timer_sync);
3695
3696void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3697{
3698 sk_init_common(sk);
3699 sk->sk_send_head = NULL;
3700
3701 timer_setup(&sk->sk_timer, NULL, 0);
3702
3703 sk->sk_allocation = GFP_KERNEL;
3704 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3705 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3706 sk->sk_state = TCP_CLOSE;
3707 sk->sk_use_task_frag = true;
3708 sk_set_socket(sk, sock);
3709
3710 sock_set_flag(sk, SOCK_ZAPPED);
3711
3712 if (sock) {
3713 sk->sk_type = sock->type;
3714 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3715 sock->sk = sk;
3716 } else {
3717 RCU_INIT_POINTER(sk->sk_wq, NULL);
3718 }
3719 sk->sk_uid = uid;
3720
3721 sk->sk_state_change = sock_def_wakeup;
3722 sk->sk_data_ready = sock_def_readable;
3723 sk->sk_write_space = sock_def_write_space;
3724 sk->sk_error_report = sock_def_error_report;
3725 sk->sk_destruct = sock_def_destruct;
3726
3727 sk->sk_frag.page = NULL;
3728 sk->sk_frag.offset = 0;
3729 sk->sk_peek_off = -1;
3730
3731 sk->sk_peer_pid = NULL;
3732 sk->sk_peer_cred = NULL;
3733 spin_lock_init(&sk->sk_peer_lock);
3734
3735 sk->sk_write_pending = 0;
3736 sk->sk_rcvlowat = 1;
3737 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3738 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3739
3740 sk->sk_stamp = SK_DEFAULT_STAMP;
3741#if BITS_PER_LONG==32
3742 seqlock_init(&sk->sk_stamp_seq);
3743#endif
3744 atomic_set(&sk->sk_zckey, 0);
3745
3746#ifdef CONFIG_NET_RX_BUSY_POLL
3747 sk->sk_napi_id = 0;
3748 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3749#endif
3750
3751 sk->sk_max_pacing_rate = ~0UL;
3752 sk->sk_pacing_rate = ~0UL;
3753 WRITE_ONCE(sk->sk_pacing_shift, 10);
3754 sk->sk_incoming_cpu = -1;
3755
3756 sk_rx_queue_clear(sk);
3757 /*
3758 * Before updating sk_refcnt, we must commit prior changes to memory
3759 * (Documentation/RCU/rculist_nulls.rst for details)
3760 */
3761 smp_wmb();
3762 refcount_set(&sk->sk_refcnt, 1);
3763 sk_drops_reset(sk);
3764}
3765EXPORT_SYMBOL(sock_init_data_uid);
3766
3767void sock_init_data(struct socket *sock, struct sock *sk)
3768{
3769 kuid_t uid = sock ?
3770 SOCK_INODE(sock)->i_uid :
3771 make_kuid(sock_net(sk)->user_ns, 0);
3772
3773 sock_init_data_uid(sock, sk, uid);
3774}
3775EXPORT_SYMBOL(sock_init_data);
3776
3777void lock_sock_nested(struct sock *sk, int subclass)
3778{
3779 /* The sk_lock has mutex_lock() semantics here. */
3780 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3781
3782 might_sleep();
3783 spin_lock_bh(&sk->sk_lock.slock);
3784 if (sock_owned_by_user_nocheck(sk))
3785 __lock_sock(sk);
3786 sk->sk_lock.owned = 1;
3787 spin_unlock_bh(&sk->sk_lock.slock);
3788}
3789EXPORT_SYMBOL(lock_sock_nested);
3790
3791void release_sock(struct sock *sk)
3792{
3793 spin_lock_bh(&sk->sk_lock.slock);
3794 if (sk->sk_backlog.tail)
3795 __release_sock(sk);
3796
3797 if (sk->sk_prot->release_cb)
3798 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3799 tcp_release_cb, sk);
3800
3801 sock_release_ownership(sk);
3802 if (waitqueue_active(&sk->sk_lock.wq))
3803 wake_up(&sk->sk_lock.wq);
3804 spin_unlock_bh(&sk->sk_lock.slock);
3805}
3806EXPORT_SYMBOL(release_sock);
3807
3808bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3809{
3810 might_sleep();
3811 spin_lock_bh(&sk->sk_lock.slock);
3812
3813 if (!sock_owned_by_user_nocheck(sk)) {
3814 /*
3815 * Fast path return with bottom halves disabled and
3816 * sock::sk_lock.slock held.
3817 *
3818 * The 'mutex' is not contended and holding
3819 * sock::sk_lock.slock prevents all other lockers to
3820 * proceed so the corresponding unlock_sock_fast() can
3821 * avoid the slow path of release_sock() completely and
3822 * just release slock.
3823 *
3824 * From a semantical POV this is equivalent to 'acquiring'
3825 * the 'mutex', hence the corresponding lockdep
3826 * mutex_release() has to happen in the fast path of
3827 * unlock_sock_fast().
3828 */
3829 return false;
3830 }
3831
3832 __lock_sock(sk);
3833 sk->sk_lock.owned = 1;
3834 __acquire(&sk->sk_lock.slock);
3835 spin_unlock_bh(&sk->sk_lock.slock);
3836 return true;
3837}
3838EXPORT_SYMBOL(__lock_sock_fast);
3839
3840int sock_gettstamp(struct socket *sock, void __user *userstamp,
3841 bool timeval, bool time32)
3842{
3843 struct sock *sk = sock->sk;
3844 struct timespec64 ts;
3845
3846 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3847 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3848 if (ts.tv_sec == -1)
3849 return -ENOENT;
3850 if (ts.tv_sec == 0) {
3851 ktime_t kt = ktime_get_real();
3852 sock_write_timestamp(sk, kt);
3853 ts = ktime_to_timespec64(kt);
3854 }
3855
3856 if (timeval)
3857 ts.tv_nsec /= 1000;
3858
3859#ifdef CONFIG_COMPAT_32BIT_TIME
3860 if (time32)
3861 return put_old_timespec32(&ts, userstamp);
3862#endif
3863#ifdef CONFIG_SPARC64
3864 /* beware of padding in sparc64 timeval */
3865 if (timeval && !in_compat_syscall()) {
3866 struct __kernel_old_timeval __user tv = {
3867 .tv_sec = ts.tv_sec,
3868 .tv_usec = ts.tv_nsec,
3869 };
3870 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3871 return -EFAULT;
3872 return 0;
3873 }
3874#endif
3875 return put_timespec64(&ts, userstamp);
3876}
3877EXPORT_SYMBOL(sock_gettstamp);
3878
3879void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3880{
3881 if (!sock_flag(sk, flag)) {
3882 unsigned long previous_flags = sk->sk_flags;
3883
3884 sock_set_flag(sk, flag);
3885 /*
3886 * we just set one of the two flags which require net
3887 * time stamping, but time stamping might have been on
3888 * already because of the other one
3889 */
3890 if (sock_needs_netstamp(sk) &&
3891 !(previous_flags & SK_FLAGS_TIMESTAMP))
3892 net_enable_timestamp();
3893 }
3894}
3895
3896int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3897 int level, int type)
3898{
3899 struct sock_extended_err ee;
3900 struct sk_buff *skb;
3901 int copied, err;
3902
3903 err = -EAGAIN;
3904 skb = sock_dequeue_err_skb(sk);
3905 if (skb == NULL)
3906 goto out;
3907
3908 copied = skb->len;
3909 if (copied > len) {
3910 msg->msg_flags |= MSG_TRUNC;
3911 copied = len;
3912 }
3913 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3914 if (err)
3915 goto out_free_skb;
3916
3917 sock_recv_timestamp(msg, sk, skb);
3918
3919 /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
3920 ee = SKB_EXT_ERR(skb)->ee;
3921 put_cmsg(msg, level, type, sizeof(ee), &ee);
3922
3923 msg->msg_flags |= MSG_ERRQUEUE;
3924 err = copied;
3925
3926out_free_skb:
3927 kfree_skb(skb);
3928out:
3929 return err;
3930}
3931EXPORT_SYMBOL(sock_recv_errqueue);
3932
3933/*
3934 * Get a socket option on an socket.
3935 *
3936 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3937 * asynchronous errors should be reported by getsockopt. We assume
3938 * this means if you specify SO_ERROR (otherwise what is the point of it).
3939 */
3940int sock_common_getsockopt(struct socket *sock, int level, int optname,
3941 char __user *optval, int __user *optlen)
3942{
3943 struct sock *sk = sock->sk;
3944
3945 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3946 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3947}
3948EXPORT_SYMBOL(sock_common_getsockopt);
3949
3950int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3951 int flags)
3952{
3953 struct sock *sk = sock->sk;
3954 int addr_len = 0;
3955 int err;
3956
3957 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3958 if (err >= 0)
3959 msg->msg_namelen = addr_len;
3960 return err;
3961}
3962EXPORT_SYMBOL(sock_common_recvmsg);
3963
3964/*
3965 * Set socket options on an inet socket.
3966 */
3967int sock_common_setsockopt(struct socket *sock, int level, int optname,
3968 sockptr_t optval, unsigned int optlen)
3969{
3970 struct sock *sk = sock->sk;
3971
3972 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3973 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3974}
3975EXPORT_SYMBOL(sock_common_setsockopt);
3976
3977void sk_common_release(struct sock *sk)
3978{
3979 if (sk->sk_prot->destroy)
3980 sk->sk_prot->destroy(sk);
3981
3982 /*
3983 * Observation: when sk_common_release is called, processes have
3984 * no access to socket. But net still has.
3985 * Step one, detach it from networking:
3986 *
3987 * A. Remove from hash tables.
3988 */
3989
3990 sk->sk_prot->unhash(sk);
3991
3992 /*
3993 * In this point socket cannot receive new packets, but it is possible
3994 * that some packets are in flight because some CPU runs receiver and
3995 * did hash table lookup before we unhashed socket. They will achieve
3996 * receive queue and will be purged by socket destructor.
3997 *
3998 * Also we still have packets pending on receive queue and probably,
3999 * our own packets waiting in device queues. sock_destroy will drain
4000 * receive queue, but transmitted packets will delay socket destruction
4001 * until the last reference will be released.
4002 */
4003
4004 sock_orphan(sk);
4005
4006 xfrm_sk_free_policy(sk);
4007
4008 sock_put(sk);
4009}
4010EXPORT_SYMBOL(sk_common_release);
4011
4012void sk_get_meminfo(const struct sock *sk, u32 *mem)
4013{
4014 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
4015
4016 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
4017 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
4018 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4019 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4020 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4021 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4022 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
4023 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4024 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4025}
4026
4027#ifdef CONFIG_PROC_FS
4028static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4029
4030int sock_prot_inuse_get(struct net *net, struct proto *prot)
4031{
4032 int cpu, idx = prot->inuse_idx;
4033 int res = 0;
4034
4035 for_each_possible_cpu(cpu)
4036 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4037
4038 return res >= 0 ? res : 0;
4039}
4040EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4041
4042int sock_inuse_get(struct net *net)
4043{
4044 int cpu, res = 0;
4045
4046 for_each_possible_cpu(cpu)
4047 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4048
4049 return res;
4050}
4051
4052EXPORT_SYMBOL_GPL(sock_inuse_get);
4053
4054static int __net_init sock_inuse_init_net(struct net *net)
4055{
4056 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4057 if (net->core.prot_inuse == NULL)
4058 return -ENOMEM;
4059 return 0;
4060}
4061
4062static void __net_exit sock_inuse_exit_net(struct net *net)
4063{
4064 free_percpu(net->core.prot_inuse);
4065}
4066
4067static struct pernet_operations net_inuse_ops = {
4068 .init = sock_inuse_init_net,
4069 .exit = sock_inuse_exit_net,
4070};
4071
4072static __init int net_inuse_init(void)
4073{
4074 if (register_pernet_subsys(&net_inuse_ops))
4075 panic("Cannot initialize net inuse counters");
4076
4077 return 0;
4078}
4079
4080core_initcall(net_inuse_init);
4081
4082static int assign_proto_idx(struct proto *prot)
4083{
4084 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4085
4086 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4087 pr_err("PROTO_INUSE_NR exhausted\n");
4088 return -ENOSPC;
4089 }
4090
4091 set_bit(prot->inuse_idx, proto_inuse_idx);
4092 return 0;
4093}
4094
4095static void release_proto_idx(struct proto *prot)
4096{
4097 if (prot->inuse_idx != PROTO_INUSE_NR)
4098 clear_bit(prot->inuse_idx, proto_inuse_idx);
4099}
4100#else
4101static inline int assign_proto_idx(struct proto *prot)
4102{
4103 return 0;
4104}
4105
4106static inline void release_proto_idx(struct proto *prot)
4107{
4108}
4109
4110#endif
4111
4112static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4113{
4114 if (!twsk_prot)
4115 return;
4116 kfree(twsk_prot->twsk_slab_name);
4117 twsk_prot->twsk_slab_name = NULL;
4118 kmem_cache_destroy(twsk_prot->twsk_slab);
4119 twsk_prot->twsk_slab = NULL;
4120}
4121
4122static int tw_prot_init(const struct proto *prot)
4123{
4124 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4125
4126 if (!twsk_prot)
4127 return 0;
4128
4129 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4130 prot->name);
4131 if (!twsk_prot->twsk_slab_name)
4132 return -ENOMEM;
4133
4134 twsk_prot->twsk_slab =
4135 kmem_cache_create(twsk_prot->twsk_slab_name,
4136 twsk_prot->twsk_obj_size, 0,
4137 SLAB_ACCOUNT | prot->slab_flags,
4138 NULL);
4139 if (!twsk_prot->twsk_slab) {
4140 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4141 prot->name);
4142 return -ENOMEM;
4143 }
4144
4145 return 0;
4146}
4147
4148static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4149{
4150 if (!rsk_prot)
4151 return;
4152 kfree(rsk_prot->slab_name);
4153 rsk_prot->slab_name = NULL;
4154 kmem_cache_destroy(rsk_prot->slab);
4155 rsk_prot->slab = NULL;
4156}
4157
4158static int req_prot_init(const struct proto *prot)
4159{
4160 struct request_sock_ops *rsk_prot = prot->rsk_prot;
4161
4162 if (!rsk_prot)
4163 return 0;
4164
4165 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4166 prot->name);
4167 if (!rsk_prot->slab_name)
4168 return -ENOMEM;
4169
4170 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4171 rsk_prot->obj_size, 0,
4172 SLAB_ACCOUNT | prot->slab_flags,
4173 NULL);
4174
4175 if (!rsk_prot->slab) {
4176 pr_crit("%s: Can't create request sock SLAB cache!\n",
4177 prot->name);
4178 return -ENOMEM;
4179 }
4180 return 0;
4181}
4182
4183int proto_register(struct proto *prot, int alloc_slab)
4184{
4185 int ret = -ENOBUFS;
4186
4187 if (prot->memory_allocated && !prot->sysctl_mem) {
4188 pr_err("%s: missing sysctl_mem\n", prot->name);
4189 return -EINVAL;
4190 }
4191 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4192 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4193 return -EINVAL;
4194 }
4195 if (alloc_slab) {
4196 prot->slab = kmem_cache_create_usercopy(prot->name,
4197 prot->obj_size, 0,
4198 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4199 prot->slab_flags,
4200 prot->useroffset, prot->usersize,
4201 NULL);
4202
4203 if (prot->slab == NULL) {
4204 pr_crit("%s: Can't create sock SLAB cache!\n",
4205 prot->name);
4206 goto out;
4207 }
4208
4209 if (req_prot_init(prot))
4210 goto out_free_request_sock_slab;
4211
4212 if (tw_prot_init(prot))
4213 goto out_free_timewait_sock_slab;
4214 }
4215
4216 mutex_lock(&proto_list_mutex);
4217 ret = assign_proto_idx(prot);
4218 if (ret) {
4219 mutex_unlock(&proto_list_mutex);
4220 goto out_free_timewait_sock_slab;
4221 }
4222 list_add(&prot->node, &proto_list);
4223 mutex_unlock(&proto_list_mutex);
4224 return ret;
4225
4226out_free_timewait_sock_slab:
4227 if (alloc_slab)
4228 tw_prot_cleanup(prot->twsk_prot);
4229out_free_request_sock_slab:
4230 if (alloc_slab) {
4231 req_prot_cleanup(prot->rsk_prot);
4232
4233 kmem_cache_destroy(prot->slab);
4234 prot->slab = NULL;
4235 }
4236out:
4237 return ret;
4238}
4239EXPORT_SYMBOL(proto_register);
4240
4241void proto_unregister(struct proto *prot)
4242{
4243 mutex_lock(&proto_list_mutex);
4244 release_proto_idx(prot);
4245 list_del(&prot->node);
4246 mutex_unlock(&proto_list_mutex);
4247
4248 kmem_cache_destroy(prot->slab);
4249 prot->slab = NULL;
4250
4251 req_prot_cleanup(prot->rsk_prot);
4252 tw_prot_cleanup(prot->twsk_prot);
4253}
4254EXPORT_SYMBOL(proto_unregister);
4255
4256int sock_load_diag_module(int family, int protocol)
4257{
4258 if (!protocol) {
4259 if (!sock_is_registered(family))
4260 return -ENOENT;
4261
4262 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4263 NETLINK_SOCK_DIAG, family);
4264 }
4265
4266#ifdef CONFIG_INET
4267 if (family == AF_INET &&
4268 protocol != IPPROTO_RAW &&
4269 protocol < MAX_INET_PROTOS &&
4270 !rcu_access_pointer(inet_protos[protocol]))
4271 return -ENOENT;
4272#endif
4273
4274 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4275 NETLINK_SOCK_DIAG, family, protocol);
4276}
4277EXPORT_SYMBOL(sock_load_diag_module);
4278
4279#ifdef CONFIG_PROC_FS
4280static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4281 __acquires(proto_list_mutex)
4282{
4283 mutex_lock(&proto_list_mutex);
4284 return seq_list_start_head(&proto_list, *pos);
4285}
4286
4287static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4288{
4289 return seq_list_next(v, &proto_list, pos);
4290}
4291
4292static void proto_seq_stop(struct seq_file *seq, void *v)
4293 __releases(proto_list_mutex)
4294{
4295 mutex_unlock(&proto_list_mutex);
4296}
4297
4298static char proto_method_implemented(const void *method)
4299{
4300 return method == NULL ? 'n' : 'y';
4301}
4302static long sock_prot_memory_allocated(struct proto *proto)
4303{
4304 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4305}
4306
4307static const char *sock_prot_memory_pressure(struct proto *proto)
4308{
4309 return proto->memory_pressure != NULL ?
4310 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4311}
4312
4313static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4314{
4315
4316 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4317 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4318 proto->name,
4319 proto->obj_size,
4320 sock_prot_inuse_get(seq_file_net(seq), proto),
4321 sock_prot_memory_allocated(proto),
4322 sock_prot_memory_pressure(proto),
4323 proto->max_header,
4324 proto->slab == NULL ? "no" : "yes",
4325 module_name(proto->owner),
4326 proto_method_implemented(proto->close),
4327 proto_method_implemented(proto->connect),
4328 proto_method_implemented(proto->disconnect),
4329 proto_method_implemented(proto->accept),
4330 proto_method_implemented(proto->ioctl),
4331 proto_method_implemented(proto->init),
4332 proto_method_implemented(proto->destroy),
4333 proto_method_implemented(proto->shutdown),
4334 proto_method_implemented(proto->setsockopt),
4335 proto_method_implemented(proto->getsockopt),
4336 proto_method_implemented(proto->sendmsg),
4337 proto_method_implemented(proto->recvmsg),
4338 proto_method_implemented(proto->bind),
4339 proto_method_implemented(proto->backlog_rcv),
4340 proto_method_implemented(proto->hash),
4341 proto_method_implemented(proto->unhash),
4342 proto_method_implemented(proto->get_port),
4343 proto_method_implemented(proto->enter_memory_pressure));
4344}
4345
4346static int proto_seq_show(struct seq_file *seq, void *v)
4347{
4348 if (v == &proto_list)
4349 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4350 "protocol",
4351 "size",
4352 "sockets",
4353 "memory",
4354 "press",
4355 "maxhdr",
4356 "slab",
4357 "module",
4358 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4359 else
4360 proto_seq_printf(seq, list_entry(v, struct proto, node));
4361 return 0;
4362}
4363
4364static const struct seq_operations proto_seq_ops = {
4365 .start = proto_seq_start,
4366 .next = proto_seq_next,
4367 .stop = proto_seq_stop,
4368 .show = proto_seq_show,
4369};
4370
4371static __net_init int proto_init_net(struct net *net)
4372{
4373 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4374 sizeof(struct seq_net_private)))
4375 return -ENOMEM;
4376
4377 return 0;
4378}
4379
4380static __net_exit void proto_exit_net(struct net *net)
4381{
4382 remove_proc_entry("protocols", net->proc_net);
4383}
4384
4385
4386static __net_initdata struct pernet_operations proto_net_ops = {
4387 .init = proto_init_net,
4388 .exit = proto_exit_net,
4389};
4390
4391static int __init proto_init(void)
4392{
4393 return register_pernet_subsys(&proto_net_ops);
4394}
4395
4396subsys_initcall(proto_init);
4397
4398#endif /* PROC_FS */
4399
4400#ifdef CONFIG_NET_RX_BUSY_POLL
4401bool sk_busy_loop_end(void *p, unsigned long start_time)
4402{
4403 struct sock *sk = p;
4404
4405 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4406 return true;
4407
4408 if (sk_is_udp(sk) &&
4409 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4410 return true;
4411
4412 return sk_busy_loop_timeout(sk, start_time);
4413}
4414EXPORT_SYMBOL(sk_busy_loop_end);
4415#endif /* CONFIG_NET_RX_BUSY_POLL */
4416
4417int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
4418{
4419 if (!sk->sk_prot->bind_add)
4420 return -EOPNOTSUPP;
4421 return sk->sk_prot->bind_add(sk, addr, addr_len);
4422}
4423EXPORT_SYMBOL(sock_bind_add);
4424
4425/* Copy 'size' bytes from userspace and return `size` back to userspace */
4426int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4427 void __user *arg, void *karg, size_t size)
4428{
4429 int ret;
4430
4431 if (copy_from_user(karg, arg, size))
4432 return -EFAULT;
4433
4434 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4435 if (ret)
4436 return ret;
4437
4438 if (copy_to_user(arg, karg, size))
4439 return -EFAULT;
4440
4441 return 0;
4442}
4443EXPORT_SYMBOL(sock_ioctl_inout);
4444
4445/* This is the most common ioctl prep function, where the result (4 bytes) is
4446 * copied back to userspace if the ioctl() returns successfully. No input is
4447 * copied from userspace as input argument.
4448 */
4449static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4450{
4451 int ret, karg = 0;
4452
4453 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4454 if (ret)
4455 return ret;
4456
4457 return put_user(karg, (int __user *)arg);
4458}
4459
4460/* A wrapper around sock ioctls, which copies the data from userspace
4461 * (depending on the protocol/ioctl), and copies back the result to userspace.
4462 * The main motivation for this function is to pass kernel memory to the
4463 * protocol ioctl callbacks, instead of userspace memory.
4464 */
4465int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4466{
4467 int rc = 1;
4468
4469 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4470 rc = ipmr_sk_ioctl(sk, cmd, arg);
4471 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4472 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4473 else if (sk_is_phonet(sk))
4474 rc = phonet_sk_ioctl(sk, cmd, arg);
4475
4476 /* If ioctl was processed, returns its value */
4477 if (rc <= 0)
4478 return rc;
4479
4480 /* Otherwise call the default handler */
4481 return sock_ioctl_out(sk, cmd, arg);
4482}
4483EXPORT_SYMBOL(sk_ioctl);
4484
4485static int __init sock_struct_check(void)
4486{
4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4488 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4489 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4491 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4492
4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4494 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4499 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4502
4503 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4505#ifdef CONFIG_MEMCG
4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4507#endif
4508
4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4513
4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4516 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4518 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4520 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4523 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4524 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4525 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4526 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4527 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4528
4529 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
4530 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
4531 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4532 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4533 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4534 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4535 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4536 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4537 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4538 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4539 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4540 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4541 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4542 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4543 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4544 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4545 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4546 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4547 return 0;
4548}
4549
4550core_initcall(sock_struct_check);