Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/isolation.h>
81#include <linux/sched/mm.h>
82#include <linux/smpboot.h>
83#include <linux/mutex.h>
84#include <linux/rwsem.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/ethtool_netlink.h>
96#include <linux/skbuff.h>
97#include <linux/kthread.h>
98#include <linux/bpf.h>
99#include <linux/bpf_trace.h>
100#include <net/net_namespace.h>
101#include <net/sock.h>
102#include <net/busy_poll.h>
103#include <linux/rtnetlink.h>
104#include <linux/stat.h>
105#include <net/dsa.h>
106#include <net/dst.h>
107#include <net/dst_metadata.h>
108#include <net/gro.h>
109#include <net/netdev_queues.h>
110#include <net/pkt_sched.h>
111#include <net/pkt_cls.h>
112#include <net/checksum.h>
113#include <net/xfrm.h>
114#include <net/tcx.h>
115#include <linux/highmem.h>
116#include <linux/init.h>
117#include <linux/module.h>
118#include <linux/netpoll.h>
119#include <linux/rcupdate.h>
120#include <linux/delay.h>
121#include <net/iw_handler.h>
122#include <asm/current.h>
123#include <linux/audit.h>
124#include <linux/dmaengine.h>
125#include <linux/err.h>
126#include <linux/ctype.h>
127#include <linux/if_arp.h>
128#include <linux/if_vlan.h>
129#include <linux/ip.h>
130#include <net/ip.h>
131#include <net/mpls.h>
132#include <linux/ipv6.h>
133#include <linux/in.h>
134#include <linux/jhash.h>
135#include <linux/random.h>
136#include <trace/events/napi.h>
137#include <trace/events/net.h>
138#include <trace/events/skb.h>
139#include <trace/events/qdisc.h>
140#include <trace/events/xdp.h>
141#include <linux/inetdevice.h>
142#include <linux/cpu_rmap.h>
143#include <linux/static_key.h>
144#include <linux/hashtable.h>
145#include <linux/vmalloc.h>
146#include <linux/if_macvlan.h>
147#include <linux/errqueue.h>
148#include <linux/hrtimer.h>
149#include <linux/netfilter_netdev.h>
150#include <linux/crash_dump.h>
151#include <linux/sctp.h>
152#include <net/udp_tunnel.h>
153#include <linux/net_namespace.h>
154#include <linux/indirect_call_wrapper.h>
155#include <net/devlink.h>
156#include <linux/pm_runtime.h>
157#include <linux/prandom.h>
158#include <linux/once_lite.h>
159#include <net/netdev_rx_queue.h>
160#include <net/page_pool/types.h>
161#include <net/page_pool/helpers.h>
162#include <net/rps.h>
163#include <linux/phy_link_topology.h>
164
165#include "dev.h"
166#include "devmem.h"
167#include "net-sysfs.h"
168
169static DEFINE_SPINLOCK(ptype_lock);
170struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171
172static int netif_rx_internal(struct sk_buff *skb);
173static int call_netdevice_notifiers_extack(unsigned long val,
174 struct net_device *dev,
175 struct netlink_ext_ack *extack);
176
177static DEFINE_MUTEX(ifalias_mutex);
178
179/* protects napi_hash addition/deletion and napi_gen_id */
180static DEFINE_SPINLOCK(napi_hash_lock);
181
182static unsigned int napi_gen_id = NR_CPUS;
183static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
184
185static inline void dev_base_seq_inc(struct net *net)
186{
187 unsigned int val = net->dev_base_seq + 1;
188
189 WRITE_ONCE(net->dev_base_seq, val ?: 1);
190}
191
192static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
193{
194 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
195
196 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
197}
198
199static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
200{
201 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
202}
203
204#ifndef CONFIG_PREEMPT_RT
205
206static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
207
208static int __init setup_backlog_napi_threads(char *arg)
209{
210 static_branch_enable(&use_backlog_threads_key);
211 return 0;
212}
213early_param("thread_backlog_napi", setup_backlog_napi_threads);
214
215static bool use_backlog_threads(void)
216{
217 return static_branch_unlikely(&use_backlog_threads_key);
218}
219
220#else
221
222static bool use_backlog_threads(void)
223{
224 return true;
225}
226
227#endif
228
229static inline void backlog_lock_irq_save(struct softnet_data *sd,
230 unsigned long *flags)
231{
232 if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
233 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
234 else
235 local_irq_save(*flags);
236}
237
238static inline void backlog_lock_irq_disable(struct softnet_data *sd)
239{
240 if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
241 spin_lock_irq(&sd->input_pkt_queue.lock);
242 else
243 local_irq_disable();
244}
245
246static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
247 unsigned long *flags)
248{
249 if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
250 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
251 else
252 local_irq_restore(*flags);
253}
254
255static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
256{
257 if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
258 spin_unlock_irq(&sd->input_pkt_queue.lock);
259 else
260 local_irq_enable();
261}
262
263static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
264 const char *name)
265{
266 struct netdev_name_node *name_node;
267
268 name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
269 if (!name_node)
270 return NULL;
271 INIT_HLIST_NODE(&name_node->hlist);
272 name_node->dev = dev;
273 name_node->name = name;
274 return name_node;
275}
276
277static struct netdev_name_node *
278netdev_name_node_head_alloc(struct net_device *dev)
279{
280 struct netdev_name_node *name_node;
281
282 name_node = netdev_name_node_alloc(dev, dev->name);
283 if (!name_node)
284 return NULL;
285 INIT_LIST_HEAD(&name_node->list);
286 return name_node;
287}
288
289static void netdev_name_node_free(struct netdev_name_node *name_node)
290{
291 kfree(name_node);
292}
293
294static void netdev_name_node_add(struct net *net,
295 struct netdev_name_node *name_node)
296{
297 hlist_add_head_rcu(&name_node->hlist,
298 dev_name_hash(net, name_node->name));
299}
300
301static void netdev_name_node_del(struct netdev_name_node *name_node)
302{
303 hlist_del_rcu(&name_node->hlist);
304}
305
306static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
307 const char *name)
308{
309 struct hlist_head *head = dev_name_hash(net, name);
310 struct netdev_name_node *name_node;
311
312 hlist_for_each_entry(name_node, head, hlist)
313 if (!strcmp(name_node->name, name))
314 return name_node;
315 return NULL;
316}
317
318static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
319 const char *name)
320{
321 struct hlist_head *head = dev_name_hash(net, name);
322 struct netdev_name_node *name_node;
323
324 hlist_for_each_entry_rcu(name_node, head, hlist)
325 if (!strcmp(name_node->name, name))
326 return name_node;
327 return NULL;
328}
329
330bool netdev_name_in_use(struct net *net, const char *name)
331{
332 return netdev_name_node_lookup(net, name);
333}
334EXPORT_SYMBOL(netdev_name_in_use);
335
336int netdev_name_node_alt_create(struct net_device *dev, const char *name)
337{
338 struct netdev_name_node *name_node;
339 struct net *net = dev_net(dev);
340
341 name_node = netdev_name_node_lookup(net, name);
342 if (name_node)
343 return -EEXIST;
344 name_node = netdev_name_node_alloc(dev, name);
345 if (!name_node)
346 return -ENOMEM;
347 netdev_name_node_add(net, name_node);
348 /* The node that holds dev->name acts as a head of per-device list. */
349 list_add_tail_rcu(&name_node->list, &dev->name_node->list);
350
351 return 0;
352}
353
354static void netdev_name_node_alt_free(struct rcu_head *head)
355{
356 struct netdev_name_node *name_node =
357 container_of(head, struct netdev_name_node, rcu);
358
359 kfree(name_node->name);
360 netdev_name_node_free(name_node);
361}
362
363static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
364{
365 netdev_name_node_del(name_node);
366 list_del(&name_node->list);
367 call_rcu(&name_node->rcu, netdev_name_node_alt_free);
368}
369
370int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
371{
372 struct netdev_name_node *name_node;
373 struct net *net = dev_net(dev);
374
375 name_node = netdev_name_node_lookup(net, name);
376 if (!name_node)
377 return -ENOENT;
378 /* lookup might have found our primary name or a name belonging
379 * to another device.
380 */
381 if (name_node == dev->name_node || name_node->dev != dev)
382 return -EINVAL;
383
384 __netdev_name_node_alt_destroy(name_node);
385 return 0;
386}
387
388static void netdev_name_node_alt_flush(struct net_device *dev)
389{
390 struct netdev_name_node *name_node, *tmp;
391
392 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
393 list_del(&name_node->list);
394 netdev_name_node_alt_free(&name_node->rcu);
395 }
396}
397
398/* Device list insertion */
399static void list_netdevice(struct net_device *dev)
400{
401 struct netdev_name_node *name_node;
402 struct net *net = dev_net(dev);
403
404 ASSERT_RTNL();
405
406 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
407 netdev_name_node_add(net, dev->name_node);
408 hlist_add_head_rcu(&dev->index_hlist,
409 dev_index_hash(net, dev->ifindex));
410
411 netdev_for_each_altname(dev, name_node)
412 netdev_name_node_add(net, name_node);
413
414 /* We reserved the ifindex, this can't fail */
415 WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
416
417 dev_base_seq_inc(net);
418}
419
420/* Device list removal
421 * caller must respect a RCU grace period before freeing/reusing dev
422 */
423static void unlist_netdevice(struct net_device *dev)
424{
425 struct netdev_name_node *name_node;
426 struct net *net = dev_net(dev);
427
428 ASSERT_RTNL();
429
430 xa_erase(&net->dev_by_index, dev->ifindex);
431
432 netdev_for_each_altname(dev, name_node)
433 netdev_name_node_del(name_node);
434
435 /* Unlink dev from the device chain */
436 list_del_rcu(&dev->dev_list);
437 netdev_name_node_del(dev->name_node);
438 hlist_del_rcu(&dev->index_hlist);
439
440 dev_base_seq_inc(dev_net(dev));
441}
442
443/*
444 * Our notifier list
445 */
446
447static RAW_NOTIFIER_HEAD(netdev_chain);
448
449/*
450 * Device drivers call our routines to queue packets here. We empty the
451 * queue in the local softnet handler.
452 */
453
454DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
455 .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
456};
457EXPORT_PER_CPU_SYMBOL(softnet_data);
458
459/* Page_pool has a lockless array/stack to alloc/recycle pages.
460 * PP consumers must pay attention to run APIs in the appropriate context
461 * (e.g. NAPI context).
462 */
463DEFINE_PER_CPU(struct page_pool *, system_page_pool);
464
465#ifdef CONFIG_LOCKDEP
466/*
467 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
468 * according to dev->type
469 */
470static const unsigned short netdev_lock_type[] = {
471 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
472 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
473 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
474 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
475 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
476 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
477 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
478 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
479 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
480 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
481 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
482 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
483 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
484 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
485 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
486
487static const char *const netdev_lock_name[] = {
488 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
489 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
490 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
491 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
492 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
493 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
494 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
495 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
496 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
497 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
498 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
499 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
500 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
501 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
502 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
503
504static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
505static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
506
507static inline unsigned short netdev_lock_pos(unsigned short dev_type)
508{
509 int i;
510
511 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
512 if (netdev_lock_type[i] == dev_type)
513 return i;
514 /* the last key is used by default */
515 return ARRAY_SIZE(netdev_lock_type) - 1;
516}
517
518static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
519 unsigned short dev_type)
520{
521 int i;
522
523 i = netdev_lock_pos(dev_type);
524 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
525 netdev_lock_name[i]);
526}
527
528static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
529{
530 int i;
531
532 i = netdev_lock_pos(dev->type);
533 lockdep_set_class_and_name(&dev->addr_list_lock,
534 &netdev_addr_lock_key[i],
535 netdev_lock_name[i]);
536}
537#else
538static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
539 unsigned short dev_type)
540{
541}
542
543static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
544{
545}
546#endif
547
548/*******************************************************************************
549 *
550 * Protocol management and registration routines
551 *
552 *******************************************************************************/
553
554
555/*
556 * Add a protocol ID to the list. Now that the input handler is
557 * smarter we can dispense with all the messy stuff that used to be
558 * here.
559 *
560 * BEWARE!!! Protocol handlers, mangling input packets,
561 * MUST BE last in hash buckets and checking protocol handlers
562 * MUST start from promiscuous ptype_all chain in net_bh.
563 * It is true now, do not change it.
564 * Explanation follows: if protocol handler, mangling packet, will
565 * be the first on list, it is not able to sense, that packet
566 * is cloned and should be copied-on-write, so that it will
567 * change it and subsequent readers will get broken packet.
568 * --ANK (980803)
569 */
570
571static inline struct list_head *ptype_head(const struct packet_type *pt)
572{
573 if (pt->type == htons(ETH_P_ALL))
574 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
575 else
576 return pt->dev ? &pt->dev->ptype_specific :
577 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
578}
579
580/**
581 * dev_add_pack - add packet handler
582 * @pt: packet type declaration
583 *
584 * Add a protocol handler to the networking stack. The passed &packet_type
585 * is linked into kernel lists and may not be freed until it has been
586 * removed from the kernel lists.
587 *
588 * This call does not sleep therefore it can not
589 * guarantee all CPU's that are in middle of receiving packets
590 * will see the new packet type (until the next received packet).
591 */
592
593void dev_add_pack(struct packet_type *pt)
594{
595 struct list_head *head = ptype_head(pt);
596
597 spin_lock(&ptype_lock);
598 list_add_rcu(&pt->list, head);
599 spin_unlock(&ptype_lock);
600}
601EXPORT_SYMBOL(dev_add_pack);
602
603/**
604 * __dev_remove_pack - remove packet handler
605 * @pt: packet type declaration
606 *
607 * Remove a protocol handler that was previously added to the kernel
608 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
609 * from the kernel lists and can be freed or reused once this function
610 * returns.
611 *
612 * The packet type might still be in use by receivers
613 * and must not be freed until after all the CPU's have gone
614 * through a quiescent state.
615 */
616void __dev_remove_pack(struct packet_type *pt)
617{
618 struct list_head *head = ptype_head(pt);
619 struct packet_type *pt1;
620
621 spin_lock(&ptype_lock);
622
623 list_for_each_entry(pt1, head, list) {
624 if (pt == pt1) {
625 list_del_rcu(&pt->list);
626 goto out;
627 }
628 }
629
630 pr_warn("dev_remove_pack: %p not found\n", pt);
631out:
632 spin_unlock(&ptype_lock);
633}
634EXPORT_SYMBOL(__dev_remove_pack);
635
636/**
637 * dev_remove_pack - remove packet handler
638 * @pt: packet type declaration
639 *
640 * Remove a protocol handler that was previously added to the kernel
641 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
642 * from the kernel lists and can be freed or reused once this function
643 * returns.
644 *
645 * This call sleeps to guarantee that no CPU is looking at the packet
646 * type after return.
647 */
648void dev_remove_pack(struct packet_type *pt)
649{
650 __dev_remove_pack(pt);
651
652 synchronize_net();
653}
654EXPORT_SYMBOL(dev_remove_pack);
655
656
657/*******************************************************************************
658 *
659 * Device Interface Subroutines
660 *
661 *******************************************************************************/
662
663/**
664 * dev_get_iflink - get 'iflink' value of a interface
665 * @dev: targeted interface
666 *
667 * Indicates the ifindex the interface is linked to.
668 * Physical interfaces have the same 'ifindex' and 'iflink' values.
669 */
670
671int dev_get_iflink(const struct net_device *dev)
672{
673 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
674 return dev->netdev_ops->ndo_get_iflink(dev);
675
676 return READ_ONCE(dev->ifindex);
677}
678EXPORT_SYMBOL(dev_get_iflink);
679
680/**
681 * dev_fill_metadata_dst - Retrieve tunnel egress information.
682 * @dev: targeted interface
683 * @skb: The packet.
684 *
685 * For better visibility of tunnel traffic OVS needs to retrieve
686 * egress tunnel information for a packet. Following API allows
687 * user to get this info.
688 */
689int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
690{
691 struct ip_tunnel_info *info;
692
693 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
694 return -EINVAL;
695
696 info = skb_tunnel_info_unclone(skb);
697 if (!info)
698 return -ENOMEM;
699 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
700 return -EINVAL;
701
702 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
703}
704EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
705
706static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
707{
708 int k = stack->num_paths++;
709
710 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
711 return NULL;
712
713 return &stack->path[k];
714}
715
716int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
717 struct net_device_path_stack *stack)
718{
719 const struct net_device *last_dev;
720 struct net_device_path_ctx ctx = {
721 .dev = dev,
722 };
723 struct net_device_path *path;
724 int ret = 0;
725
726 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
727 stack->num_paths = 0;
728 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
729 last_dev = ctx.dev;
730 path = dev_fwd_path(stack);
731 if (!path)
732 return -1;
733
734 memset(path, 0, sizeof(struct net_device_path));
735 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
736 if (ret < 0)
737 return -1;
738
739 if (WARN_ON_ONCE(last_dev == ctx.dev))
740 return -1;
741 }
742
743 if (!ctx.dev)
744 return ret;
745
746 path = dev_fwd_path(stack);
747 if (!path)
748 return -1;
749 path->type = DEV_PATH_ETHERNET;
750 path->dev = ctx.dev;
751
752 return ret;
753}
754EXPORT_SYMBOL_GPL(dev_fill_forward_path);
755
756/* must be called under rcu_read_lock(), as we dont take a reference */
757static struct napi_struct *napi_by_id(unsigned int napi_id)
758{
759 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
760 struct napi_struct *napi;
761
762 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
763 if (napi->napi_id == napi_id)
764 return napi;
765
766 return NULL;
767}
768
769/* must be called under rcu_read_lock(), as we dont take a reference */
770static struct napi_struct *
771netdev_napi_by_id(struct net *net, unsigned int napi_id)
772{
773 struct napi_struct *napi;
774
775 napi = napi_by_id(napi_id);
776 if (!napi)
777 return NULL;
778
779 if (WARN_ON_ONCE(!napi->dev))
780 return NULL;
781 if (!net_eq(net, dev_net(napi->dev)))
782 return NULL;
783
784 return napi;
785}
786
787/**
788 * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
789 * @net: the applicable net namespace
790 * @napi_id: ID of a NAPI of a target device
791 *
792 * Find a NAPI instance with @napi_id. Lock its device.
793 * The device must be in %NETREG_REGISTERED state for lookup to succeed.
794 * netdev_unlock() must be called to release it.
795 *
796 * Return: pointer to NAPI, its device with lock held, NULL if not found.
797 */
798struct napi_struct *
799netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
800{
801 struct napi_struct *napi;
802 struct net_device *dev;
803
804 rcu_read_lock();
805 napi = netdev_napi_by_id(net, napi_id);
806 if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
807 rcu_read_unlock();
808 return NULL;
809 }
810
811 dev = napi->dev;
812 dev_hold(dev);
813 rcu_read_unlock();
814
815 dev = __netdev_put_lock(dev);
816 if (!dev)
817 return NULL;
818
819 rcu_read_lock();
820 napi = netdev_napi_by_id(net, napi_id);
821 if (napi && napi->dev != dev)
822 napi = NULL;
823 rcu_read_unlock();
824
825 if (!napi)
826 netdev_unlock(dev);
827 return napi;
828}
829
830/**
831 * __dev_get_by_name - find a device by its name
832 * @net: the applicable net namespace
833 * @name: name to find
834 *
835 * Find an interface by name. Must be called under RTNL semaphore.
836 * If the name is found a pointer to the device is returned.
837 * If the name is not found then %NULL is returned. The
838 * reference counters are not incremented so the caller must be
839 * careful with locks.
840 */
841
842struct net_device *__dev_get_by_name(struct net *net, const char *name)
843{
844 struct netdev_name_node *node_name;
845
846 node_name = netdev_name_node_lookup(net, name);
847 return node_name ? node_name->dev : NULL;
848}
849EXPORT_SYMBOL(__dev_get_by_name);
850
851/**
852 * dev_get_by_name_rcu - find a device by its name
853 * @net: the applicable net namespace
854 * @name: name to find
855 *
856 * Find an interface by name.
857 * If the name is found a pointer to the device is returned.
858 * If the name is not found then %NULL is returned.
859 * The reference counters are not incremented so the caller must be
860 * careful with locks. The caller must hold RCU lock.
861 */
862
863struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
864{
865 struct netdev_name_node *node_name;
866
867 node_name = netdev_name_node_lookup_rcu(net, name);
868 return node_name ? node_name->dev : NULL;
869}
870EXPORT_SYMBOL(dev_get_by_name_rcu);
871
872/* Deprecated for new users, call netdev_get_by_name() instead */
873struct net_device *dev_get_by_name(struct net *net, const char *name)
874{
875 struct net_device *dev;
876
877 rcu_read_lock();
878 dev = dev_get_by_name_rcu(net, name);
879 dev_hold(dev);
880 rcu_read_unlock();
881 return dev;
882}
883EXPORT_SYMBOL(dev_get_by_name);
884
885/**
886 * netdev_get_by_name() - find a device by its name
887 * @net: the applicable net namespace
888 * @name: name to find
889 * @tracker: tracking object for the acquired reference
890 * @gfp: allocation flags for the tracker
891 *
892 * Find an interface by name. This can be called from any
893 * context and does its own locking. The returned handle has
894 * the usage count incremented and the caller must use netdev_put() to
895 * release it when it is no longer needed. %NULL is returned if no
896 * matching device is found.
897 */
898struct net_device *netdev_get_by_name(struct net *net, const char *name,
899 netdevice_tracker *tracker, gfp_t gfp)
900{
901 struct net_device *dev;
902
903 dev = dev_get_by_name(net, name);
904 if (dev)
905 netdev_tracker_alloc(dev, tracker, gfp);
906 return dev;
907}
908EXPORT_SYMBOL(netdev_get_by_name);
909
910/**
911 * __dev_get_by_index - find a device by its ifindex
912 * @net: the applicable net namespace
913 * @ifindex: index of device
914 *
915 * Search for an interface by index. Returns %NULL if the device
916 * is not found or a pointer to the device. The device has not
917 * had its reference counter increased so the caller must be careful
918 * about locking. The caller must hold the RTNL semaphore.
919 */
920
921struct net_device *__dev_get_by_index(struct net *net, int ifindex)
922{
923 struct net_device *dev;
924 struct hlist_head *head = dev_index_hash(net, ifindex);
925
926 hlist_for_each_entry(dev, head, index_hlist)
927 if (dev->ifindex == ifindex)
928 return dev;
929
930 return NULL;
931}
932EXPORT_SYMBOL(__dev_get_by_index);
933
934/**
935 * dev_get_by_index_rcu - find a device by its ifindex
936 * @net: the applicable net namespace
937 * @ifindex: index of device
938 *
939 * Search for an interface by index. Returns %NULL if the device
940 * is not found or a pointer to the device. The device has not
941 * had its reference counter increased so the caller must be careful
942 * about locking. The caller must hold RCU lock.
943 */
944
945struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
946{
947 struct net_device *dev;
948 struct hlist_head *head = dev_index_hash(net, ifindex);
949
950 hlist_for_each_entry_rcu(dev, head, index_hlist)
951 if (dev->ifindex == ifindex)
952 return dev;
953
954 return NULL;
955}
956EXPORT_SYMBOL(dev_get_by_index_rcu);
957
958/* Deprecated for new users, call netdev_get_by_index() instead */
959struct net_device *dev_get_by_index(struct net *net, int ifindex)
960{
961 struct net_device *dev;
962
963 rcu_read_lock();
964 dev = dev_get_by_index_rcu(net, ifindex);
965 dev_hold(dev);
966 rcu_read_unlock();
967 return dev;
968}
969EXPORT_SYMBOL(dev_get_by_index);
970
971/**
972 * netdev_get_by_index() - find a device by its ifindex
973 * @net: the applicable net namespace
974 * @ifindex: index of device
975 * @tracker: tracking object for the acquired reference
976 * @gfp: allocation flags for the tracker
977 *
978 * Search for an interface by index. Returns NULL if the device
979 * is not found or a pointer to the device. The device returned has
980 * had a reference added and the pointer is safe until the user calls
981 * netdev_put() to indicate they have finished with it.
982 */
983struct net_device *netdev_get_by_index(struct net *net, int ifindex,
984 netdevice_tracker *tracker, gfp_t gfp)
985{
986 struct net_device *dev;
987
988 dev = dev_get_by_index(net, ifindex);
989 if (dev)
990 netdev_tracker_alloc(dev, tracker, gfp);
991 return dev;
992}
993EXPORT_SYMBOL(netdev_get_by_index);
994
995/**
996 * dev_get_by_napi_id - find a device by napi_id
997 * @napi_id: ID of the NAPI struct
998 *
999 * Search for an interface by NAPI ID. Returns %NULL if the device
1000 * is not found or a pointer to the device. The device has not had
1001 * its reference counter increased so the caller must be careful
1002 * about locking. The caller must hold RCU lock.
1003 */
1004struct net_device *dev_get_by_napi_id(unsigned int napi_id)
1005{
1006 struct napi_struct *napi;
1007
1008 WARN_ON_ONCE(!rcu_read_lock_held());
1009
1010 if (napi_id < MIN_NAPI_ID)
1011 return NULL;
1012
1013 napi = napi_by_id(napi_id);
1014
1015 return napi ? napi->dev : NULL;
1016}
1017
1018/* Release the held reference on the net_device, and if the net_device
1019 * is still registered try to lock the instance lock. If device is being
1020 * unregistered NULL will be returned (but the reference has been released,
1021 * either way!)
1022 *
1023 * This helper is intended for locking net_device after it has been looked up
1024 * using a lockless lookup helper. Lock prevents the instance from going away.
1025 */
1026struct net_device *__netdev_put_lock(struct net_device *dev)
1027{
1028 netdev_lock(dev);
1029 if (dev->reg_state > NETREG_REGISTERED) {
1030 netdev_unlock(dev);
1031 dev_put(dev);
1032 return NULL;
1033 }
1034 dev_put(dev);
1035 return dev;
1036}
1037
1038/**
1039 * netdev_get_by_index_lock() - find a device by its ifindex
1040 * @net: the applicable net namespace
1041 * @ifindex: index of device
1042 *
1043 * Search for an interface by index. If a valid device
1044 * with @ifindex is found it will be returned with netdev->lock held.
1045 * netdev_unlock() must be called to release it.
1046 *
1047 * Return: pointer to a device with lock held, NULL if not found.
1048 */
1049struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
1050{
1051 struct net_device *dev;
1052
1053 dev = dev_get_by_index(net, ifindex);
1054 if (!dev)
1055 return NULL;
1056
1057 return __netdev_put_lock(dev);
1058}
1059
1060struct net_device *
1061netdev_xa_find_lock(struct net *net, struct net_device *dev,
1062 unsigned long *index)
1063{
1064 if (dev)
1065 netdev_unlock(dev);
1066
1067 do {
1068 rcu_read_lock();
1069 dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
1070 if (!dev) {
1071 rcu_read_unlock();
1072 return NULL;
1073 }
1074 dev_hold(dev);
1075 rcu_read_unlock();
1076
1077 dev = __netdev_put_lock(dev);
1078 if (dev)
1079 return dev;
1080
1081 (*index)++;
1082 } while (true);
1083}
1084
1085static DEFINE_SEQLOCK(netdev_rename_lock);
1086
1087void netdev_copy_name(struct net_device *dev, char *name)
1088{
1089 unsigned int seq;
1090
1091 do {
1092 seq = read_seqbegin(&netdev_rename_lock);
1093 strscpy(name, dev->name, IFNAMSIZ);
1094 } while (read_seqretry(&netdev_rename_lock, seq));
1095}
1096
1097/**
1098 * netdev_get_name - get a netdevice name, knowing its ifindex.
1099 * @net: network namespace
1100 * @name: a pointer to the buffer where the name will be stored.
1101 * @ifindex: the ifindex of the interface to get the name from.
1102 */
1103int netdev_get_name(struct net *net, char *name, int ifindex)
1104{
1105 struct net_device *dev;
1106 int ret;
1107
1108 rcu_read_lock();
1109
1110 dev = dev_get_by_index_rcu(net, ifindex);
1111 if (!dev) {
1112 ret = -ENODEV;
1113 goto out;
1114 }
1115
1116 netdev_copy_name(dev, name);
1117
1118 ret = 0;
1119out:
1120 rcu_read_unlock();
1121 return ret;
1122}
1123
1124static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
1125 const char *ha)
1126{
1127 return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
1128}
1129
1130/**
1131 * dev_getbyhwaddr_rcu - find a device by its hardware address
1132 * @net: the applicable net namespace
1133 * @type: media type of device
1134 * @ha: hardware address
1135 *
1136 * Search for an interface by MAC address. Returns NULL if the device
1137 * is not found or a pointer to the device.
1138 * The caller must hold RCU.
1139 * The returned device has not had its ref count increased
1140 * and the caller must therefore be careful about locking
1141 *
1142 */
1143
1144struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1145 const char *ha)
1146{
1147 struct net_device *dev;
1148
1149 for_each_netdev_rcu(net, dev)
1150 if (dev_addr_cmp(dev, type, ha))
1151 return dev;
1152
1153 return NULL;
1154}
1155EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1156
1157/**
1158 * dev_getbyhwaddr() - find a device by its hardware address
1159 * @net: the applicable net namespace
1160 * @type: media type of device
1161 * @ha: hardware address
1162 *
1163 * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
1164 * rtnl_lock.
1165 *
1166 * Context: rtnl_lock() must be held.
1167 * Return: pointer to the net_device, or NULL if not found
1168 */
1169struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
1170 const char *ha)
1171{
1172 struct net_device *dev;
1173
1174 ASSERT_RTNL();
1175 for_each_netdev(net, dev)
1176 if (dev_addr_cmp(dev, type, ha))
1177 return dev;
1178
1179 return NULL;
1180}
1181EXPORT_SYMBOL(dev_getbyhwaddr);
1182
1183struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1184{
1185 struct net_device *dev, *ret = NULL;
1186
1187 rcu_read_lock();
1188 for_each_netdev_rcu(net, dev)
1189 if (dev->type == type) {
1190 dev_hold(dev);
1191 ret = dev;
1192 break;
1193 }
1194 rcu_read_unlock();
1195 return ret;
1196}
1197EXPORT_SYMBOL(dev_getfirstbyhwtype);
1198
1199/**
1200 * __dev_get_by_flags - find any device with given flags
1201 * @net: the applicable net namespace
1202 * @if_flags: IFF_* values
1203 * @mask: bitmask of bits in if_flags to check
1204 *
1205 * Search for any interface with the given flags. Returns NULL if a device
1206 * is not found or a pointer to the device. Must be called inside
1207 * rtnl_lock(), and result refcount is unchanged.
1208 */
1209
1210struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1211 unsigned short mask)
1212{
1213 struct net_device *dev, *ret;
1214
1215 ASSERT_RTNL();
1216
1217 ret = NULL;
1218 for_each_netdev(net, dev) {
1219 if (((dev->flags ^ if_flags) & mask) == 0) {
1220 ret = dev;
1221 break;
1222 }
1223 }
1224 return ret;
1225}
1226EXPORT_SYMBOL(__dev_get_by_flags);
1227
1228/**
1229 * dev_valid_name - check if name is okay for network device
1230 * @name: name string
1231 *
1232 * Network device names need to be valid file names to
1233 * allow sysfs to work. We also disallow any kind of
1234 * whitespace.
1235 */
1236bool dev_valid_name(const char *name)
1237{
1238 if (*name == '\0')
1239 return false;
1240 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1241 return false;
1242 if (!strcmp(name, ".") || !strcmp(name, ".."))
1243 return false;
1244
1245 while (*name) {
1246 if (*name == '/' || *name == ':' || isspace(*name))
1247 return false;
1248 name++;
1249 }
1250 return true;
1251}
1252EXPORT_SYMBOL(dev_valid_name);
1253
1254/**
1255 * __dev_alloc_name - allocate a name for a device
1256 * @net: network namespace to allocate the device name in
1257 * @name: name format string
1258 * @res: result name string
1259 *
1260 * Passed a format string - eg "lt%d" it will try and find a suitable
1261 * id. It scans list of devices to build up a free map, then chooses
1262 * the first empty slot. The caller must hold the dev_base or rtnl lock
1263 * while allocating the name and adding the device in order to avoid
1264 * duplicates.
1265 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1266 * Returns the number of the unit assigned or a negative errno code.
1267 */
1268
1269static int __dev_alloc_name(struct net *net, const char *name, char *res)
1270{
1271 int i = 0;
1272 const char *p;
1273 const int max_netdevices = 8*PAGE_SIZE;
1274 unsigned long *inuse;
1275 struct net_device *d;
1276 char buf[IFNAMSIZ];
1277
1278 /* Verify the string as this thing may have come from the user.
1279 * There must be one "%d" and no other "%" characters.
1280 */
1281 p = strchr(name, '%');
1282 if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1283 return -EINVAL;
1284
1285 /* Use one page as a bit array of possible slots */
1286 inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1287 if (!inuse)
1288 return -ENOMEM;
1289
1290 for_each_netdev(net, d) {
1291 struct netdev_name_node *name_node;
1292
1293 netdev_for_each_altname(d, name_node) {
1294 if (!sscanf(name_node->name, name, &i))
1295 continue;
1296 if (i < 0 || i >= max_netdevices)
1297 continue;
1298
1299 /* avoid cases where sscanf is not exact inverse of printf */
1300 snprintf(buf, IFNAMSIZ, name, i);
1301 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1302 __set_bit(i, inuse);
1303 }
1304 if (!sscanf(d->name, name, &i))
1305 continue;
1306 if (i < 0 || i >= max_netdevices)
1307 continue;
1308
1309 /* avoid cases where sscanf is not exact inverse of printf */
1310 snprintf(buf, IFNAMSIZ, name, i);
1311 if (!strncmp(buf, d->name, IFNAMSIZ))
1312 __set_bit(i, inuse);
1313 }
1314
1315 i = find_first_zero_bit(inuse, max_netdevices);
1316 bitmap_free(inuse);
1317 if (i == max_netdevices)
1318 return -ENFILE;
1319
1320 /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1321 strscpy(buf, name, IFNAMSIZ);
1322 snprintf(res, IFNAMSIZ, buf, i);
1323 return i;
1324}
1325
1326/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1327static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1328 const char *want_name, char *out_name,
1329 int dup_errno)
1330{
1331 if (!dev_valid_name(want_name))
1332 return -EINVAL;
1333
1334 if (strchr(want_name, '%'))
1335 return __dev_alloc_name(net, want_name, out_name);
1336
1337 if (netdev_name_in_use(net, want_name))
1338 return -dup_errno;
1339 if (out_name != want_name)
1340 strscpy(out_name, want_name, IFNAMSIZ);
1341 return 0;
1342}
1343
1344/**
1345 * dev_alloc_name - allocate a name for a device
1346 * @dev: device
1347 * @name: name format string
1348 *
1349 * Passed a format string - eg "lt%d" it will try and find a suitable
1350 * id. It scans list of devices to build up a free map, then chooses
1351 * the first empty slot. The caller must hold the dev_base or rtnl lock
1352 * while allocating the name and adding the device in order to avoid
1353 * duplicates.
1354 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1355 * Returns the number of the unit assigned or a negative errno code.
1356 */
1357
1358int dev_alloc_name(struct net_device *dev, const char *name)
1359{
1360 return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1361}
1362EXPORT_SYMBOL(dev_alloc_name);
1363
1364static int dev_get_valid_name(struct net *net, struct net_device *dev,
1365 const char *name)
1366{
1367 int ret;
1368
1369 ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1370 return ret < 0 ? ret : 0;
1371}
1372
1373/**
1374 * dev_change_name - change name of a device
1375 * @dev: device
1376 * @newname: name (or format string) must be at least IFNAMSIZ
1377 *
1378 * Change name of a device, can pass format strings "eth%d".
1379 * for wildcarding.
1380 */
1381int dev_change_name(struct net_device *dev, const char *newname)
1382{
1383 struct net *net = dev_net(dev);
1384 unsigned char old_assign_type;
1385 char oldname[IFNAMSIZ];
1386 int err = 0;
1387 int ret;
1388
1389 ASSERT_RTNL_NET(net);
1390
1391 if (!strncmp(newname, dev->name, IFNAMSIZ))
1392 return 0;
1393
1394 memcpy(oldname, dev->name, IFNAMSIZ);
1395
1396 write_seqlock_bh(&netdev_rename_lock);
1397 err = dev_get_valid_name(net, dev, newname);
1398 write_sequnlock_bh(&netdev_rename_lock);
1399
1400 if (err < 0)
1401 return err;
1402
1403 if (oldname[0] && !strchr(oldname, '%'))
1404 netdev_info(dev, "renamed from %s%s\n", oldname,
1405 dev->flags & IFF_UP ? " (while UP)" : "");
1406
1407 old_assign_type = dev->name_assign_type;
1408 WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1409
1410rollback:
1411 ret = device_rename(&dev->dev, dev->name);
1412 if (ret) {
1413 write_seqlock_bh(&netdev_rename_lock);
1414 memcpy(dev->name, oldname, IFNAMSIZ);
1415 write_sequnlock_bh(&netdev_rename_lock);
1416 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1417 return ret;
1418 }
1419
1420 netdev_adjacent_rename_links(dev, oldname);
1421
1422 netdev_name_node_del(dev->name_node);
1423
1424 synchronize_net();
1425
1426 netdev_name_node_add(net, dev->name_node);
1427
1428 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1429 ret = notifier_to_errno(ret);
1430
1431 if (ret) {
1432 /* err >= 0 after dev_alloc_name() or stores the first errno */
1433 if (err >= 0) {
1434 err = ret;
1435 write_seqlock_bh(&netdev_rename_lock);
1436 memcpy(dev->name, oldname, IFNAMSIZ);
1437 write_sequnlock_bh(&netdev_rename_lock);
1438 memcpy(oldname, newname, IFNAMSIZ);
1439 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1440 old_assign_type = NET_NAME_RENAMED;
1441 goto rollback;
1442 } else {
1443 netdev_err(dev, "name change rollback failed: %d\n",
1444 ret);
1445 }
1446 }
1447
1448 return err;
1449}
1450
1451/**
1452 * dev_set_alias - change ifalias of a device
1453 * @dev: device
1454 * @alias: name up to IFALIASZ
1455 * @len: limit of bytes to copy from info
1456 *
1457 * Set ifalias for a device,
1458 */
1459int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1460{
1461 struct dev_ifalias *new_alias = NULL;
1462
1463 if (len >= IFALIASZ)
1464 return -EINVAL;
1465
1466 if (len) {
1467 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1468 if (!new_alias)
1469 return -ENOMEM;
1470
1471 memcpy(new_alias->ifalias, alias, len);
1472 new_alias->ifalias[len] = 0;
1473 }
1474
1475 mutex_lock(&ifalias_mutex);
1476 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1477 mutex_is_locked(&ifalias_mutex));
1478 mutex_unlock(&ifalias_mutex);
1479
1480 if (new_alias)
1481 kfree_rcu(new_alias, rcuhead);
1482
1483 return len;
1484}
1485EXPORT_SYMBOL(dev_set_alias);
1486
1487/**
1488 * dev_get_alias - get ifalias of a device
1489 * @dev: device
1490 * @name: buffer to store name of ifalias
1491 * @len: size of buffer
1492 *
1493 * get ifalias for a device. Caller must make sure dev cannot go
1494 * away, e.g. rcu read lock or own a reference count to device.
1495 */
1496int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1497{
1498 const struct dev_ifalias *alias;
1499 int ret = 0;
1500
1501 rcu_read_lock();
1502 alias = rcu_dereference(dev->ifalias);
1503 if (alias)
1504 ret = snprintf(name, len, "%s", alias->ifalias);
1505 rcu_read_unlock();
1506
1507 return ret;
1508}
1509
1510/**
1511 * netdev_features_change - device changes features
1512 * @dev: device to cause notification
1513 *
1514 * Called to indicate a device has changed features.
1515 */
1516void netdev_features_change(struct net_device *dev)
1517{
1518 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1519}
1520EXPORT_SYMBOL(netdev_features_change);
1521
1522/**
1523 * netdev_state_change - device changes state
1524 * @dev: device to cause notification
1525 *
1526 * Called to indicate a device has changed state. This function calls
1527 * the notifier chains for netdev_chain and sends a NEWLINK message
1528 * to the routing socket.
1529 */
1530void netdev_state_change(struct net_device *dev)
1531{
1532 if (dev->flags & IFF_UP) {
1533 struct netdev_notifier_change_info change_info = {
1534 .info.dev = dev,
1535 };
1536
1537 call_netdevice_notifiers_info(NETDEV_CHANGE,
1538 &change_info.info);
1539 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1540 }
1541}
1542EXPORT_SYMBOL(netdev_state_change);
1543
1544/**
1545 * __netdev_notify_peers - notify network peers about existence of @dev,
1546 * to be called when rtnl lock is already held.
1547 * @dev: network device
1548 *
1549 * Generate traffic such that interested network peers are aware of
1550 * @dev, such as by generating a gratuitous ARP. This may be used when
1551 * a device wants to inform the rest of the network about some sort of
1552 * reconfiguration such as a failover event or virtual machine
1553 * migration.
1554 */
1555void __netdev_notify_peers(struct net_device *dev)
1556{
1557 ASSERT_RTNL();
1558 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1559 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1560}
1561EXPORT_SYMBOL(__netdev_notify_peers);
1562
1563/**
1564 * netdev_notify_peers - notify network peers about existence of @dev
1565 * @dev: network device
1566 *
1567 * Generate traffic such that interested network peers are aware of
1568 * @dev, such as by generating a gratuitous ARP. This may be used when
1569 * a device wants to inform the rest of the network about some sort of
1570 * reconfiguration such as a failover event or virtual machine
1571 * migration.
1572 */
1573void netdev_notify_peers(struct net_device *dev)
1574{
1575 rtnl_lock();
1576 __netdev_notify_peers(dev);
1577 rtnl_unlock();
1578}
1579EXPORT_SYMBOL(netdev_notify_peers);
1580
1581static int napi_threaded_poll(void *data);
1582
1583static int napi_kthread_create(struct napi_struct *n)
1584{
1585 int err = 0;
1586
1587 /* Create and wake up the kthread once to put it in
1588 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1589 * warning and work with loadavg.
1590 */
1591 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1592 n->dev->name, n->napi_id);
1593 if (IS_ERR(n->thread)) {
1594 err = PTR_ERR(n->thread);
1595 pr_err("kthread_run failed with err %d\n", err);
1596 n->thread = NULL;
1597 }
1598
1599 return err;
1600}
1601
1602static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1603{
1604 const struct net_device_ops *ops = dev->netdev_ops;
1605 int ret;
1606
1607 ASSERT_RTNL();
1608 dev_addr_check(dev);
1609
1610 if (!netif_device_present(dev)) {
1611 /* may be detached because parent is runtime-suspended */
1612 if (dev->dev.parent)
1613 pm_runtime_resume(dev->dev.parent);
1614 if (!netif_device_present(dev))
1615 return -ENODEV;
1616 }
1617
1618 /* Block netpoll from trying to do any rx path servicing.
1619 * If we don't do this there is a chance ndo_poll_controller
1620 * or ndo_poll may be running while we open the device
1621 */
1622 netpoll_poll_disable(dev);
1623
1624 ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1625 ret = notifier_to_errno(ret);
1626 if (ret)
1627 return ret;
1628
1629 set_bit(__LINK_STATE_START, &dev->state);
1630
1631 if (ops->ndo_validate_addr)
1632 ret = ops->ndo_validate_addr(dev);
1633
1634 if (!ret && ops->ndo_open)
1635 ret = ops->ndo_open(dev);
1636
1637 netpoll_poll_enable(dev);
1638
1639 if (ret)
1640 clear_bit(__LINK_STATE_START, &dev->state);
1641 else {
1642 netif_set_up(dev, true);
1643 dev_set_rx_mode(dev);
1644 dev_activate(dev);
1645 add_device_randomness(dev->dev_addr, dev->addr_len);
1646 }
1647
1648 return ret;
1649}
1650
1651/**
1652 * dev_open - prepare an interface for use.
1653 * @dev: device to open
1654 * @extack: netlink extended ack
1655 *
1656 * Takes a device from down to up state. The device's private open
1657 * function is invoked and then the multicast lists are loaded. Finally
1658 * the device is moved into the up state and a %NETDEV_UP message is
1659 * sent to the netdev notifier chain.
1660 *
1661 * Calling this function on an active interface is a nop. On a failure
1662 * a negative errno code is returned.
1663 */
1664int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1665{
1666 int ret;
1667
1668 if (dev->flags & IFF_UP)
1669 return 0;
1670
1671 ret = __dev_open(dev, extack);
1672 if (ret < 0)
1673 return ret;
1674
1675 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1676 call_netdevice_notifiers(NETDEV_UP, dev);
1677
1678 return ret;
1679}
1680EXPORT_SYMBOL(dev_open);
1681
1682static void __dev_close_many(struct list_head *head)
1683{
1684 struct net_device *dev;
1685
1686 ASSERT_RTNL();
1687 might_sleep();
1688
1689 list_for_each_entry(dev, head, close_list) {
1690 /* Temporarily disable netpoll until the interface is down */
1691 netpoll_poll_disable(dev);
1692
1693 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1694
1695 clear_bit(__LINK_STATE_START, &dev->state);
1696
1697 /* Synchronize to scheduled poll. We cannot touch poll list, it
1698 * can be even on different cpu. So just clear netif_running().
1699 *
1700 * dev->stop() will invoke napi_disable() on all of it's
1701 * napi_struct instances on this device.
1702 */
1703 smp_mb__after_atomic(); /* Commit netif_running(). */
1704 }
1705
1706 dev_deactivate_many(head);
1707
1708 list_for_each_entry(dev, head, close_list) {
1709 const struct net_device_ops *ops = dev->netdev_ops;
1710
1711 /*
1712 * Call the device specific close. This cannot fail.
1713 * Only if device is UP
1714 *
1715 * We allow it to be called even after a DETACH hot-plug
1716 * event.
1717 */
1718 if (ops->ndo_stop)
1719 ops->ndo_stop(dev);
1720
1721 netif_set_up(dev, false);
1722 netpoll_poll_enable(dev);
1723 }
1724}
1725
1726static void __dev_close(struct net_device *dev)
1727{
1728 LIST_HEAD(single);
1729
1730 list_add(&dev->close_list, &single);
1731 __dev_close_many(&single);
1732 list_del(&single);
1733}
1734
1735void dev_close_many(struct list_head *head, bool unlink)
1736{
1737 struct net_device *dev, *tmp;
1738
1739 /* Remove the devices that don't need to be closed */
1740 list_for_each_entry_safe(dev, tmp, head, close_list)
1741 if (!(dev->flags & IFF_UP))
1742 list_del_init(&dev->close_list);
1743
1744 __dev_close_many(head);
1745
1746 list_for_each_entry_safe(dev, tmp, head, close_list) {
1747 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1748 call_netdevice_notifiers(NETDEV_DOWN, dev);
1749 if (unlink)
1750 list_del_init(&dev->close_list);
1751 }
1752}
1753EXPORT_SYMBOL(dev_close_many);
1754
1755/**
1756 * dev_close - shutdown an interface.
1757 * @dev: device to shutdown
1758 *
1759 * This function moves an active device into down state. A
1760 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1761 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1762 * chain.
1763 */
1764void dev_close(struct net_device *dev)
1765{
1766 if (dev->flags & IFF_UP) {
1767 LIST_HEAD(single);
1768
1769 list_add(&dev->close_list, &single);
1770 dev_close_many(&single, true);
1771 list_del(&single);
1772 }
1773}
1774EXPORT_SYMBOL(dev_close);
1775
1776
1777/**
1778 * dev_disable_lro - disable Large Receive Offload on a device
1779 * @dev: device
1780 *
1781 * Disable Large Receive Offload (LRO) on a net device. Must be
1782 * called under RTNL. This is needed if received packets may be
1783 * forwarded to another interface.
1784 */
1785void dev_disable_lro(struct net_device *dev)
1786{
1787 struct net_device *lower_dev;
1788 struct list_head *iter;
1789
1790 dev->wanted_features &= ~NETIF_F_LRO;
1791 netdev_update_features(dev);
1792
1793 if (unlikely(dev->features & NETIF_F_LRO))
1794 netdev_WARN(dev, "failed to disable LRO!\n");
1795
1796 netdev_for_each_lower_dev(dev, lower_dev, iter)
1797 dev_disable_lro(lower_dev);
1798}
1799EXPORT_SYMBOL(dev_disable_lro);
1800
1801/**
1802 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1803 * @dev: device
1804 *
1805 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1806 * called under RTNL. This is needed if Generic XDP is installed on
1807 * the device.
1808 */
1809static void dev_disable_gro_hw(struct net_device *dev)
1810{
1811 dev->wanted_features &= ~NETIF_F_GRO_HW;
1812 netdev_update_features(dev);
1813
1814 if (unlikely(dev->features & NETIF_F_GRO_HW))
1815 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1816}
1817
1818const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1819{
1820#define N(val) \
1821 case NETDEV_##val: \
1822 return "NETDEV_" __stringify(val);
1823 switch (cmd) {
1824 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1825 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1826 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1827 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1828 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1829 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1830 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1831 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1832 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1833 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1834 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1835 N(XDP_FEAT_CHANGE)
1836 }
1837#undef N
1838 return "UNKNOWN_NETDEV_EVENT";
1839}
1840EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1841
1842static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1843 struct net_device *dev)
1844{
1845 struct netdev_notifier_info info = {
1846 .dev = dev,
1847 };
1848
1849 return nb->notifier_call(nb, val, &info);
1850}
1851
1852static int call_netdevice_register_notifiers(struct notifier_block *nb,
1853 struct net_device *dev)
1854{
1855 int err;
1856
1857 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1858 err = notifier_to_errno(err);
1859 if (err)
1860 return err;
1861
1862 if (!(dev->flags & IFF_UP))
1863 return 0;
1864
1865 call_netdevice_notifier(nb, NETDEV_UP, dev);
1866 return 0;
1867}
1868
1869static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1870 struct net_device *dev)
1871{
1872 if (dev->flags & IFF_UP) {
1873 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1874 dev);
1875 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1876 }
1877 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1878}
1879
1880static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1881 struct net *net)
1882{
1883 struct net_device *dev;
1884 int err;
1885
1886 for_each_netdev(net, dev) {
1887 err = call_netdevice_register_notifiers(nb, dev);
1888 if (err)
1889 goto rollback;
1890 }
1891 return 0;
1892
1893rollback:
1894 for_each_netdev_continue_reverse(net, dev)
1895 call_netdevice_unregister_notifiers(nb, dev);
1896 return err;
1897}
1898
1899static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1900 struct net *net)
1901{
1902 struct net_device *dev;
1903
1904 for_each_netdev(net, dev)
1905 call_netdevice_unregister_notifiers(nb, dev);
1906}
1907
1908static int dev_boot_phase = 1;
1909
1910/**
1911 * register_netdevice_notifier - register a network notifier block
1912 * @nb: notifier
1913 *
1914 * Register a notifier to be called when network device events occur.
1915 * The notifier passed is linked into the kernel structures and must
1916 * not be reused until it has been unregistered. A negative errno code
1917 * is returned on a failure.
1918 *
1919 * When registered all registration and up events are replayed
1920 * to the new notifier to allow device to have a race free
1921 * view of the network device list.
1922 */
1923
1924int register_netdevice_notifier(struct notifier_block *nb)
1925{
1926 struct net *net;
1927 int err;
1928
1929 /* Close race with setup_net() and cleanup_net() */
1930 down_write(&pernet_ops_rwsem);
1931
1932 /* When RTNL is removed, we need protection for netdev_chain. */
1933 rtnl_lock();
1934
1935 err = raw_notifier_chain_register(&netdev_chain, nb);
1936 if (err)
1937 goto unlock;
1938 if (dev_boot_phase)
1939 goto unlock;
1940 for_each_net(net) {
1941 __rtnl_net_lock(net);
1942 err = call_netdevice_register_net_notifiers(nb, net);
1943 __rtnl_net_unlock(net);
1944 if (err)
1945 goto rollback;
1946 }
1947
1948unlock:
1949 rtnl_unlock();
1950 up_write(&pernet_ops_rwsem);
1951 return err;
1952
1953rollback:
1954 for_each_net_continue_reverse(net) {
1955 __rtnl_net_lock(net);
1956 call_netdevice_unregister_net_notifiers(nb, net);
1957 __rtnl_net_unlock(net);
1958 }
1959
1960 raw_notifier_chain_unregister(&netdev_chain, nb);
1961 goto unlock;
1962}
1963EXPORT_SYMBOL(register_netdevice_notifier);
1964
1965/**
1966 * unregister_netdevice_notifier - unregister a network notifier block
1967 * @nb: notifier
1968 *
1969 * Unregister a notifier previously registered by
1970 * register_netdevice_notifier(). The notifier is unlinked into the
1971 * kernel structures and may then be reused. A negative errno code
1972 * is returned on a failure.
1973 *
1974 * After unregistering unregister and down device events are synthesized
1975 * for all devices on the device list to the removed notifier to remove
1976 * the need for special case cleanup code.
1977 */
1978
1979int unregister_netdevice_notifier(struct notifier_block *nb)
1980{
1981 struct net *net;
1982 int err;
1983
1984 /* Close race with setup_net() and cleanup_net() */
1985 down_write(&pernet_ops_rwsem);
1986 rtnl_lock();
1987 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1988 if (err)
1989 goto unlock;
1990
1991 for_each_net(net) {
1992 __rtnl_net_lock(net);
1993 call_netdevice_unregister_net_notifiers(nb, net);
1994 __rtnl_net_unlock(net);
1995 }
1996
1997unlock:
1998 rtnl_unlock();
1999 up_write(&pernet_ops_rwsem);
2000 return err;
2001}
2002EXPORT_SYMBOL(unregister_netdevice_notifier);
2003
2004static int __register_netdevice_notifier_net(struct net *net,
2005 struct notifier_block *nb,
2006 bool ignore_call_fail)
2007{
2008 int err;
2009
2010 err = raw_notifier_chain_register(&net->netdev_chain, nb);
2011 if (err)
2012 return err;
2013 if (dev_boot_phase)
2014 return 0;
2015
2016 err = call_netdevice_register_net_notifiers(nb, net);
2017 if (err && !ignore_call_fail)
2018 goto chain_unregister;
2019
2020 return 0;
2021
2022chain_unregister:
2023 raw_notifier_chain_unregister(&net->netdev_chain, nb);
2024 return err;
2025}
2026
2027static int __unregister_netdevice_notifier_net(struct net *net,
2028 struct notifier_block *nb)
2029{
2030 int err;
2031
2032 err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
2033 if (err)
2034 return err;
2035
2036 call_netdevice_unregister_net_notifiers(nb, net);
2037 return 0;
2038}
2039
2040/**
2041 * register_netdevice_notifier_net - register a per-netns network notifier block
2042 * @net: network namespace
2043 * @nb: notifier
2044 *
2045 * Register a notifier to be called when network device events occur.
2046 * The notifier passed is linked into the kernel structures and must
2047 * not be reused until it has been unregistered. A negative errno code
2048 * is returned on a failure.
2049 *
2050 * When registered all registration and up events are replayed
2051 * to the new notifier to allow device to have a race free
2052 * view of the network device list.
2053 */
2054
2055int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
2056{
2057 int err;
2058
2059 rtnl_net_lock(net);
2060 err = __register_netdevice_notifier_net(net, nb, false);
2061 rtnl_net_unlock(net);
2062
2063 return err;
2064}
2065EXPORT_SYMBOL(register_netdevice_notifier_net);
2066
2067/**
2068 * unregister_netdevice_notifier_net - unregister a per-netns
2069 * network notifier block
2070 * @net: network namespace
2071 * @nb: notifier
2072 *
2073 * Unregister a notifier previously registered by
2074 * register_netdevice_notifier_net(). The notifier is unlinked from the
2075 * kernel structures and may then be reused. A negative errno code
2076 * is returned on a failure.
2077 *
2078 * After unregistering unregister and down device events are synthesized
2079 * for all devices on the device list to the removed notifier to remove
2080 * the need for special case cleanup code.
2081 */
2082
2083int unregister_netdevice_notifier_net(struct net *net,
2084 struct notifier_block *nb)
2085{
2086 int err;
2087
2088 rtnl_net_lock(net);
2089 err = __unregister_netdevice_notifier_net(net, nb);
2090 rtnl_net_unlock(net);
2091
2092 return err;
2093}
2094EXPORT_SYMBOL(unregister_netdevice_notifier_net);
2095
2096static void __move_netdevice_notifier_net(struct net *src_net,
2097 struct net *dst_net,
2098 struct notifier_block *nb)
2099{
2100 __unregister_netdevice_notifier_net(src_net, nb);
2101 __register_netdevice_notifier_net(dst_net, nb, true);
2102}
2103
2104static void rtnl_net_dev_lock(struct net_device *dev)
2105{
2106 bool again;
2107
2108 do {
2109 struct net *net;
2110
2111 again = false;
2112
2113 /* netns might be being dismantled. */
2114 rcu_read_lock();
2115 net = dev_net_rcu(dev);
2116 net_passive_inc(net);
2117 rcu_read_unlock();
2118
2119 rtnl_net_lock(net);
2120
2121#ifdef CONFIG_NET_NS
2122 /* dev might have been moved to another netns. */
2123 if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
2124 rtnl_net_unlock(net);
2125 net_passive_dec(net);
2126 again = true;
2127 }
2128#endif
2129 } while (again);
2130}
2131
2132static void rtnl_net_dev_unlock(struct net_device *dev)
2133{
2134 struct net *net = dev_net(dev);
2135
2136 rtnl_net_unlock(net);
2137 net_passive_dec(net);
2138}
2139
2140int register_netdevice_notifier_dev_net(struct net_device *dev,
2141 struct notifier_block *nb,
2142 struct netdev_net_notifier *nn)
2143{
2144 struct net *net = dev_net(dev);
2145 int err;
2146
2147 /* rtnl_net_lock() assumes dev is not yet published by
2148 * register_netdevice().
2149 */
2150 DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list));
2151
2152 rtnl_net_lock(net);
2153 err = __register_netdevice_notifier_net(net, nb, false);
2154 if (!err) {
2155 nn->nb = nb;
2156 list_add(&nn->list, &dev->net_notifier_list);
2157 }
2158 rtnl_net_unlock(net);
2159
2160 return err;
2161}
2162EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
2163
2164int unregister_netdevice_notifier_dev_net(struct net_device *dev,
2165 struct notifier_block *nb,
2166 struct netdev_net_notifier *nn)
2167{
2168 int err;
2169
2170 rtnl_net_dev_lock(dev);
2171 list_del(&nn->list);
2172 err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2173 rtnl_net_dev_unlock(dev);
2174
2175 return err;
2176}
2177EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2178
2179static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2180 struct net *net)
2181{
2182 struct netdev_net_notifier *nn;
2183
2184 list_for_each_entry(nn, &dev->net_notifier_list, list)
2185 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
2186}
2187
2188/**
2189 * call_netdevice_notifiers_info - call all network notifier blocks
2190 * @val: value passed unmodified to notifier function
2191 * @info: notifier information data
2192 *
2193 * Call all network notifier blocks. Parameters and return value
2194 * are as for raw_notifier_call_chain().
2195 */
2196
2197int call_netdevice_notifiers_info(unsigned long val,
2198 struct netdev_notifier_info *info)
2199{
2200 struct net *net = dev_net(info->dev);
2201 int ret;
2202
2203 ASSERT_RTNL();
2204
2205 /* Run per-netns notifier block chain first, then run the global one.
2206 * Hopefully, one day, the global one is going to be removed after
2207 * all notifier block registrators get converted to be per-netns.
2208 */
2209 ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2210 if (ret & NOTIFY_STOP_MASK)
2211 return ret;
2212 return raw_notifier_call_chain(&netdev_chain, val, info);
2213}
2214
2215/**
2216 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
2217 * for and rollback on error
2218 * @val_up: value passed unmodified to notifier function
2219 * @val_down: value passed unmodified to the notifier function when
2220 * recovering from an error on @val_up
2221 * @info: notifier information data
2222 *
2223 * Call all per-netns network notifier blocks, but not notifier blocks on
2224 * the global notifier chain. Parameters and return value are as for
2225 * raw_notifier_call_chain_robust().
2226 */
2227
2228static int
2229call_netdevice_notifiers_info_robust(unsigned long val_up,
2230 unsigned long val_down,
2231 struct netdev_notifier_info *info)
2232{
2233 struct net *net = dev_net(info->dev);
2234
2235 ASSERT_RTNL();
2236
2237 return raw_notifier_call_chain_robust(&net->netdev_chain,
2238 val_up, val_down, info);
2239}
2240
2241static int call_netdevice_notifiers_extack(unsigned long val,
2242 struct net_device *dev,
2243 struct netlink_ext_ack *extack)
2244{
2245 struct netdev_notifier_info info = {
2246 .dev = dev,
2247 .extack = extack,
2248 };
2249
2250 return call_netdevice_notifiers_info(val, &info);
2251}
2252
2253/**
2254 * call_netdevice_notifiers - call all network notifier blocks
2255 * @val: value passed unmodified to notifier function
2256 * @dev: net_device pointer passed unmodified to notifier function
2257 *
2258 * Call all network notifier blocks. Parameters and return value
2259 * are as for raw_notifier_call_chain().
2260 */
2261
2262int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2263{
2264 return call_netdevice_notifiers_extack(val, dev, NULL);
2265}
2266EXPORT_SYMBOL(call_netdevice_notifiers);
2267
2268/**
2269 * call_netdevice_notifiers_mtu - call all network notifier blocks
2270 * @val: value passed unmodified to notifier function
2271 * @dev: net_device pointer passed unmodified to notifier function
2272 * @arg: additional u32 argument passed to the notifier function
2273 *
2274 * Call all network notifier blocks. Parameters and return value
2275 * are as for raw_notifier_call_chain().
2276 */
2277static int call_netdevice_notifiers_mtu(unsigned long val,
2278 struct net_device *dev, u32 arg)
2279{
2280 struct netdev_notifier_info_ext info = {
2281 .info.dev = dev,
2282 .ext.mtu = arg,
2283 };
2284
2285 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2286
2287 return call_netdevice_notifiers_info(val, &info.info);
2288}
2289
2290#ifdef CONFIG_NET_INGRESS
2291static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2292
2293void net_inc_ingress_queue(void)
2294{
2295 static_branch_inc(&ingress_needed_key);
2296}
2297EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2298
2299void net_dec_ingress_queue(void)
2300{
2301 static_branch_dec(&ingress_needed_key);
2302}
2303EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2304#endif
2305
2306#ifdef CONFIG_NET_EGRESS
2307static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2308
2309void net_inc_egress_queue(void)
2310{
2311 static_branch_inc(&egress_needed_key);
2312}
2313EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2314
2315void net_dec_egress_queue(void)
2316{
2317 static_branch_dec(&egress_needed_key);
2318}
2319EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2320#endif
2321
2322#ifdef CONFIG_NET_CLS_ACT
2323DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
2324EXPORT_SYMBOL(tcf_sw_enabled_key);
2325#endif
2326
2327DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2328EXPORT_SYMBOL(netstamp_needed_key);
2329#ifdef CONFIG_JUMP_LABEL
2330static atomic_t netstamp_needed_deferred;
2331static atomic_t netstamp_wanted;
2332static void netstamp_clear(struct work_struct *work)
2333{
2334 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2335 int wanted;
2336
2337 wanted = atomic_add_return(deferred, &netstamp_wanted);
2338 if (wanted > 0)
2339 static_branch_enable(&netstamp_needed_key);
2340 else
2341 static_branch_disable(&netstamp_needed_key);
2342}
2343static DECLARE_WORK(netstamp_work, netstamp_clear);
2344#endif
2345
2346void net_enable_timestamp(void)
2347{
2348#ifdef CONFIG_JUMP_LABEL
2349 int wanted = atomic_read(&netstamp_wanted);
2350
2351 while (wanted > 0) {
2352 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2353 return;
2354 }
2355 atomic_inc(&netstamp_needed_deferred);
2356 schedule_work(&netstamp_work);
2357#else
2358 static_branch_inc(&netstamp_needed_key);
2359#endif
2360}
2361EXPORT_SYMBOL(net_enable_timestamp);
2362
2363void net_disable_timestamp(void)
2364{
2365#ifdef CONFIG_JUMP_LABEL
2366 int wanted = atomic_read(&netstamp_wanted);
2367
2368 while (wanted > 1) {
2369 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2370 return;
2371 }
2372 atomic_dec(&netstamp_needed_deferred);
2373 schedule_work(&netstamp_work);
2374#else
2375 static_branch_dec(&netstamp_needed_key);
2376#endif
2377}
2378EXPORT_SYMBOL(net_disable_timestamp);
2379
2380static inline void net_timestamp_set(struct sk_buff *skb)
2381{
2382 skb->tstamp = 0;
2383 skb->tstamp_type = SKB_CLOCK_REALTIME;
2384 if (static_branch_unlikely(&netstamp_needed_key))
2385 skb->tstamp = ktime_get_real();
2386}
2387
2388#define net_timestamp_check(COND, SKB) \
2389 if (static_branch_unlikely(&netstamp_needed_key)) { \
2390 if ((COND) && !(SKB)->tstamp) \
2391 (SKB)->tstamp = ktime_get_real(); \
2392 } \
2393
2394bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2395{
2396 return __is_skb_forwardable(dev, skb, true);
2397}
2398EXPORT_SYMBOL_GPL(is_skb_forwardable);
2399
2400static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2401 bool check_mtu)
2402{
2403 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2404
2405 if (likely(!ret)) {
2406 skb->protocol = eth_type_trans(skb, dev);
2407 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2408 }
2409
2410 return ret;
2411}
2412
2413int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2414{
2415 return __dev_forward_skb2(dev, skb, true);
2416}
2417EXPORT_SYMBOL_GPL(__dev_forward_skb);
2418
2419/**
2420 * dev_forward_skb - loopback an skb to another netif
2421 *
2422 * @dev: destination network device
2423 * @skb: buffer to forward
2424 *
2425 * return values:
2426 * NET_RX_SUCCESS (no congestion)
2427 * NET_RX_DROP (packet was dropped, but freed)
2428 *
2429 * dev_forward_skb can be used for injecting an skb from the
2430 * start_xmit function of one device into the receive queue
2431 * of another device.
2432 *
2433 * The receiving device may be in another namespace, so
2434 * we have to clear all information in the skb that could
2435 * impact namespace isolation.
2436 */
2437int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2438{
2439 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2440}
2441EXPORT_SYMBOL_GPL(dev_forward_skb);
2442
2443int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2444{
2445 return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2446}
2447
2448static inline int deliver_skb(struct sk_buff *skb,
2449 struct packet_type *pt_prev,
2450 struct net_device *orig_dev)
2451{
2452 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2453 return -ENOMEM;
2454 refcount_inc(&skb->users);
2455 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2456}
2457
2458static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2459 struct packet_type **pt,
2460 struct net_device *orig_dev,
2461 __be16 type,
2462 struct list_head *ptype_list)
2463{
2464 struct packet_type *ptype, *pt_prev = *pt;
2465
2466 list_for_each_entry_rcu(ptype, ptype_list, list) {
2467 if (ptype->type != type)
2468 continue;
2469 if (pt_prev)
2470 deliver_skb(skb, pt_prev, orig_dev);
2471 pt_prev = ptype;
2472 }
2473 *pt = pt_prev;
2474}
2475
2476static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2477{
2478 if (!ptype->af_packet_priv || !skb->sk)
2479 return false;
2480
2481 if (ptype->id_match)
2482 return ptype->id_match(ptype, skb->sk);
2483 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2484 return true;
2485
2486 return false;
2487}
2488
2489/**
2490 * dev_nit_active - return true if any network interface taps are in use
2491 *
2492 * @dev: network device to check for the presence of taps
2493 */
2494bool dev_nit_active(struct net_device *dev)
2495{
2496 return !list_empty(&net_hotdata.ptype_all) ||
2497 !list_empty(&dev->ptype_all);
2498}
2499EXPORT_SYMBOL_GPL(dev_nit_active);
2500
2501/*
2502 * Support routine. Sends outgoing frames to any network
2503 * taps currently in use.
2504 */
2505
2506void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2507{
2508 struct list_head *ptype_list = &net_hotdata.ptype_all;
2509 struct packet_type *ptype, *pt_prev = NULL;
2510 struct sk_buff *skb2 = NULL;
2511
2512 rcu_read_lock();
2513again:
2514 list_for_each_entry_rcu(ptype, ptype_list, list) {
2515 if (READ_ONCE(ptype->ignore_outgoing))
2516 continue;
2517
2518 /* Never send packets back to the socket
2519 * they originated from - MvS (miquels@drinkel.ow.org)
2520 */
2521 if (skb_loop_sk(ptype, skb))
2522 continue;
2523
2524 if (pt_prev) {
2525 deliver_skb(skb2, pt_prev, skb->dev);
2526 pt_prev = ptype;
2527 continue;
2528 }
2529
2530 /* need to clone skb, done only once */
2531 skb2 = skb_clone(skb, GFP_ATOMIC);
2532 if (!skb2)
2533 goto out_unlock;
2534
2535 net_timestamp_set(skb2);
2536
2537 /* skb->nh should be correctly
2538 * set by sender, so that the second statement is
2539 * just protection against buggy protocols.
2540 */
2541 skb_reset_mac_header(skb2);
2542
2543 if (skb_network_header(skb2) < skb2->data ||
2544 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2545 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2546 ntohs(skb2->protocol),
2547 dev->name);
2548 skb_reset_network_header(skb2);
2549 }
2550
2551 skb2->transport_header = skb2->network_header;
2552 skb2->pkt_type = PACKET_OUTGOING;
2553 pt_prev = ptype;
2554 }
2555
2556 if (ptype_list == &net_hotdata.ptype_all) {
2557 ptype_list = &dev->ptype_all;
2558 goto again;
2559 }
2560out_unlock:
2561 if (pt_prev) {
2562 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2563 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2564 else
2565 kfree_skb(skb2);
2566 }
2567 rcu_read_unlock();
2568}
2569EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2570
2571/**
2572 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2573 * @dev: Network device
2574 * @txq: number of queues available
2575 *
2576 * If real_num_tx_queues is changed the tc mappings may no longer be
2577 * valid. To resolve this verify the tc mapping remains valid and if
2578 * not NULL the mapping. With no priorities mapping to this
2579 * offset/count pair it will no longer be used. In the worst case TC0
2580 * is invalid nothing can be done so disable priority mappings. If is
2581 * expected that drivers will fix this mapping if they can before
2582 * calling netif_set_real_num_tx_queues.
2583 */
2584static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2585{
2586 int i;
2587 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2588
2589 /* If TC0 is invalidated disable TC mapping */
2590 if (tc->offset + tc->count > txq) {
2591 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2592 dev->num_tc = 0;
2593 return;
2594 }
2595
2596 /* Invalidated prio to tc mappings set to TC0 */
2597 for (i = 1; i < TC_BITMASK + 1; i++) {
2598 int q = netdev_get_prio_tc_map(dev, i);
2599
2600 tc = &dev->tc_to_txq[q];
2601 if (tc->offset + tc->count > txq) {
2602 netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2603 i, q);
2604 netdev_set_prio_tc_map(dev, i, 0);
2605 }
2606 }
2607}
2608
2609int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2610{
2611 if (dev->num_tc) {
2612 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2613 int i;
2614
2615 /* walk through the TCs and see if it falls into any of them */
2616 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2617 if ((txq - tc->offset) < tc->count)
2618 return i;
2619 }
2620
2621 /* didn't find it, just return -1 to indicate no match */
2622 return -1;
2623 }
2624
2625 return 0;
2626}
2627EXPORT_SYMBOL(netdev_txq_to_tc);
2628
2629#ifdef CONFIG_XPS
2630static struct static_key xps_needed __read_mostly;
2631static struct static_key xps_rxqs_needed __read_mostly;
2632static DEFINE_MUTEX(xps_map_mutex);
2633#define xmap_dereference(P) \
2634 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2635
2636static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2637 struct xps_dev_maps *old_maps, int tci, u16 index)
2638{
2639 struct xps_map *map = NULL;
2640 int pos;
2641
2642 map = xmap_dereference(dev_maps->attr_map[tci]);
2643 if (!map)
2644 return false;
2645
2646 for (pos = map->len; pos--;) {
2647 if (map->queues[pos] != index)
2648 continue;
2649
2650 if (map->len > 1) {
2651 map->queues[pos] = map->queues[--map->len];
2652 break;
2653 }
2654
2655 if (old_maps)
2656 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2657 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2658 kfree_rcu(map, rcu);
2659 return false;
2660 }
2661
2662 return true;
2663}
2664
2665static bool remove_xps_queue_cpu(struct net_device *dev,
2666 struct xps_dev_maps *dev_maps,
2667 int cpu, u16 offset, u16 count)
2668{
2669 int num_tc = dev_maps->num_tc;
2670 bool active = false;
2671 int tci;
2672
2673 for (tci = cpu * num_tc; num_tc--; tci++) {
2674 int i, j;
2675
2676 for (i = count, j = offset; i--; j++) {
2677 if (!remove_xps_queue(dev_maps, NULL, tci, j))
2678 break;
2679 }
2680
2681 active |= i < 0;
2682 }
2683
2684 return active;
2685}
2686
2687static void reset_xps_maps(struct net_device *dev,
2688 struct xps_dev_maps *dev_maps,
2689 enum xps_map_type type)
2690{
2691 static_key_slow_dec_cpuslocked(&xps_needed);
2692 if (type == XPS_RXQS)
2693 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2694
2695 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2696
2697 kfree_rcu(dev_maps, rcu);
2698}
2699
2700static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2701 u16 offset, u16 count)
2702{
2703 struct xps_dev_maps *dev_maps;
2704 bool active = false;
2705 int i, j;
2706
2707 dev_maps = xmap_dereference(dev->xps_maps[type]);
2708 if (!dev_maps)
2709 return;
2710
2711 for (j = 0; j < dev_maps->nr_ids; j++)
2712 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2713 if (!active)
2714 reset_xps_maps(dev, dev_maps, type);
2715
2716 if (type == XPS_CPUS) {
2717 for (i = offset + (count - 1); count--; i--)
2718 netdev_queue_numa_node_write(
2719 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2720 }
2721}
2722
2723static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2724 u16 count)
2725{
2726 if (!static_key_false(&xps_needed))
2727 return;
2728
2729 cpus_read_lock();
2730 mutex_lock(&xps_map_mutex);
2731
2732 if (static_key_false(&xps_rxqs_needed))
2733 clean_xps_maps(dev, XPS_RXQS, offset, count);
2734
2735 clean_xps_maps(dev, XPS_CPUS, offset, count);
2736
2737 mutex_unlock(&xps_map_mutex);
2738 cpus_read_unlock();
2739}
2740
2741static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2742{
2743 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2744}
2745
2746static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2747 u16 index, bool is_rxqs_map)
2748{
2749 struct xps_map *new_map;
2750 int alloc_len = XPS_MIN_MAP_ALLOC;
2751 int i, pos;
2752
2753 for (pos = 0; map && pos < map->len; pos++) {
2754 if (map->queues[pos] != index)
2755 continue;
2756 return map;
2757 }
2758
2759 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2760 if (map) {
2761 if (pos < map->alloc_len)
2762 return map;
2763
2764 alloc_len = map->alloc_len * 2;
2765 }
2766
2767 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2768 * map
2769 */
2770 if (is_rxqs_map)
2771 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2772 else
2773 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2774 cpu_to_node(attr_index));
2775 if (!new_map)
2776 return NULL;
2777
2778 for (i = 0; i < pos; i++)
2779 new_map->queues[i] = map->queues[i];
2780 new_map->alloc_len = alloc_len;
2781 new_map->len = pos;
2782
2783 return new_map;
2784}
2785
2786/* Copy xps maps at a given index */
2787static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2788 struct xps_dev_maps *new_dev_maps, int index,
2789 int tc, bool skip_tc)
2790{
2791 int i, tci = index * dev_maps->num_tc;
2792 struct xps_map *map;
2793
2794 /* copy maps belonging to foreign traffic classes */
2795 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2796 if (i == tc && skip_tc)
2797 continue;
2798
2799 /* fill in the new device map from the old device map */
2800 map = xmap_dereference(dev_maps->attr_map[tci]);
2801 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2802 }
2803}
2804
2805/* Must be called under cpus_read_lock */
2806int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2807 u16 index, enum xps_map_type type)
2808{
2809 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2810 const unsigned long *online_mask = NULL;
2811 bool active = false, copy = false;
2812 int i, j, tci, numa_node_id = -2;
2813 int maps_sz, num_tc = 1, tc = 0;
2814 struct xps_map *map, *new_map;
2815 unsigned int nr_ids;
2816
2817 WARN_ON_ONCE(index >= dev->num_tx_queues);
2818
2819 if (dev->num_tc) {
2820 /* Do not allow XPS on subordinate device directly */
2821 num_tc = dev->num_tc;
2822 if (num_tc < 0)
2823 return -EINVAL;
2824
2825 /* If queue belongs to subordinate dev use its map */
2826 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2827
2828 tc = netdev_txq_to_tc(dev, index);
2829 if (tc < 0)
2830 return -EINVAL;
2831 }
2832
2833 mutex_lock(&xps_map_mutex);
2834
2835 dev_maps = xmap_dereference(dev->xps_maps[type]);
2836 if (type == XPS_RXQS) {
2837 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2838 nr_ids = dev->num_rx_queues;
2839 } else {
2840 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2841 if (num_possible_cpus() > 1)
2842 online_mask = cpumask_bits(cpu_online_mask);
2843 nr_ids = nr_cpu_ids;
2844 }
2845
2846 if (maps_sz < L1_CACHE_BYTES)
2847 maps_sz = L1_CACHE_BYTES;
2848
2849 /* The old dev_maps could be larger or smaller than the one we're
2850 * setting up now, as dev->num_tc or nr_ids could have been updated in
2851 * between. We could try to be smart, but let's be safe instead and only
2852 * copy foreign traffic classes if the two map sizes match.
2853 */
2854 if (dev_maps &&
2855 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2856 copy = true;
2857
2858 /* allocate memory for queue storage */
2859 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2860 j < nr_ids;) {
2861 if (!new_dev_maps) {
2862 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2863 if (!new_dev_maps) {
2864 mutex_unlock(&xps_map_mutex);
2865 return -ENOMEM;
2866 }
2867
2868 new_dev_maps->nr_ids = nr_ids;
2869 new_dev_maps->num_tc = num_tc;
2870 }
2871
2872 tci = j * num_tc + tc;
2873 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2874
2875 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2876 if (!map)
2877 goto error;
2878
2879 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2880 }
2881
2882 if (!new_dev_maps)
2883 goto out_no_new_maps;
2884
2885 if (!dev_maps) {
2886 /* Increment static keys at most once per type */
2887 static_key_slow_inc_cpuslocked(&xps_needed);
2888 if (type == XPS_RXQS)
2889 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2890 }
2891
2892 for (j = 0; j < nr_ids; j++) {
2893 bool skip_tc = false;
2894
2895 tci = j * num_tc + tc;
2896 if (netif_attr_test_mask(j, mask, nr_ids) &&
2897 netif_attr_test_online(j, online_mask, nr_ids)) {
2898 /* add tx-queue to CPU/rx-queue maps */
2899 int pos = 0;
2900
2901 skip_tc = true;
2902
2903 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2904 while ((pos < map->len) && (map->queues[pos] != index))
2905 pos++;
2906
2907 if (pos == map->len)
2908 map->queues[map->len++] = index;
2909#ifdef CONFIG_NUMA
2910 if (type == XPS_CPUS) {
2911 if (numa_node_id == -2)
2912 numa_node_id = cpu_to_node(j);
2913 else if (numa_node_id != cpu_to_node(j))
2914 numa_node_id = -1;
2915 }
2916#endif
2917 }
2918
2919 if (copy)
2920 xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2921 skip_tc);
2922 }
2923
2924 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2925
2926 /* Cleanup old maps */
2927 if (!dev_maps)
2928 goto out_no_old_maps;
2929
2930 for (j = 0; j < dev_maps->nr_ids; j++) {
2931 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2932 map = xmap_dereference(dev_maps->attr_map[tci]);
2933 if (!map)
2934 continue;
2935
2936 if (copy) {
2937 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2938 if (map == new_map)
2939 continue;
2940 }
2941
2942 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2943 kfree_rcu(map, rcu);
2944 }
2945 }
2946
2947 old_dev_maps = dev_maps;
2948
2949out_no_old_maps:
2950 dev_maps = new_dev_maps;
2951 active = true;
2952
2953out_no_new_maps:
2954 if (type == XPS_CPUS)
2955 /* update Tx queue numa node */
2956 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2957 (numa_node_id >= 0) ?
2958 numa_node_id : NUMA_NO_NODE);
2959
2960 if (!dev_maps)
2961 goto out_no_maps;
2962
2963 /* removes tx-queue from unused CPUs/rx-queues */
2964 for (j = 0; j < dev_maps->nr_ids; j++) {
2965 tci = j * dev_maps->num_tc;
2966
2967 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2968 if (i == tc &&
2969 netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2970 netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2971 continue;
2972
2973 active |= remove_xps_queue(dev_maps,
2974 copy ? old_dev_maps : NULL,
2975 tci, index);
2976 }
2977 }
2978
2979 if (old_dev_maps)
2980 kfree_rcu(old_dev_maps, rcu);
2981
2982 /* free map if not active */
2983 if (!active)
2984 reset_xps_maps(dev, dev_maps, type);
2985
2986out_no_maps:
2987 mutex_unlock(&xps_map_mutex);
2988
2989 return 0;
2990error:
2991 /* remove any maps that we added */
2992 for (j = 0; j < nr_ids; j++) {
2993 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2994 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2995 map = copy ?
2996 xmap_dereference(dev_maps->attr_map[tci]) :
2997 NULL;
2998 if (new_map && new_map != map)
2999 kfree(new_map);
3000 }
3001 }
3002
3003 mutex_unlock(&xps_map_mutex);
3004
3005 kfree(new_dev_maps);
3006 return -ENOMEM;
3007}
3008EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
3009
3010int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
3011 u16 index)
3012{
3013 int ret;
3014
3015 cpus_read_lock();
3016 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
3017 cpus_read_unlock();
3018
3019 return ret;
3020}
3021EXPORT_SYMBOL(netif_set_xps_queue);
3022
3023#endif
3024static void netdev_unbind_all_sb_channels(struct net_device *dev)
3025{
3026 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
3027
3028 /* Unbind any subordinate channels */
3029 while (txq-- != &dev->_tx[0]) {
3030 if (txq->sb_dev)
3031 netdev_unbind_sb_channel(dev, txq->sb_dev);
3032 }
3033}
3034
3035void netdev_reset_tc(struct net_device *dev)
3036{
3037#ifdef CONFIG_XPS
3038 netif_reset_xps_queues_gt(dev, 0);
3039#endif
3040 netdev_unbind_all_sb_channels(dev);
3041
3042 /* Reset TC configuration of device */
3043 dev->num_tc = 0;
3044 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
3045 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
3046}
3047EXPORT_SYMBOL(netdev_reset_tc);
3048
3049int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
3050{
3051 if (tc >= dev->num_tc)
3052 return -EINVAL;
3053
3054#ifdef CONFIG_XPS
3055 netif_reset_xps_queues(dev, offset, count);
3056#endif
3057 dev->tc_to_txq[tc].count = count;
3058 dev->tc_to_txq[tc].offset = offset;
3059 return 0;
3060}
3061EXPORT_SYMBOL(netdev_set_tc_queue);
3062
3063int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
3064{
3065 if (num_tc > TC_MAX_QUEUE)
3066 return -EINVAL;
3067
3068#ifdef CONFIG_XPS
3069 netif_reset_xps_queues_gt(dev, 0);
3070#endif
3071 netdev_unbind_all_sb_channels(dev);
3072
3073 dev->num_tc = num_tc;
3074 return 0;
3075}
3076EXPORT_SYMBOL(netdev_set_num_tc);
3077
3078void netdev_unbind_sb_channel(struct net_device *dev,
3079 struct net_device *sb_dev)
3080{
3081 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
3082
3083#ifdef CONFIG_XPS
3084 netif_reset_xps_queues_gt(sb_dev, 0);
3085#endif
3086 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
3087 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
3088
3089 while (txq-- != &dev->_tx[0]) {
3090 if (txq->sb_dev == sb_dev)
3091 txq->sb_dev = NULL;
3092 }
3093}
3094EXPORT_SYMBOL(netdev_unbind_sb_channel);
3095
3096int netdev_bind_sb_channel_queue(struct net_device *dev,
3097 struct net_device *sb_dev,
3098 u8 tc, u16 count, u16 offset)
3099{
3100 /* Make certain the sb_dev and dev are already configured */
3101 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
3102 return -EINVAL;
3103
3104 /* We cannot hand out queues we don't have */
3105 if ((offset + count) > dev->real_num_tx_queues)
3106 return -EINVAL;
3107
3108 /* Record the mapping */
3109 sb_dev->tc_to_txq[tc].count = count;
3110 sb_dev->tc_to_txq[tc].offset = offset;
3111
3112 /* Provide a way for Tx queue to find the tc_to_txq map or
3113 * XPS map for itself.
3114 */
3115 while (count--)
3116 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
3117
3118 return 0;
3119}
3120EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
3121
3122int netdev_set_sb_channel(struct net_device *dev, u16 channel)
3123{
3124 /* Do not use a multiqueue device to represent a subordinate channel */
3125 if (netif_is_multiqueue(dev))
3126 return -ENODEV;
3127
3128 /* We allow channels 1 - 32767 to be used for subordinate channels.
3129 * Channel 0 is meant to be "native" mode and used only to represent
3130 * the main root device. We allow writing 0 to reset the device back
3131 * to normal mode after being used as a subordinate channel.
3132 */
3133 if (channel > S16_MAX)
3134 return -EINVAL;
3135
3136 dev->num_tc = -channel;
3137
3138 return 0;
3139}
3140EXPORT_SYMBOL(netdev_set_sb_channel);
3141
3142/*
3143 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
3144 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
3145 */
3146int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
3147{
3148 bool disabling;
3149 int rc;
3150
3151 disabling = txq < dev->real_num_tx_queues;
3152
3153 if (txq < 1 || txq > dev->num_tx_queues)
3154 return -EINVAL;
3155
3156 if (dev->reg_state == NETREG_REGISTERED ||
3157 dev->reg_state == NETREG_UNREGISTERING) {
3158 ASSERT_RTNL();
3159
3160 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
3161 txq);
3162 if (rc)
3163 return rc;
3164
3165 if (dev->num_tc)
3166 netif_setup_tc(dev, txq);
3167
3168 net_shaper_set_real_num_tx_queues(dev, txq);
3169
3170 dev_qdisc_change_real_num_tx(dev, txq);
3171
3172 dev->real_num_tx_queues = txq;
3173
3174 if (disabling) {
3175 synchronize_net();
3176 qdisc_reset_all_tx_gt(dev, txq);
3177#ifdef CONFIG_XPS
3178 netif_reset_xps_queues_gt(dev, txq);
3179#endif
3180 }
3181 } else {
3182 dev->real_num_tx_queues = txq;
3183 }
3184
3185 return 0;
3186}
3187EXPORT_SYMBOL(netif_set_real_num_tx_queues);
3188
3189#ifdef CONFIG_SYSFS
3190/**
3191 * netif_set_real_num_rx_queues - set actual number of RX queues used
3192 * @dev: Network device
3193 * @rxq: Actual number of RX queues
3194 *
3195 * This must be called either with the rtnl_lock held or before
3196 * registration of the net device. Returns 0 on success, or a
3197 * negative error code. If called before registration, it always
3198 * succeeds.
3199 */
3200int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3201{
3202 int rc;
3203
3204 if (rxq < 1 || rxq > dev->num_rx_queues)
3205 return -EINVAL;
3206
3207 if (dev->reg_state == NETREG_REGISTERED) {
3208 ASSERT_RTNL();
3209
3210 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3211 rxq);
3212 if (rc)
3213 return rc;
3214 }
3215
3216 dev->real_num_rx_queues = rxq;
3217 return 0;
3218}
3219EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3220#endif
3221
3222/**
3223 * netif_set_real_num_queues - set actual number of RX and TX queues used
3224 * @dev: Network device
3225 * @txq: Actual number of TX queues
3226 * @rxq: Actual number of RX queues
3227 *
3228 * Set the real number of both TX and RX queues.
3229 * Does nothing if the number of queues is already correct.
3230 */
3231int netif_set_real_num_queues(struct net_device *dev,
3232 unsigned int txq, unsigned int rxq)
3233{
3234 unsigned int old_rxq = dev->real_num_rx_queues;
3235 int err;
3236
3237 if (txq < 1 || txq > dev->num_tx_queues ||
3238 rxq < 1 || rxq > dev->num_rx_queues)
3239 return -EINVAL;
3240
3241 /* Start from increases, so the error path only does decreases -
3242 * decreases can't fail.
3243 */
3244 if (rxq > dev->real_num_rx_queues) {
3245 err = netif_set_real_num_rx_queues(dev, rxq);
3246 if (err)
3247 return err;
3248 }
3249 if (txq > dev->real_num_tx_queues) {
3250 err = netif_set_real_num_tx_queues(dev, txq);
3251 if (err)
3252 goto undo_rx;
3253 }
3254 if (rxq < dev->real_num_rx_queues)
3255 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
3256 if (txq < dev->real_num_tx_queues)
3257 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
3258
3259 return 0;
3260undo_rx:
3261 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
3262 return err;
3263}
3264EXPORT_SYMBOL(netif_set_real_num_queues);
3265
3266/**
3267 * netif_set_tso_max_size() - set the max size of TSO frames supported
3268 * @dev: netdev to update
3269 * @size: max skb->len of a TSO frame
3270 *
3271 * Set the limit on the size of TSO super-frames the device can handle.
3272 * Unless explicitly set the stack will assume the value of
3273 * %GSO_LEGACY_MAX_SIZE.
3274 */
3275void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3276{
3277 dev->tso_max_size = min(GSO_MAX_SIZE, size);
3278 if (size < READ_ONCE(dev->gso_max_size))
3279 netif_set_gso_max_size(dev, size);
3280 if (size < READ_ONCE(dev->gso_ipv4_max_size))
3281 netif_set_gso_ipv4_max_size(dev, size);
3282}
3283EXPORT_SYMBOL(netif_set_tso_max_size);
3284
3285/**
3286 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3287 * @dev: netdev to update
3288 * @segs: max number of TCP segments
3289 *
3290 * Set the limit on the number of TCP segments the device can generate from
3291 * a single TSO super-frame.
3292 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3293 */
3294void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3295{
3296 dev->tso_max_segs = segs;
3297 if (segs < READ_ONCE(dev->gso_max_segs))
3298 netif_set_gso_max_segs(dev, segs);
3299}
3300EXPORT_SYMBOL(netif_set_tso_max_segs);
3301
3302/**
3303 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3304 * @to: netdev to update
3305 * @from: netdev from which to copy the limits
3306 */
3307void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3308{
3309 netif_set_tso_max_size(to, from->tso_max_size);
3310 netif_set_tso_max_segs(to, from->tso_max_segs);
3311}
3312EXPORT_SYMBOL(netif_inherit_tso_max);
3313
3314/**
3315 * netif_get_num_default_rss_queues - default number of RSS queues
3316 *
3317 * Default value is the number of physical cores if there are only 1 or 2, or
3318 * divided by 2 if there are more.
3319 */
3320int netif_get_num_default_rss_queues(void)
3321{
3322 cpumask_var_t cpus;
3323 int cpu, count = 0;
3324
3325 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3326 return 1;
3327
3328 cpumask_copy(cpus, cpu_online_mask);
3329 for_each_cpu(cpu, cpus) {
3330 ++count;
3331 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3332 }
3333 free_cpumask_var(cpus);
3334
3335 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3336}
3337EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3338
3339static void __netif_reschedule(struct Qdisc *q)
3340{
3341 struct softnet_data *sd;
3342 unsigned long flags;
3343
3344 local_irq_save(flags);
3345 sd = this_cpu_ptr(&softnet_data);
3346 q->next_sched = NULL;
3347 *sd->output_queue_tailp = q;
3348 sd->output_queue_tailp = &q->next_sched;
3349 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3350 local_irq_restore(flags);
3351}
3352
3353void __netif_schedule(struct Qdisc *q)
3354{
3355 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3356 __netif_reschedule(q);
3357}
3358EXPORT_SYMBOL(__netif_schedule);
3359
3360struct dev_kfree_skb_cb {
3361 enum skb_drop_reason reason;
3362};
3363
3364static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3365{
3366 return (struct dev_kfree_skb_cb *)skb->cb;
3367}
3368
3369void netif_schedule_queue(struct netdev_queue *txq)
3370{
3371 rcu_read_lock();
3372 if (!netif_xmit_stopped(txq)) {
3373 struct Qdisc *q = rcu_dereference(txq->qdisc);
3374
3375 __netif_schedule(q);
3376 }
3377 rcu_read_unlock();
3378}
3379EXPORT_SYMBOL(netif_schedule_queue);
3380
3381void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3382{
3383 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3384 struct Qdisc *q;
3385
3386 rcu_read_lock();
3387 q = rcu_dereference(dev_queue->qdisc);
3388 __netif_schedule(q);
3389 rcu_read_unlock();
3390 }
3391}
3392EXPORT_SYMBOL(netif_tx_wake_queue);
3393
3394void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3395{
3396 unsigned long flags;
3397
3398 if (unlikely(!skb))
3399 return;
3400
3401 if (likely(refcount_read(&skb->users) == 1)) {
3402 smp_rmb();
3403 refcount_set(&skb->users, 0);
3404 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3405 return;
3406 }
3407 get_kfree_skb_cb(skb)->reason = reason;
3408 local_irq_save(flags);
3409 skb->next = __this_cpu_read(softnet_data.completion_queue);
3410 __this_cpu_write(softnet_data.completion_queue, skb);
3411 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3412 local_irq_restore(flags);
3413}
3414EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3415
3416void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3417{
3418 if (in_hardirq() || irqs_disabled())
3419 dev_kfree_skb_irq_reason(skb, reason);
3420 else
3421 kfree_skb_reason(skb, reason);
3422}
3423EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3424
3425
3426/**
3427 * netif_device_detach - mark device as removed
3428 * @dev: network device
3429 *
3430 * Mark device as removed from system and therefore no longer available.
3431 */
3432void netif_device_detach(struct net_device *dev)
3433{
3434 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3435 netif_running(dev)) {
3436 netif_tx_stop_all_queues(dev);
3437 }
3438}
3439EXPORT_SYMBOL(netif_device_detach);
3440
3441/**
3442 * netif_device_attach - mark device as attached
3443 * @dev: network device
3444 *
3445 * Mark device as attached from system and restart if needed.
3446 */
3447void netif_device_attach(struct net_device *dev)
3448{
3449 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3450 netif_running(dev)) {
3451 netif_tx_wake_all_queues(dev);
3452 netdev_watchdog_up(dev);
3453 }
3454}
3455EXPORT_SYMBOL(netif_device_attach);
3456
3457/*
3458 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3459 * to be used as a distribution range.
3460 */
3461static u16 skb_tx_hash(const struct net_device *dev,
3462 const struct net_device *sb_dev,
3463 struct sk_buff *skb)
3464{
3465 u32 hash;
3466 u16 qoffset = 0;
3467 u16 qcount = dev->real_num_tx_queues;
3468
3469 if (dev->num_tc) {
3470 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3471
3472 qoffset = sb_dev->tc_to_txq[tc].offset;
3473 qcount = sb_dev->tc_to_txq[tc].count;
3474 if (unlikely(!qcount)) {
3475 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3476 sb_dev->name, qoffset, tc);
3477 qoffset = 0;
3478 qcount = dev->real_num_tx_queues;
3479 }
3480 }
3481
3482 if (skb_rx_queue_recorded(skb)) {
3483 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3484 hash = skb_get_rx_queue(skb);
3485 if (hash >= qoffset)
3486 hash -= qoffset;
3487 while (unlikely(hash >= qcount))
3488 hash -= qcount;
3489 return hash + qoffset;
3490 }
3491
3492 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3493}
3494
3495void skb_warn_bad_offload(const struct sk_buff *skb)
3496{
3497 static const netdev_features_t null_features;
3498 struct net_device *dev = skb->dev;
3499 const char *name = "";
3500
3501 if (!net_ratelimit())
3502 return;
3503
3504 if (dev) {
3505 if (dev->dev.parent)
3506 name = dev_driver_string(dev->dev.parent);
3507 else
3508 name = netdev_name(dev);
3509 }
3510 skb_dump(KERN_WARNING, skb, false);
3511 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3512 name, dev ? &dev->features : &null_features,
3513 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3514}
3515
3516/*
3517 * Invalidate hardware checksum when packet is to be mangled, and
3518 * complete checksum manually on outgoing path.
3519 */
3520int skb_checksum_help(struct sk_buff *skb)
3521{
3522 __wsum csum;
3523 int ret = 0, offset;
3524
3525 if (skb->ip_summed == CHECKSUM_COMPLETE)
3526 goto out_set_summed;
3527
3528 if (unlikely(skb_is_gso(skb))) {
3529 skb_warn_bad_offload(skb);
3530 return -EINVAL;
3531 }
3532
3533 if (!skb_frags_readable(skb)) {
3534 return -EFAULT;
3535 }
3536
3537 /* Before computing a checksum, we should make sure no frag could
3538 * be modified by an external entity : checksum could be wrong.
3539 */
3540 if (skb_has_shared_frag(skb)) {
3541 ret = __skb_linearize(skb);
3542 if (ret)
3543 goto out;
3544 }
3545
3546 offset = skb_checksum_start_offset(skb);
3547 ret = -EINVAL;
3548 if (unlikely(offset >= skb_headlen(skb))) {
3549 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3550 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3551 offset, skb_headlen(skb));
3552 goto out;
3553 }
3554 csum = skb_checksum(skb, offset, skb->len - offset, 0);
3555
3556 offset += skb->csum_offset;
3557 if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3558 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3559 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3560 offset + sizeof(__sum16), skb_headlen(skb));
3561 goto out;
3562 }
3563 ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3564 if (ret)
3565 goto out;
3566
3567 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3568out_set_summed:
3569 skb->ip_summed = CHECKSUM_NONE;
3570out:
3571 return ret;
3572}
3573EXPORT_SYMBOL(skb_checksum_help);
3574
3575int skb_crc32c_csum_help(struct sk_buff *skb)
3576{
3577 __le32 crc32c_csum;
3578 int ret = 0, offset, start;
3579
3580 if (skb->ip_summed != CHECKSUM_PARTIAL)
3581 goto out;
3582
3583 if (unlikely(skb_is_gso(skb)))
3584 goto out;
3585
3586 /* Before computing a checksum, we should make sure no frag could
3587 * be modified by an external entity : checksum could be wrong.
3588 */
3589 if (unlikely(skb_has_shared_frag(skb))) {
3590 ret = __skb_linearize(skb);
3591 if (ret)
3592 goto out;
3593 }
3594 start = skb_checksum_start_offset(skb);
3595 offset = start + offsetof(struct sctphdr, checksum);
3596 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3597 ret = -EINVAL;
3598 goto out;
3599 }
3600
3601 ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3602 if (ret)
3603 goto out;
3604
3605 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3606 skb->len - start, ~(__u32)0,
3607 crc32c_csum_stub));
3608 *(__le32 *)(skb->data + offset) = crc32c_csum;
3609 skb_reset_csum_not_inet(skb);
3610out:
3611 return ret;
3612}
3613EXPORT_SYMBOL(skb_crc32c_csum_help);
3614
3615__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3616{
3617 __be16 type = skb->protocol;
3618
3619 /* Tunnel gso handlers can set protocol to ethernet. */
3620 if (type == htons(ETH_P_TEB)) {
3621 struct ethhdr *eth;
3622
3623 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3624 return 0;
3625
3626 eth = (struct ethhdr *)skb->data;
3627 type = eth->h_proto;
3628 }
3629
3630 return vlan_get_protocol_and_depth(skb, type, depth);
3631}
3632
3633
3634/* Take action when hardware reception checksum errors are detected. */
3635#ifdef CONFIG_BUG
3636static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3637{
3638 netdev_err(dev, "hw csum failure\n");
3639 skb_dump(KERN_ERR, skb, true);
3640 dump_stack();
3641}
3642
3643void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3644{
3645 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3646}
3647EXPORT_SYMBOL(netdev_rx_csum_fault);
3648#endif
3649
3650/* XXX: check that highmem exists at all on the given machine. */
3651static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3652{
3653#ifdef CONFIG_HIGHMEM
3654 int i;
3655
3656 if (!(dev->features & NETIF_F_HIGHDMA)) {
3657 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3658 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3659 struct page *page = skb_frag_page(frag);
3660
3661 if (page && PageHighMem(page))
3662 return 1;
3663 }
3664 }
3665#endif
3666 return 0;
3667}
3668
3669/* If MPLS offload request, verify we are testing hardware MPLS features
3670 * instead of standard features for the netdev.
3671 */
3672#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3673static netdev_features_t net_mpls_features(struct sk_buff *skb,
3674 netdev_features_t features,
3675 __be16 type)
3676{
3677 if (eth_p_mpls(type))
3678 features &= skb->dev->mpls_features;
3679
3680 return features;
3681}
3682#else
3683static netdev_features_t net_mpls_features(struct sk_buff *skb,
3684 netdev_features_t features,
3685 __be16 type)
3686{
3687 return features;
3688}
3689#endif
3690
3691static netdev_features_t harmonize_features(struct sk_buff *skb,
3692 netdev_features_t features)
3693{
3694 __be16 type;
3695
3696 type = skb_network_protocol(skb, NULL);
3697 features = net_mpls_features(skb, features, type);
3698
3699 if (skb->ip_summed != CHECKSUM_NONE &&
3700 !can_checksum_protocol(features, type)) {
3701 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3702 }
3703 if (illegal_highdma(skb->dev, skb))
3704 features &= ~NETIF_F_SG;
3705
3706 return features;
3707}
3708
3709netdev_features_t passthru_features_check(struct sk_buff *skb,
3710 struct net_device *dev,
3711 netdev_features_t features)
3712{
3713 return features;
3714}
3715EXPORT_SYMBOL(passthru_features_check);
3716
3717static netdev_features_t dflt_features_check(struct sk_buff *skb,
3718 struct net_device *dev,
3719 netdev_features_t features)
3720{
3721 return vlan_features_check(skb, features);
3722}
3723
3724static netdev_features_t gso_features_check(const struct sk_buff *skb,
3725 struct net_device *dev,
3726 netdev_features_t features)
3727{
3728 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3729
3730 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3731 return features & ~NETIF_F_GSO_MASK;
3732
3733 if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
3734 return features & ~NETIF_F_GSO_MASK;
3735
3736 if (!skb_shinfo(skb)->gso_type) {
3737 skb_warn_bad_offload(skb);
3738 return features & ~NETIF_F_GSO_MASK;
3739 }
3740
3741 /* Support for GSO partial features requires software
3742 * intervention before we can actually process the packets
3743 * so we need to strip support for any partial features now
3744 * and we can pull them back in after we have partially
3745 * segmented the frame.
3746 */
3747 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3748 features &= ~dev->gso_partial_features;
3749
3750 /* Make sure to clear the IPv4 ID mangling feature if the
3751 * IPv4 header has the potential to be fragmented.
3752 */
3753 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3754 struct iphdr *iph = skb->encapsulation ?
3755 inner_ip_hdr(skb) : ip_hdr(skb);
3756
3757 if (!(iph->frag_off & htons(IP_DF)))
3758 features &= ~NETIF_F_TSO_MANGLEID;
3759 }
3760
3761 return features;
3762}
3763
3764netdev_features_t netif_skb_features(struct sk_buff *skb)
3765{
3766 struct net_device *dev = skb->dev;
3767 netdev_features_t features = dev->features;
3768
3769 if (skb_is_gso(skb))
3770 features = gso_features_check(skb, dev, features);
3771
3772 /* If encapsulation offload request, verify we are testing
3773 * hardware encapsulation features instead of standard
3774 * features for the netdev
3775 */
3776 if (skb->encapsulation)
3777 features &= dev->hw_enc_features;
3778
3779 if (skb_vlan_tagged(skb))
3780 features = netdev_intersect_features(features,
3781 dev->vlan_features |
3782 NETIF_F_HW_VLAN_CTAG_TX |
3783 NETIF_F_HW_VLAN_STAG_TX);
3784
3785 if (dev->netdev_ops->ndo_features_check)
3786 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3787 features);
3788 else
3789 features &= dflt_features_check(skb, dev, features);
3790
3791 return harmonize_features(skb, features);
3792}
3793EXPORT_SYMBOL(netif_skb_features);
3794
3795static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3796 struct netdev_queue *txq, bool more)
3797{
3798 unsigned int len;
3799 int rc;
3800
3801 if (dev_nit_active(dev))
3802 dev_queue_xmit_nit(skb, dev);
3803
3804 len = skb->len;
3805 trace_net_dev_start_xmit(skb, dev);
3806 rc = netdev_start_xmit(skb, dev, txq, more);
3807 trace_net_dev_xmit(skb, rc, dev, len);
3808
3809 return rc;
3810}
3811
3812struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3813 struct netdev_queue *txq, int *ret)
3814{
3815 struct sk_buff *skb = first;
3816 int rc = NETDEV_TX_OK;
3817
3818 while (skb) {
3819 struct sk_buff *next = skb->next;
3820
3821 skb_mark_not_on_list(skb);
3822 rc = xmit_one(skb, dev, txq, next != NULL);
3823 if (unlikely(!dev_xmit_complete(rc))) {
3824 skb->next = next;
3825 goto out;
3826 }
3827
3828 skb = next;
3829 if (netif_tx_queue_stopped(txq) && skb) {
3830 rc = NETDEV_TX_BUSY;
3831 break;
3832 }
3833 }
3834
3835out:
3836 *ret = rc;
3837 return skb;
3838}
3839
3840static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3841 netdev_features_t features)
3842{
3843 if (skb_vlan_tag_present(skb) &&
3844 !vlan_hw_offload_capable(features, skb->vlan_proto))
3845 skb = __vlan_hwaccel_push_inside(skb);
3846 return skb;
3847}
3848
3849int skb_csum_hwoffload_help(struct sk_buff *skb,
3850 const netdev_features_t features)
3851{
3852 if (unlikely(skb_csum_is_sctp(skb)))
3853 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3854 skb_crc32c_csum_help(skb);
3855
3856 if (features & NETIF_F_HW_CSUM)
3857 return 0;
3858
3859 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3860 if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
3861 skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
3862 !ipv6_has_hopopt_jumbo(skb))
3863 goto sw_checksum;
3864
3865 switch (skb->csum_offset) {
3866 case offsetof(struct tcphdr, check):
3867 case offsetof(struct udphdr, check):
3868 return 0;
3869 }
3870 }
3871
3872sw_checksum:
3873 return skb_checksum_help(skb);
3874}
3875EXPORT_SYMBOL(skb_csum_hwoffload_help);
3876
3877static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3878{
3879 netdev_features_t features;
3880
3881 features = netif_skb_features(skb);
3882 skb = validate_xmit_vlan(skb, features);
3883 if (unlikely(!skb))
3884 goto out_null;
3885
3886 skb = sk_validate_xmit_skb(skb, dev);
3887 if (unlikely(!skb))
3888 goto out_null;
3889
3890 if (netif_needs_gso(skb, features)) {
3891 struct sk_buff *segs;
3892
3893 segs = skb_gso_segment(skb, features);
3894 if (IS_ERR(segs)) {
3895 goto out_kfree_skb;
3896 } else if (segs) {
3897 consume_skb(skb);
3898 skb = segs;
3899 }
3900 } else {
3901 if (skb_needs_linearize(skb, features) &&
3902 __skb_linearize(skb))
3903 goto out_kfree_skb;
3904
3905 /* If packet is not checksummed and device does not
3906 * support checksumming for this protocol, complete
3907 * checksumming here.
3908 */
3909 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3910 if (skb->encapsulation)
3911 skb_set_inner_transport_header(skb,
3912 skb_checksum_start_offset(skb));
3913 else
3914 skb_set_transport_header(skb,
3915 skb_checksum_start_offset(skb));
3916 if (skb_csum_hwoffload_help(skb, features))
3917 goto out_kfree_skb;
3918 }
3919 }
3920
3921 skb = validate_xmit_xfrm(skb, features, again);
3922
3923 return skb;
3924
3925out_kfree_skb:
3926 kfree_skb(skb);
3927out_null:
3928 dev_core_stats_tx_dropped_inc(dev);
3929 return NULL;
3930}
3931
3932struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3933{
3934 struct sk_buff *next, *head = NULL, *tail;
3935
3936 for (; skb != NULL; skb = next) {
3937 next = skb->next;
3938 skb_mark_not_on_list(skb);
3939
3940 /* in case skb won't be segmented, point to itself */
3941 skb->prev = skb;
3942
3943 skb = validate_xmit_skb(skb, dev, again);
3944 if (!skb)
3945 continue;
3946
3947 if (!head)
3948 head = skb;
3949 else
3950 tail->next = skb;
3951 /* If skb was segmented, skb->prev points to
3952 * the last segment. If not, it still contains skb.
3953 */
3954 tail = skb->prev;
3955 }
3956 return head;
3957}
3958EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3959
3960static void qdisc_pkt_len_init(struct sk_buff *skb)
3961{
3962 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3963
3964 qdisc_skb_cb(skb)->pkt_len = skb->len;
3965
3966 /* To get more precise estimation of bytes sent on wire,
3967 * we add to pkt_len the headers size of all segments
3968 */
3969 if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3970 u16 gso_segs = shinfo->gso_segs;
3971 unsigned int hdr_len;
3972
3973 /* mac layer + network layer */
3974 hdr_len = skb_transport_offset(skb);
3975
3976 /* + transport layer */
3977 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3978 const struct tcphdr *th;
3979 struct tcphdr _tcphdr;
3980
3981 th = skb_header_pointer(skb, hdr_len,
3982 sizeof(_tcphdr), &_tcphdr);
3983 if (likely(th))
3984 hdr_len += __tcp_hdrlen(th);
3985 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
3986 struct udphdr _udphdr;
3987
3988 if (skb_header_pointer(skb, hdr_len,
3989 sizeof(_udphdr), &_udphdr))
3990 hdr_len += sizeof(struct udphdr);
3991 }
3992
3993 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
3994 int payload = skb->len - hdr_len;
3995
3996 /* Malicious packet. */
3997 if (payload <= 0)
3998 return;
3999 gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
4000 }
4001 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
4002 }
4003}
4004
4005static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
4006 struct sk_buff **to_free,
4007 struct netdev_queue *txq)
4008{
4009 int rc;
4010
4011 rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
4012 if (rc == NET_XMIT_SUCCESS)
4013 trace_qdisc_enqueue(q, txq, skb);
4014 return rc;
4015}
4016
4017static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
4018 struct net_device *dev,
4019 struct netdev_queue *txq)
4020{
4021 spinlock_t *root_lock = qdisc_lock(q);
4022 struct sk_buff *to_free = NULL;
4023 bool contended;
4024 int rc;
4025
4026 qdisc_calculate_pkt_len(skb, q);
4027
4028 tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
4029
4030 if (q->flags & TCQ_F_NOLOCK) {
4031 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
4032 qdisc_run_begin(q)) {
4033 /* Retest nolock_qdisc_is_empty() within the protection
4034 * of q->seqlock to protect from racing with requeuing.
4035 */
4036 if (unlikely(!nolock_qdisc_is_empty(q))) {
4037 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4038 __qdisc_run(q);
4039 qdisc_run_end(q);
4040
4041 goto no_lock_out;
4042 }
4043
4044 qdisc_bstats_cpu_update(q, skb);
4045 if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
4046 !nolock_qdisc_is_empty(q))
4047 __qdisc_run(q);
4048
4049 qdisc_run_end(q);
4050 return NET_XMIT_SUCCESS;
4051 }
4052
4053 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4054 qdisc_run(q);
4055
4056no_lock_out:
4057 if (unlikely(to_free))
4058 kfree_skb_list_reason(to_free,
4059 tcf_get_drop_reason(to_free));
4060 return rc;
4061 }
4062
4063 if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
4064 kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
4065 return NET_XMIT_DROP;
4066 }
4067 /*
4068 * Heuristic to force contended enqueues to serialize on a
4069 * separate lock before trying to get qdisc main lock.
4070 * This permits qdisc->running owner to get the lock more
4071 * often and dequeue packets faster.
4072 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
4073 * and then other tasks will only enqueue packets. The packets will be
4074 * sent after the qdisc owner is scheduled again. To prevent this
4075 * scenario the task always serialize on the lock.
4076 */
4077 contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
4078 if (unlikely(contended))
4079 spin_lock(&q->busylock);
4080
4081 spin_lock(root_lock);
4082 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
4083 __qdisc_drop(skb, &to_free);
4084 rc = NET_XMIT_DROP;
4085 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
4086 qdisc_run_begin(q)) {
4087 /*
4088 * This is a work-conserving queue; there are no old skbs
4089 * waiting to be sent out; and the qdisc is not running -
4090 * xmit the skb directly.
4091 */
4092
4093 qdisc_bstats_update(q, skb);
4094
4095 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
4096 if (unlikely(contended)) {
4097 spin_unlock(&q->busylock);
4098 contended = false;
4099 }
4100 __qdisc_run(q);
4101 }
4102
4103 qdisc_run_end(q);
4104 rc = NET_XMIT_SUCCESS;
4105 } else {
4106 WRITE_ONCE(q->owner, smp_processor_id());
4107 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
4108 WRITE_ONCE(q->owner, -1);
4109 if (qdisc_run_begin(q)) {
4110 if (unlikely(contended)) {
4111 spin_unlock(&q->busylock);
4112 contended = false;
4113 }
4114 __qdisc_run(q);
4115 qdisc_run_end(q);
4116 }
4117 }
4118 spin_unlock(root_lock);
4119 if (unlikely(to_free))
4120 kfree_skb_list_reason(to_free,
4121 tcf_get_drop_reason(to_free));
4122 if (unlikely(contended))
4123 spin_unlock(&q->busylock);
4124 return rc;
4125}
4126
4127#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
4128static void skb_update_prio(struct sk_buff *skb)
4129{
4130 const struct netprio_map *map;
4131 const struct sock *sk;
4132 unsigned int prioidx;
4133
4134 if (skb->priority)
4135 return;
4136 map = rcu_dereference_bh(skb->dev->priomap);
4137 if (!map)
4138 return;
4139 sk = skb_to_full_sk(skb);
4140 if (!sk)
4141 return;
4142
4143 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
4144
4145 if (prioidx < map->priomap_len)
4146 skb->priority = map->priomap[prioidx];
4147}
4148#else
4149#define skb_update_prio(skb)
4150#endif
4151
4152/**
4153 * dev_loopback_xmit - loop back @skb
4154 * @net: network namespace this loopback is happening in
4155 * @sk: sk needed to be a netfilter okfn
4156 * @skb: buffer to transmit
4157 */
4158int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
4159{
4160 skb_reset_mac_header(skb);
4161 __skb_pull(skb, skb_network_offset(skb));
4162 skb->pkt_type = PACKET_LOOPBACK;
4163 if (skb->ip_summed == CHECKSUM_NONE)
4164 skb->ip_summed = CHECKSUM_UNNECESSARY;
4165 DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
4166 skb_dst_force(skb);
4167 netif_rx(skb);
4168 return 0;
4169}
4170EXPORT_SYMBOL(dev_loopback_xmit);
4171
4172#ifdef CONFIG_NET_EGRESS
4173static struct netdev_queue *
4174netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
4175{
4176 int qm = skb_get_queue_mapping(skb);
4177
4178 return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
4179}
4180
4181#ifndef CONFIG_PREEMPT_RT
4182static bool netdev_xmit_txqueue_skipped(void)
4183{
4184 return __this_cpu_read(softnet_data.xmit.skip_txqueue);
4185}
4186
4187void netdev_xmit_skip_txqueue(bool skip)
4188{
4189 __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
4190}
4191EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
4192
4193#else
4194static bool netdev_xmit_txqueue_skipped(void)
4195{
4196 return current->net_xmit.skip_txqueue;
4197}
4198
4199void netdev_xmit_skip_txqueue(bool skip)
4200{
4201 current->net_xmit.skip_txqueue = skip;
4202}
4203EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
4204#endif
4205#endif /* CONFIG_NET_EGRESS */
4206
4207#ifdef CONFIG_NET_XGRESS
4208static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
4209 enum skb_drop_reason *drop_reason)
4210{
4211 int ret = TC_ACT_UNSPEC;
4212#ifdef CONFIG_NET_CLS_ACT
4213 struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
4214 struct tcf_result res;
4215
4216 if (!miniq)
4217 return ret;
4218
4219 /* Global bypass */
4220 if (!static_branch_likely(&tcf_sw_enabled_key))
4221 return ret;
4222
4223 /* Block-wise bypass */
4224 if (tcf_block_bypass_sw(miniq->block))
4225 return ret;
4226
4227 tc_skb_cb(skb)->mru = 0;
4228 tc_skb_cb(skb)->post_ct = false;
4229 tcf_set_drop_reason(skb, *drop_reason);
4230
4231 mini_qdisc_bstats_cpu_update(miniq, skb);
4232 ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
4233 /* Only tcf related quirks below. */
4234 switch (ret) {
4235 case TC_ACT_SHOT:
4236 *drop_reason = tcf_get_drop_reason(skb);
4237 mini_qdisc_qstats_cpu_drop(miniq);
4238 break;
4239 case TC_ACT_OK:
4240 case TC_ACT_RECLASSIFY:
4241 skb->tc_index = TC_H_MIN(res.classid);
4242 break;
4243 }
4244#endif /* CONFIG_NET_CLS_ACT */
4245 return ret;
4246}
4247
4248static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
4249
4250void tcx_inc(void)
4251{
4252 static_branch_inc(&tcx_needed_key);
4253}
4254
4255void tcx_dec(void)
4256{
4257 static_branch_dec(&tcx_needed_key);
4258}
4259
4260static __always_inline enum tcx_action_base
4261tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
4262 const bool needs_mac)
4263{
4264 const struct bpf_mprog_fp *fp;
4265 const struct bpf_prog *prog;
4266 int ret = TCX_NEXT;
4267
4268 if (needs_mac)
4269 __skb_push(skb, skb->mac_len);
4270 bpf_mprog_foreach_prog(entry, fp, prog) {
4271 bpf_compute_data_pointers(skb);
4272 ret = bpf_prog_run(prog, skb);
4273 if (ret != TCX_NEXT)
4274 break;
4275 }
4276 if (needs_mac)
4277 __skb_pull(skb, skb->mac_len);
4278 return tcx_action_code(skb, ret);
4279}
4280
4281static __always_inline struct sk_buff *
4282sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4283 struct net_device *orig_dev, bool *another)
4284{
4285 struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
4286 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
4287 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4288 int sch_ret;
4289
4290 if (!entry)
4291 return skb;
4292
4293 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4294 if (*pt_prev) {
4295 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4296 *pt_prev = NULL;
4297 }
4298
4299 qdisc_skb_cb(skb)->pkt_len = skb->len;
4300 tcx_set_ingress(skb, true);
4301
4302 if (static_branch_unlikely(&tcx_needed_key)) {
4303 sch_ret = tcx_run(entry, skb, true);
4304 if (sch_ret != TC_ACT_UNSPEC)
4305 goto ingress_verdict;
4306 }
4307 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4308ingress_verdict:
4309 switch (sch_ret) {
4310 case TC_ACT_REDIRECT:
4311 /* skb_mac_header check was done by BPF, so we can safely
4312 * push the L2 header back before redirecting to another
4313 * netdev.
4314 */
4315 __skb_push(skb, skb->mac_len);
4316 if (skb_do_redirect(skb) == -EAGAIN) {
4317 __skb_pull(skb, skb->mac_len);
4318 *another = true;
4319 break;
4320 }
4321 *ret = NET_RX_SUCCESS;
4322 bpf_net_ctx_clear(bpf_net_ctx);
4323 return NULL;
4324 case TC_ACT_SHOT:
4325 kfree_skb_reason(skb, drop_reason);
4326 *ret = NET_RX_DROP;
4327 bpf_net_ctx_clear(bpf_net_ctx);
4328 return NULL;
4329 /* used by tc_run */
4330 case TC_ACT_STOLEN:
4331 case TC_ACT_QUEUED:
4332 case TC_ACT_TRAP:
4333 consume_skb(skb);
4334 fallthrough;
4335 case TC_ACT_CONSUMED:
4336 *ret = NET_RX_SUCCESS;
4337 bpf_net_ctx_clear(bpf_net_ctx);
4338 return NULL;
4339 }
4340 bpf_net_ctx_clear(bpf_net_ctx);
4341
4342 return skb;
4343}
4344
4345static __always_inline struct sk_buff *
4346sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4347{
4348 struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4349 enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4350 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4351 int sch_ret;
4352
4353 if (!entry)
4354 return skb;
4355
4356 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4357
4358 /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4359 * already set by the caller.
4360 */
4361 if (static_branch_unlikely(&tcx_needed_key)) {
4362 sch_ret = tcx_run(entry, skb, false);
4363 if (sch_ret != TC_ACT_UNSPEC)
4364 goto egress_verdict;
4365 }
4366 sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4367egress_verdict:
4368 switch (sch_ret) {
4369 case TC_ACT_REDIRECT:
4370 /* No need to push/pop skb's mac_header here on egress! */
4371 skb_do_redirect(skb);
4372 *ret = NET_XMIT_SUCCESS;
4373 bpf_net_ctx_clear(bpf_net_ctx);
4374 return NULL;
4375 case TC_ACT_SHOT:
4376 kfree_skb_reason(skb, drop_reason);
4377 *ret = NET_XMIT_DROP;
4378 bpf_net_ctx_clear(bpf_net_ctx);
4379 return NULL;
4380 /* used by tc_run */
4381 case TC_ACT_STOLEN:
4382 case TC_ACT_QUEUED:
4383 case TC_ACT_TRAP:
4384 consume_skb(skb);
4385 fallthrough;
4386 case TC_ACT_CONSUMED:
4387 *ret = NET_XMIT_SUCCESS;
4388 bpf_net_ctx_clear(bpf_net_ctx);
4389 return NULL;
4390 }
4391 bpf_net_ctx_clear(bpf_net_ctx);
4392
4393 return skb;
4394}
4395#else
4396static __always_inline struct sk_buff *
4397sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4398 struct net_device *orig_dev, bool *another)
4399{
4400 return skb;
4401}
4402
4403static __always_inline struct sk_buff *
4404sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4405{
4406 return skb;
4407}
4408#endif /* CONFIG_NET_XGRESS */
4409
4410#ifdef CONFIG_XPS
4411static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4412 struct xps_dev_maps *dev_maps, unsigned int tci)
4413{
4414 int tc = netdev_get_prio_tc_map(dev, skb->priority);
4415 struct xps_map *map;
4416 int queue_index = -1;
4417
4418 if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4419 return queue_index;
4420
4421 tci *= dev_maps->num_tc;
4422 tci += tc;
4423
4424 map = rcu_dereference(dev_maps->attr_map[tci]);
4425 if (map) {
4426 if (map->len == 1)
4427 queue_index = map->queues[0];
4428 else
4429 queue_index = map->queues[reciprocal_scale(
4430 skb_get_hash(skb), map->len)];
4431 if (unlikely(queue_index >= dev->real_num_tx_queues))
4432 queue_index = -1;
4433 }
4434 return queue_index;
4435}
4436#endif
4437
4438static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4439 struct sk_buff *skb)
4440{
4441#ifdef CONFIG_XPS
4442 struct xps_dev_maps *dev_maps;
4443 struct sock *sk = skb->sk;
4444 int queue_index = -1;
4445
4446 if (!static_key_false(&xps_needed))
4447 return -1;
4448
4449 rcu_read_lock();
4450 if (!static_key_false(&xps_rxqs_needed))
4451 goto get_cpus_map;
4452
4453 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4454 if (dev_maps) {
4455 int tci = sk_rx_queue_get(sk);
4456
4457 if (tci >= 0)
4458 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4459 tci);
4460 }
4461
4462get_cpus_map:
4463 if (queue_index < 0) {
4464 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4465 if (dev_maps) {
4466 unsigned int tci = skb->sender_cpu - 1;
4467
4468 queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4469 tci);
4470 }
4471 }
4472 rcu_read_unlock();
4473
4474 return queue_index;
4475#else
4476 return -1;
4477#endif
4478}
4479
4480u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4481 struct net_device *sb_dev)
4482{
4483 return 0;
4484}
4485EXPORT_SYMBOL(dev_pick_tx_zero);
4486
4487u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4488 struct net_device *sb_dev)
4489{
4490 struct sock *sk = skb->sk;
4491 int queue_index = sk_tx_queue_get(sk);
4492
4493 sb_dev = sb_dev ? : dev;
4494
4495 if (queue_index < 0 || skb->ooo_okay ||
4496 queue_index >= dev->real_num_tx_queues) {
4497 int new_index = get_xps_queue(dev, sb_dev, skb);
4498
4499 if (new_index < 0)
4500 new_index = skb_tx_hash(dev, sb_dev, skb);
4501
4502 if (queue_index != new_index && sk &&
4503 sk_fullsock(sk) &&
4504 rcu_access_pointer(sk->sk_dst_cache))
4505 sk_tx_queue_set(sk, new_index);
4506
4507 queue_index = new_index;
4508 }
4509
4510 return queue_index;
4511}
4512EXPORT_SYMBOL(netdev_pick_tx);
4513
4514struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4515 struct sk_buff *skb,
4516 struct net_device *sb_dev)
4517{
4518 int queue_index = 0;
4519
4520#ifdef CONFIG_XPS
4521 u32 sender_cpu = skb->sender_cpu - 1;
4522
4523 if (sender_cpu >= (u32)NR_CPUS)
4524 skb->sender_cpu = raw_smp_processor_id() + 1;
4525#endif
4526
4527 if (dev->real_num_tx_queues != 1) {
4528 const struct net_device_ops *ops = dev->netdev_ops;
4529
4530 if (ops->ndo_select_queue)
4531 queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4532 else
4533 queue_index = netdev_pick_tx(dev, skb, sb_dev);
4534
4535 queue_index = netdev_cap_txqueue(dev, queue_index);
4536 }
4537
4538 skb_set_queue_mapping(skb, queue_index);
4539 return netdev_get_tx_queue(dev, queue_index);
4540}
4541
4542/**
4543 * __dev_queue_xmit() - transmit a buffer
4544 * @skb: buffer to transmit
4545 * @sb_dev: suboordinate device used for L2 forwarding offload
4546 *
4547 * Queue a buffer for transmission to a network device. The caller must
4548 * have set the device and priority and built the buffer before calling
4549 * this function. The function can be called from an interrupt.
4550 *
4551 * When calling this method, interrupts MUST be enabled. This is because
4552 * the BH enable code must have IRQs enabled so that it will not deadlock.
4553 *
4554 * Regardless of the return value, the skb is consumed, so it is currently
4555 * difficult to retry a send to this method. (You can bump the ref count
4556 * before sending to hold a reference for retry if you are careful.)
4557 *
4558 * Return:
4559 * * 0 - buffer successfully transmitted
4560 * * positive qdisc return code - NET_XMIT_DROP etc.
4561 * * negative errno - other errors
4562 */
4563int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4564{
4565 struct net_device *dev = skb->dev;
4566 struct netdev_queue *txq = NULL;
4567 struct Qdisc *q;
4568 int rc = -ENOMEM;
4569 bool again = false;
4570
4571 skb_reset_mac_header(skb);
4572 skb_assert_len(skb);
4573
4574 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4575 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4576
4577 /* Disable soft irqs for various locks below. Also
4578 * stops preemption for RCU.
4579 */
4580 rcu_read_lock_bh();
4581
4582 skb_update_prio(skb);
4583
4584 qdisc_pkt_len_init(skb);
4585 tcx_set_ingress(skb, false);
4586#ifdef CONFIG_NET_EGRESS
4587 if (static_branch_unlikely(&egress_needed_key)) {
4588 if (nf_hook_egress_active()) {
4589 skb = nf_hook_egress(skb, &rc, dev);
4590 if (!skb)
4591 goto out;
4592 }
4593
4594 netdev_xmit_skip_txqueue(false);
4595
4596 nf_skip_egress(skb, true);
4597 skb = sch_handle_egress(skb, &rc, dev);
4598 if (!skb)
4599 goto out;
4600 nf_skip_egress(skb, false);
4601
4602 if (netdev_xmit_txqueue_skipped())
4603 txq = netdev_tx_queue_mapping(dev, skb);
4604 }
4605#endif
4606 /* If device/qdisc don't need skb->dst, release it right now while
4607 * its hot in this cpu cache.
4608 */
4609 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4610 skb_dst_drop(skb);
4611 else
4612 skb_dst_force(skb);
4613
4614 if (!txq)
4615 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4616
4617 q = rcu_dereference_bh(txq->qdisc);
4618
4619 trace_net_dev_queue(skb);
4620 if (q->enqueue) {
4621 rc = __dev_xmit_skb(skb, q, dev, txq);
4622 goto out;
4623 }
4624
4625 /* The device has no queue. Common case for software devices:
4626 * loopback, all the sorts of tunnels...
4627
4628 * Really, it is unlikely that netif_tx_lock protection is necessary
4629 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4630 * counters.)
4631 * However, it is possible, that they rely on protection
4632 * made by us here.
4633
4634 * Check this and shot the lock. It is not prone from deadlocks.
4635 *Either shot noqueue qdisc, it is even simpler 8)
4636 */
4637 if (dev->flags & IFF_UP) {
4638 int cpu = smp_processor_id(); /* ok because BHs are off */
4639
4640 /* Other cpus might concurrently change txq->xmit_lock_owner
4641 * to -1 or to their cpu id, but not to our id.
4642 */
4643 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4644 if (dev_xmit_recursion())
4645 goto recursion_alert;
4646
4647 skb = validate_xmit_skb(skb, dev, &again);
4648 if (!skb)
4649 goto out;
4650
4651 HARD_TX_LOCK(dev, txq, cpu);
4652
4653 if (!netif_xmit_stopped(txq)) {
4654 dev_xmit_recursion_inc();
4655 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4656 dev_xmit_recursion_dec();
4657 if (dev_xmit_complete(rc)) {
4658 HARD_TX_UNLOCK(dev, txq);
4659 goto out;
4660 }
4661 }
4662 HARD_TX_UNLOCK(dev, txq);
4663 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4664 dev->name);
4665 } else {
4666 /* Recursion is detected! It is possible,
4667 * unfortunately
4668 */
4669recursion_alert:
4670 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4671 dev->name);
4672 }
4673 }
4674
4675 rc = -ENETDOWN;
4676 rcu_read_unlock_bh();
4677
4678 dev_core_stats_tx_dropped_inc(dev);
4679 kfree_skb_list(skb);
4680 return rc;
4681out:
4682 rcu_read_unlock_bh();
4683 return rc;
4684}
4685EXPORT_SYMBOL(__dev_queue_xmit);
4686
4687int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4688{
4689 struct net_device *dev = skb->dev;
4690 struct sk_buff *orig_skb = skb;
4691 struct netdev_queue *txq;
4692 int ret = NETDEV_TX_BUSY;
4693 bool again = false;
4694
4695 if (unlikely(!netif_running(dev) ||
4696 !netif_carrier_ok(dev)))
4697 goto drop;
4698
4699 skb = validate_xmit_skb_list(skb, dev, &again);
4700 if (skb != orig_skb)
4701 goto drop;
4702
4703 skb_set_queue_mapping(skb, queue_id);
4704 txq = skb_get_tx_queue(dev, skb);
4705
4706 local_bh_disable();
4707
4708 dev_xmit_recursion_inc();
4709 HARD_TX_LOCK(dev, txq, smp_processor_id());
4710 if (!netif_xmit_frozen_or_drv_stopped(txq))
4711 ret = netdev_start_xmit(skb, dev, txq, false);
4712 HARD_TX_UNLOCK(dev, txq);
4713 dev_xmit_recursion_dec();
4714
4715 local_bh_enable();
4716 return ret;
4717drop:
4718 dev_core_stats_tx_dropped_inc(dev);
4719 kfree_skb_list(skb);
4720 return NET_XMIT_DROP;
4721}
4722EXPORT_SYMBOL(__dev_direct_xmit);
4723
4724/*************************************************************************
4725 * Receiver routines
4726 *************************************************************************/
4727static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
4728
4729int weight_p __read_mostly = 64; /* old backlog weight */
4730int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4731int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4732
4733/* Called with irq disabled */
4734static inline void ____napi_schedule(struct softnet_data *sd,
4735 struct napi_struct *napi)
4736{
4737 struct task_struct *thread;
4738
4739 lockdep_assert_irqs_disabled();
4740
4741 if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4742 /* Paired with smp_mb__before_atomic() in
4743 * napi_enable()/dev_set_threaded().
4744 * Use READ_ONCE() to guarantee a complete
4745 * read on napi->thread. Only call
4746 * wake_up_process() when it's not NULL.
4747 */
4748 thread = READ_ONCE(napi->thread);
4749 if (thread) {
4750 if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
4751 goto use_local_napi;
4752
4753 set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4754 wake_up_process(thread);
4755 return;
4756 }
4757 }
4758
4759use_local_napi:
4760 list_add_tail(&napi->poll_list, &sd->poll_list);
4761 WRITE_ONCE(napi->list_owner, smp_processor_id());
4762 /* If not called from net_rx_action()
4763 * we have to raise NET_RX_SOFTIRQ.
4764 */
4765 if (!sd->in_net_rx_action)
4766 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4767}
4768
4769#ifdef CONFIG_RPS
4770
4771struct static_key_false rps_needed __read_mostly;
4772EXPORT_SYMBOL(rps_needed);
4773struct static_key_false rfs_needed __read_mostly;
4774EXPORT_SYMBOL(rfs_needed);
4775
4776static struct rps_dev_flow *
4777set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4778 struct rps_dev_flow *rflow, u16 next_cpu)
4779{
4780 if (next_cpu < nr_cpu_ids) {
4781 u32 head;
4782#ifdef CONFIG_RFS_ACCEL
4783 struct netdev_rx_queue *rxqueue;
4784 struct rps_dev_flow_table *flow_table;
4785 struct rps_dev_flow *old_rflow;
4786 u16 rxq_index;
4787 u32 flow_id;
4788 int rc;
4789
4790 /* Should we steer this flow to a different hardware queue? */
4791 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4792 !(dev->features & NETIF_F_NTUPLE))
4793 goto out;
4794 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4795 if (rxq_index == skb_get_rx_queue(skb))
4796 goto out;
4797
4798 rxqueue = dev->_rx + rxq_index;
4799 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4800 if (!flow_table)
4801 goto out;
4802 flow_id = skb_get_hash(skb) & flow_table->mask;
4803 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4804 rxq_index, flow_id);
4805 if (rc < 0)
4806 goto out;
4807 old_rflow = rflow;
4808 rflow = &flow_table->flows[flow_id];
4809 WRITE_ONCE(rflow->filter, rc);
4810 if (old_rflow->filter == rc)
4811 WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
4812 out:
4813#endif
4814 head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
4815 rps_input_queue_tail_save(&rflow->last_qtail, head);
4816 }
4817
4818 WRITE_ONCE(rflow->cpu, next_cpu);
4819 return rflow;
4820}
4821
4822/*
4823 * get_rps_cpu is called from netif_receive_skb and returns the target
4824 * CPU from the RPS map of the receiving queue for a given skb.
4825 * rcu_read_lock must be held on entry.
4826 */
4827static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4828 struct rps_dev_flow **rflowp)
4829{
4830 const struct rps_sock_flow_table *sock_flow_table;
4831 struct netdev_rx_queue *rxqueue = dev->_rx;
4832 struct rps_dev_flow_table *flow_table;
4833 struct rps_map *map;
4834 int cpu = -1;
4835 u32 tcpu;
4836 u32 hash;
4837
4838 if (skb_rx_queue_recorded(skb)) {
4839 u16 index = skb_get_rx_queue(skb);
4840
4841 if (unlikely(index >= dev->real_num_rx_queues)) {
4842 WARN_ONCE(dev->real_num_rx_queues > 1,
4843 "%s received packet on queue %u, but number "
4844 "of RX queues is %u\n",
4845 dev->name, index, dev->real_num_rx_queues);
4846 goto done;
4847 }
4848 rxqueue += index;
4849 }
4850
4851 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4852
4853 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4854 map = rcu_dereference(rxqueue->rps_map);
4855 if (!flow_table && !map)
4856 goto done;
4857
4858 skb_reset_network_header(skb);
4859 hash = skb_get_hash(skb);
4860 if (!hash)
4861 goto done;
4862
4863 sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
4864 if (flow_table && sock_flow_table) {
4865 struct rps_dev_flow *rflow;
4866 u32 next_cpu;
4867 u32 ident;
4868
4869 /* First check into global flow table if there is a match.
4870 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4871 */
4872 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4873 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
4874 goto try_rps;
4875
4876 next_cpu = ident & net_hotdata.rps_cpu_mask;
4877
4878 /* OK, now we know there is a match,
4879 * we can look at the local (per receive queue) flow table
4880 */
4881 rflow = &flow_table->flows[hash & flow_table->mask];
4882 tcpu = rflow->cpu;
4883
4884 /*
4885 * If the desired CPU (where last recvmsg was done) is
4886 * different from current CPU (one in the rx-queue flow
4887 * table entry), switch if one of the following holds:
4888 * - Current CPU is unset (>= nr_cpu_ids).
4889 * - Current CPU is offline.
4890 * - The current CPU's queue tail has advanced beyond the
4891 * last packet that was enqueued using this table entry.
4892 * This guarantees that all previous packets for the flow
4893 * have been dequeued, thus preserving in order delivery.
4894 */
4895 if (unlikely(tcpu != next_cpu) &&
4896 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4897 ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
4898 rflow->last_qtail)) >= 0)) {
4899 tcpu = next_cpu;
4900 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4901 }
4902
4903 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4904 *rflowp = rflow;
4905 cpu = tcpu;
4906 goto done;
4907 }
4908 }
4909
4910try_rps:
4911
4912 if (map) {
4913 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4914 if (cpu_online(tcpu)) {
4915 cpu = tcpu;
4916 goto done;
4917 }
4918 }
4919
4920done:
4921 return cpu;
4922}
4923
4924#ifdef CONFIG_RFS_ACCEL
4925
4926/**
4927 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4928 * @dev: Device on which the filter was set
4929 * @rxq_index: RX queue index
4930 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4931 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4932 *
4933 * Drivers that implement ndo_rx_flow_steer() should periodically call
4934 * this function for each installed filter and remove the filters for
4935 * which it returns %true.
4936 */
4937bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4938 u32 flow_id, u16 filter_id)
4939{
4940 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4941 struct rps_dev_flow_table *flow_table;
4942 struct rps_dev_flow *rflow;
4943 bool expire = true;
4944 unsigned int cpu;
4945
4946 rcu_read_lock();
4947 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4948 if (flow_table && flow_id <= flow_table->mask) {
4949 rflow = &flow_table->flows[flow_id];
4950 cpu = READ_ONCE(rflow->cpu);
4951 if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
4952 ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
4953 READ_ONCE(rflow->last_qtail)) <
4954 (int)(10 * flow_table->mask)))
4955 expire = false;
4956 }
4957 rcu_read_unlock();
4958 return expire;
4959}
4960EXPORT_SYMBOL(rps_may_expire_flow);
4961
4962#endif /* CONFIG_RFS_ACCEL */
4963
4964/* Called from hardirq (IPI) context */
4965static void rps_trigger_softirq(void *data)
4966{
4967 struct softnet_data *sd = data;
4968
4969 ____napi_schedule(sd, &sd->backlog);
4970 sd->received_rps++;
4971}
4972
4973#endif /* CONFIG_RPS */
4974
4975/* Called from hardirq (IPI) context */
4976static void trigger_rx_softirq(void *data)
4977{
4978 struct softnet_data *sd = data;
4979
4980 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4981 smp_store_release(&sd->defer_ipi_scheduled, 0);
4982}
4983
4984/*
4985 * After we queued a packet into sd->input_pkt_queue,
4986 * we need to make sure this queue is serviced soon.
4987 *
4988 * - If this is another cpu queue, link it to our rps_ipi_list,
4989 * and make sure we will process rps_ipi_list from net_rx_action().
4990 *
4991 * - If this is our own queue, NAPI schedule our backlog.
4992 * Note that this also raises NET_RX_SOFTIRQ.
4993 */
4994static void napi_schedule_rps(struct softnet_data *sd)
4995{
4996 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4997
4998#ifdef CONFIG_RPS
4999 if (sd != mysd) {
5000 if (use_backlog_threads()) {
5001 __napi_schedule_irqoff(&sd->backlog);
5002 return;
5003 }
5004
5005 sd->rps_ipi_next = mysd->rps_ipi_list;
5006 mysd->rps_ipi_list = sd;
5007
5008 /* If not called from net_rx_action() or napi_threaded_poll()
5009 * we have to raise NET_RX_SOFTIRQ.
5010 */
5011 if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
5012 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5013 return;
5014 }
5015#endif /* CONFIG_RPS */
5016 __napi_schedule_irqoff(&mysd->backlog);
5017}
5018
5019void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
5020{
5021 unsigned long flags;
5022
5023 if (use_backlog_threads()) {
5024 backlog_lock_irq_save(sd, &flags);
5025
5026 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
5027 __napi_schedule_irqoff(&sd->backlog);
5028
5029 backlog_unlock_irq_restore(sd, &flags);
5030
5031 } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
5032 smp_call_function_single_async(cpu, &sd->defer_csd);
5033 }
5034}
5035
5036#ifdef CONFIG_NET_FLOW_LIMIT
5037int netdev_flow_limit_table_len __read_mostly = (1 << 12);
5038#endif
5039
5040static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
5041{
5042#ifdef CONFIG_NET_FLOW_LIMIT
5043 struct sd_flow_limit *fl;
5044 struct softnet_data *sd;
5045 unsigned int old_flow, new_flow;
5046
5047 if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
5048 return false;
5049
5050 sd = this_cpu_ptr(&softnet_data);
5051
5052 rcu_read_lock();
5053 fl = rcu_dereference(sd->flow_limit);
5054 if (fl) {
5055 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
5056 old_flow = fl->history[fl->history_head];
5057 fl->history[fl->history_head] = new_flow;
5058
5059 fl->history_head++;
5060 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
5061
5062 if (likely(fl->buckets[old_flow]))
5063 fl->buckets[old_flow]--;
5064
5065 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
5066 fl->count++;
5067 rcu_read_unlock();
5068 return true;
5069 }
5070 }
5071 rcu_read_unlock();
5072#endif
5073 return false;
5074}
5075
5076/*
5077 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
5078 * queue (may be a remote CPU queue).
5079 */
5080static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
5081 unsigned int *qtail)
5082{
5083 enum skb_drop_reason reason;
5084 struct softnet_data *sd;
5085 unsigned long flags;
5086 unsigned int qlen;
5087 int max_backlog;
5088 u32 tail;
5089
5090 reason = SKB_DROP_REASON_DEV_READY;
5091 if (!netif_running(skb->dev))
5092 goto bad_dev;
5093
5094 reason = SKB_DROP_REASON_CPU_BACKLOG;
5095 sd = &per_cpu(softnet_data, cpu);
5096
5097 qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
5098 max_backlog = READ_ONCE(net_hotdata.max_backlog);
5099 if (unlikely(qlen > max_backlog))
5100 goto cpu_backlog_drop;
5101 backlog_lock_irq_save(sd, &flags);
5102 qlen = skb_queue_len(&sd->input_pkt_queue);
5103 if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
5104 if (!qlen) {
5105 /* Schedule NAPI for backlog device. We can use
5106 * non atomic operation as we own the queue lock.
5107 */
5108 if (!__test_and_set_bit(NAPI_STATE_SCHED,
5109 &sd->backlog.state))
5110 napi_schedule_rps(sd);
5111 }
5112 __skb_queue_tail(&sd->input_pkt_queue, skb);
5113 tail = rps_input_queue_tail_incr(sd);
5114 backlog_unlock_irq_restore(sd, &flags);
5115
5116 /* save the tail outside of the critical section */
5117 rps_input_queue_tail_save(qtail, tail);
5118 return NET_RX_SUCCESS;
5119 }
5120
5121 backlog_unlock_irq_restore(sd, &flags);
5122
5123cpu_backlog_drop:
5124 atomic_inc(&sd->dropped);
5125bad_dev:
5126 dev_core_stats_rx_dropped_inc(skb->dev);
5127 kfree_skb_reason(skb, reason);
5128 return NET_RX_DROP;
5129}
5130
5131static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
5132{
5133 struct net_device *dev = skb->dev;
5134 struct netdev_rx_queue *rxqueue;
5135
5136 rxqueue = dev->_rx;
5137
5138 if (skb_rx_queue_recorded(skb)) {
5139 u16 index = skb_get_rx_queue(skb);
5140
5141 if (unlikely(index >= dev->real_num_rx_queues)) {
5142 WARN_ONCE(dev->real_num_rx_queues > 1,
5143 "%s received packet on queue %u, but number "
5144 "of RX queues is %u\n",
5145 dev->name, index, dev->real_num_rx_queues);
5146
5147 return rxqueue; /* Return first rxqueue */
5148 }
5149 rxqueue += index;
5150 }
5151 return rxqueue;
5152}
5153
5154u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
5155 const struct bpf_prog *xdp_prog)
5156{
5157 void *orig_data, *orig_data_end, *hard_start;
5158 struct netdev_rx_queue *rxqueue;
5159 bool orig_bcast, orig_host;
5160 u32 mac_len, frame_sz;
5161 __be16 orig_eth_type;
5162 struct ethhdr *eth;
5163 u32 metalen, act;
5164 int off;
5165
5166 /* The XDP program wants to see the packet starting at the MAC
5167 * header.
5168 */
5169 mac_len = skb->data - skb_mac_header(skb);
5170 hard_start = skb->data - skb_headroom(skb);
5171
5172 /* SKB "head" area always have tailroom for skb_shared_info */
5173 frame_sz = (void *)skb_end_pointer(skb) - hard_start;
5174 frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
5175
5176 rxqueue = netif_get_rxqueue(skb);
5177 xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
5178 xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
5179 skb_headlen(skb) + mac_len, true);
5180 if (skb_is_nonlinear(skb)) {
5181 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
5182 xdp_buff_set_frags_flag(xdp);
5183 } else {
5184 xdp_buff_clear_frags_flag(xdp);
5185 }
5186
5187 orig_data_end = xdp->data_end;
5188 orig_data = xdp->data;
5189 eth = (struct ethhdr *)xdp->data;
5190 orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
5191 orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
5192 orig_eth_type = eth->h_proto;
5193
5194 act = bpf_prog_run_xdp(xdp_prog, xdp);
5195
5196 /* check if bpf_xdp_adjust_head was used */
5197 off = xdp->data - orig_data;
5198 if (off) {
5199 if (off > 0)
5200 __skb_pull(skb, off);
5201 else if (off < 0)
5202 __skb_push(skb, -off);
5203
5204 skb->mac_header += off;
5205 skb_reset_network_header(skb);
5206 }
5207
5208 /* check if bpf_xdp_adjust_tail was used */
5209 off = xdp->data_end - orig_data_end;
5210 if (off != 0) {
5211 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
5212 skb->len += off; /* positive on grow, negative on shrink */
5213 }
5214
5215 /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
5216 * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
5217 */
5218 if (xdp_buff_has_frags(xdp))
5219 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
5220 else
5221 skb->data_len = 0;
5222
5223 /* check if XDP changed eth hdr such SKB needs update */
5224 eth = (struct ethhdr *)xdp->data;
5225 if ((orig_eth_type != eth->h_proto) ||
5226 (orig_host != ether_addr_equal_64bits(eth->h_dest,
5227 skb->dev->dev_addr)) ||
5228 (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
5229 __skb_push(skb, ETH_HLEN);
5230 skb->pkt_type = PACKET_HOST;
5231 skb->protocol = eth_type_trans(skb, skb->dev);
5232 }
5233
5234 /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
5235 * before calling us again on redirect path. We do not call do_redirect
5236 * as we leave that up to the caller.
5237 *
5238 * Caller is responsible for managing lifetime of skb (i.e. calling
5239 * kfree_skb in response to actions it cannot handle/XDP_DROP).
5240 */
5241 switch (act) {
5242 case XDP_REDIRECT:
5243 case XDP_TX:
5244 __skb_push(skb, mac_len);
5245 break;
5246 case XDP_PASS:
5247 metalen = xdp->data - xdp->data_meta;
5248 if (metalen)
5249 skb_metadata_set(skb, metalen);
5250 break;
5251 }
5252
5253 return act;
5254}
5255
5256static int
5257netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
5258{
5259 struct sk_buff *skb = *pskb;
5260 int err, hroom, troom;
5261
5262 if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
5263 return 0;
5264
5265 /* In case we have to go down the path and also linearize,
5266 * then lets do the pskb_expand_head() work just once here.
5267 */
5268 hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
5269 troom = skb->tail + skb->data_len - skb->end;
5270 err = pskb_expand_head(skb,
5271 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
5272 troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
5273 if (err)
5274 return err;
5275
5276 return skb_linearize(skb);
5277}
5278
5279static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
5280 struct xdp_buff *xdp,
5281 const struct bpf_prog *xdp_prog)
5282{
5283 struct sk_buff *skb = *pskb;
5284 u32 mac_len, act = XDP_DROP;
5285
5286 /* Reinjected packets coming from act_mirred or similar should
5287 * not get XDP generic processing.
5288 */
5289 if (skb_is_redirected(skb))
5290 return XDP_PASS;
5291
5292 /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
5293 * bytes. This is the guarantee that also native XDP provides,
5294 * thus we need to do it here as well.
5295 */
5296 mac_len = skb->data - skb_mac_header(skb);
5297 __skb_push(skb, mac_len);
5298
5299 if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
5300 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
5301 if (netif_skb_check_for_xdp(pskb, xdp_prog))
5302 goto do_drop;
5303 }
5304
5305 __skb_pull(*pskb, mac_len);
5306
5307 act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
5308 switch (act) {
5309 case XDP_REDIRECT:
5310 case XDP_TX:
5311 case XDP_PASS:
5312 break;
5313 default:
5314 bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
5315 fallthrough;
5316 case XDP_ABORTED:
5317 trace_xdp_exception((*pskb)->dev, xdp_prog, act);
5318 fallthrough;
5319 case XDP_DROP:
5320 do_drop:
5321 kfree_skb(*pskb);
5322 break;
5323 }
5324
5325 return act;
5326}
5327
5328/* When doing generic XDP we have to bypass the qdisc layer and the
5329 * network taps in order to match in-driver-XDP behavior. This also means
5330 * that XDP packets are able to starve other packets going through a qdisc,
5331 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
5332 * queues, so they do not have this starvation issue.
5333 */
5334void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
5335{
5336 struct net_device *dev = skb->dev;
5337 struct netdev_queue *txq;
5338 bool free_skb = true;
5339 int cpu, rc;
5340
5341 txq = netdev_core_pick_tx(dev, skb, NULL);
5342 cpu = smp_processor_id();
5343 HARD_TX_LOCK(dev, txq, cpu);
5344 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5345 rc = netdev_start_xmit(skb, dev, txq, 0);
5346 if (dev_xmit_complete(rc))
5347 free_skb = false;
5348 }
5349 HARD_TX_UNLOCK(dev, txq);
5350 if (free_skb) {
5351 trace_xdp_exception(dev, xdp_prog, XDP_TX);
5352 dev_core_stats_tx_dropped_inc(dev);
5353 kfree_skb(skb);
5354 }
5355}
5356
5357static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5358
5359int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5360{
5361 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
5362
5363 if (xdp_prog) {
5364 struct xdp_buff xdp;
5365 u32 act;
5366 int err;
5367
5368 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
5369 act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5370 if (act != XDP_PASS) {
5371 switch (act) {
5372 case XDP_REDIRECT:
5373 err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5374 &xdp, xdp_prog);
5375 if (err)
5376 goto out_redir;
5377 break;
5378 case XDP_TX:
5379 generic_xdp_tx(*pskb, xdp_prog);
5380 break;
5381 }
5382 bpf_net_ctx_clear(bpf_net_ctx);
5383 return XDP_DROP;
5384 }
5385 bpf_net_ctx_clear(bpf_net_ctx);
5386 }
5387 return XDP_PASS;
5388out_redir:
5389 bpf_net_ctx_clear(bpf_net_ctx);
5390 kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5391 return XDP_DROP;
5392}
5393EXPORT_SYMBOL_GPL(do_xdp_generic);
5394
5395static int netif_rx_internal(struct sk_buff *skb)
5396{
5397 int ret;
5398
5399 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5400
5401 trace_netif_rx(skb);
5402
5403#ifdef CONFIG_RPS
5404 if (static_branch_unlikely(&rps_needed)) {
5405 struct rps_dev_flow voidflow, *rflow = &voidflow;
5406 int cpu;
5407
5408 rcu_read_lock();
5409
5410 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5411 if (cpu < 0)
5412 cpu = smp_processor_id();
5413
5414 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5415
5416 rcu_read_unlock();
5417 } else
5418#endif
5419 {
5420 unsigned int qtail;
5421
5422 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5423 }
5424 return ret;
5425}
5426
5427/**
5428 * __netif_rx - Slightly optimized version of netif_rx
5429 * @skb: buffer to post
5430 *
5431 * This behaves as netif_rx except that it does not disable bottom halves.
5432 * As a result this function may only be invoked from the interrupt context
5433 * (either hard or soft interrupt).
5434 */
5435int __netif_rx(struct sk_buff *skb)
5436{
5437 int ret;
5438
5439 lockdep_assert_once(hardirq_count() | softirq_count());
5440
5441 trace_netif_rx_entry(skb);
5442 ret = netif_rx_internal(skb);
5443 trace_netif_rx_exit(ret);
5444 return ret;
5445}
5446EXPORT_SYMBOL(__netif_rx);
5447
5448/**
5449 * netif_rx - post buffer to the network code
5450 * @skb: buffer to post
5451 *
5452 * This function receives a packet from a device driver and queues it for
5453 * the upper (protocol) levels to process via the backlog NAPI device. It
5454 * always succeeds. The buffer may be dropped during processing for
5455 * congestion control or by the protocol layers.
5456 * The network buffer is passed via the backlog NAPI device. Modern NIC
5457 * driver should use NAPI and GRO.
5458 * This function can used from interrupt and from process context. The
5459 * caller from process context must not disable interrupts before invoking
5460 * this function.
5461 *
5462 * return values:
5463 * NET_RX_SUCCESS (no congestion)
5464 * NET_RX_DROP (packet was dropped)
5465 *
5466 */
5467int netif_rx(struct sk_buff *skb)
5468{
5469 bool need_bh_off = !(hardirq_count() | softirq_count());
5470 int ret;
5471
5472 if (need_bh_off)
5473 local_bh_disable();
5474 trace_netif_rx_entry(skb);
5475 ret = netif_rx_internal(skb);
5476 trace_netif_rx_exit(ret);
5477 if (need_bh_off)
5478 local_bh_enable();
5479 return ret;
5480}
5481EXPORT_SYMBOL(netif_rx);
5482
5483static __latent_entropy void net_tx_action(void)
5484{
5485 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5486
5487 if (sd->completion_queue) {
5488 struct sk_buff *clist;
5489
5490 local_irq_disable();
5491 clist = sd->completion_queue;
5492 sd->completion_queue = NULL;
5493 local_irq_enable();
5494
5495 while (clist) {
5496 struct sk_buff *skb = clist;
5497
5498 clist = clist->next;
5499
5500 WARN_ON(refcount_read(&skb->users));
5501 if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5502 trace_consume_skb(skb, net_tx_action);
5503 else
5504 trace_kfree_skb(skb, net_tx_action,
5505 get_kfree_skb_cb(skb)->reason, NULL);
5506
5507 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5508 __kfree_skb(skb);
5509 else
5510 __napi_kfree_skb(skb,
5511 get_kfree_skb_cb(skb)->reason);
5512 }
5513 }
5514
5515 if (sd->output_queue) {
5516 struct Qdisc *head;
5517
5518 local_irq_disable();
5519 head = sd->output_queue;
5520 sd->output_queue = NULL;
5521 sd->output_queue_tailp = &sd->output_queue;
5522 local_irq_enable();
5523
5524 rcu_read_lock();
5525
5526 while (head) {
5527 struct Qdisc *q = head;
5528 spinlock_t *root_lock = NULL;
5529
5530 head = head->next_sched;
5531
5532 /* We need to make sure head->next_sched is read
5533 * before clearing __QDISC_STATE_SCHED
5534 */
5535 smp_mb__before_atomic();
5536
5537 if (!(q->flags & TCQ_F_NOLOCK)) {
5538 root_lock = qdisc_lock(q);
5539 spin_lock(root_lock);
5540 } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5541 &q->state))) {
5542 /* There is a synchronize_net() between
5543 * STATE_DEACTIVATED flag being set and
5544 * qdisc_reset()/some_qdisc_is_busy() in
5545 * dev_deactivate(), so we can safely bail out
5546 * early here to avoid data race between
5547 * qdisc_deactivate() and some_qdisc_is_busy()
5548 * for lockless qdisc.
5549 */
5550 clear_bit(__QDISC_STATE_SCHED, &q->state);
5551 continue;
5552 }
5553
5554 clear_bit(__QDISC_STATE_SCHED, &q->state);
5555 qdisc_run(q);
5556 if (root_lock)
5557 spin_unlock(root_lock);
5558 }
5559
5560 rcu_read_unlock();
5561 }
5562
5563 xfrm_dev_backlog(sd);
5564}
5565
5566#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5567/* This hook is defined here for ATM LANE */
5568int (*br_fdb_test_addr_hook)(struct net_device *dev,
5569 unsigned char *addr) __read_mostly;
5570EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5571#endif
5572
5573/**
5574 * netdev_is_rx_handler_busy - check if receive handler is registered
5575 * @dev: device to check
5576 *
5577 * Check if a receive handler is already registered for a given device.
5578 * Return true if there one.
5579 *
5580 * The caller must hold the rtnl_mutex.
5581 */
5582bool netdev_is_rx_handler_busy(struct net_device *dev)
5583{
5584 ASSERT_RTNL();
5585 return dev && rtnl_dereference(dev->rx_handler);
5586}
5587EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5588
5589/**
5590 * netdev_rx_handler_register - register receive handler
5591 * @dev: device to register a handler for
5592 * @rx_handler: receive handler to register
5593 * @rx_handler_data: data pointer that is used by rx handler
5594 *
5595 * Register a receive handler for a device. This handler will then be
5596 * called from __netif_receive_skb. A negative errno code is returned
5597 * on a failure.
5598 *
5599 * The caller must hold the rtnl_mutex.
5600 *
5601 * For a general description of rx_handler, see enum rx_handler_result.
5602 */
5603int netdev_rx_handler_register(struct net_device *dev,
5604 rx_handler_func_t *rx_handler,
5605 void *rx_handler_data)
5606{
5607 if (netdev_is_rx_handler_busy(dev))
5608 return -EBUSY;
5609
5610 if (dev->priv_flags & IFF_NO_RX_HANDLER)
5611 return -EINVAL;
5612
5613 /* Note: rx_handler_data must be set before rx_handler */
5614 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5615 rcu_assign_pointer(dev->rx_handler, rx_handler);
5616
5617 return 0;
5618}
5619EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5620
5621/**
5622 * netdev_rx_handler_unregister - unregister receive handler
5623 * @dev: device to unregister a handler from
5624 *
5625 * Unregister a receive handler from a device.
5626 *
5627 * The caller must hold the rtnl_mutex.
5628 */
5629void netdev_rx_handler_unregister(struct net_device *dev)
5630{
5631
5632 ASSERT_RTNL();
5633 RCU_INIT_POINTER(dev->rx_handler, NULL);
5634 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5635 * section has a guarantee to see a non NULL rx_handler_data
5636 * as well.
5637 */
5638 synchronize_net();
5639 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5640}
5641EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5642
5643/*
5644 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5645 * the special handling of PFMEMALLOC skbs.
5646 */
5647static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5648{
5649 switch (skb->protocol) {
5650 case htons(ETH_P_ARP):
5651 case htons(ETH_P_IP):
5652 case htons(ETH_P_IPV6):
5653 case htons(ETH_P_8021Q):
5654 case htons(ETH_P_8021AD):
5655 return true;
5656 default:
5657 return false;
5658 }
5659}
5660
5661static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5662 int *ret, struct net_device *orig_dev)
5663{
5664 if (nf_hook_ingress_active(skb)) {
5665 int ingress_retval;
5666
5667 if (*pt_prev) {
5668 *ret = deliver_skb(skb, *pt_prev, orig_dev);
5669 *pt_prev = NULL;
5670 }
5671
5672 rcu_read_lock();
5673 ingress_retval = nf_hook_ingress(skb);
5674 rcu_read_unlock();
5675 return ingress_retval;
5676 }
5677 return 0;
5678}
5679
5680static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5681 struct packet_type **ppt_prev)
5682{
5683 struct packet_type *ptype, *pt_prev;
5684 rx_handler_func_t *rx_handler;
5685 struct sk_buff *skb = *pskb;
5686 struct net_device *orig_dev;
5687 bool deliver_exact = false;
5688 int ret = NET_RX_DROP;
5689 __be16 type;
5690
5691 net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5692
5693 trace_netif_receive_skb(skb);
5694
5695 orig_dev = skb->dev;
5696
5697 skb_reset_network_header(skb);
5698#if !defined(CONFIG_DEBUG_NET)
5699 /* We plan to no longer reset the transport header here.
5700 * Give some time to fuzzers and dev build to catch bugs
5701 * in network stacks.
5702 */
5703 if (!skb_transport_header_was_set(skb))
5704 skb_reset_transport_header(skb);
5705#endif
5706 skb_reset_mac_len(skb);
5707
5708 pt_prev = NULL;
5709
5710another_round:
5711 skb->skb_iif = skb->dev->ifindex;
5712
5713 __this_cpu_inc(softnet_data.processed);
5714
5715 if (static_branch_unlikely(&generic_xdp_needed_key)) {
5716 int ret2;
5717
5718 migrate_disable();
5719 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5720 &skb);
5721 migrate_enable();
5722
5723 if (ret2 != XDP_PASS) {
5724 ret = NET_RX_DROP;
5725 goto out;
5726 }
5727 }
5728
5729 if (eth_type_vlan(skb->protocol)) {
5730 skb = skb_vlan_untag(skb);
5731 if (unlikely(!skb))
5732 goto out;
5733 }
5734
5735 if (skb_skip_tc_classify(skb))
5736 goto skip_classify;
5737
5738 if (pfmemalloc)
5739 goto skip_taps;
5740
5741 list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
5742 if (pt_prev)
5743 ret = deliver_skb(skb, pt_prev, orig_dev);
5744 pt_prev = ptype;
5745 }
5746
5747 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5748 if (pt_prev)
5749 ret = deliver_skb(skb, pt_prev, orig_dev);
5750 pt_prev = ptype;
5751 }
5752
5753skip_taps:
5754#ifdef CONFIG_NET_INGRESS
5755 if (static_branch_unlikely(&ingress_needed_key)) {
5756 bool another = false;
5757
5758 nf_skip_egress(skb, true);
5759 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5760 &another);
5761 if (another)
5762 goto another_round;
5763 if (!skb)
5764 goto out;
5765
5766 nf_skip_egress(skb, false);
5767 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5768 goto out;
5769 }
5770#endif
5771 skb_reset_redirect(skb);
5772skip_classify:
5773 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5774 goto drop;
5775
5776 if (skb_vlan_tag_present(skb)) {
5777 if (pt_prev) {
5778 ret = deliver_skb(skb, pt_prev, orig_dev);
5779 pt_prev = NULL;
5780 }
5781 if (vlan_do_receive(&skb))
5782 goto another_round;
5783 else if (unlikely(!skb))
5784 goto out;
5785 }
5786
5787 rx_handler = rcu_dereference(skb->dev->rx_handler);
5788 if (rx_handler) {
5789 if (pt_prev) {
5790 ret = deliver_skb(skb, pt_prev, orig_dev);
5791 pt_prev = NULL;
5792 }
5793 switch (rx_handler(&skb)) {
5794 case RX_HANDLER_CONSUMED:
5795 ret = NET_RX_SUCCESS;
5796 goto out;
5797 case RX_HANDLER_ANOTHER:
5798 goto another_round;
5799 case RX_HANDLER_EXACT:
5800 deliver_exact = true;
5801 break;
5802 case RX_HANDLER_PASS:
5803 break;
5804 default:
5805 BUG();
5806 }
5807 }
5808
5809 if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5810check_vlan_id:
5811 if (skb_vlan_tag_get_id(skb)) {
5812 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5813 * find vlan device.
5814 */
5815 skb->pkt_type = PACKET_OTHERHOST;
5816 } else if (eth_type_vlan(skb->protocol)) {
5817 /* Outer header is 802.1P with vlan 0, inner header is
5818 * 802.1Q or 802.1AD and vlan_do_receive() above could
5819 * not find vlan dev for vlan id 0.
5820 */
5821 __vlan_hwaccel_clear_tag(skb);
5822 skb = skb_vlan_untag(skb);
5823 if (unlikely(!skb))
5824 goto out;
5825 if (vlan_do_receive(&skb))
5826 /* After stripping off 802.1P header with vlan 0
5827 * vlan dev is found for inner header.
5828 */
5829 goto another_round;
5830 else if (unlikely(!skb))
5831 goto out;
5832 else
5833 /* We have stripped outer 802.1P vlan 0 header.
5834 * But could not find vlan dev.
5835 * check again for vlan id to set OTHERHOST.
5836 */
5837 goto check_vlan_id;
5838 }
5839 /* Note: we might in the future use prio bits
5840 * and set skb->priority like in vlan_do_receive()
5841 * For the time being, just ignore Priority Code Point
5842 */
5843 __vlan_hwaccel_clear_tag(skb);
5844 }
5845
5846 type = skb->protocol;
5847
5848 /* deliver only exact match when indicated */
5849 if (likely(!deliver_exact)) {
5850 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5851 &ptype_base[ntohs(type) &
5852 PTYPE_HASH_MASK]);
5853 }
5854
5855 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5856 &orig_dev->ptype_specific);
5857
5858 if (unlikely(skb->dev != orig_dev)) {
5859 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5860 &skb->dev->ptype_specific);
5861 }
5862
5863 if (pt_prev) {
5864 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5865 goto drop;
5866 *ppt_prev = pt_prev;
5867 } else {
5868drop:
5869 if (!deliver_exact)
5870 dev_core_stats_rx_dropped_inc(skb->dev);
5871 else
5872 dev_core_stats_rx_nohandler_inc(skb->dev);
5873 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5874 /* Jamal, now you will not able to escape explaining
5875 * me how you were going to use this. :-)
5876 */
5877 ret = NET_RX_DROP;
5878 }
5879
5880out:
5881 /* The invariant here is that if *ppt_prev is not NULL
5882 * then skb should also be non-NULL.
5883 *
5884 * Apparently *ppt_prev assignment above holds this invariant due to
5885 * skb dereferencing near it.
5886 */
5887 *pskb = skb;
5888 return ret;
5889}
5890
5891static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5892{
5893 struct net_device *orig_dev = skb->dev;
5894 struct packet_type *pt_prev = NULL;
5895 int ret;
5896
5897 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5898 if (pt_prev)
5899 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5900 skb->dev, pt_prev, orig_dev);
5901 return ret;
5902}
5903
5904/**
5905 * netif_receive_skb_core - special purpose version of netif_receive_skb
5906 * @skb: buffer to process
5907 *
5908 * More direct receive version of netif_receive_skb(). It should
5909 * only be used by callers that have a need to skip RPS and Generic XDP.
5910 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5911 *
5912 * This function may only be called from softirq context and interrupts
5913 * should be enabled.
5914 *
5915 * Return values (usually ignored):
5916 * NET_RX_SUCCESS: no congestion
5917 * NET_RX_DROP: packet was dropped
5918 */
5919int netif_receive_skb_core(struct sk_buff *skb)
5920{
5921 int ret;
5922
5923 rcu_read_lock();
5924 ret = __netif_receive_skb_one_core(skb, false);
5925 rcu_read_unlock();
5926
5927 return ret;
5928}
5929EXPORT_SYMBOL(netif_receive_skb_core);
5930
5931static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5932 struct packet_type *pt_prev,
5933 struct net_device *orig_dev)
5934{
5935 struct sk_buff *skb, *next;
5936
5937 if (!pt_prev)
5938 return;
5939 if (list_empty(head))
5940 return;
5941 if (pt_prev->list_func != NULL)
5942 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5943 ip_list_rcv, head, pt_prev, orig_dev);
5944 else
5945 list_for_each_entry_safe(skb, next, head, list) {
5946 skb_list_del_init(skb);
5947 pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5948 }
5949}
5950
5951static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5952{
5953 /* Fast-path assumptions:
5954 * - There is no RX handler.
5955 * - Only one packet_type matches.
5956 * If either of these fails, we will end up doing some per-packet
5957 * processing in-line, then handling the 'last ptype' for the whole
5958 * sublist. This can't cause out-of-order delivery to any single ptype,
5959 * because the 'last ptype' must be constant across the sublist, and all
5960 * other ptypes are handled per-packet.
5961 */
5962 /* Current (common) ptype of sublist */
5963 struct packet_type *pt_curr = NULL;
5964 /* Current (common) orig_dev of sublist */
5965 struct net_device *od_curr = NULL;
5966 struct sk_buff *skb, *next;
5967 LIST_HEAD(sublist);
5968
5969 list_for_each_entry_safe(skb, next, head, list) {
5970 struct net_device *orig_dev = skb->dev;
5971 struct packet_type *pt_prev = NULL;
5972
5973 skb_list_del_init(skb);
5974 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5975 if (!pt_prev)
5976 continue;
5977 if (pt_curr != pt_prev || od_curr != orig_dev) {
5978 /* dispatch old sublist */
5979 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5980 /* start new sublist */
5981 INIT_LIST_HEAD(&sublist);
5982 pt_curr = pt_prev;
5983 od_curr = orig_dev;
5984 }
5985 list_add_tail(&skb->list, &sublist);
5986 }
5987
5988 /* dispatch final sublist */
5989 __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5990}
5991
5992static int __netif_receive_skb(struct sk_buff *skb)
5993{
5994 int ret;
5995
5996 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5997 unsigned int noreclaim_flag;
5998
5999 /*
6000 * PFMEMALLOC skbs are special, they should
6001 * - be delivered to SOCK_MEMALLOC sockets only
6002 * - stay away from userspace
6003 * - have bounded memory usage
6004 *
6005 * Use PF_MEMALLOC as this saves us from propagating the allocation
6006 * context down to all allocation sites.
6007 */
6008 noreclaim_flag = memalloc_noreclaim_save();
6009 ret = __netif_receive_skb_one_core(skb, true);
6010 memalloc_noreclaim_restore(noreclaim_flag);
6011 } else
6012 ret = __netif_receive_skb_one_core(skb, false);
6013
6014 return ret;
6015}
6016
6017static void __netif_receive_skb_list(struct list_head *head)
6018{
6019 unsigned long noreclaim_flag = 0;
6020 struct sk_buff *skb, *next;
6021 bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
6022
6023 list_for_each_entry_safe(skb, next, head, list) {
6024 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
6025 struct list_head sublist;
6026
6027 /* Handle the previous sublist */
6028 list_cut_before(&sublist, head, &skb->list);
6029 if (!list_empty(&sublist))
6030 __netif_receive_skb_list_core(&sublist, pfmemalloc);
6031 pfmemalloc = !pfmemalloc;
6032 /* See comments in __netif_receive_skb */
6033 if (pfmemalloc)
6034 noreclaim_flag = memalloc_noreclaim_save();
6035 else
6036 memalloc_noreclaim_restore(noreclaim_flag);
6037 }
6038 }
6039 /* Handle the remaining sublist */
6040 if (!list_empty(head))
6041 __netif_receive_skb_list_core(head, pfmemalloc);
6042 /* Restore pflags */
6043 if (pfmemalloc)
6044 memalloc_noreclaim_restore(noreclaim_flag);
6045}
6046
6047static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
6048{
6049 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
6050 struct bpf_prog *new = xdp->prog;
6051 int ret = 0;
6052
6053 switch (xdp->command) {
6054 case XDP_SETUP_PROG:
6055 rcu_assign_pointer(dev->xdp_prog, new);
6056 if (old)
6057 bpf_prog_put(old);
6058
6059 if (old && !new) {
6060 static_branch_dec(&generic_xdp_needed_key);
6061 } else if (new && !old) {
6062 static_branch_inc(&generic_xdp_needed_key);
6063 dev_disable_lro(dev);
6064 dev_disable_gro_hw(dev);
6065 }
6066 break;
6067
6068 default:
6069 ret = -EINVAL;
6070 break;
6071 }
6072
6073 return ret;
6074}
6075
6076static int netif_receive_skb_internal(struct sk_buff *skb)
6077{
6078 int ret;
6079
6080 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
6081
6082 if (skb_defer_rx_timestamp(skb))
6083 return NET_RX_SUCCESS;
6084
6085 rcu_read_lock();
6086#ifdef CONFIG_RPS
6087 if (static_branch_unlikely(&rps_needed)) {
6088 struct rps_dev_flow voidflow, *rflow = &voidflow;
6089 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
6090
6091 if (cpu >= 0) {
6092 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
6093 rcu_read_unlock();
6094 return ret;
6095 }
6096 }
6097#endif
6098 ret = __netif_receive_skb(skb);
6099 rcu_read_unlock();
6100 return ret;
6101}
6102
6103void netif_receive_skb_list_internal(struct list_head *head)
6104{
6105 struct sk_buff *skb, *next;
6106 LIST_HEAD(sublist);
6107
6108 list_for_each_entry_safe(skb, next, head, list) {
6109 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
6110 skb);
6111 skb_list_del_init(skb);
6112 if (!skb_defer_rx_timestamp(skb))
6113 list_add_tail(&skb->list, &sublist);
6114 }
6115 list_splice_init(&sublist, head);
6116
6117 rcu_read_lock();
6118#ifdef CONFIG_RPS
6119 if (static_branch_unlikely(&rps_needed)) {
6120 list_for_each_entry_safe(skb, next, head, list) {
6121 struct rps_dev_flow voidflow, *rflow = &voidflow;
6122 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
6123
6124 if (cpu >= 0) {
6125 /* Will be handled, remove from list */
6126 skb_list_del_init(skb);
6127 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
6128 }
6129 }
6130 }
6131#endif
6132 __netif_receive_skb_list(head);
6133 rcu_read_unlock();
6134}
6135
6136/**
6137 * netif_receive_skb - process receive buffer from network
6138 * @skb: buffer to process
6139 *
6140 * netif_receive_skb() is the main receive data processing function.
6141 * It always succeeds. The buffer may be dropped during processing
6142 * for congestion control or by the protocol layers.
6143 *
6144 * This function may only be called from softirq context and interrupts
6145 * should be enabled.
6146 *
6147 * Return values (usually ignored):
6148 * NET_RX_SUCCESS: no congestion
6149 * NET_RX_DROP: packet was dropped
6150 */
6151int netif_receive_skb(struct sk_buff *skb)
6152{
6153 int ret;
6154
6155 trace_netif_receive_skb_entry(skb);
6156
6157 ret = netif_receive_skb_internal(skb);
6158 trace_netif_receive_skb_exit(ret);
6159
6160 return ret;
6161}
6162EXPORT_SYMBOL(netif_receive_skb);
6163
6164/**
6165 * netif_receive_skb_list - process many receive buffers from network
6166 * @head: list of skbs to process.
6167 *
6168 * Since return value of netif_receive_skb() is normally ignored, and
6169 * wouldn't be meaningful for a list, this function returns void.
6170 *
6171 * This function may only be called from softirq context and interrupts
6172 * should be enabled.
6173 */
6174void netif_receive_skb_list(struct list_head *head)
6175{
6176 struct sk_buff *skb;
6177
6178 if (list_empty(head))
6179 return;
6180 if (trace_netif_receive_skb_list_entry_enabled()) {
6181 list_for_each_entry(skb, head, list)
6182 trace_netif_receive_skb_list_entry(skb);
6183 }
6184 netif_receive_skb_list_internal(head);
6185 trace_netif_receive_skb_list_exit(0);
6186}
6187EXPORT_SYMBOL(netif_receive_skb_list);
6188
6189/* Network device is going away, flush any packets still pending */
6190static void flush_backlog(struct work_struct *work)
6191{
6192 struct sk_buff *skb, *tmp;
6193 struct softnet_data *sd;
6194
6195 local_bh_disable();
6196 sd = this_cpu_ptr(&softnet_data);
6197
6198 backlog_lock_irq_disable(sd);
6199 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6200 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
6201 __skb_unlink(skb, &sd->input_pkt_queue);
6202 dev_kfree_skb_irq(skb);
6203 rps_input_queue_head_incr(sd);
6204 }
6205 }
6206 backlog_unlock_irq_enable(sd);
6207
6208 local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6209 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
6210 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
6211 __skb_unlink(skb, &sd->process_queue);
6212 kfree_skb(skb);
6213 rps_input_queue_head_incr(sd);
6214 }
6215 }
6216 local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6217 local_bh_enable();
6218}
6219
6220static bool flush_required(int cpu)
6221{
6222#if IS_ENABLED(CONFIG_RPS)
6223 struct softnet_data *sd = &per_cpu(softnet_data, cpu);
6224 bool do_flush;
6225
6226 backlog_lock_irq_disable(sd);
6227
6228 /* as insertion into process_queue happens with the rps lock held,
6229 * process_queue access may race only with dequeue
6230 */
6231 do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
6232 !skb_queue_empty_lockless(&sd->process_queue);
6233 backlog_unlock_irq_enable(sd);
6234
6235 return do_flush;
6236#endif
6237 /* without RPS we can't safely check input_pkt_queue: during a
6238 * concurrent remote skb_queue_splice() we can detect as empty both
6239 * input_pkt_queue and process_queue even if the latter could end-up
6240 * containing a lot of packets.
6241 */
6242 return true;
6243}
6244
6245struct flush_backlogs {
6246 cpumask_t flush_cpus;
6247 struct work_struct w[];
6248};
6249
6250static struct flush_backlogs *flush_backlogs_alloc(void)
6251{
6252 return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
6253 GFP_KERNEL);
6254}
6255
6256static struct flush_backlogs *flush_backlogs_fallback;
6257static DEFINE_MUTEX(flush_backlogs_mutex);
6258
6259static void flush_all_backlogs(void)
6260{
6261 struct flush_backlogs *ptr = flush_backlogs_alloc();
6262 unsigned int cpu;
6263
6264 if (!ptr) {
6265 mutex_lock(&flush_backlogs_mutex);
6266 ptr = flush_backlogs_fallback;
6267 }
6268 cpumask_clear(&ptr->flush_cpus);
6269
6270 cpus_read_lock();
6271
6272 for_each_online_cpu(cpu) {
6273 if (flush_required(cpu)) {
6274 INIT_WORK(&ptr->w[cpu], flush_backlog);
6275 queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
6276 __cpumask_set_cpu(cpu, &ptr->flush_cpus);
6277 }
6278 }
6279
6280 /* we can have in flight packet[s] on the cpus we are not flushing,
6281 * synchronize_net() in unregister_netdevice_many() will take care of
6282 * them.
6283 */
6284 for_each_cpu(cpu, &ptr->flush_cpus)
6285 flush_work(&ptr->w[cpu]);
6286
6287 cpus_read_unlock();
6288
6289 if (ptr != flush_backlogs_fallback)
6290 kfree(ptr);
6291 else
6292 mutex_unlock(&flush_backlogs_mutex);
6293}
6294
6295static void net_rps_send_ipi(struct softnet_data *remsd)
6296{
6297#ifdef CONFIG_RPS
6298 while (remsd) {
6299 struct softnet_data *next = remsd->rps_ipi_next;
6300
6301 if (cpu_online(remsd->cpu))
6302 smp_call_function_single_async(remsd->cpu, &remsd->csd);
6303 remsd = next;
6304 }
6305#endif
6306}
6307
6308/*
6309 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6310 * Note: called with local irq disabled, but exits with local irq enabled.
6311 */
6312static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6313{
6314#ifdef CONFIG_RPS
6315 struct softnet_data *remsd = sd->rps_ipi_list;
6316
6317 if (!use_backlog_threads() && remsd) {
6318 sd->rps_ipi_list = NULL;
6319
6320 local_irq_enable();
6321
6322 /* Send pending IPI's to kick RPS processing on remote cpus. */
6323 net_rps_send_ipi(remsd);
6324 } else
6325#endif
6326 local_irq_enable();
6327}
6328
6329static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6330{
6331#ifdef CONFIG_RPS
6332 return !use_backlog_threads() && sd->rps_ipi_list;
6333#else
6334 return false;
6335#endif
6336}
6337
6338static int process_backlog(struct napi_struct *napi, int quota)
6339{
6340 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6341 bool again = true;
6342 int work = 0;
6343
6344 /* Check if we have pending ipi, its better to send them now,
6345 * not waiting net_rx_action() end.
6346 */
6347 if (sd_has_rps_ipi_waiting(sd)) {
6348 local_irq_disable();
6349 net_rps_action_and_irq_enable(sd);
6350 }
6351
6352 napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
6353 while (again) {
6354 struct sk_buff *skb;
6355
6356 local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6357 while ((skb = __skb_dequeue(&sd->process_queue))) {
6358 local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6359 rcu_read_lock();
6360 __netif_receive_skb(skb);
6361 rcu_read_unlock();
6362 if (++work >= quota) {
6363 rps_input_queue_head_add(sd, work);
6364 return work;
6365 }
6366
6367 local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6368 }
6369 local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6370
6371 backlog_lock_irq_disable(sd);
6372 if (skb_queue_empty(&sd->input_pkt_queue)) {
6373 /*
6374 * Inline a custom version of __napi_complete().
6375 * only current cpu owns and manipulates this napi,
6376 * and NAPI_STATE_SCHED is the only possible flag set
6377 * on backlog.
6378 * We can use a plain write instead of clear_bit(),
6379 * and we dont need an smp_mb() memory barrier.
6380 */
6381 napi->state &= NAPIF_STATE_THREADED;
6382 again = false;
6383 } else {
6384 local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6385 skb_queue_splice_tail_init(&sd->input_pkt_queue,
6386 &sd->process_queue);
6387 local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6388 }
6389 backlog_unlock_irq_enable(sd);
6390 }
6391
6392 if (work)
6393 rps_input_queue_head_add(sd, work);
6394 return work;
6395}
6396
6397/**
6398 * __napi_schedule - schedule for receive
6399 * @n: entry to schedule
6400 *
6401 * The entry's receive function will be scheduled to run.
6402 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6403 */
6404void __napi_schedule(struct napi_struct *n)
6405{
6406 unsigned long flags;
6407
6408 local_irq_save(flags);
6409 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6410 local_irq_restore(flags);
6411}
6412EXPORT_SYMBOL(__napi_schedule);
6413
6414/**
6415 * napi_schedule_prep - check if napi can be scheduled
6416 * @n: napi context
6417 *
6418 * Test if NAPI routine is already running, and if not mark
6419 * it as running. This is used as a condition variable to
6420 * insure only one NAPI poll instance runs. We also make
6421 * sure there is no pending NAPI disable.
6422 */
6423bool napi_schedule_prep(struct napi_struct *n)
6424{
6425 unsigned long new, val = READ_ONCE(n->state);
6426
6427 do {
6428 if (unlikely(val & NAPIF_STATE_DISABLE))
6429 return false;
6430 new = val | NAPIF_STATE_SCHED;
6431
6432 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6433 * This was suggested by Alexander Duyck, as compiler
6434 * emits better code than :
6435 * if (val & NAPIF_STATE_SCHED)
6436 * new |= NAPIF_STATE_MISSED;
6437 */
6438 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6439 NAPIF_STATE_MISSED;
6440 } while (!try_cmpxchg(&n->state, &val, new));
6441
6442 return !(val & NAPIF_STATE_SCHED);
6443}
6444EXPORT_SYMBOL(napi_schedule_prep);
6445
6446/**
6447 * __napi_schedule_irqoff - schedule for receive
6448 * @n: entry to schedule
6449 *
6450 * Variant of __napi_schedule() assuming hard irqs are masked.
6451 *
6452 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6453 * because the interrupt disabled assumption might not be true
6454 * due to force-threaded interrupts and spinlock substitution.
6455 */
6456void __napi_schedule_irqoff(struct napi_struct *n)
6457{
6458 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6459 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6460 else
6461 __napi_schedule(n);
6462}
6463EXPORT_SYMBOL(__napi_schedule_irqoff);
6464
6465bool napi_complete_done(struct napi_struct *n, int work_done)
6466{
6467 unsigned long flags, val, new, timeout = 0;
6468 bool ret = true;
6469
6470 /*
6471 * 1) Don't let napi dequeue from the cpu poll list
6472 * just in case its running on a different cpu.
6473 * 2) If we are busy polling, do nothing here, we have
6474 * the guarantee we will be called later.
6475 */
6476 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6477 NAPIF_STATE_IN_BUSY_POLL)))
6478 return false;
6479
6480 if (work_done) {
6481 if (n->gro_bitmask)
6482 timeout = napi_get_gro_flush_timeout(n);
6483 n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
6484 }
6485 if (n->defer_hard_irqs_count > 0) {
6486 n->defer_hard_irqs_count--;
6487 timeout = napi_get_gro_flush_timeout(n);
6488 if (timeout)
6489 ret = false;
6490 }
6491 if (n->gro_bitmask) {
6492 /* When the NAPI instance uses a timeout and keeps postponing
6493 * it, we need to bound somehow the time packets are kept in
6494 * the GRO layer
6495 */
6496 napi_gro_flush(n, !!timeout);
6497 }
6498
6499 gro_normal_list(n);
6500
6501 if (unlikely(!list_empty(&n->poll_list))) {
6502 /* If n->poll_list is not empty, we need to mask irqs */
6503 local_irq_save(flags);
6504 list_del_init(&n->poll_list);
6505 local_irq_restore(flags);
6506 }
6507 WRITE_ONCE(n->list_owner, -1);
6508
6509 val = READ_ONCE(n->state);
6510 do {
6511 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6512
6513 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6514 NAPIF_STATE_SCHED_THREADED |
6515 NAPIF_STATE_PREFER_BUSY_POLL);
6516
6517 /* If STATE_MISSED was set, leave STATE_SCHED set,
6518 * because we will call napi->poll() one more time.
6519 * This C code was suggested by Alexander Duyck to help gcc.
6520 */
6521 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6522 NAPIF_STATE_SCHED;
6523 } while (!try_cmpxchg(&n->state, &val, new));
6524
6525 if (unlikely(val & NAPIF_STATE_MISSED)) {
6526 __napi_schedule(n);
6527 return false;
6528 }
6529
6530 if (timeout)
6531 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6532 HRTIMER_MODE_REL_PINNED);
6533 return ret;
6534}
6535EXPORT_SYMBOL(napi_complete_done);
6536
6537static void skb_defer_free_flush(struct softnet_data *sd)
6538{
6539 struct sk_buff *skb, *next;
6540
6541 /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6542 if (!READ_ONCE(sd->defer_list))
6543 return;
6544
6545 spin_lock(&sd->defer_lock);
6546 skb = sd->defer_list;
6547 sd->defer_list = NULL;
6548 sd->defer_count = 0;
6549 spin_unlock(&sd->defer_lock);
6550
6551 while (skb != NULL) {
6552 next = skb->next;
6553 napi_consume_skb(skb, 1);
6554 skb = next;
6555 }
6556}
6557
6558#if defined(CONFIG_NET_RX_BUSY_POLL)
6559
6560static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6561{
6562 if (!skip_schedule) {
6563 gro_normal_list(napi);
6564 __napi_schedule(napi);
6565 return;
6566 }
6567
6568 if (napi->gro_bitmask) {
6569 /* flush too old packets
6570 * If HZ < 1000, flush all packets.
6571 */
6572 napi_gro_flush(napi, HZ >= 1000);
6573 }
6574
6575 gro_normal_list(napi);
6576 clear_bit(NAPI_STATE_SCHED, &napi->state);
6577}
6578
6579enum {
6580 NAPI_F_PREFER_BUSY_POLL = 1,
6581 NAPI_F_END_ON_RESCHED = 2,
6582};
6583
6584static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6585 unsigned flags, u16 budget)
6586{
6587 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6588 bool skip_schedule = false;
6589 unsigned long timeout;
6590 int rc;
6591
6592 /* Busy polling means there is a high chance device driver hard irq
6593 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6594 * set in napi_schedule_prep().
6595 * Since we are about to call napi->poll() once more, we can safely
6596 * clear NAPI_STATE_MISSED.
6597 *
6598 * Note: x86 could use a single "lock and ..." instruction
6599 * to perform these two clear_bit()
6600 */
6601 clear_bit(NAPI_STATE_MISSED, &napi->state);
6602 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6603
6604 local_bh_disable();
6605 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6606
6607 if (flags & NAPI_F_PREFER_BUSY_POLL) {
6608 napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
6609 timeout = napi_get_gro_flush_timeout(napi);
6610 if (napi->defer_hard_irqs_count && timeout) {
6611 hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6612 skip_schedule = true;
6613 }
6614 }
6615
6616 /* All we really want here is to re-enable device interrupts.
6617 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6618 */
6619 rc = napi->poll(napi, budget);
6620 /* We can't gro_normal_list() here, because napi->poll() might have
6621 * rearmed the napi (napi_complete_done()) in which case it could
6622 * already be running on another CPU.
6623 */
6624 trace_napi_poll(napi, rc, budget);
6625 netpoll_poll_unlock(have_poll_lock);
6626 if (rc == budget)
6627 __busy_poll_stop(napi, skip_schedule);
6628 bpf_net_ctx_clear(bpf_net_ctx);
6629 local_bh_enable();
6630}
6631
6632static void __napi_busy_loop(unsigned int napi_id,
6633 bool (*loop_end)(void *, unsigned long),
6634 void *loop_end_arg, unsigned flags, u16 budget)
6635{
6636 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6637 int (*napi_poll)(struct napi_struct *napi, int budget);
6638 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6639 void *have_poll_lock = NULL;
6640 struct napi_struct *napi;
6641
6642 WARN_ON_ONCE(!rcu_read_lock_held());
6643
6644restart:
6645 napi_poll = NULL;
6646
6647 napi = napi_by_id(napi_id);
6648 if (!napi)
6649 return;
6650
6651 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6652 preempt_disable();
6653 for (;;) {
6654 int work = 0;
6655
6656 local_bh_disable();
6657 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6658 if (!napi_poll) {
6659 unsigned long val = READ_ONCE(napi->state);
6660
6661 /* If multiple threads are competing for this napi,
6662 * we avoid dirtying napi->state as much as we can.
6663 */
6664 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6665 NAPIF_STATE_IN_BUSY_POLL)) {
6666 if (flags & NAPI_F_PREFER_BUSY_POLL)
6667 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6668 goto count;
6669 }
6670 if (cmpxchg(&napi->state, val,
6671 val | NAPIF_STATE_IN_BUSY_POLL |
6672 NAPIF_STATE_SCHED) != val) {
6673 if (flags & NAPI_F_PREFER_BUSY_POLL)
6674 set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6675 goto count;
6676 }
6677 have_poll_lock = netpoll_poll_lock(napi);
6678 napi_poll = napi->poll;
6679 }
6680 work = napi_poll(napi, budget);
6681 trace_napi_poll(napi, work, budget);
6682 gro_normal_list(napi);
6683count:
6684 if (work > 0)
6685 __NET_ADD_STATS(dev_net(napi->dev),
6686 LINUX_MIB_BUSYPOLLRXPACKETS, work);
6687 skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6688 bpf_net_ctx_clear(bpf_net_ctx);
6689 local_bh_enable();
6690
6691 if (!loop_end || loop_end(loop_end_arg, start_time))
6692 break;
6693
6694 if (unlikely(need_resched())) {
6695 if (flags & NAPI_F_END_ON_RESCHED)
6696 break;
6697 if (napi_poll)
6698 busy_poll_stop(napi, have_poll_lock, flags, budget);
6699 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6700 preempt_enable();
6701 rcu_read_unlock();
6702 cond_resched();
6703 rcu_read_lock();
6704 if (loop_end(loop_end_arg, start_time))
6705 return;
6706 goto restart;
6707 }
6708 cpu_relax();
6709 }
6710 if (napi_poll)
6711 busy_poll_stop(napi, have_poll_lock, flags, budget);
6712 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6713 preempt_enable();
6714}
6715
6716void napi_busy_loop_rcu(unsigned int napi_id,
6717 bool (*loop_end)(void *, unsigned long),
6718 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6719{
6720 unsigned flags = NAPI_F_END_ON_RESCHED;
6721
6722 if (prefer_busy_poll)
6723 flags |= NAPI_F_PREFER_BUSY_POLL;
6724
6725 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6726}
6727
6728void napi_busy_loop(unsigned int napi_id,
6729 bool (*loop_end)(void *, unsigned long),
6730 void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6731{
6732 unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6733
6734 rcu_read_lock();
6735 __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6736 rcu_read_unlock();
6737}
6738EXPORT_SYMBOL(napi_busy_loop);
6739
6740void napi_suspend_irqs(unsigned int napi_id)
6741{
6742 struct napi_struct *napi;
6743
6744 rcu_read_lock();
6745 napi = napi_by_id(napi_id);
6746 if (napi) {
6747 unsigned long timeout = napi_get_irq_suspend_timeout(napi);
6748
6749 if (timeout)
6750 hrtimer_start(&napi->timer, ns_to_ktime(timeout),
6751 HRTIMER_MODE_REL_PINNED);
6752 }
6753 rcu_read_unlock();
6754}
6755
6756void napi_resume_irqs(unsigned int napi_id)
6757{
6758 struct napi_struct *napi;
6759
6760 rcu_read_lock();
6761 napi = napi_by_id(napi_id);
6762 if (napi) {
6763 /* If irq_suspend_timeout is set to 0 between the call to
6764 * napi_suspend_irqs and now, the original value still
6765 * determines the safety timeout as intended and napi_watchdog
6766 * will resume irq processing.
6767 */
6768 if (napi_get_irq_suspend_timeout(napi)) {
6769 local_bh_disable();
6770 napi_schedule(napi);
6771 local_bh_enable();
6772 }
6773 }
6774 rcu_read_unlock();
6775}
6776
6777#endif /* CONFIG_NET_RX_BUSY_POLL */
6778
6779static void __napi_hash_add_with_id(struct napi_struct *napi,
6780 unsigned int napi_id)
6781{
6782 WRITE_ONCE(napi->napi_id, napi_id);
6783 hlist_add_head_rcu(&napi->napi_hash_node,
6784 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6785}
6786
6787static void napi_hash_add_with_id(struct napi_struct *napi,
6788 unsigned int napi_id)
6789{
6790 unsigned long flags;
6791
6792 spin_lock_irqsave(&napi_hash_lock, flags);
6793 WARN_ON_ONCE(napi_by_id(napi_id));
6794 __napi_hash_add_with_id(napi, napi_id);
6795 spin_unlock_irqrestore(&napi_hash_lock, flags);
6796}
6797
6798static void napi_hash_add(struct napi_struct *napi)
6799{
6800 unsigned long flags;
6801
6802 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6803 return;
6804
6805 spin_lock_irqsave(&napi_hash_lock, flags);
6806
6807 /* 0..NR_CPUS range is reserved for sender_cpu use */
6808 do {
6809 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6810 napi_gen_id = MIN_NAPI_ID;
6811 } while (napi_by_id(napi_gen_id));
6812
6813 __napi_hash_add_with_id(napi, napi_gen_id);
6814
6815 spin_unlock_irqrestore(&napi_hash_lock, flags);
6816}
6817
6818/* Warning : caller is responsible to make sure rcu grace period
6819 * is respected before freeing memory containing @napi
6820 */
6821static void napi_hash_del(struct napi_struct *napi)
6822{
6823 unsigned long flags;
6824
6825 spin_lock_irqsave(&napi_hash_lock, flags);
6826
6827 hlist_del_init_rcu(&napi->napi_hash_node);
6828
6829 spin_unlock_irqrestore(&napi_hash_lock, flags);
6830}
6831
6832static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6833{
6834 struct napi_struct *napi;
6835
6836 napi = container_of(timer, struct napi_struct, timer);
6837
6838 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6839 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6840 */
6841 if (!napi_disable_pending(napi) &&
6842 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6843 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6844 __napi_schedule_irqoff(napi);
6845 }
6846
6847 return HRTIMER_NORESTART;
6848}
6849
6850static void init_gro_hash(struct napi_struct *napi)
6851{
6852 int i;
6853
6854 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6855 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6856 napi->gro_hash[i].count = 0;
6857 }
6858 napi->gro_bitmask = 0;
6859}
6860
6861int dev_set_threaded(struct net_device *dev, bool threaded)
6862{
6863 struct napi_struct *napi;
6864 int err = 0;
6865
6866 netdev_assert_locked_or_invisible(dev);
6867
6868 if (dev->threaded == threaded)
6869 return 0;
6870
6871 if (threaded) {
6872 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6873 if (!napi->thread) {
6874 err = napi_kthread_create(napi);
6875 if (err) {
6876 threaded = false;
6877 break;
6878 }
6879 }
6880 }
6881 }
6882
6883 WRITE_ONCE(dev->threaded, threaded);
6884
6885 /* Make sure kthread is created before THREADED bit
6886 * is set.
6887 */
6888 smp_mb__before_atomic();
6889
6890 /* Setting/unsetting threaded mode on a napi might not immediately
6891 * take effect, if the current napi instance is actively being
6892 * polled. In this case, the switch between threaded mode and
6893 * softirq mode will happen in the next round of napi_schedule().
6894 * This should not cause hiccups/stalls to the live traffic.
6895 */
6896 list_for_each_entry(napi, &dev->napi_list, dev_list)
6897 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6898
6899 return err;
6900}
6901EXPORT_SYMBOL(dev_set_threaded);
6902
6903/**
6904 * netif_queue_set_napi - Associate queue with the napi
6905 * @dev: device to which NAPI and queue belong
6906 * @queue_index: Index of queue
6907 * @type: queue type as RX or TX
6908 * @napi: NAPI context, pass NULL to clear previously set NAPI
6909 *
6910 * Set queue with its corresponding napi context. This should be done after
6911 * registering the NAPI handler for the queue-vector and the queues have been
6912 * mapped to the corresponding interrupt vector.
6913 */
6914void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
6915 enum netdev_queue_type type, struct napi_struct *napi)
6916{
6917 struct netdev_rx_queue *rxq;
6918 struct netdev_queue *txq;
6919
6920 if (WARN_ON_ONCE(napi && !napi->dev))
6921 return;
6922 if (dev->reg_state >= NETREG_REGISTERED)
6923 ASSERT_RTNL();
6924
6925 switch (type) {
6926 case NETDEV_QUEUE_TYPE_RX:
6927 rxq = __netif_get_rx_queue(dev, queue_index);
6928 rxq->napi = napi;
6929 return;
6930 case NETDEV_QUEUE_TYPE_TX:
6931 txq = netdev_get_tx_queue(dev, queue_index);
6932 txq->napi = napi;
6933 return;
6934 default:
6935 return;
6936 }
6937}
6938EXPORT_SYMBOL(netif_queue_set_napi);
6939
6940static void napi_restore_config(struct napi_struct *n)
6941{
6942 n->defer_hard_irqs = n->config->defer_hard_irqs;
6943 n->gro_flush_timeout = n->config->gro_flush_timeout;
6944 n->irq_suspend_timeout = n->config->irq_suspend_timeout;
6945 /* a NAPI ID might be stored in the config, if so use it. if not, use
6946 * napi_hash_add to generate one for us.
6947 */
6948 if (n->config->napi_id) {
6949 napi_hash_add_with_id(n, n->config->napi_id);
6950 } else {
6951 napi_hash_add(n);
6952 n->config->napi_id = n->napi_id;
6953 }
6954}
6955
6956static void napi_save_config(struct napi_struct *n)
6957{
6958 n->config->defer_hard_irqs = n->defer_hard_irqs;
6959 n->config->gro_flush_timeout = n->gro_flush_timeout;
6960 n->config->irq_suspend_timeout = n->irq_suspend_timeout;
6961 napi_hash_del(n);
6962}
6963
6964/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
6965 * inherit an existing ID try to insert it at the right position.
6966 */
6967static void
6968netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
6969{
6970 unsigned int new_id, pos_id;
6971 struct list_head *higher;
6972 struct napi_struct *pos;
6973
6974 new_id = UINT_MAX;
6975 if (napi->config && napi->config->napi_id)
6976 new_id = napi->config->napi_id;
6977
6978 higher = &dev->napi_list;
6979 list_for_each_entry(pos, &dev->napi_list, dev_list) {
6980 if (pos->napi_id >= MIN_NAPI_ID)
6981 pos_id = pos->napi_id;
6982 else if (pos->config)
6983 pos_id = pos->config->napi_id;
6984 else
6985 pos_id = UINT_MAX;
6986
6987 if (pos_id <= new_id)
6988 break;
6989 higher = &pos->dev_list;
6990 }
6991 list_add_rcu(&napi->dev_list, higher); /* adds after higher */
6992}
6993
6994/* Double check that napi_get_frags() allocates skbs with
6995 * skb->head being backed by slab, not a page fragment.
6996 * This is to make sure bug fixed in 3226b158e67c
6997 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
6998 * does not accidentally come back.
6999 */
7000static void napi_get_frags_check(struct napi_struct *napi)
7001{
7002 struct sk_buff *skb;
7003
7004 local_bh_disable();
7005 skb = napi_get_frags(napi);
7006 WARN_ON_ONCE(skb && skb->head_frag);
7007 napi_free_frags(napi);
7008 local_bh_enable();
7009}
7010
7011void netif_napi_add_weight_locked(struct net_device *dev,
7012 struct napi_struct *napi,
7013 int (*poll)(struct napi_struct *, int),
7014 int weight)
7015{
7016 netdev_assert_locked(dev);
7017 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
7018 return;
7019
7020 INIT_LIST_HEAD(&napi->poll_list);
7021 INIT_HLIST_NODE(&napi->napi_hash_node);
7022 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
7023 napi->timer.function = napi_watchdog;
7024 init_gro_hash(napi);
7025 napi->skb = NULL;
7026 INIT_LIST_HEAD(&napi->rx_list);
7027 napi->rx_count = 0;
7028 napi->poll = poll;
7029 if (weight > NAPI_POLL_WEIGHT)
7030 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
7031 weight);
7032 napi->weight = weight;
7033 napi->dev = dev;
7034#ifdef CONFIG_NETPOLL
7035 napi->poll_owner = -1;
7036#endif
7037 napi->list_owner = -1;
7038 set_bit(NAPI_STATE_SCHED, &napi->state);
7039 set_bit(NAPI_STATE_NPSVC, &napi->state);
7040 netif_napi_dev_list_add(dev, napi);
7041
7042 /* default settings from sysfs are applied to all NAPIs. any per-NAPI
7043 * configuration will be loaded in napi_enable
7044 */
7045 napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
7046 napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
7047
7048 napi_get_frags_check(napi);
7049 /* Create kthread for this napi if dev->threaded is set.
7050 * Clear dev->threaded if kthread creation failed so that
7051 * threaded mode will not be enabled in napi_enable().
7052 */
7053 if (dev->threaded && napi_kthread_create(napi))
7054 dev->threaded = false;
7055 netif_napi_set_irq_locked(napi, -1);
7056}
7057EXPORT_SYMBOL(netif_napi_add_weight_locked);
7058
7059void napi_disable_locked(struct napi_struct *n)
7060{
7061 unsigned long val, new;
7062
7063 might_sleep();
7064 netdev_assert_locked(n->dev);
7065
7066 set_bit(NAPI_STATE_DISABLE, &n->state);
7067
7068 val = READ_ONCE(n->state);
7069 do {
7070 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
7071 usleep_range(20, 200);
7072 val = READ_ONCE(n->state);
7073 }
7074
7075 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
7076 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
7077 } while (!try_cmpxchg(&n->state, &val, new));
7078
7079 hrtimer_cancel(&n->timer);
7080
7081 if (n->config)
7082 napi_save_config(n);
7083 else
7084 napi_hash_del(n);
7085
7086 clear_bit(NAPI_STATE_DISABLE, &n->state);
7087}
7088EXPORT_SYMBOL(napi_disable_locked);
7089
7090/**
7091 * napi_disable() - prevent NAPI from scheduling
7092 * @n: NAPI context
7093 *
7094 * Stop NAPI from being scheduled on this context.
7095 * Waits till any outstanding processing completes.
7096 * Takes netdev_lock() for associated net_device.
7097 */
7098void napi_disable(struct napi_struct *n)
7099{
7100 netdev_lock(n->dev);
7101 napi_disable_locked(n);
7102 netdev_unlock(n->dev);
7103}
7104EXPORT_SYMBOL(napi_disable);
7105
7106void napi_enable_locked(struct napi_struct *n)
7107{
7108 unsigned long new, val = READ_ONCE(n->state);
7109
7110 if (n->config)
7111 napi_restore_config(n);
7112 else
7113 napi_hash_add(n);
7114
7115 do {
7116 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
7117
7118 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
7119 if (n->dev->threaded && n->thread)
7120 new |= NAPIF_STATE_THREADED;
7121 } while (!try_cmpxchg(&n->state, &val, new));
7122}
7123EXPORT_SYMBOL(napi_enable_locked);
7124
7125/**
7126 * napi_enable() - enable NAPI scheduling
7127 * @n: NAPI context
7128 *
7129 * Enable scheduling of a NAPI instance.
7130 * Must be paired with napi_disable().
7131 * Takes netdev_lock() for associated net_device.
7132 */
7133void napi_enable(struct napi_struct *n)
7134{
7135 netdev_lock(n->dev);
7136 napi_enable_locked(n);
7137 netdev_unlock(n->dev);
7138}
7139EXPORT_SYMBOL(napi_enable);
7140
7141static void flush_gro_hash(struct napi_struct *napi)
7142{
7143 int i;
7144
7145 for (i = 0; i < GRO_HASH_BUCKETS; i++) {
7146 struct sk_buff *skb, *n;
7147
7148 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
7149 kfree_skb(skb);
7150 napi->gro_hash[i].count = 0;
7151 }
7152}
7153
7154/* Must be called in process context */
7155void __netif_napi_del_locked(struct napi_struct *napi)
7156{
7157 netdev_assert_locked(napi->dev);
7158
7159 if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
7160 return;
7161
7162 if (napi->config) {
7163 napi->index = -1;
7164 napi->config = NULL;
7165 }
7166
7167 list_del_rcu(&napi->dev_list);
7168 napi_free_frags(napi);
7169
7170 flush_gro_hash(napi);
7171 napi->gro_bitmask = 0;
7172
7173 if (napi->thread) {
7174 kthread_stop(napi->thread);
7175 napi->thread = NULL;
7176 }
7177}
7178EXPORT_SYMBOL(__netif_napi_del_locked);
7179
7180static int __napi_poll(struct napi_struct *n, bool *repoll)
7181{
7182 int work, weight;
7183
7184 weight = n->weight;
7185
7186 /* This NAPI_STATE_SCHED test is for avoiding a race
7187 * with netpoll's poll_napi(). Only the entity which
7188 * obtains the lock and sees NAPI_STATE_SCHED set will
7189 * actually make the ->poll() call. Therefore we avoid
7190 * accidentally calling ->poll() when NAPI is not scheduled.
7191 */
7192 work = 0;
7193 if (napi_is_scheduled(n)) {
7194 work = n->poll(n, weight);
7195 trace_napi_poll(n, work, weight);
7196
7197 xdp_do_check_flushed(n);
7198 }
7199
7200 if (unlikely(work > weight))
7201 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
7202 n->poll, work, weight);
7203
7204 if (likely(work < weight))
7205 return work;
7206
7207 /* Drivers must not modify the NAPI state if they
7208 * consume the entire weight. In such cases this code
7209 * still "owns" the NAPI instance and therefore can
7210 * move the instance around on the list at-will.
7211 */
7212 if (unlikely(napi_disable_pending(n))) {
7213 napi_complete(n);
7214 return work;
7215 }
7216
7217 /* The NAPI context has more processing work, but busy-polling
7218 * is preferred. Exit early.
7219 */
7220 if (napi_prefer_busy_poll(n)) {
7221 if (napi_complete_done(n, work)) {
7222 /* If timeout is not set, we need to make sure
7223 * that the NAPI is re-scheduled.
7224 */
7225 napi_schedule(n);
7226 }
7227 return work;
7228 }
7229
7230 if (n->gro_bitmask) {
7231 /* flush too old packets
7232 * If HZ < 1000, flush all packets.
7233 */
7234 napi_gro_flush(n, HZ >= 1000);
7235 }
7236
7237 gro_normal_list(n);
7238
7239 /* Some drivers may have called napi_schedule
7240 * prior to exhausting their budget.
7241 */
7242 if (unlikely(!list_empty(&n->poll_list))) {
7243 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
7244 n->dev ? n->dev->name : "backlog");
7245 return work;
7246 }
7247
7248 *repoll = true;
7249
7250 return work;
7251}
7252
7253static int napi_poll(struct napi_struct *n, struct list_head *repoll)
7254{
7255 bool do_repoll = false;
7256 void *have;
7257 int work;
7258
7259 list_del_init(&n->poll_list);
7260
7261 have = netpoll_poll_lock(n);
7262
7263 work = __napi_poll(n, &do_repoll);
7264
7265 if (do_repoll)
7266 list_add_tail(&n->poll_list, repoll);
7267
7268 netpoll_poll_unlock(have);
7269
7270 return work;
7271}
7272
7273static int napi_thread_wait(struct napi_struct *napi)
7274{
7275 set_current_state(TASK_INTERRUPTIBLE);
7276
7277 while (!kthread_should_stop()) {
7278 /* Testing SCHED_THREADED bit here to make sure the current
7279 * kthread owns this napi and could poll on this napi.
7280 * Testing SCHED bit is not enough because SCHED bit might be
7281 * set by some other busy poll thread or by napi_disable().
7282 */
7283 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
7284 WARN_ON(!list_empty(&napi->poll_list));
7285 __set_current_state(TASK_RUNNING);
7286 return 0;
7287 }
7288
7289 schedule();
7290 set_current_state(TASK_INTERRUPTIBLE);
7291 }
7292 __set_current_state(TASK_RUNNING);
7293
7294 return -1;
7295}
7296
7297static void napi_threaded_poll_loop(struct napi_struct *napi)
7298{
7299 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
7300 struct softnet_data *sd;
7301 unsigned long last_qs = jiffies;
7302
7303 for (;;) {
7304 bool repoll = false;
7305 void *have;
7306
7307 local_bh_disable();
7308 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
7309
7310 sd = this_cpu_ptr(&softnet_data);
7311 sd->in_napi_threaded_poll = true;
7312
7313 have = netpoll_poll_lock(napi);
7314 __napi_poll(napi, &repoll);
7315 netpoll_poll_unlock(have);
7316
7317 sd->in_napi_threaded_poll = false;
7318 barrier();
7319
7320 if (sd_has_rps_ipi_waiting(sd)) {
7321 local_irq_disable();
7322 net_rps_action_and_irq_enable(sd);
7323 }
7324 skb_defer_free_flush(sd);
7325 bpf_net_ctx_clear(bpf_net_ctx);
7326 local_bh_enable();
7327
7328 if (!repoll)
7329 break;
7330
7331 rcu_softirq_qs_periodic(last_qs);
7332 cond_resched();
7333 }
7334}
7335
7336static int napi_threaded_poll(void *data)
7337{
7338 struct napi_struct *napi = data;
7339
7340 while (!napi_thread_wait(napi))
7341 napi_threaded_poll_loop(napi);
7342
7343 return 0;
7344}
7345
7346static __latent_entropy void net_rx_action(void)
7347{
7348 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
7349 unsigned long time_limit = jiffies +
7350 usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
7351 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
7352 int budget = READ_ONCE(net_hotdata.netdev_budget);
7353 LIST_HEAD(list);
7354 LIST_HEAD(repoll);
7355
7356 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
7357start:
7358 sd->in_net_rx_action = true;
7359 local_irq_disable();
7360 list_splice_init(&sd->poll_list, &list);
7361 local_irq_enable();
7362
7363 for (;;) {
7364 struct napi_struct *n;
7365
7366 skb_defer_free_flush(sd);
7367
7368 if (list_empty(&list)) {
7369 if (list_empty(&repoll)) {
7370 sd->in_net_rx_action = false;
7371 barrier();
7372 /* We need to check if ____napi_schedule()
7373 * had refilled poll_list while
7374 * sd->in_net_rx_action was true.
7375 */
7376 if (!list_empty(&sd->poll_list))
7377 goto start;
7378 if (!sd_has_rps_ipi_waiting(sd))
7379 goto end;
7380 }
7381 break;
7382 }
7383
7384 n = list_first_entry(&list, struct napi_struct, poll_list);
7385 budget -= napi_poll(n, &repoll);
7386
7387 /* If softirq window is exhausted then punt.
7388 * Allow this to run for 2 jiffies since which will allow
7389 * an average latency of 1.5/HZ.
7390 */
7391 if (unlikely(budget <= 0 ||
7392 time_after_eq(jiffies, time_limit))) {
7393 sd->time_squeeze++;
7394 break;
7395 }
7396 }
7397
7398 local_irq_disable();
7399
7400 list_splice_tail_init(&sd->poll_list, &list);
7401 list_splice_tail(&repoll, &list);
7402 list_splice(&list, &sd->poll_list);
7403 if (!list_empty(&sd->poll_list))
7404 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
7405 else
7406 sd->in_net_rx_action = false;
7407
7408 net_rps_action_and_irq_enable(sd);
7409end:
7410 bpf_net_ctx_clear(bpf_net_ctx);
7411}
7412
7413struct netdev_adjacent {
7414 struct net_device *dev;
7415 netdevice_tracker dev_tracker;
7416
7417 /* upper master flag, there can only be one master device per list */
7418 bool master;
7419
7420 /* lookup ignore flag */
7421 bool ignore;
7422
7423 /* counter for the number of times this device was added to us */
7424 u16 ref_nr;
7425
7426 /* private field for the users */
7427 void *private;
7428
7429 struct list_head list;
7430 struct rcu_head rcu;
7431};
7432
7433static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
7434 struct list_head *adj_list)
7435{
7436 struct netdev_adjacent *adj;
7437
7438 list_for_each_entry(adj, adj_list, list) {
7439 if (adj->dev == adj_dev)
7440 return adj;
7441 }
7442 return NULL;
7443}
7444
7445static int ____netdev_has_upper_dev(struct net_device *upper_dev,
7446 struct netdev_nested_priv *priv)
7447{
7448 struct net_device *dev = (struct net_device *)priv->data;
7449
7450 return upper_dev == dev;
7451}
7452
7453/**
7454 * netdev_has_upper_dev - Check if device is linked to an upper device
7455 * @dev: device
7456 * @upper_dev: upper device to check
7457 *
7458 * Find out if a device is linked to specified upper device and return true
7459 * in case it is. Note that this checks only immediate upper device,
7460 * not through a complete stack of devices. The caller must hold the RTNL lock.
7461 */
7462bool netdev_has_upper_dev(struct net_device *dev,
7463 struct net_device *upper_dev)
7464{
7465 struct netdev_nested_priv priv = {
7466 .data = (void *)upper_dev,
7467 };
7468
7469 ASSERT_RTNL();
7470
7471 return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7472 &priv);
7473}
7474EXPORT_SYMBOL(netdev_has_upper_dev);
7475
7476/**
7477 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
7478 * @dev: device
7479 * @upper_dev: upper device to check
7480 *
7481 * Find out if a device is linked to specified upper device and return true
7482 * in case it is. Note that this checks the entire upper device chain.
7483 * The caller must hold rcu lock.
7484 */
7485
7486bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7487 struct net_device *upper_dev)
7488{
7489 struct netdev_nested_priv priv = {
7490 .data = (void *)upper_dev,
7491 };
7492
7493 return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7494 &priv);
7495}
7496EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
7497
7498/**
7499 * netdev_has_any_upper_dev - Check if device is linked to some device
7500 * @dev: device
7501 *
7502 * Find out if a device is linked to an upper device and return true in case
7503 * it is. The caller must hold the RTNL lock.
7504 */
7505bool netdev_has_any_upper_dev(struct net_device *dev)
7506{
7507 ASSERT_RTNL();
7508
7509 return !list_empty(&dev->adj_list.upper);
7510}
7511EXPORT_SYMBOL(netdev_has_any_upper_dev);
7512
7513/**
7514 * netdev_master_upper_dev_get - Get master upper device
7515 * @dev: device
7516 *
7517 * Find a master upper device and return pointer to it or NULL in case
7518 * it's not there. The caller must hold the RTNL lock.
7519 */
7520struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
7521{
7522 struct netdev_adjacent *upper;
7523
7524 ASSERT_RTNL();
7525
7526 if (list_empty(&dev->adj_list.upper))
7527 return NULL;
7528
7529 upper = list_first_entry(&dev->adj_list.upper,
7530 struct netdev_adjacent, list);
7531 if (likely(upper->master))
7532 return upper->dev;
7533 return NULL;
7534}
7535EXPORT_SYMBOL(netdev_master_upper_dev_get);
7536
7537static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7538{
7539 struct netdev_adjacent *upper;
7540
7541 ASSERT_RTNL();
7542
7543 if (list_empty(&dev->adj_list.upper))
7544 return NULL;
7545
7546 upper = list_first_entry(&dev->adj_list.upper,
7547 struct netdev_adjacent, list);
7548 if (likely(upper->master) && !upper->ignore)
7549 return upper->dev;
7550 return NULL;
7551}
7552
7553/**
7554 * netdev_has_any_lower_dev - Check if device is linked to some device
7555 * @dev: device
7556 *
7557 * Find out if a device is linked to a lower device and return true in case
7558 * it is. The caller must hold the RTNL lock.
7559 */
7560static bool netdev_has_any_lower_dev(struct net_device *dev)
7561{
7562 ASSERT_RTNL();
7563
7564 return !list_empty(&dev->adj_list.lower);
7565}
7566
7567void *netdev_adjacent_get_private(struct list_head *adj_list)
7568{
7569 struct netdev_adjacent *adj;
7570
7571 adj = list_entry(adj_list, struct netdev_adjacent, list);
7572
7573 return adj->private;
7574}
7575EXPORT_SYMBOL(netdev_adjacent_get_private);
7576
7577/**
7578 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7579 * @dev: device
7580 * @iter: list_head ** of the current position
7581 *
7582 * Gets the next device from the dev's upper list, starting from iter
7583 * position. The caller must hold RCU read lock.
7584 */
7585struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7586 struct list_head **iter)
7587{
7588 struct netdev_adjacent *upper;
7589
7590 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7591
7592 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7593
7594 if (&upper->list == &dev->adj_list.upper)
7595 return NULL;
7596
7597 *iter = &upper->list;
7598
7599 return upper->dev;
7600}
7601EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7602
7603static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7604 struct list_head **iter,
7605 bool *ignore)
7606{
7607 struct netdev_adjacent *upper;
7608
7609 upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7610
7611 if (&upper->list == &dev->adj_list.upper)
7612 return NULL;
7613
7614 *iter = &upper->list;
7615 *ignore = upper->ignore;
7616
7617 return upper->dev;
7618}
7619
7620static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7621 struct list_head **iter)
7622{
7623 struct netdev_adjacent *upper;
7624
7625 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7626
7627 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7628
7629 if (&upper->list == &dev->adj_list.upper)
7630 return NULL;
7631
7632 *iter = &upper->list;
7633
7634 return upper->dev;
7635}
7636
7637static int __netdev_walk_all_upper_dev(struct net_device *dev,
7638 int (*fn)(struct net_device *dev,
7639 struct netdev_nested_priv *priv),
7640 struct netdev_nested_priv *priv)
7641{
7642 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7643 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7644 int ret, cur = 0;
7645 bool ignore;
7646
7647 now = dev;
7648 iter = &dev->adj_list.upper;
7649
7650 while (1) {
7651 if (now != dev) {
7652 ret = fn(now, priv);
7653 if (ret)
7654 return ret;
7655 }
7656
7657 next = NULL;
7658 while (1) {
7659 udev = __netdev_next_upper_dev(now, &iter, &ignore);
7660 if (!udev)
7661 break;
7662 if (ignore)
7663 continue;
7664
7665 next = udev;
7666 niter = &udev->adj_list.upper;
7667 dev_stack[cur] = now;
7668 iter_stack[cur++] = iter;
7669 break;
7670 }
7671
7672 if (!next) {
7673 if (!cur)
7674 return 0;
7675 next = dev_stack[--cur];
7676 niter = iter_stack[cur];
7677 }
7678
7679 now = next;
7680 iter = niter;
7681 }
7682
7683 return 0;
7684}
7685
7686int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7687 int (*fn)(struct net_device *dev,
7688 struct netdev_nested_priv *priv),
7689 struct netdev_nested_priv *priv)
7690{
7691 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7692 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7693 int ret, cur = 0;
7694
7695 now = dev;
7696 iter = &dev->adj_list.upper;
7697
7698 while (1) {
7699 if (now != dev) {
7700 ret = fn(now, priv);
7701 if (ret)
7702 return ret;
7703 }
7704
7705 next = NULL;
7706 while (1) {
7707 udev = netdev_next_upper_dev_rcu(now, &iter);
7708 if (!udev)
7709 break;
7710
7711 next = udev;
7712 niter = &udev->adj_list.upper;
7713 dev_stack[cur] = now;
7714 iter_stack[cur++] = iter;
7715 break;
7716 }
7717
7718 if (!next) {
7719 if (!cur)
7720 return 0;
7721 next = dev_stack[--cur];
7722 niter = iter_stack[cur];
7723 }
7724
7725 now = next;
7726 iter = niter;
7727 }
7728
7729 return 0;
7730}
7731EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7732
7733static bool __netdev_has_upper_dev(struct net_device *dev,
7734 struct net_device *upper_dev)
7735{
7736 struct netdev_nested_priv priv = {
7737 .flags = 0,
7738 .data = (void *)upper_dev,
7739 };
7740
7741 ASSERT_RTNL();
7742
7743 return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7744 &priv);
7745}
7746
7747/**
7748 * netdev_lower_get_next_private - Get the next ->private from the
7749 * lower neighbour list
7750 * @dev: device
7751 * @iter: list_head ** of the current position
7752 *
7753 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7754 * list, starting from iter position. The caller must hold either hold the
7755 * RTNL lock or its own locking that guarantees that the neighbour lower
7756 * list will remain unchanged.
7757 */
7758void *netdev_lower_get_next_private(struct net_device *dev,
7759 struct list_head **iter)
7760{
7761 struct netdev_adjacent *lower;
7762
7763 lower = list_entry(*iter, struct netdev_adjacent, list);
7764
7765 if (&lower->list == &dev->adj_list.lower)
7766 return NULL;
7767
7768 *iter = lower->list.next;
7769
7770 return lower->private;
7771}
7772EXPORT_SYMBOL(netdev_lower_get_next_private);
7773
7774/**
7775 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7776 * lower neighbour list, RCU
7777 * variant
7778 * @dev: device
7779 * @iter: list_head ** of the current position
7780 *
7781 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7782 * list, starting from iter position. The caller must hold RCU read lock.
7783 */
7784void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7785 struct list_head **iter)
7786{
7787 struct netdev_adjacent *lower;
7788
7789 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7790
7791 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7792
7793 if (&lower->list == &dev->adj_list.lower)
7794 return NULL;
7795
7796 *iter = &lower->list;
7797
7798 return lower->private;
7799}
7800EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7801
7802/**
7803 * netdev_lower_get_next - Get the next device from the lower neighbour
7804 * list
7805 * @dev: device
7806 * @iter: list_head ** of the current position
7807 *
7808 * Gets the next netdev_adjacent from the dev's lower neighbour
7809 * list, starting from iter position. The caller must hold RTNL lock or
7810 * its own locking that guarantees that the neighbour lower
7811 * list will remain unchanged.
7812 */
7813void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7814{
7815 struct netdev_adjacent *lower;
7816
7817 lower = list_entry(*iter, struct netdev_adjacent, list);
7818
7819 if (&lower->list == &dev->adj_list.lower)
7820 return NULL;
7821
7822 *iter = lower->list.next;
7823
7824 return lower->dev;
7825}
7826EXPORT_SYMBOL(netdev_lower_get_next);
7827
7828static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7829 struct list_head **iter)
7830{
7831 struct netdev_adjacent *lower;
7832
7833 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7834
7835 if (&lower->list == &dev->adj_list.lower)
7836 return NULL;
7837
7838 *iter = &lower->list;
7839
7840 return lower->dev;
7841}
7842
7843static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7844 struct list_head **iter,
7845 bool *ignore)
7846{
7847 struct netdev_adjacent *lower;
7848
7849 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7850
7851 if (&lower->list == &dev->adj_list.lower)
7852 return NULL;
7853
7854 *iter = &lower->list;
7855 *ignore = lower->ignore;
7856
7857 return lower->dev;
7858}
7859
7860int netdev_walk_all_lower_dev(struct net_device *dev,
7861 int (*fn)(struct net_device *dev,
7862 struct netdev_nested_priv *priv),
7863 struct netdev_nested_priv *priv)
7864{
7865 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7866 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7867 int ret, cur = 0;
7868
7869 now = dev;
7870 iter = &dev->adj_list.lower;
7871
7872 while (1) {
7873 if (now != dev) {
7874 ret = fn(now, priv);
7875 if (ret)
7876 return ret;
7877 }
7878
7879 next = NULL;
7880 while (1) {
7881 ldev = netdev_next_lower_dev(now, &iter);
7882 if (!ldev)
7883 break;
7884
7885 next = ldev;
7886 niter = &ldev->adj_list.lower;
7887 dev_stack[cur] = now;
7888 iter_stack[cur++] = iter;
7889 break;
7890 }
7891
7892 if (!next) {
7893 if (!cur)
7894 return 0;
7895 next = dev_stack[--cur];
7896 niter = iter_stack[cur];
7897 }
7898
7899 now = next;
7900 iter = niter;
7901 }
7902
7903 return 0;
7904}
7905EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7906
7907static int __netdev_walk_all_lower_dev(struct net_device *dev,
7908 int (*fn)(struct net_device *dev,
7909 struct netdev_nested_priv *priv),
7910 struct netdev_nested_priv *priv)
7911{
7912 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7913 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7914 int ret, cur = 0;
7915 bool ignore;
7916
7917 now = dev;
7918 iter = &dev->adj_list.lower;
7919
7920 while (1) {
7921 if (now != dev) {
7922 ret = fn(now, priv);
7923 if (ret)
7924 return ret;
7925 }
7926
7927 next = NULL;
7928 while (1) {
7929 ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7930 if (!ldev)
7931 break;
7932 if (ignore)
7933 continue;
7934
7935 next = ldev;
7936 niter = &ldev->adj_list.lower;
7937 dev_stack[cur] = now;
7938 iter_stack[cur++] = iter;
7939 break;
7940 }
7941
7942 if (!next) {
7943 if (!cur)
7944 return 0;
7945 next = dev_stack[--cur];
7946 niter = iter_stack[cur];
7947 }
7948
7949 now = next;
7950 iter = niter;
7951 }
7952
7953 return 0;
7954}
7955
7956struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7957 struct list_head **iter)
7958{
7959 struct netdev_adjacent *lower;
7960
7961 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7962 if (&lower->list == &dev->adj_list.lower)
7963 return NULL;
7964
7965 *iter = &lower->list;
7966
7967 return lower->dev;
7968}
7969EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7970
7971static u8 __netdev_upper_depth(struct net_device *dev)
7972{
7973 struct net_device *udev;
7974 struct list_head *iter;
7975 u8 max_depth = 0;
7976 bool ignore;
7977
7978 for (iter = &dev->adj_list.upper,
7979 udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7980 udev;
7981 udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7982 if (ignore)
7983 continue;
7984 if (max_depth < udev->upper_level)
7985 max_depth = udev->upper_level;
7986 }
7987
7988 return max_depth;
7989}
7990
7991static u8 __netdev_lower_depth(struct net_device *dev)
7992{
7993 struct net_device *ldev;
7994 struct list_head *iter;
7995 u8 max_depth = 0;
7996 bool ignore;
7997
7998 for (iter = &dev->adj_list.lower,
7999 ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
8000 ldev;
8001 ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
8002 if (ignore)
8003 continue;
8004 if (max_depth < ldev->lower_level)
8005 max_depth = ldev->lower_level;
8006 }
8007
8008 return max_depth;
8009}
8010
8011static int __netdev_update_upper_level(struct net_device *dev,
8012 struct netdev_nested_priv *__unused)
8013{
8014 dev->upper_level = __netdev_upper_depth(dev) + 1;
8015 return 0;
8016}
8017
8018#ifdef CONFIG_LOCKDEP
8019static LIST_HEAD(net_unlink_list);
8020
8021static void net_unlink_todo(struct net_device *dev)
8022{
8023 if (list_empty(&dev->unlink_list))
8024 list_add_tail(&dev->unlink_list, &net_unlink_list);
8025}
8026#endif
8027
8028static int __netdev_update_lower_level(struct net_device *dev,
8029 struct netdev_nested_priv *priv)
8030{
8031 dev->lower_level = __netdev_lower_depth(dev) + 1;
8032
8033#ifdef CONFIG_LOCKDEP
8034 if (!priv)
8035 return 0;
8036
8037 if (priv->flags & NESTED_SYNC_IMM)
8038 dev->nested_level = dev->lower_level - 1;
8039 if (priv->flags & NESTED_SYNC_TODO)
8040 net_unlink_todo(dev);
8041#endif
8042 return 0;
8043}
8044
8045int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
8046 int (*fn)(struct net_device *dev,
8047 struct netdev_nested_priv *priv),
8048 struct netdev_nested_priv *priv)
8049{
8050 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
8051 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
8052 int ret, cur = 0;
8053
8054 now = dev;
8055 iter = &dev->adj_list.lower;
8056
8057 while (1) {
8058 if (now != dev) {
8059 ret = fn(now, priv);
8060 if (ret)
8061 return ret;
8062 }
8063
8064 next = NULL;
8065 while (1) {
8066 ldev = netdev_next_lower_dev_rcu(now, &iter);
8067 if (!ldev)
8068 break;
8069
8070 next = ldev;
8071 niter = &ldev->adj_list.lower;
8072 dev_stack[cur] = now;
8073 iter_stack[cur++] = iter;
8074 break;
8075 }
8076
8077 if (!next) {
8078 if (!cur)
8079 return 0;
8080 next = dev_stack[--cur];
8081 niter = iter_stack[cur];
8082 }
8083
8084 now = next;
8085 iter = niter;
8086 }
8087
8088 return 0;
8089}
8090EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
8091
8092/**
8093 * netdev_lower_get_first_private_rcu - Get the first ->private from the
8094 * lower neighbour list, RCU
8095 * variant
8096 * @dev: device
8097 *
8098 * Gets the first netdev_adjacent->private from the dev's lower neighbour
8099 * list. The caller must hold RCU read lock.
8100 */
8101void *netdev_lower_get_first_private_rcu(struct net_device *dev)
8102{
8103 struct netdev_adjacent *lower;
8104
8105 lower = list_first_or_null_rcu(&dev->adj_list.lower,
8106 struct netdev_adjacent, list);
8107 if (lower)
8108 return lower->private;
8109 return NULL;
8110}
8111EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
8112
8113/**
8114 * netdev_master_upper_dev_get_rcu - Get master upper device
8115 * @dev: device
8116 *
8117 * Find a master upper device and return pointer to it or NULL in case
8118 * it's not there. The caller must hold the RCU read lock.
8119 */
8120struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
8121{
8122 struct netdev_adjacent *upper;
8123
8124 upper = list_first_or_null_rcu(&dev->adj_list.upper,
8125 struct netdev_adjacent, list);
8126 if (upper && likely(upper->master))
8127 return upper->dev;
8128 return NULL;
8129}
8130EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
8131
8132static int netdev_adjacent_sysfs_add(struct net_device *dev,
8133 struct net_device *adj_dev,
8134 struct list_head *dev_list)
8135{
8136 char linkname[IFNAMSIZ+7];
8137
8138 sprintf(linkname, dev_list == &dev->adj_list.upper ?
8139 "upper_%s" : "lower_%s", adj_dev->name);
8140 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
8141 linkname);
8142}
8143static void netdev_adjacent_sysfs_del(struct net_device *dev,
8144 char *name,
8145 struct list_head *dev_list)
8146{
8147 char linkname[IFNAMSIZ+7];
8148
8149 sprintf(linkname, dev_list == &dev->adj_list.upper ?
8150 "upper_%s" : "lower_%s", name);
8151 sysfs_remove_link(&(dev->dev.kobj), linkname);
8152}
8153
8154static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
8155 struct net_device *adj_dev,
8156 struct list_head *dev_list)
8157{
8158 return (dev_list == &dev->adj_list.upper ||
8159 dev_list == &dev->adj_list.lower) &&
8160 net_eq(dev_net(dev), dev_net(adj_dev));
8161}
8162
8163static int __netdev_adjacent_dev_insert(struct net_device *dev,
8164 struct net_device *adj_dev,
8165 struct list_head *dev_list,
8166 void *private, bool master)
8167{
8168 struct netdev_adjacent *adj;
8169 int ret;
8170
8171 adj = __netdev_find_adj(adj_dev, dev_list);
8172
8173 if (adj) {
8174 adj->ref_nr += 1;
8175 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
8176 dev->name, adj_dev->name, adj->ref_nr);
8177
8178 return 0;
8179 }
8180
8181 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
8182 if (!adj)
8183 return -ENOMEM;
8184
8185 adj->dev = adj_dev;
8186 adj->master = master;
8187 adj->ref_nr = 1;
8188 adj->private = private;
8189 adj->ignore = false;
8190 netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
8191
8192 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
8193 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
8194
8195 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
8196 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
8197 if (ret)
8198 goto free_adj;
8199 }
8200
8201 /* Ensure that master link is always the first item in list. */
8202 if (master) {
8203 ret = sysfs_create_link(&(dev->dev.kobj),
8204 &(adj_dev->dev.kobj), "master");
8205 if (ret)
8206 goto remove_symlinks;
8207
8208 list_add_rcu(&adj->list, dev_list);
8209 } else {
8210 list_add_tail_rcu(&adj->list, dev_list);
8211 }
8212
8213 return 0;
8214
8215remove_symlinks:
8216 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
8217 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
8218free_adj:
8219 netdev_put(adj_dev, &adj->dev_tracker);
8220 kfree(adj);
8221
8222 return ret;
8223}
8224
8225static void __netdev_adjacent_dev_remove(struct net_device *dev,
8226 struct net_device *adj_dev,
8227 u16 ref_nr,
8228 struct list_head *dev_list)
8229{
8230 struct netdev_adjacent *adj;
8231
8232 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
8233 dev->name, adj_dev->name, ref_nr);
8234
8235 adj = __netdev_find_adj(adj_dev, dev_list);
8236
8237 if (!adj) {
8238 pr_err("Adjacency does not exist for device %s from %s\n",
8239 dev->name, adj_dev->name);
8240 WARN_ON(1);
8241 return;
8242 }
8243
8244 if (adj->ref_nr > ref_nr) {
8245 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
8246 dev->name, adj_dev->name, ref_nr,
8247 adj->ref_nr - ref_nr);
8248 adj->ref_nr -= ref_nr;
8249 return;
8250 }
8251
8252 if (adj->master)
8253 sysfs_remove_link(&(dev->dev.kobj), "master");
8254
8255 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
8256 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
8257
8258 list_del_rcu(&adj->list);
8259 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
8260 adj_dev->name, dev->name, adj_dev->name);
8261 netdev_put(adj_dev, &adj->dev_tracker);
8262 kfree_rcu(adj, rcu);
8263}
8264
8265static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
8266 struct net_device *upper_dev,
8267 struct list_head *up_list,
8268 struct list_head *down_list,
8269 void *private, bool master)
8270{
8271 int ret;
8272
8273 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
8274 private, master);
8275 if (ret)
8276 return ret;
8277
8278 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
8279 private, false);
8280 if (ret) {
8281 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
8282 return ret;
8283 }
8284
8285 return 0;
8286}
8287
8288static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
8289 struct net_device *upper_dev,
8290 u16 ref_nr,
8291 struct list_head *up_list,
8292 struct list_head *down_list)
8293{
8294 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
8295 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
8296}
8297
8298static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
8299 struct net_device *upper_dev,
8300 void *private, bool master)
8301{
8302 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
8303 &dev->adj_list.upper,
8304 &upper_dev->adj_list.lower,
8305 private, master);
8306}
8307
8308static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
8309 struct net_device *upper_dev)
8310{
8311 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
8312 &dev->adj_list.upper,
8313 &upper_dev->adj_list.lower);
8314}
8315
8316static int __netdev_upper_dev_link(struct net_device *dev,
8317 struct net_device *upper_dev, bool master,
8318 void *upper_priv, void *upper_info,
8319 struct netdev_nested_priv *priv,
8320 struct netlink_ext_ack *extack)
8321{
8322 struct netdev_notifier_changeupper_info changeupper_info = {
8323 .info = {
8324 .dev = dev,
8325 .extack = extack,
8326 },
8327 .upper_dev = upper_dev,
8328 .master = master,
8329 .linking = true,
8330 .upper_info = upper_info,
8331 };
8332 struct net_device *master_dev;
8333 int ret = 0;
8334
8335 ASSERT_RTNL();
8336
8337 if (dev == upper_dev)
8338 return -EBUSY;
8339
8340 /* To prevent loops, check if dev is not upper device to upper_dev. */
8341 if (__netdev_has_upper_dev(upper_dev, dev))
8342 return -EBUSY;
8343
8344 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
8345 return -EMLINK;
8346
8347 if (!master) {
8348 if (__netdev_has_upper_dev(dev, upper_dev))
8349 return -EEXIST;
8350 } else {
8351 master_dev = __netdev_master_upper_dev_get(dev);
8352 if (master_dev)
8353 return master_dev == upper_dev ? -EEXIST : -EBUSY;
8354 }
8355
8356 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8357 &changeupper_info.info);
8358 ret = notifier_to_errno(ret);
8359 if (ret)
8360 return ret;
8361
8362 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
8363 master);
8364 if (ret)
8365 return ret;
8366
8367 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8368 &changeupper_info.info);
8369 ret = notifier_to_errno(ret);
8370 if (ret)
8371 goto rollback;
8372
8373 __netdev_update_upper_level(dev, NULL);
8374 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8375
8376 __netdev_update_lower_level(upper_dev, priv);
8377 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8378 priv);
8379
8380 return 0;
8381
8382rollback:
8383 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8384
8385 return ret;
8386}
8387
8388/**
8389 * netdev_upper_dev_link - Add a link to the upper device
8390 * @dev: device
8391 * @upper_dev: new upper device
8392 * @extack: netlink extended ack
8393 *
8394 * Adds a link to device which is upper to this one. The caller must hold
8395 * the RTNL lock. On a failure a negative errno code is returned.
8396 * On success the reference counts are adjusted and the function
8397 * returns zero.
8398 */
8399int netdev_upper_dev_link(struct net_device *dev,
8400 struct net_device *upper_dev,
8401 struct netlink_ext_ack *extack)
8402{
8403 struct netdev_nested_priv priv = {
8404 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8405 .data = NULL,
8406 };
8407
8408 return __netdev_upper_dev_link(dev, upper_dev, false,
8409 NULL, NULL, &priv, extack);
8410}
8411EXPORT_SYMBOL(netdev_upper_dev_link);
8412
8413/**
8414 * netdev_master_upper_dev_link - Add a master link to the upper device
8415 * @dev: device
8416 * @upper_dev: new upper device
8417 * @upper_priv: upper device private
8418 * @upper_info: upper info to be passed down via notifier
8419 * @extack: netlink extended ack
8420 *
8421 * Adds a link to device which is upper to this one. In this case, only
8422 * one master upper device can be linked, although other non-master devices
8423 * might be linked as well. The caller must hold the RTNL lock.
8424 * On a failure a negative errno code is returned. On success the reference
8425 * counts are adjusted and the function returns zero.
8426 */
8427int netdev_master_upper_dev_link(struct net_device *dev,
8428 struct net_device *upper_dev,
8429 void *upper_priv, void *upper_info,
8430 struct netlink_ext_ack *extack)
8431{
8432 struct netdev_nested_priv priv = {
8433 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8434 .data = NULL,
8435 };
8436
8437 return __netdev_upper_dev_link(dev, upper_dev, true,
8438 upper_priv, upper_info, &priv, extack);
8439}
8440EXPORT_SYMBOL(netdev_master_upper_dev_link);
8441
8442static void __netdev_upper_dev_unlink(struct net_device *dev,
8443 struct net_device *upper_dev,
8444 struct netdev_nested_priv *priv)
8445{
8446 struct netdev_notifier_changeupper_info changeupper_info = {
8447 .info = {
8448 .dev = dev,
8449 },
8450 .upper_dev = upper_dev,
8451 .linking = false,
8452 };
8453
8454 ASSERT_RTNL();
8455
8456 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
8457
8458 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8459 &changeupper_info.info);
8460
8461 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8462
8463 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8464 &changeupper_info.info);
8465
8466 __netdev_update_upper_level(dev, NULL);
8467 __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8468
8469 __netdev_update_lower_level(upper_dev, priv);
8470 __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8471 priv);
8472}
8473
8474/**
8475 * netdev_upper_dev_unlink - Removes a link to upper device
8476 * @dev: device
8477 * @upper_dev: new upper device
8478 *
8479 * Removes a link to device which is upper to this one. The caller must hold
8480 * the RTNL lock.
8481 */
8482void netdev_upper_dev_unlink(struct net_device *dev,
8483 struct net_device *upper_dev)
8484{
8485 struct netdev_nested_priv priv = {
8486 .flags = NESTED_SYNC_TODO,
8487 .data = NULL,
8488 };
8489
8490 __netdev_upper_dev_unlink(dev, upper_dev, &priv);
8491}
8492EXPORT_SYMBOL(netdev_upper_dev_unlink);
8493
8494static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8495 struct net_device *lower_dev,
8496 bool val)
8497{
8498 struct netdev_adjacent *adj;
8499
8500 adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8501 if (adj)
8502 adj->ignore = val;
8503
8504 adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8505 if (adj)
8506 adj->ignore = val;
8507}
8508
8509static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8510 struct net_device *lower_dev)
8511{
8512 __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8513}
8514
8515static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8516 struct net_device *lower_dev)
8517{
8518 __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8519}
8520
8521int netdev_adjacent_change_prepare(struct net_device *old_dev,
8522 struct net_device *new_dev,
8523 struct net_device *dev,
8524 struct netlink_ext_ack *extack)
8525{
8526 struct netdev_nested_priv priv = {
8527 .flags = 0,
8528 .data = NULL,
8529 };
8530 int err;
8531
8532 if (!new_dev)
8533 return 0;
8534
8535 if (old_dev && new_dev != old_dev)
8536 netdev_adjacent_dev_disable(dev, old_dev);
8537 err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8538 extack);
8539 if (err) {
8540 if (old_dev && new_dev != old_dev)
8541 netdev_adjacent_dev_enable(dev, old_dev);
8542 return err;
8543 }
8544
8545 return 0;
8546}
8547EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8548
8549void netdev_adjacent_change_commit(struct net_device *old_dev,
8550 struct net_device *new_dev,
8551 struct net_device *dev)
8552{
8553 struct netdev_nested_priv priv = {
8554 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8555 .data = NULL,
8556 };
8557
8558 if (!new_dev || !old_dev)
8559 return;
8560
8561 if (new_dev == old_dev)
8562 return;
8563
8564 netdev_adjacent_dev_enable(dev, old_dev);
8565 __netdev_upper_dev_unlink(old_dev, dev, &priv);
8566}
8567EXPORT_SYMBOL(netdev_adjacent_change_commit);
8568
8569void netdev_adjacent_change_abort(struct net_device *old_dev,
8570 struct net_device *new_dev,
8571 struct net_device *dev)
8572{
8573 struct netdev_nested_priv priv = {
8574 .flags = 0,
8575 .data = NULL,
8576 };
8577
8578 if (!new_dev)
8579 return;
8580
8581 if (old_dev && new_dev != old_dev)
8582 netdev_adjacent_dev_enable(dev, old_dev);
8583
8584 __netdev_upper_dev_unlink(new_dev, dev, &priv);
8585}
8586EXPORT_SYMBOL(netdev_adjacent_change_abort);
8587
8588/**
8589 * netdev_bonding_info_change - Dispatch event about slave change
8590 * @dev: device
8591 * @bonding_info: info to dispatch
8592 *
8593 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8594 * The caller must hold the RTNL lock.
8595 */
8596void netdev_bonding_info_change(struct net_device *dev,
8597 struct netdev_bonding_info *bonding_info)
8598{
8599 struct netdev_notifier_bonding_info info = {
8600 .info.dev = dev,
8601 };
8602
8603 memcpy(&info.bonding_info, bonding_info,
8604 sizeof(struct netdev_bonding_info));
8605 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8606 &info.info);
8607}
8608EXPORT_SYMBOL(netdev_bonding_info_change);
8609
8610static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8611 struct netlink_ext_ack *extack)
8612{
8613 struct netdev_notifier_offload_xstats_info info = {
8614 .info.dev = dev,
8615 .info.extack = extack,
8616 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8617 };
8618 int err;
8619 int rc;
8620
8621 dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8622 GFP_KERNEL);
8623 if (!dev->offload_xstats_l3)
8624 return -ENOMEM;
8625
8626 rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8627 NETDEV_OFFLOAD_XSTATS_DISABLE,
8628 &info.info);
8629 err = notifier_to_errno(rc);
8630 if (err)
8631 goto free_stats;
8632
8633 return 0;
8634
8635free_stats:
8636 kfree(dev->offload_xstats_l3);
8637 dev->offload_xstats_l3 = NULL;
8638 return err;
8639}
8640
8641int netdev_offload_xstats_enable(struct net_device *dev,
8642 enum netdev_offload_xstats_type type,
8643 struct netlink_ext_ack *extack)
8644{
8645 ASSERT_RTNL();
8646
8647 if (netdev_offload_xstats_enabled(dev, type))
8648 return -EALREADY;
8649
8650 switch (type) {
8651 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8652 return netdev_offload_xstats_enable_l3(dev, extack);
8653 }
8654
8655 WARN_ON(1);
8656 return -EINVAL;
8657}
8658EXPORT_SYMBOL(netdev_offload_xstats_enable);
8659
8660static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8661{
8662 struct netdev_notifier_offload_xstats_info info = {
8663 .info.dev = dev,
8664 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8665 };
8666
8667 call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8668 &info.info);
8669 kfree(dev->offload_xstats_l3);
8670 dev->offload_xstats_l3 = NULL;
8671}
8672
8673int netdev_offload_xstats_disable(struct net_device *dev,
8674 enum netdev_offload_xstats_type type)
8675{
8676 ASSERT_RTNL();
8677
8678 if (!netdev_offload_xstats_enabled(dev, type))
8679 return -EALREADY;
8680
8681 switch (type) {
8682 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8683 netdev_offload_xstats_disable_l3(dev);
8684 return 0;
8685 }
8686
8687 WARN_ON(1);
8688 return -EINVAL;
8689}
8690EXPORT_SYMBOL(netdev_offload_xstats_disable);
8691
8692static void netdev_offload_xstats_disable_all(struct net_device *dev)
8693{
8694 netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8695}
8696
8697static struct rtnl_hw_stats64 *
8698netdev_offload_xstats_get_ptr(const struct net_device *dev,
8699 enum netdev_offload_xstats_type type)
8700{
8701 switch (type) {
8702 case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8703 return dev->offload_xstats_l3;
8704 }
8705
8706 WARN_ON(1);
8707 return NULL;
8708}
8709
8710bool netdev_offload_xstats_enabled(const struct net_device *dev,
8711 enum netdev_offload_xstats_type type)
8712{
8713 ASSERT_RTNL();
8714
8715 return netdev_offload_xstats_get_ptr(dev, type);
8716}
8717EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8718
8719struct netdev_notifier_offload_xstats_ru {
8720 bool used;
8721};
8722
8723struct netdev_notifier_offload_xstats_rd {
8724 struct rtnl_hw_stats64 stats;
8725 bool used;
8726};
8727
8728static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8729 const struct rtnl_hw_stats64 *src)
8730{
8731 dest->rx_packets += src->rx_packets;
8732 dest->tx_packets += src->tx_packets;
8733 dest->rx_bytes += src->rx_bytes;
8734 dest->tx_bytes += src->tx_bytes;
8735 dest->rx_errors += src->rx_errors;
8736 dest->tx_errors += src->tx_errors;
8737 dest->rx_dropped += src->rx_dropped;
8738 dest->tx_dropped += src->tx_dropped;
8739 dest->multicast += src->multicast;
8740}
8741
8742static int netdev_offload_xstats_get_used(struct net_device *dev,
8743 enum netdev_offload_xstats_type type,
8744 bool *p_used,
8745 struct netlink_ext_ack *extack)
8746{
8747 struct netdev_notifier_offload_xstats_ru report_used = {};
8748 struct netdev_notifier_offload_xstats_info info = {
8749 .info.dev = dev,
8750 .info.extack = extack,
8751 .type = type,
8752 .report_used = &report_used,
8753 };
8754 int rc;
8755
8756 WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8757 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8758 &info.info);
8759 *p_used = report_used.used;
8760 return notifier_to_errno(rc);
8761}
8762
8763static int netdev_offload_xstats_get_stats(struct net_device *dev,
8764 enum netdev_offload_xstats_type type,
8765 struct rtnl_hw_stats64 *p_stats,
8766 bool *p_used,
8767 struct netlink_ext_ack *extack)
8768{
8769 struct netdev_notifier_offload_xstats_rd report_delta = {};
8770 struct netdev_notifier_offload_xstats_info info = {
8771 .info.dev = dev,
8772 .info.extack = extack,
8773 .type = type,
8774 .report_delta = &report_delta,
8775 };
8776 struct rtnl_hw_stats64 *stats;
8777 int rc;
8778
8779 stats = netdev_offload_xstats_get_ptr(dev, type);
8780 if (WARN_ON(!stats))
8781 return -EINVAL;
8782
8783 rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8784 &info.info);
8785
8786 /* Cache whatever we got, even if there was an error, otherwise the
8787 * successful stats retrievals would get lost.
8788 */
8789 netdev_hw_stats64_add(stats, &report_delta.stats);
8790
8791 if (p_stats)
8792 *p_stats = *stats;
8793 *p_used = report_delta.used;
8794
8795 return notifier_to_errno(rc);
8796}
8797
8798int netdev_offload_xstats_get(struct net_device *dev,
8799 enum netdev_offload_xstats_type type,
8800 struct rtnl_hw_stats64 *p_stats, bool *p_used,
8801 struct netlink_ext_ack *extack)
8802{
8803 ASSERT_RTNL();
8804
8805 if (p_stats)
8806 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8807 p_used, extack);
8808 else
8809 return netdev_offload_xstats_get_used(dev, type, p_used,
8810 extack);
8811}
8812EXPORT_SYMBOL(netdev_offload_xstats_get);
8813
8814void
8815netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8816 const struct rtnl_hw_stats64 *stats)
8817{
8818 report_delta->used = true;
8819 netdev_hw_stats64_add(&report_delta->stats, stats);
8820}
8821EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8822
8823void
8824netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8825{
8826 report_used->used = true;
8827}
8828EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8829
8830void netdev_offload_xstats_push_delta(struct net_device *dev,
8831 enum netdev_offload_xstats_type type,
8832 const struct rtnl_hw_stats64 *p_stats)
8833{
8834 struct rtnl_hw_stats64 *stats;
8835
8836 ASSERT_RTNL();
8837
8838 stats = netdev_offload_xstats_get_ptr(dev, type);
8839 if (WARN_ON(!stats))
8840 return;
8841
8842 netdev_hw_stats64_add(stats, p_stats);
8843}
8844EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8845
8846/**
8847 * netdev_get_xmit_slave - Get the xmit slave of master device
8848 * @dev: device
8849 * @skb: The packet
8850 * @all_slaves: assume all the slaves are active
8851 *
8852 * The reference counters are not incremented so the caller must be
8853 * careful with locks. The caller must hold RCU lock.
8854 * %NULL is returned if no slave is found.
8855 */
8856
8857struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8858 struct sk_buff *skb,
8859 bool all_slaves)
8860{
8861 const struct net_device_ops *ops = dev->netdev_ops;
8862
8863 if (!ops->ndo_get_xmit_slave)
8864 return NULL;
8865 return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8866}
8867EXPORT_SYMBOL(netdev_get_xmit_slave);
8868
8869static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8870 struct sock *sk)
8871{
8872 const struct net_device_ops *ops = dev->netdev_ops;
8873
8874 if (!ops->ndo_sk_get_lower_dev)
8875 return NULL;
8876 return ops->ndo_sk_get_lower_dev(dev, sk);
8877}
8878
8879/**
8880 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8881 * @dev: device
8882 * @sk: the socket
8883 *
8884 * %NULL is returned if no lower device is found.
8885 */
8886
8887struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8888 struct sock *sk)
8889{
8890 struct net_device *lower;
8891
8892 lower = netdev_sk_get_lower_dev(dev, sk);
8893 while (lower) {
8894 dev = lower;
8895 lower = netdev_sk_get_lower_dev(dev, sk);
8896 }
8897
8898 return dev;
8899}
8900EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8901
8902static void netdev_adjacent_add_links(struct net_device *dev)
8903{
8904 struct netdev_adjacent *iter;
8905
8906 struct net *net = dev_net(dev);
8907
8908 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8909 if (!net_eq(net, dev_net(iter->dev)))
8910 continue;
8911 netdev_adjacent_sysfs_add(iter->dev, dev,
8912 &iter->dev->adj_list.lower);
8913 netdev_adjacent_sysfs_add(dev, iter->dev,
8914 &dev->adj_list.upper);
8915 }
8916
8917 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8918 if (!net_eq(net, dev_net(iter->dev)))
8919 continue;
8920 netdev_adjacent_sysfs_add(iter->dev, dev,
8921 &iter->dev->adj_list.upper);
8922 netdev_adjacent_sysfs_add(dev, iter->dev,
8923 &dev->adj_list.lower);
8924 }
8925}
8926
8927static void netdev_adjacent_del_links(struct net_device *dev)
8928{
8929 struct netdev_adjacent *iter;
8930
8931 struct net *net = dev_net(dev);
8932
8933 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8934 if (!net_eq(net, dev_net(iter->dev)))
8935 continue;
8936 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8937 &iter->dev->adj_list.lower);
8938 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8939 &dev->adj_list.upper);
8940 }
8941
8942 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8943 if (!net_eq(net, dev_net(iter->dev)))
8944 continue;
8945 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8946 &iter->dev->adj_list.upper);
8947 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8948 &dev->adj_list.lower);
8949 }
8950}
8951
8952void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8953{
8954 struct netdev_adjacent *iter;
8955
8956 struct net *net = dev_net(dev);
8957
8958 list_for_each_entry(iter, &dev->adj_list.upper, list) {
8959 if (!net_eq(net, dev_net(iter->dev)))
8960 continue;
8961 netdev_adjacent_sysfs_del(iter->dev, oldname,
8962 &iter->dev->adj_list.lower);
8963 netdev_adjacent_sysfs_add(iter->dev, dev,
8964 &iter->dev->adj_list.lower);
8965 }
8966
8967 list_for_each_entry(iter, &dev->adj_list.lower, list) {
8968 if (!net_eq(net, dev_net(iter->dev)))
8969 continue;
8970 netdev_adjacent_sysfs_del(iter->dev, oldname,
8971 &iter->dev->adj_list.upper);
8972 netdev_adjacent_sysfs_add(iter->dev, dev,
8973 &iter->dev->adj_list.upper);
8974 }
8975}
8976
8977void *netdev_lower_dev_get_private(struct net_device *dev,
8978 struct net_device *lower_dev)
8979{
8980 struct netdev_adjacent *lower;
8981
8982 if (!lower_dev)
8983 return NULL;
8984 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8985 if (!lower)
8986 return NULL;
8987
8988 return lower->private;
8989}
8990EXPORT_SYMBOL(netdev_lower_dev_get_private);
8991
8992
8993/**
8994 * netdev_lower_state_changed - Dispatch event about lower device state change
8995 * @lower_dev: device
8996 * @lower_state_info: state to dispatch
8997 *
8998 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8999 * The caller must hold the RTNL lock.
9000 */
9001void netdev_lower_state_changed(struct net_device *lower_dev,
9002 void *lower_state_info)
9003{
9004 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
9005 .info.dev = lower_dev,
9006 };
9007
9008 ASSERT_RTNL();
9009 changelowerstate_info.lower_state_info = lower_state_info;
9010 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
9011 &changelowerstate_info.info);
9012}
9013EXPORT_SYMBOL(netdev_lower_state_changed);
9014
9015static void dev_change_rx_flags(struct net_device *dev, int flags)
9016{
9017 const struct net_device_ops *ops = dev->netdev_ops;
9018
9019 if (ops->ndo_change_rx_flags)
9020 ops->ndo_change_rx_flags(dev, flags);
9021}
9022
9023static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
9024{
9025 unsigned int old_flags = dev->flags;
9026 unsigned int promiscuity, flags;
9027 kuid_t uid;
9028 kgid_t gid;
9029
9030 ASSERT_RTNL();
9031
9032 promiscuity = dev->promiscuity + inc;
9033 if (promiscuity == 0) {
9034 /*
9035 * Avoid overflow.
9036 * If inc causes overflow, untouch promisc and return error.
9037 */
9038 if (unlikely(inc > 0)) {
9039 netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
9040 return -EOVERFLOW;
9041 }
9042 flags = old_flags & ~IFF_PROMISC;
9043 } else {
9044 flags = old_flags | IFF_PROMISC;
9045 }
9046 WRITE_ONCE(dev->promiscuity, promiscuity);
9047 if (flags != old_flags) {
9048 WRITE_ONCE(dev->flags, flags);
9049 netdev_info(dev, "%s promiscuous mode\n",
9050 dev->flags & IFF_PROMISC ? "entered" : "left");
9051 if (audit_enabled) {
9052 current_uid_gid(&uid, &gid);
9053 audit_log(audit_context(), GFP_ATOMIC,
9054 AUDIT_ANOM_PROMISCUOUS,
9055 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
9056 dev->name, (dev->flags & IFF_PROMISC),
9057 (old_flags & IFF_PROMISC),
9058 from_kuid(&init_user_ns, audit_get_loginuid(current)),
9059 from_kuid(&init_user_ns, uid),
9060 from_kgid(&init_user_ns, gid),
9061 audit_get_sessionid(current));
9062 }
9063
9064 dev_change_rx_flags(dev, IFF_PROMISC);
9065 }
9066 if (notify)
9067 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
9068 return 0;
9069}
9070
9071/**
9072 * dev_set_promiscuity - update promiscuity count on a device
9073 * @dev: device
9074 * @inc: modifier
9075 *
9076 * Add or remove promiscuity from a device. While the count in the device
9077 * remains above zero the interface remains promiscuous. Once it hits zero
9078 * the device reverts back to normal filtering operation. A negative inc
9079 * value is used to drop promiscuity on the device.
9080 * Return 0 if successful or a negative errno code on error.
9081 */
9082int dev_set_promiscuity(struct net_device *dev, int inc)
9083{
9084 unsigned int old_flags = dev->flags;
9085 int err;
9086
9087 err = __dev_set_promiscuity(dev, inc, true);
9088 if (err < 0)
9089 return err;
9090 if (dev->flags != old_flags)
9091 dev_set_rx_mode(dev);
9092 return err;
9093}
9094EXPORT_SYMBOL(dev_set_promiscuity);
9095
9096static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
9097{
9098 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
9099 unsigned int allmulti, flags;
9100
9101 ASSERT_RTNL();
9102
9103 allmulti = dev->allmulti + inc;
9104 if (allmulti == 0) {
9105 /*
9106 * Avoid overflow.
9107 * If inc causes overflow, untouch allmulti and return error.
9108 */
9109 if (unlikely(inc > 0)) {
9110 netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
9111 return -EOVERFLOW;
9112 }
9113 flags = old_flags & ~IFF_ALLMULTI;
9114 } else {
9115 flags = old_flags | IFF_ALLMULTI;
9116 }
9117 WRITE_ONCE(dev->allmulti, allmulti);
9118 if (flags != old_flags) {
9119 WRITE_ONCE(dev->flags, flags);
9120 netdev_info(dev, "%s allmulticast mode\n",
9121 dev->flags & IFF_ALLMULTI ? "entered" : "left");
9122 dev_change_rx_flags(dev, IFF_ALLMULTI);
9123 dev_set_rx_mode(dev);
9124 if (notify)
9125 __dev_notify_flags(dev, old_flags,
9126 dev->gflags ^ old_gflags, 0, NULL);
9127 }
9128 return 0;
9129}
9130
9131/**
9132 * dev_set_allmulti - update allmulti count on a device
9133 * @dev: device
9134 * @inc: modifier
9135 *
9136 * Add or remove reception of all multicast frames to a device. While the
9137 * count in the device remains above zero the interface remains listening
9138 * to all interfaces. Once it hits zero the device reverts back to normal
9139 * filtering operation. A negative @inc value is used to drop the counter
9140 * when releasing a resource needing all multicasts.
9141 * Return 0 if successful or a negative errno code on error.
9142 */
9143
9144int dev_set_allmulti(struct net_device *dev, int inc)
9145{
9146 return __dev_set_allmulti(dev, inc, true);
9147}
9148EXPORT_SYMBOL(dev_set_allmulti);
9149
9150/*
9151 * Upload unicast and multicast address lists to device and
9152 * configure RX filtering. When the device doesn't support unicast
9153 * filtering it is put in promiscuous mode while unicast addresses
9154 * are present.
9155 */
9156void __dev_set_rx_mode(struct net_device *dev)
9157{
9158 const struct net_device_ops *ops = dev->netdev_ops;
9159
9160 /* dev_open will call this function so the list will stay sane. */
9161 if (!(dev->flags&IFF_UP))
9162 return;
9163
9164 if (!netif_device_present(dev))
9165 return;
9166
9167 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
9168 /* Unicast addresses changes may only happen under the rtnl,
9169 * therefore calling __dev_set_promiscuity here is safe.
9170 */
9171 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
9172 __dev_set_promiscuity(dev, 1, false);
9173 dev->uc_promisc = true;
9174 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
9175 __dev_set_promiscuity(dev, -1, false);
9176 dev->uc_promisc = false;
9177 }
9178 }
9179
9180 if (ops->ndo_set_rx_mode)
9181 ops->ndo_set_rx_mode(dev);
9182}
9183
9184void dev_set_rx_mode(struct net_device *dev)
9185{
9186 netif_addr_lock_bh(dev);
9187 __dev_set_rx_mode(dev);
9188 netif_addr_unlock_bh(dev);
9189}
9190
9191/**
9192 * dev_get_flags - get flags reported to userspace
9193 * @dev: device
9194 *
9195 * Get the combination of flag bits exported through APIs to userspace.
9196 */
9197unsigned int dev_get_flags(const struct net_device *dev)
9198{
9199 unsigned int flags;
9200
9201 flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
9202 IFF_ALLMULTI |
9203 IFF_RUNNING |
9204 IFF_LOWER_UP |
9205 IFF_DORMANT)) |
9206 (READ_ONCE(dev->gflags) & (IFF_PROMISC |
9207 IFF_ALLMULTI));
9208
9209 if (netif_running(dev)) {
9210 if (netif_oper_up(dev))
9211 flags |= IFF_RUNNING;
9212 if (netif_carrier_ok(dev))
9213 flags |= IFF_LOWER_UP;
9214 if (netif_dormant(dev))
9215 flags |= IFF_DORMANT;
9216 }
9217
9218 return flags;
9219}
9220EXPORT_SYMBOL(dev_get_flags);
9221
9222int __dev_change_flags(struct net_device *dev, unsigned int flags,
9223 struct netlink_ext_ack *extack)
9224{
9225 unsigned int old_flags = dev->flags;
9226 int ret;
9227
9228 ASSERT_RTNL();
9229
9230 /*
9231 * Set the flags on our device.
9232 */
9233
9234 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
9235 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
9236 IFF_AUTOMEDIA)) |
9237 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
9238 IFF_ALLMULTI));
9239
9240 /*
9241 * Load in the correct multicast list now the flags have changed.
9242 */
9243
9244 if ((old_flags ^ flags) & IFF_MULTICAST)
9245 dev_change_rx_flags(dev, IFF_MULTICAST);
9246
9247 dev_set_rx_mode(dev);
9248
9249 /*
9250 * Have we downed the interface. We handle IFF_UP ourselves
9251 * according to user attempts to set it, rather than blindly
9252 * setting it.
9253 */
9254
9255 ret = 0;
9256 if ((old_flags ^ flags) & IFF_UP) {
9257 if (old_flags & IFF_UP)
9258 __dev_close(dev);
9259 else
9260 ret = __dev_open(dev, extack);
9261 }
9262
9263 if ((flags ^ dev->gflags) & IFF_PROMISC) {
9264 int inc = (flags & IFF_PROMISC) ? 1 : -1;
9265 unsigned int old_flags = dev->flags;
9266
9267 dev->gflags ^= IFF_PROMISC;
9268
9269 if (__dev_set_promiscuity(dev, inc, false) >= 0)
9270 if (dev->flags != old_flags)
9271 dev_set_rx_mode(dev);
9272 }
9273
9274 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
9275 * is important. Some (broken) drivers set IFF_PROMISC, when
9276 * IFF_ALLMULTI is requested not asking us and not reporting.
9277 */
9278 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
9279 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
9280
9281 dev->gflags ^= IFF_ALLMULTI;
9282 __dev_set_allmulti(dev, inc, false);
9283 }
9284
9285 return ret;
9286}
9287
9288void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
9289 unsigned int gchanges, u32 portid,
9290 const struct nlmsghdr *nlh)
9291{
9292 unsigned int changes = dev->flags ^ old_flags;
9293
9294 if (gchanges)
9295 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
9296
9297 if (changes & IFF_UP) {
9298 if (dev->flags & IFF_UP)
9299 call_netdevice_notifiers(NETDEV_UP, dev);
9300 else
9301 call_netdevice_notifiers(NETDEV_DOWN, dev);
9302 }
9303
9304 if (dev->flags & IFF_UP &&
9305 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
9306 struct netdev_notifier_change_info change_info = {
9307 .info = {
9308 .dev = dev,
9309 },
9310 .flags_changed = changes,
9311 };
9312
9313 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
9314 }
9315}
9316
9317/**
9318 * dev_change_flags - change device settings
9319 * @dev: device
9320 * @flags: device state flags
9321 * @extack: netlink extended ack
9322 *
9323 * Change settings on device based state flags. The flags are
9324 * in the userspace exported format.
9325 */
9326int dev_change_flags(struct net_device *dev, unsigned int flags,
9327 struct netlink_ext_ack *extack)
9328{
9329 int ret;
9330 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
9331
9332 ret = __dev_change_flags(dev, flags, extack);
9333 if (ret < 0)
9334 return ret;
9335
9336 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
9337 __dev_notify_flags(dev, old_flags, changes, 0, NULL);
9338 return ret;
9339}
9340EXPORT_SYMBOL(dev_change_flags);
9341
9342int __dev_set_mtu(struct net_device *dev, int new_mtu)
9343{
9344 const struct net_device_ops *ops = dev->netdev_ops;
9345
9346 if (ops->ndo_change_mtu)
9347 return ops->ndo_change_mtu(dev, new_mtu);
9348
9349 /* Pairs with all the lockless reads of dev->mtu in the stack */
9350 WRITE_ONCE(dev->mtu, new_mtu);
9351 return 0;
9352}
9353EXPORT_SYMBOL(__dev_set_mtu);
9354
9355int dev_validate_mtu(struct net_device *dev, int new_mtu,
9356 struct netlink_ext_ack *extack)
9357{
9358 /* MTU must be positive, and in range */
9359 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
9360 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
9361 return -EINVAL;
9362 }
9363
9364 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
9365 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
9366 return -EINVAL;
9367 }
9368 return 0;
9369}
9370
9371/**
9372 * dev_set_mtu_ext - Change maximum transfer unit
9373 * @dev: device
9374 * @new_mtu: new transfer unit
9375 * @extack: netlink extended ack
9376 *
9377 * Change the maximum transfer size of the network device.
9378 */
9379int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
9380 struct netlink_ext_ack *extack)
9381{
9382 int err, orig_mtu;
9383
9384 if (new_mtu == dev->mtu)
9385 return 0;
9386
9387 err = dev_validate_mtu(dev, new_mtu, extack);
9388 if (err)
9389 return err;
9390
9391 if (!netif_device_present(dev))
9392 return -ENODEV;
9393
9394 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
9395 err = notifier_to_errno(err);
9396 if (err)
9397 return err;
9398
9399 orig_mtu = dev->mtu;
9400 err = __dev_set_mtu(dev, new_mtu);
9401
9402 if (!err) {
9403 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
9404 orig_mtu);
9405 err = notifier_to_errno(err);
9406 if (err) {
9407 /* setting mtu back and notifying everyone again,
9408 * so that they have a chance to revert changes.
9409 */
9410 __dev_set_mtu(dev, orig_mtu);
9411 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
9412 new_mtu);
9413 }
9414 }
9415 return err;
9416}
9417
9418int dev_set_mtu(struct net_device *dev, int new_mtu)
9419{
9420 struct netlink_ext_ack extack;
9421 int err;
9422
9423 memset(&extack, 0, sizeof(extack));
9424 err = dev_set_mtu_ext(dev, new_mtu, &extack);
9425 if (err && extack._msg)
9426 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
9427 return err;
9428}
9429EXPORT_SYMBOL(dev_set_mtu);
9430
9431/**
9432 * dev_change_tx_queue_len - Change TX queue length of a netdevice
9433 * @dev: device
9434 * @new_len: new tx queue length
9435 */
9436int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
9437{
9438 unsigned int orig_len = dev->tx_queue_len;
9439 int res;
9440
9441 if (new_len != (unsigned int)new_len)
9442 return -ERANGE;
9443
9444 if (new_len != orig_len) {
9445 WRITE_ONCE(dev->tx_queue_len, new_len);
9446 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
9447 res = notifier_to_errno(res);
9448 if (res)
9449 goto err_rollback;
9450 res = dev_qdisc_change_tx_queue_len(dev);
9451 if (res)
9452 goto err_rollback;
9453 }
9454
9455 return 0;
9456
9457err_rollback:
9458 netdev_err(dev, "refused to change device tx_queue_len\n");
9459 WRITE_ONCE(dev->tx_queue_len, orig_len);
9460 return res;
9461}
9462
9463/**
9464 * dev_set_group - Change group this device belongs to
9465 * @dev: device
9466 * @new_group: group this device should belong to
9467 */
9468void dev_set_group(struct net_device *dev, int new_group)
9469{
9470 dev->group = new_group;
9471}
9472
9473/**
9474 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
9475 * @dev: device
9476 * @addr: new address
9477 * @extack: netlink extended ack
9478 */
9479int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
9480 struct netlink_ext_ack *extack)
9481{
9482 struct netdev_notifier_pre_changeaddr_info info = {
9483 .info.dev = dev,
9484 .info.extack = extack,
9485 .dev_addr = addr,
9486 };
9487 int rc;
9488
9489 rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
9490 return notifier_to_errno(rc);
9491}
9492EXPORT_SYMBOL(dev_pre_changeaddr_notify);
9493
9494/**
9495 * dev_set_mac_address - Change Media Access Control Address
9496 * @dev: device
9497 * @sa: new address
9498 * @extack: netlink extended ack
9499 *
9500 * Change the hardware (MAC) address of the device
9501 */
9502int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
9503 struct netlink_ext_ack *extack)
9504{
9505 const struct net_device_ops *ops = dev->netdev_ops;
9506 int err;
9507
9508 if (!ops->ndo_set_mac_address)
9509 return -EOPNOTSUPP;
9510 if (sa->sa_family != dev->type)
9511 return -EINVAL;
9512 if (!netif_device_present(dev))
9513 return -ENODEV;
9514 err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
9515 if (err)
9516 return err;
9517 if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
9518 err = ops->ndo_set_mac_address(dev, sa);
9519 if (err)
9520 return err;
9521 }
9522 dev->addr_assign_type = NET_ADDR_SET;
9523 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
9524 add_device_randomness(dev->dev_addr, dev->addr_len);
9525 return 0;
9526}
9527EXPORT_SYMBOL(dev_set_mac_address);
9528
9529DECLARE_RWSEM(dev_addr_sem);
9530
9531int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
9532 struct netlink_ext_ack *extack)
9533{
9534 int ret;
9535
9536 down_write(&dev_addr_sem);
9537 ret = dev_set_mac_address(dev, sa, extack);
9538 up_write(&dev_addr_sem);
9539 return ret;
9540}
9541EXPORT_SYMBOL(dev_set_mac_address_user);
9542
9543int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
9544{
9545 size_t size = sizeof(sa->sa_data_min);
9546 struct net_device *dev;
9547 int ret = 0;
9548
9549 down_read(&dev_addr_sem);
9550 rcu_read_lock();
9551
9552 dev = dev_get_by_name_rcu(net, dev_name);
9553 if (!dev) {
9554 ret = -ENODEV;
9555 goto unlock;
9556 }
9557 if (!dev->addr_len)
9558 memset(sa->sa_data, 0, size);
9559 else
9560 memcpy(sa->sa_data, dev->dev_addr,
9561 min_t(size_t, size, dev->addr_len));
9562 sa->sa_family = dev->type;
9563
9564unlock:
9565 rcu_read_unlock();
9566 up_read(&dev_addr_sem);
9567 return ret;
9568}
9569EXPORT_SYMBOL(dev_get_mac_address);
9570
9571/**
9572 * dev_change_carrier - Change device carrier
9573 * @dev: device
9574 * @new_carrier: new value
9575 *
9576 * Change device carrier
9577 */
9578int dev_change_carrier(struct net_device *dev, bool new_carrier)
9579{
9580 const struct net_device_ops *ops = dev->netdev_ops;
9581
9582 if (!ops->ndo_change_carrier)
9583 return -EOPNOTSUPP;
9584 if (!netif_device_present(dev))
9585 return -ENODEV;
9586 return ops->ndo_change_carrier(dev, new_carrier);
9587}
9588
9589/**
9590 * dev_get_phys_port_id - Get device physical port ID
9591 * @dev: device
9592 * @ppid: port ID
9593 *
9594 * Get device physical port ID
9595 */
9596int dev_get_phys_port_id(struct net_device *dev,
9597 struct netdev_phys_item_id *ppid)
9598{
9599 const struct net_device_ops *ops = dev->netdev_ops;
9600
9601 if (!ops->ndo_get_phys_port_id)
9602 return -EOPNOTSUPP;
9603 return ops->ndo_get_phys_port_id(dev, ppid);
9604}
9605
9606/**
9607 * dev_get_phys_port_name - Get device physical port name
9608 * @dev: device
9609 * @name: port name
9610 * @len: limit of bytes to copy to name
9611 *
9612 * Get device physical port name
9613 */
9614int dev_get_phys_port_name(struct net_device *dev,
9615 char *name, size_t len)
9616{
9617 const struct net_device_ops *ops = dev->netdev_ops;
9618 int err;
9619
9620 if (ops->ndo_get_phys_port_name) {
9621 err = ops->ndo_get_phys_port_name(dev, name, len);
9622 if (err != -EOPNOTSUPP)
9623 return err;
9624 }
9625 return devlink_compat_phys_port_name_get(dev, name, len);
9626}
9627
9628/**
9629 * dev_get_port_parent_id - Get the device's port parent identifier
9630 * @dev: network device
9631 * @ppid: pointer to a storage for the port's parent identifier
9632 * @recurse: allow/disallow recursion to lower devices
9633 *
9634 * Get the devices's port parent identifier
9635 */
9636int dev_get_port_parent_id(struct net_device *dev,
9637 struct netdev_phys_item_id *ppid,
9638 bool recurse)
9639{
9640 const struct net_device_ops *ops = dev->netdev_ops;
9641 struct netdev_phys_item_id first = { };
9642 struct net_device *lower_dev;
9643 struct list_head *iter;
9644 int err;
9645
9646 if (ops->ndo_get_port_parent_id) {
9647 err = ops->ndo_get_port_parent_id(dev, ppid);
9648 if (err != -EOPNOTSUPP)
9649 return err;
9650 }
9651
9652 err = devlink_compat_switch_id_get(dev, ppid);
9653 if (!recurse || err != -EOPNOTSUPP)
9654 return err;
9655
9656 netdev_for_each_lower_dev(dev, lower_dev, iter) {
9657 err = dev_get_port_parent_id(lower_dev, ppid, true);
9658 if (err)
9659 break;
9660 if (!first.id_len)
9661 first = *ppid;
9662 else if (memcmp(&first, ppid, sizeof(*ppid)))
9663 return -EOPNOTSUPP;
9664 }
9665
9666 return err;
9667}
9668EXPORT_SYMBOL(dev_get_port_parent_id);
9669
9670/**
9671 * netdev_port_same_parent_id - Indicate if two network devices have
9672 * the same port parent identifier
9673 * @a: first network device
9674 * @b: second network device
9675 */
9676bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9677{
9678 struct netdev_phys_item_id a_id = { };
9679 struct netdev_phys_item_id b_id = { };
9680
9681 if (dev_get_port_parent_id(a, &a_id, true) ||
9682 dev_get_port_parent_id(b, &b_id, true))
9683 return false;
9684
9685 return netdev_phys_item_id_same(&a_id, &b_id);
9686}
9687EXPORT_SYMBOL(netdev_port_same_parent_id);
9688
9689/**
9690 * dev_change_proto_down - set carrier according to proto_down.
9691 *
9692 * @dev: device
9693 * @proto_down: new value
9694 */
9695int dev_change_proto_down(struct net_device *dev, bool proto_down)
9696{
9697 if (!dev->change_proto_down)
9698 return -EOPNOTSUPP;
9699 if (!netif_device_present(dev))
9700 return -ENODEV;
9701 if (proto_down)
9702 netif_carrier_off(dev);
9703 else
9704 netif_carrier_on(dev);
9705 WRITE_ONCE(dev->proto_down, proto_down);
9706 return 0;
9707}
9708
9709/**
9710 * dev_change_proto_down_reason - proto down reason
9711 *
9712 * @dev: device
9713 * @mask: proto down mask
9714 * @value: proto down value
9715 */
9716void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9717 u32 value)
9718{
9719 u32 proto_down_reason;
9720 int b;
9721
9722 if (!mask) {
9723 proto_down_reason = value;
9724 } else {
9725 proto_down_reason = dev->proto_down_reason;
9726 for_each_set_bit(b, &mask, 32) {
9727 if (value & (1 << b))
9728 proto_down_reason |= BIT(b);
9729 else
9730 proto_down_reason &= ~BIT(b);
9731 }
9732 }
9733 WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
9734}
9735
9736struct bpf_xdp_link {
9737 struct bpf_link link;
9738 struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9739 int flags;
9740};
9741
9742static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9743{
9744 if (flags & XDP_FLAGS_HW_MODE)
9745 return XDP_MODE_HW;
9746 if (flags & XDP_FLAGS_DRV_MODE)
9747 return XDP_MODE_DRV;
9748 if (flags & XDP_FLAGS_SKB_MODE)
9749 return XDP_MODE_SKB;
9750 return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9751}
9752
9753static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9754{
9755 switch (mode) {
9756 case XDP_MODE_SKB:
9757 return generic_xdp_install;
9758 case XDP_MODE_DRV:
9759 case XDP_MODE_HW:
9760 return dev->netdev_ops->ndo_bpf;
9761 default:
9762 return NULL;
9763 }
9764}
9765
9766static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9767 enum bpf_xdp_mode mode)
9768{
9769 return dev->xdp_state[mode].link;
9770}
9771
9772static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9773 enum bpf_xdp_mode mode)
9774{
9775 struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9776
9777 if (link)
9778 return link->link.prog;
9779 return dev->xdp_state[mode].prog;
9780}
9781
9782u8 dev_xdp_prog_count(struct net_device *dev)
9783{
9784 u8 count = 0;
9785 int i;
9786
9787 for (i = 0; i < __MAX_XDP_MODE; i++)
9788 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9789 count++;
9790 return count;
9791}
9792EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9793
9794u8 dev_xdp_sb_prog_count(struct net_device *dev)
9795{
9796 u8 count = 0;
9797 int i;
9798
9799 for (i = 0; i < __MAX_XDP_MODE; i++)
9800 if (dev->xdp_state[i].prog &&
9801 !dev->xdp_state[i].prog->aux->xdp_has_frags)
9802 count++;
9803 return count;
9804}
9805
9806int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
9807{
9808 if (!dev->netdev_ops->ndo_bpf)
9809 return -EOPNOTSUPP;
9810
9811 if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
9812 bpf->command == XDP_SETUP_PROG &&
9813 bpf->prog && !bpf->prog->aux->xdp_has_frags) {
9814 NL_SET_ERR_MSG(bpf->extack,
9815 "unable to propagate XDP to device using tcp-data-split");
9816 return -EBUSY;
9817 }
9818
9819 if (dev_get_min_mp_channel_count(dev)) {
9820 NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
9821 return -EBUSY;
9822 }
9823
9824 return dev->netdev_ops->ndo_bpf(dev, bpf);
9825}
9826EXPORT_SYMBOL_GPL(dev_xdp_propagate);
9827
9828u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9829{
9830 struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9831
9832 return prog ? prog->aux->id : 0;
9833}
9834
9835static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9836 struct bpf_xdp_link *link)
9837{
9838 dev->xdp_state[mode].link = link;
9839 dev->xdp_state[mode].prog = NULL;
9840}
9841
9842static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9843 struct bpf_prog *prog)
9844{
9845 dev->xdp_state[mode].link = NULL;
9846 dev->xdp_state[mode].prog = prog;
9847}
9848
9849static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9850 bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9851 u32 flags, struct bpf_prog *prog)
9852{
9853 struct netdev_bpf xdp;
9854 int err;
9855
9856 if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
9857 prog && !prog->aux->xdp_has_frags) {
9858 NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
9859 return -EBUSY;
9860 }
9861
9862 if (dev_get_min_mp_channel_count(dev)) {
9863 NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
9864 return -EBUSY;
9865 }
9866
9867 memset(&xdp, 0, sizeof(xdp));
9868 xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9869 xdp.extack = extack;
9870 xdp.flags = flags;
9871 xdp.prog = prog;
9872
9873 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9874 * "moved" into driver), so they don't increment it on their own, but
9875 * they do decrement refcnt when program is detached or replaced.
9876 * Given net_device also owns link/prog, we need to bump refcnt here
9877 * to prevent drivers from underflowing it.
9878 */
9879 if (prog)
9880 bpf_prog_inc(prog);
9881 err = bpf_op(dev, &xdp);
9882 if (err) {
9883 if (prog)
9884 bpf_prog_put(prog);
9885 return err;
9886 }
9887
9888 if (mode != XDP_MODE_HW)
9889 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9890
9891 return 0;
9892}
9893
9894static void dev_xdp_uninstall(struct net_device *dev)
9895{
9896 struct bpf_xdp_link *link;
9897 struct bpf_prog *prog;
9898 enum bpf_xdp_mode mode;
9899 bpf_op_t bpf_op;
9900
9901 ASSERT_RTNL();
9902
9903 for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9904 prog = dev_xdp_prog(dev, mode);
9905 if (!prog)
9906 continue;
9907
9908 bpf_op = dev_xdp_bpf_op(dev, mode);
9909 if (!bpf_op)
9910 continue;
9911
9912 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9913
9914 /* auto-detach link from net device */
9915 link = dev_xdp_link(dev, mode);
9916 if (link)
9917 link->dev = NULL;
9918 else
9919 bpf_prog_put(prog);
9920
9921 dev_xdp_set_link(dev, mode, NULL);
9922 }
9923}
9924
9925static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9926 struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9927 struct bpf_prog *old_prog, u32 flags)
9928{
9929 unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9930 struct bpf_prog *cur_prog;
9931 struct net_device *upper;
9932 struct list_head *iter;
9933 enum bpf_xdp_mode mode;
9934 bpf_op_t bpf_op;
9935 int err;
9936
9937 ASSERT_RTNL();
9938
9939 /* either link or prog attachment, never both */
9940 if (link && (new_prog || old_prog))
9941 return -EINVAL;
9942 /* link supports only XDP mode flags */
9943 if (link && (flags & ~XDP_FLAGS_MODES)) {
9944 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9945 return -EINVAL;
9946 }
9947 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9948 if (num_modes > 1) {
9949 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9950 return -EINVAL;
9951 }
9952 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9953 if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9954 NL_SET_ERR_MSG(extack,
9955 "More than one program loaded, unset mode is ambiguous");
9956 return -EINVAL;
9957 }
9958 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9959 if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9960 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9961 return -EINVAL;
9962 }
9963
9964 mode = dev_xdp_mode(dev, flags);
9965 /* can't replace attached link */
9966 if (dev_xdp_link(dev, mode)) {
9967 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9968 return -EBUSY;
9969 }
9970
9971 /* don't allow if an upper device already has a program */
9972 netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9973 if (dev_xdp_prog_count(upper) > 0) {
9974 NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9975 return -EEXIST;
9976 }
9977 }
9978
9979 cur_prog = dev_xdp_prog(dev, mode);
9980 /* can't replace attached prog with link */
9981 if (link && cur_prog) {
9982 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9983 return -EBUSY;
9984 }
9985 if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9986 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9987 return -EEXIST;
9988 }
9989
9990 /* put effective new program into new_prog */
9991 if (link)
9992 new_prog = link->link.prog;
9993
9994 if (new_prog) {
9995 bool offload = mode == XDP_MODE_HW;
9996 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9997 ? XDP_MODE_DRV : XDP_MODE_SKB;
9998
9999 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
10000 NL_SET_ERR_MSG(extack, "XDP program already attached");
10001 return -EBUSY;
10002 }
10003 if (!offload && dev_xdp_prog(dev, other_mode)) {
10004 NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
10005 return -EEXIST;
10006 }
10007 if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
10008 NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
10009 return -EINVAL;
10010 }
10011 if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
10012 NL_SET_ERR_MSG(extack, "Program bound to different device");
10013 return -EINVAL;
10014 }
10015 if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
10016 NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
10017 return -EINVAL;
10018 }
10019 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
10020 NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
10021 return -EINVAL;
10022 }
10023 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
10024 NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
10025 return -EINVAL;
10026 }
10027 }
10028
10029 /* don't call drivers if the effective program didn't change */
10030 if (new_prog != cur_prog) {
10031 bpf_op = dev_xdp_bpf_op(dev, mode);
10032 if (!bpf_op) {
10033 NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
10034 return -EOPNOTSUPP;
10035 }
10036
10037 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
10038 if (err)
10039 return err;
10040 }
10041
10042 if (link)
10043 dev_xdp_set_link(dev, mode, link);
10044 else
10045 dev_xdp_set_prog(dev, mode, new_prog);
10046 if (cur_prog)
10047 bpf_prog_put(cur_prog);
10048
10049 return 0;
10050}
10051
10052static int dev_xdp_attach_link(struct net_device *dev,
10053 struct netlink_ext_ack *extack,
10054 struct bpf_xdp_link *link)
10055{
10056 return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
10057}
10058
10059static int dev_xdp_detach_link(struct net_device *dev,
10060 struct netlink_ext_ack *extack,
10061 struct bpf_xdp_link *link)
10062{
10063 enum bpf_xdp_mode mode;
10064 bpf_op_t bpf_op;
10065
10066 ASSERT_RTNL();
10067
10068 mode = dev_xdp_mode(dev, link->flags);
10069 if (dev_xdp_link(dev, mode) != link)
10070 return -EINVAL;
10071
10072 bpf_op = dev_xdp_bpf_op(dev, mode);
10073 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
10074 dev_xdp_set_link(dev, mode, NULL);
10075 return 0;
10076}
10077
10078static void bpf_xdp_link_release(struct bpf_link *link)
10079{
10080 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10081
10082 rtnl_lock();
10083
10084 /* if racing with net_device's tear down, xdp_link->dev might be
10085 * already NULL, in which case link was already auto-detached
10086 */
10087 if (xdp_link->dev) {
10088 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
10089 xdp_link->dev = NULL;
10090 }
10091
10092 rtnl_unlock();
10093}
10094
10095static int bpf_xdp_link_detach(struct bpf_link *link)
10096{
10097 bpf_xdp_link_release(link);
10098 return 0;
10099}
10100
10101static void bpf_xdp_link_dealloc(struct bpf_link *link)
10102{
10103 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10104
10105 kfree(xdp_link);
10106}
10107
10108static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
10109 struct seq_file *seq)
10110{
10111 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10112 u32 ifindex = 0;
10113
10114 rtnl_lock();
10115 if (xdp_link->dev)
10116 ifindex = xdp_link->dev->ifindex;
10117 rtnl_unlock();
10118
10119 seq_printf(seq, "ifindex:\t%u\n", ifindex);
10120}
10121
10122static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
10123 struct bpf_link_info *info)
10124{
10125 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10126 u32 ifindex = 0;
10127
10128 rtnl_lock();
10129 if (xdp_link->dev)
10130 ifindex = xdp_link->dev->ifindex;
10131 rtnl_unlock();
10132
10133 info->xdp.ifindex = ifindex;
10134 return 0;
10135}
10136
10137static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
10138 struct bpf_prog *old_prog)
10139{
10140 struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
10141 enum bpf_xdp_mode mode;
10142 bpf_op_t bpf_op;
10143 int err = 0;
10144
10145 rtnl_lock();
10146
10147 /* link might have been auto-released already, so fail */
10148 if (!xdp_link->dev) {
10149 err = -ENOLINK;
10150 goto out_unlock;
10151 }
10152
10153 if (old_prog && link->prog != old_prog) {
10154 err = -EPERM;
10155 goto out_unlock;
10156 }
10157 old_prog = link->prog;
10158 if (old_prog->type != new_prog->type ||
10159 old_prog->expected_attach_type != new_prog->expected_attach_type) {
10160 err = -EINVAL;
10161 goto out_unlock;
10162 }
10163
10164 if (old_prog == new_prog) {
10165 /* no-op, don't disturb drivers */
10166 bpf_prog_put(new_prog);
10167 goto out_unlock;
10168 }
10169
10170 mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
10171 bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
10172 err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
10173 xdp_link->flags, new_prog);
10174 if (err)
10175 goto out_unlock;
10176
10177 old_prog = xchg(&link->prog, new_prog);
10178 bpf_prog_put(old_prog);
10179
10180out_unlock:
10181 rtnl_unlock();
10182 return err;
10183}
10184
10185static const struct bpf_link_ops bpf_xdp_link_lops = {
10186 .release = bpf_xdp_link_release,
10187 .dealloc = bpf_xdp_link_dealloc,
10188 .detach = bpf_xdp_link_detach,
10189 .show_fdinfo = bpf_xdp_link_show_fdinfo,
10190 .fill_link_info = bpf_xdp_link_fill_link_info,
10191 .update_prog = bpf_xdp_link_update,
10192};
10193
10194int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
10195{
10196 struct net *net = current->nsproxy->net_ns;
10197 struct bpf_link_primer link_primer;
10198 struct netlink_ext_ack extack = {};
10199 struct bpf_xdp_link *link;
10200 struct net_device *dev;
10201 int err, fd;
10202
10203 rtnl_lock();
10204 dev = dev_get_by_index(net, attr->link_create.target_ifindex);
10205 if (!dev) {
10206 rtnl_unlock();
10207 return -EINVAL;
10208 }
10209
10210 link = kzalloc(sizeof(*link), GFP_USER);
10211 if (!link) {
10212 err = -ENOMEM;
10213 goto unlock;
10214 }
10215
10216 bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
10217 link->dev = dev;
10218 link->flags = attr->link_create.flags;
10219
10220 err = bpf_link_prime(&link->link, &link_primer);
10221 if (err) {
10222 kfree(link);
10223 goto unlock;
10224 }
10225
10226 err = dev_xdp_attach_link(dev, &extack, link);
10227 rtnl_unlock();
10228
10229 if (err) {
10230 link->dev = NULL;
10231 bpf_link_cleanup(&link_primer);
10232 trace_bpf_xdp_link_attach_failed(extack._msg);
10233 goto out_put_dev;
10234 }
10235
10236 fd = bpf_link_settle(&link_primer);
10237 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
10238 dev_put(dev);
10239 return fd;
10240
10241unlock:
10242 rtnl_unlock();
10243
10244out_put_dev:
10245 dev_put(dev);
10246 return err;
10247}
10248
10249/**
10250 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
10251 * @dev: device
10252 * @extack: netlink extended ack
10253 * @fd: new program fd or negative value to clear
10254 * @expected_fd: old program fd that userspace expects to replace or clear
10255 * @flags: xdp-related flags
10256 *
10257 * Set or clear a bpf program for a device
10258 */
10259int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
10260 int fd, int expected_fd, u32 flags)
10261{
10262 enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
10263 struct bpf_prog *new_prog = NULL, *old_prog = NULL;
10264 int err;
10265
10266 ASSERT_RTNL();
10267
10268 if (fd >= 0) {
10269 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
10270 mode != XDP_MODE_SKB);
10271 if (IS_ERR(new_prog))
10272 return PTR_ERR(new_prog);
10273 }
10274
10275 if (expected_fd >= 0) {
10276 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
10277 mode != XDP_MODE_SKB);
10278 if (IS_ERR(old_prog)) {
10279 err = PTR_ERR(old_prog);
10280 old_prog = NULL;
10281 goto err_out;
10282 }
10283 }
10284
10285 err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
10286
10287err_out:
10288 if (err && new_prog)
10289 bpf_prog_put(new_prog);
10290 if (old_prog)
10291 bpf_prog_put(old_prog);
10292 return err;
10293}
10294
10295u32 dev_get_min_mp_channel_count(const struct net_device *dev)
10296{
10297 int i;
10298
10299 ASSERT_RTNL();
10300
10301 for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
10302 if (dev->_rx[i].mp_params.mp_priv)
10303 /* The channel count is the idx plus 1. */
10304 return i + 1;
10305
10306 return 0;
10307}
10308
10309/**
10310 * dev_index_reserve() - allocate an ifindex in a namespace
10311 * @net: the applicable net namespace
10312 * @ifindex: requested ifindex, pass %0 to get one allocated
10313 *
10314 * Allocate a ifindex for a new device. Caller must either use the ifindex
10315 * to store the device (via list_netdevice()) or call dev_index_release()
10316 * to give the index up.
10317 *
10318 * Return: a suitable unique value for a new device interface number or -errno.
10319 */
10320static int dev_index_reserve(struct net *net, u32 ifindex)
10321{
10322 int err;
10323
10324 if (ifindex > INT_MAX) {
10325 DEBUG_NET_WARN_ON_ONCE(1);
10326 return -EINVAL;
10327 }
10328
10329 if (!ifindex)
10330 err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
10331 xa_limit_31b, &net->ifindex, GFP_KERNEL);
10332 else
10333 err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
10334 if (err < 0)
10335 return err;
10336
10337 return ifindex;
10338}
10339
10340static void dev_index_release(struct net *net, int ifindex)
10341{
10342 /* Expect only unused indexes, unlist_netdevice() removes the used */
10343 WARN_ON(xa_erase(&net->dev_by_index, ifindex));
10344}
10345
10346static bool from_cleanup_net(void)
10347{
10348#ifdef CONFIG_NET_NS
10349 return current == cleanup_net_task;
10350#else
10351 return false;
10352#endif
10353}
10354
10355/* Delayed registration/unregisteration */
10356LIST_HEAD(net_todo_list);
10357DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
10358atomic_t dev_unreg_count = ATOMIC_INIT(0);
10359
10360static void net_set_todo(struct net_device *dev)
10361{
10362 list_add_tail(&dev->todo_list, &net_todo_list);
10363}
10364
10365static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
10366 struct net_device *upper, netdev_features_t features)
10367{
10368 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10369 netdev_features_t feature;
10370 int feature_bit;
10371
10372 for_each_netdev_feature(upper_disables, feature_bit) {
10373 feature = __NETIF_F_BIT(feature_bit);
10374 if (!(upper->wanted_features & feature)
10375 && (features & feature)) {
10376 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
10377 &feature, upper->name);
10378 features &= ~feature;
10379 }
10380 }
10381
10382 return features;
10383}
10384
10385static void netdev_sync_lower_features(struct net_device *upper,
10386 struct net_device *lower, netdev_features_t features)
10387{
10388 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
10389 netdev_features_t feature;
10390 int feature_bit;
10391
10392 for_each_netdev_feature(upper_disables, feature_bit) {
10393 feature = __NETIF_F_BIT(feature_bit);
10394 if (!(features & feature) && (lower->features & feature)) {
10395 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
10396 &feature, lower->name);
10397 lower->wanted_features &= ~feature;
10398 __netdev_update_features(lower);
10399
10400 if (unlikely(lower->features & feature))
10401 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
10402 &feature, lower->name);
10403 else
10404 netdev_features_change(lower);
10405 }
10406 }
10407}
10408
10409static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
10410{
10411 netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
10412 bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
10413 bool hw_csum = features & NETIF_F_HW_CSUM;
10414
10415 return ip_csum || hw_csum;
10416}
10417
10418static netdev_features_t netdev_fix_features(struct net_device *dev,
10419 netdev_features_t features)
10420{
10421 /* Fix illegal checksum combinations */
10422 if ((features & NETIF_F_HW_CSUM) &&
10423 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
10424 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
10425 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
10426 }
10427
10428 /* TSO requires that SG is present as well. */
10429 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
10430 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
10431 features &= ~NETIF_F_ALL_TSO;
10432 }
10433
10434 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
10435 !(features & NETIF_F_IP_CSUM)) {
10436 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
10437 features &= ~NETIF_F_TSO;
10438 features &= ~NETIF_F_TSO_ECN;
10439 }
10440
10441 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
10442 !(features & NETIF_F_IPV6_CSUM)) {
10443 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
10444 features &= ~NETIF_F_TSO6;
10445 }
10446
10447 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
10448 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
10449 features &= ~NETIF_F_TSO_MANGLEID;
10450
10451 /* TSO ECN requires that TSO is present as well. */
10452 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
10453 features &= ~NETIF_F_TSO_ECN;
10454
10455 /* Software GSO depends on SG. */
10456 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
10457 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
10458 features &= ~NETIF_F_GSO;
10459 }
10460
10461 /* GSO partial features require GSO partial be set */
10462 if ((features & dev->gso_partial_features) &&
10463 !(features & NETIF_F_GSO_PARTIAL)) {
10464 netdev_dbg(dev,
10465 "Dropping partially supported GSO features since no GSO partial.\n");
10466 features &= ~dev->gso_partial_features;
10467 }
10468
10469 if (!(features & NETIF_F_RXCSUM)) {
10470 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
10471 * successfully merged by hardware must also have the
10472 * checksum verified by hardware. If the user does not
10473 * want to enable RXCSUM, logically, we should disable GRO_HW.
10474 */
10475 if (features & NETIF_F_GRO_HW) {
10476 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
10477 features &= ~NETIF_F_GRO_HW;
10478 }
10479 }
10480
10481 /* LRO/HW-GRO features cannot be combined with RX-FCS */
10482 if (features & NETIF_F_RXFCS) {
10483 if (features & NETIF_F_LRO) {
10484 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
10485 features &= ~NETIF_F_LRO;
10486 }
10487
10488 if (features & NETIF_F_GRO_HW) {
10489 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
10490 features &= ~NETIF_F_GRO_HW;
10491 }
10492 }
10493
10494 if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
10495 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
10496 features &= ~NETIF_F_LRO;
10497 }
10498
10499 if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
10500 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
10501 features &= ~NETIF_F_HW_TLS_TX;
10502 }
10503
10504 if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
10505 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
10506 features &= ~NETIF_F_HW_TLS_RX;
10507 }
10508
10509 if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
10510 netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
10511 features &= ~NETIF_F_GSO_UDP_L4;
10512 }
10513
10514 return features;
10515}
10516
10517int __netdev_update_features(struct net_device *dev)
10518{
10519 struct net_device *upper, *lower;
10520 netdev_features_t features;
10521 struct list_head *iter;
10522 int err = -1;
10523
10524 ASSERT_RTNL();
10525
10526 features = netdev_get_wanted_features(dev);
10527
10528 if (dev->netdev_ops->ndo_fix_features)
10529 features = dev->netdev_ops->ndo_fix_features(dev, features);
10530
10531 /* driver might be less strict about feature dependencies */
10532 features = netdev_fix_features(dev, features);
10533
10534 /* some features can't be enabled if they're off on an upper device */
10535 netdev_for_each_upper_dev_rcu(dev, upper, iter)
10536 features = netdev_sync_upper_features(dev, upper, features);
10537
10538 if (dev->features == features)
10539 goto sync_lower;
10540
10541 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
10542 &dev->features, &features);
10543
10544 if (dev->netdev_ops->ndo_set_features)
10545 err = dev->netdev_ops->ndo_set_features(dev, features);
10546 else
10547 err = 0;
10548
10549 if (unlikely(err < 0)) {
10550 netdev_err(dev,
10551 "set_features() failed (%d); wanted %pNF, left %pNF\n",
10552 err, &features, &dev->features);
10553 /* return non-0 since some features might have changed and
10554 * it's better to fire a spurious notification than miss it
10555 */
10556 return -1;
10557 }
10558
10559sync_lower:
10560 /* some features must be disabled on lower devices when disabled
10561 * on an upper device (think: bonding master or bridge)
10562 */
10563 netdev_for_each_lower_dev(dev, lower, iter)
10564 netdev_sync_lower_features(dev, lower, features);
10565
10566 if (!err) {
10567 netdev_features_t diff = features ^ dev->features;
10568
10569 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
10570 /* udp_tunnel_{get,drop}_rx_info both need
10571 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
10572 * device, or they won't do anything.
10573 * Thus we need to update dev->features
10574 * *before* calling udp_tunnel_get_rx_info,
10575 * but *after* calling udp_tunnel_drop_rx_info.
10576 */
10577 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
10578 dev->features = features;
10579 udp_tunnel_get_rx_info(dev);
10580 } else {
10581 udp_tunnel_drop_rx_info(dev);
10582 }
10583 }
10584
10585 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
10586 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
10587 dev->features = features;
10588 err |= vlan_get_rx_ctag_filter_info(dev);
10589 } else {
10590 vlan_drop_rx_ctag_filter_info(dev);
10591 }
10592 }
10593
10594 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
10595 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
10596 dev->features = features;
10597 err |= vlan_get_rx_stag_filter_info(dev);
10598 } else {
10599 vlan_drop_rx_stag_filter_info(dev);
10600 }
10601 }
10602
10603 dev->features = features;
10604 }
10605
10606 return err < 0 ? 0 : 1;
10607}
10608
10609/**
10610 * netdev_update_features - recalculate device features
10611 * @dev: the device to check
10612 *
10613 * Recalculate dev->features set and send notifications if it
10614 * has changed. Should be called after driver or hardware dependent
10615 * conditions might have changed that influence the features.
10616 */
10617void netdev_update_features(struct net_device *dev)
10618{
10619 if (__netdev_update_features(dev))
10620 netdev_features_change(dev);
10621}
10622EXPORT_SYMBOL(netdev_update_features);
10623
10624/**
10625 * netdev_change_features - recalculate device features
10626 * @dev: the device to check
10627 *
10628 * Recalculate dev->features set and send notifications even
10629 * if they have not changed. Should be called instead of
10630 * netdev_update_features() if also dev->vlan_features might
10631 * have changed to allow the changes to be propagated to stacked
10632 * VLAN devices.
10633 */
10634void netdev_change_features(struct net_device *dev)
10635{
10636 __netdev_update_features(dev);
10637 netdev_features_change(dev);
10638}
10639EXPORT_SYMBOL(netdev_change_features);
10640
10641/**
10642 * netif_stacked_transfer_operstate - transfer operstate
10643 * @rootdev: the root or lower level device to transfer state from
10644 * @dev: the device to transfer operstate to
10645 *
10646 * Transfer operational state from root to device. This is normally
10647 * called when a stacking relationship exists between the root
10648 * device and the device(a leaf device).
10649 */
10650void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10651 struct net_device *dev)
10652{
10653 if (rootdev->operstate == IF_OPER_DORMANT)
10654 netif_dormant_on(dev);
10655 else
10656 netif_dormant_off(dev);
10657
10658 if (rootdev->operstate == IF_OPER_TESTING)
10659 netif_testing_on(dev);
10660 else
10661 netif_testing_off(dev);
10662
10663 if (netif_carrier_ok(rootdev))
10664 netif_carrier_on(dev);
10665 else
10666 netif_carrier_off(dev);
10667}
10668EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10669
10670static int netif_alloc_rx_queues(struct net_device *dev)
10671{
10672 unsigned int i, count = dev->num_rx_queues;
10673 struct netdev_rx_queue *rx;
10674 size_t sz = count * sizeof(*rx);
10675 int err = 0;
10676
10677 BUG_ON(count < 1);
10678
10679 rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10680 if (!rx)
10681 return -ENOMEM;
10682
10683 dev->_rx = rx;
10684
10685 for (i = 0; i < count; i++) {
10686 rx[i].dev = dev;
10687
10688 /* XDP RX-queue setup */
10689 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10690 if (err < 0)
10691 goto err_rxq_info;
10692 }
10693 return 0;
10694
10695err_rxq_info:
10696 /* Rollback successful reg's and free other resources */
10697 while (i--)
10698 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10699 kvfree(dev->_rx);
10700 dev->_rx = NULL;
10701 return err;
10702}
10703
10704static void netif_free_rx_queues(struct net_device *dev)
10705{
10706 unsigned int i, count = dev->num_rx_queues;
10707
10708 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10709 if (!dev->_rx)
10710 return;
10711
10712 for (i = 0; i < count; i++)
10713 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10714
10715 kvfree(dev->_rx);
10716}
10717
10718static void netdev_init_one_queue(struct net_device *dev,
10719 struct netdev_queue *queue, void *_unused)
10720{
10721 /* Initialize queue lock */
10722 spin_lock_init(&queue->_xmit_lock);
10723 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10724 queue->xmit_lock_owner = -1;
10725 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10726 queue->dev = dev;
10727#ifdef CONFIG_BQL
10728 dql_init(&queue->dql, HZ);
10729#endif
10730}
10731
10732static void netif_free_tx_queues(struct net_device *dev)
10733{
10734 kvfree(dev->_tx);
10735}
10736
10737static int netif_alloc_netdev_queues(struct net_device *dev)
10738{
10739 unsigned int count = dev->num_tx_queues;
10740 struct netdev_queue *tx;
10741 size_t sz = count * sizeof(*tx);
10742
10743 if (count < 1 || count > 0xffff)
10744 return -EINVAL;
10745
10746 tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10747 if (!tx)
10748 return -ENOMEM;
10749
10750 dev->_tx = tx;
10751
10752 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10753 spin_lock_init(&dev->tx_global_lock);
10754
10755 return 0;
10756}
10757
10758void netif_tx_stop_all_queues(struct net_device *dev)
10759{
10760 unsigned int i;
10761
10762 for (i = 0; i < dev->num_tx_queues; i++) {
10763 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10764
10765 netif_tx_stop_queue(txq);
10766 }
10767}
10768EXPORT_SYMBOL(netif_tx_stop_all_queues);
10769
10770static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10771{
10772 void __percpu *v;
10773
10774 /* Drivers implementing ndo_get_peer_dev must support tstat
10775 * accounting, so that skb_do_redirect() can bump the dev's
10776 * RX stats upon network namespace switch.
10777 */
10778 if (dev->netdev_ops->ndo_get_peer_dev &&
10779 dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10780 return -EOPNOTSUPP;
10781
10782 switch (dev->pcpu_stat_type) {
10783 case NETDEV_PCPU_STAT_NONE:
10784 return 0;
10785 case NETDEV_PCPU_STAT_LSTATS:
10786 v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10787 break;
10788 case NETDEV_PCPU_STAT_TSTATS:
10789 v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10790 break;
10791 case NETDEV_PCPU_STAT_DSTATS:
10792 v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10793 break;
10794 default:
10795 return -EINVAL;
10796 }
10797
10798 return v ? 0 : -ENOMEM;
10799}
10800
10801static void netdev_do_free_pcpu_stats(struct net_device *dev)
10802{
10803 switch (dev->pcpu_stat_type) {
10804 case NETDEV_PCPU_STAT_NONE:
10805 return;
10806 case NETDEV_PCPU_STAT_LSTATS:
10807 free_percpu(dev->lstats);
10808 break;
10809 case NETDEV_PCPU_STAT_TSTATS:
10810 free_percpu(dev->tstats);
10811 break;
10812 case NETDEV_PCPU_STAT_DSTATS:
10813 free_percpu(dev->dstats);
10814 break;
10815 }
10816}
10817
10818static void netdev_free_phy_link_topology(struct net_device *dev)
10819{
10820 struct phy_link_topology *topo = dev->link_topo;
10821
10822 if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
10823 xa_destroy(&topo->phys);
10824 kfree(topo);
10825 dev->link_topo = NULL;
10826 }
10827}
10828
10829/**
10830 * register_netdevice() - register a network device
10831 * @dev: device to register
10832 *
10833 * Take a prepared network device structure and make it externally accessible.
10834 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10835 * Callers must hold the rtnl lock - you may want register_netdev()
10836 * instead of this.
10837 */
10838int register_netdevice(struct net_device *dev)
10839{
10840 int ret;
10841 struct net *net = dev_net(dev);
10842
10843 BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10844 NETDEV_FEATURE_COUNT);
10845 BUG_ON(dev_boot_phase);
10846 ASSERT_RTNL();
10847
10848 might_sleep();
10849
10850 /* When net_device's are persistent, this will be fatal. */
10851 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10852 BUG_ON(!net);
10853
10854 ret = ethtool_check_ops(dev->ethtool_ops);
10855 if (ret)
10856 return ret;
10857
10858 /* rss ctx ID 0 is reserved for the default context, start from 1 */
10859 xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
10860 mutex_init(&dev->ethtool->rss_lock);
10861
10862 spin_lock_init(&dev->addr_list_lock);
10863 netdev_set_addr_lockdep_class(dev);
10864
10865 ret = dev_get_valid_name(net, dev, dev->name);
10866 if (ret < 0)
10867 goto out;
10868
10869 ret = -ENOMEM;
10870 dev->name_node = netdev_name_node_head_alloc(dev);
10871 if (!dev->name_node)
10872 goto out;
10873
10874 /* Init, if this function is available */
10875 if (dev->netdev_ops->ndo_init) {
10876 ret = dev->netdev_ops->ndo_init(dev);
10877 if (ret) {
10878 if (ret > 0)
10879 ret = -EIO;
10880 goto err_free_name;
10881 }
10882 }
10883
10884 if (((dev->hw_features | dev->features) &
10885 NETIF_F_HW_VLAN_CTAG_FILTER) &&
10886 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10887 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10888 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10889 ret = -EINVAL;
10890 goto err_uninit;
10891 }
10892
10893 ret = netdev_do_alloc_pcpu_stats(dev);
10894 if (ret)
10895 goto err_uninit;
10896
10897 ret = dev_index_reserve(net, dev->ifindex);
10898 if (ret < 0)
10899 goto err_free_pcpu;
10900 dev->ifindex = ret;
10901
10902 /* Transfer changeable features to wanted_features and enable
10903 * software offloads (GSO and GRO).
10904 */
10905 dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10906 dev->features |= NETIF_F_SOFT_FEATURES;
10907
10908 if (dev->udp_tunnel_nic_info) {
10909 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10910 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10911 }
10912
10913 dev->wanted_features = dev->features & dev->hw_features;
10914
10915 if (!(dev->flags & IFF_LOOPBACK))
10916 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10917
10918 /* If IPv4 TCP segmentation offload is supported we should also
10919 * allow the device to enable segmenting the frame with the option
10920 * of ignoring a static IP ID value. This doesn't enable the
10921 * feature itself but allows the user to enable it later.
10922 */
10923 if (dev->hw_features & NETIF_F_TSO)
10924 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10925 if (dev->vlan_features & NETIF_F_TSO)
10926 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10927 if (dev->mpls_features & NETIF_F_TSO)
10928 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10929 if (dev->hw_enc_features & NETIF_F_TSO)
10930 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10931
10932 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10933 */
10934 dev->vlan_features |= NETIF_F_HIGHDMA;
10935
10936 /* Make NETIF_F_SG inheritable to tunnel devices.
10937 */
10938 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10939
10940 /* Make NETIF_F_SG inheritable to MPLS.
10941 */
10942 dev->mpls_features |= NETIF_F_SG;
10943
10944 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10945 ret = notifier_to_errno(ret);
10946 if (ret)
10947 goto err_ifindex_release;
10948
10949 ret = netdev_register_kobject(dev);
10950
10951 netdev_lock(dev);
10952 WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10953 netdev_unlock(dev);
10954
10955 if (ret)
10956 goto err_uninit_notify;
10957
10958 __netdev_update_features(dev);
10959
10960 /*
10961 * Default initial state at registry is that the
10962 * device is present.
10963 */
10964
10965 set_bit(__LINK_STATE_PRESENT, &dev->state);
10966
10967 linkwatch_init_dev(dev);
10968
10969 dev_init_scheduler(dev);
10970
10971 netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10972 list_netdevice(dev);
10973
10974 add_device_randomness(dev->dev_addr, dev->addr_len);
10975
10976 /* If the device has permanent device address, driver should
10977 * set dev_addr and also addr_assign_type should be set to
10978 * NET_ADDR_PERM (default value).
10979 */
10980 if (dev->addr_assign_type == NET_ADDR_PERM)
10981 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10982
10983 /* Notify protocols, that a new device appeared. */
10984 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10985 ret = notifier_to_errno(ret);
10986 if (ret) {
10987 /* Expect explicit free_netdev() on failure */
10988 dev->needs_free_netdev = false;
10989 unregister_netdevice_queue(dev, NULL);
10990 goto out;
10991 }
10992 /*
10993 * Prevent userspace races by waiting until the network
10994 * device is fully setup before sending notifications.
10995 */
10996 if (!dev->rtnl_link_ops ||
10997 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10998 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10999
11000out:
11001 return ret;
11002
11003err_uninit_notify:
11004 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11005err_ifindex_release:
11006 dev_index_release(net, dev->ifindex);
11007err_free_pcpu:
11008 netdev_do_free_pcpu_stats(dev);
11009err_uninit:
11010 if (dev->netdev_ops->ndo_uninit)
11011 dev->netdev_ops->ndo_uninit(dev);
11012 if (dev->priv_destructor)
11013 dev->priv_destructor(dev);
11014err_free_name:
11015 netdev_name_node_free(dev->name_node);
11016 goto out;
11017}
11018EXPORT_SYMBOL(register_netdevice);
11019
11020/* Initialize the core of a dummy net device.
11021 * The setup steps dummy netdevs need which normal netdevs get by going
11022 * through register_netdevice().
11023 */
11024static void init_dummy_netdev(struct net_device *dev)
11025{
11026 /* make sure we BUG if trying to hit standard
11027 * register/unregister code path
11028 */
11029 dev->reg_state = NETREG_DUMMY;
11030
11031 /* a dummy interface is started by default */
11032 set_bit(__LINK_STATE_PRESENT, &dev->state);
11033 set_bit(__LINK_STATE_START, &dev->state);
11034
11035 /* Note : We dont allocate pcpu_refcnt for dummy devices,
11036 * because users of this 'device' dont need to change
11037 * its refcount.
11038 */
11039}
11040
11041/**
11042 * register_netdev - register a network device
11043 * @dev: device to register
11044 *
11045 * Take a completed network device structure and add it to the kernel
11046 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
11047 * chain. 0 is returned on success. A negative errno code is returned
11048 * on a failure to set up the device, or if the name is a duplicate.
11049 *
11050 * This is a wrapper around register_netdevice that takes the rtnl semaphore
11051 * and expands the device name if you passed a format string to
11052 * alloc_netdev.
11053 */
11054int register_netdev(struct net_device *dev)
11055{
11056 struct net *net = dev_net(dev);
11057 int err;
11058
11059 if (rtnl_net_lock_killable(net))
11060 return -EINTR;
11061
11062 err = register_netdevice(dev);
11063
11064 rtnl_net_unlock(net);
11065
11066 return err;
11067}
11068EXPORT_SYMBOL(register_netdev);
11069
11070int netdev_refcnt_read(const struct net_device *dev)
11071{
11072#ifdef CONFIG_PCPU_DEV_REFCNT
11073 int i, refcnt = 0;
11074
11075 for_each_possible_cpu(i)
11076 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
11077 return refcnt;
11078#else
11079 return refcount_read(&dev->dev_refcnt);
11080#endif
11081}
11082EXPORT_SYMBOL(netdev_refcnt_read);
11083
11084int netdev_unregister_timeout_secs __read_mostly = 10;
11085
11086#define WAIT_REFS_MIN_MSECS 1
11087#define WAIT_REFS_MAX_MSECS 250
11088/**
11089 * netdev_wait_allrefs_any - wait until all references are gone.
11090 * @list: list of net_devices to wait on
11091 *
11092 * This is called when unregistering network devices.
11093 *
11094 * Any protocol or device that holds a reference should register
11095 * for netdevice notification, and cleanup and put back the
11096 * reference if they receive an UNREGISTER event.
11097 * We can get stuck here if buggy protocols don't correctly
11098 * call dev_put.
11099 */
11100static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
11101{
11102 unsigned long rebroadcast_time, warning_time;
11103 struct net_device *dev;
11104 int wait = 0;
11105
11106 rebroadcast_time = warning_time = jiffies;
11107
11108 list_for_each_entry(dev, list, todo_list)
11109 if (netdev_refcnt_read(dev) == 1)
11110 return dev;
11111
11112 while (true) {
11113 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
11114 rtnl_lock();
11115
11116 /* Rebroadcast unregister notification */
11117 list_for_each_entry(dev, list, todo_list)
11118 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11119
11120 __rtnl_unlock();
11121 rcu_barrier();
11122 rtnl_lock();
11123
11124 list_for_each_entry(dev, list, todo_list)
11125 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
11126 &dev->state)) {
11127 /* We must not have linkwatch events
11128 * pending on unregister. If this
11129 * happens, we simply run the queue
11130 * unscheduled, resulting in a noop
11131 * for this device.
11132 */
11133 linkwatch_run_queue();
11134 break;
11135 }
11136
11137 __rtnl_unlock();
11138
11139 rebroadcast_time = jiffies;
11140 }
11141
11142 rcu_barrier();
11143
11144 if (!wait) {
11145 wait = WAIT_REFS_MIN_MSECS;
11146 } else {
11147 msleep(wait);
11148 wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
11149 }
11150
11151 list_for_each_entry(dev, list, todo_list)
11152 if (netdev_refcnt_read(dev) == 1)
11153 return dev;
11154
11155 if (time_after(jiffies, warning_time +
11156 READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
11157 list_for_each_entry(dev, list, todo_list) {
11158 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
11159 dev->name, netdev_refcnt_read(dev));
11160 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
11161 }
11162
11163 warning_time = jiffies;
11164 }
11165 }
11166}
11167
11168/* The sequence is:
11169 *
11170 * rtnl_lock();
11171 * ...
11172 * register_netdevice(x1);
11173 * register_netdevice(x2);
11174 * ...
11175 * unregister_netdevice(y1);
11176 * unregister_netdevice(y2);
11177 * ...
11178 * rtnl_unlock();
11179 * free_netdev(y1);
11180 * free_netdev(y2);
11181 *
11182 * We are invoked by rtnl_unlock().
11183 * This allows us to deal with problems:
11184 * 1) We can delete sysfs objects which invoke hotplug
11185 * without deadlocking with linkwatch via keventd.
11186 * 2) Since we run with the RTNL semaphore not held, we can sleep
11187 * safely in order to wait for the netdev refcnt to drop to zero.
11188 *
11189 * We must not return until all unregister events added during
11190 * the interval the lock was held have been completed.
11191 */
11192void netdev_run_todo(void)
11193{
11194 struct net_device *dev, *tmp;
11195 struct list_head list;
11196 int cnt;
11197#ifdef CONFIG_LOCKDEP
11198 struct list_head unlink_list;
11199
11200 list_replace_init(&net_unlink_list, &unlink_list);
11201
11202 while (!list_empty(&unlink_list)) {
11203 struct net_device *dev = list_first_entry(&unlink_list,
11204 struct net_device,
11205 unlink_list);
11206 list_del_init(&dev->unlink_list);
11207 dev->nested_level = dev->lower_level - 1;
11208 }
11209#endif
11210
11211 /* Snapshot list, allow later requests */
11212 list_replace_init(&net_todo_list, &list);
11213
11214 __rtnl_unlock();
11215
11216 /* Wait for rcu callbacks to finish before next phase */
11217 if (!list_empty(&list))
11218 rcu_barrier();
11219
11220 list_for_each_entry_safe(dev, tmp, &list, todo_list) {
11221 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
11222 netdev_WARN(dev, "run_todo but not unregistering\n");
11223 list_del(&dev->todo_list);
11224 continue;
11225 }
11226
11227 netdev_lock(dev);
11228 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
11229 netdev_unlock(dev);
11230 linkwatch_sync_dev(dev);
11231 }
11232
11233 cnt = 0;
11234 while (!list_empty(&list)) {
11235 dev = netdev_wait_allrefs_any(&list);
11236 list_del(&dev->todo_list);
11237
11238 /* paranoia */
11239 BUG_ON(netdev_refcnt_read(dev) != 1);
11240 BUG_ON(!list_empty(&dev->ptype_all));
11241 BUG_ON(!list_empty(&dev->ptype_specific));
11242 WARN_ON(rcu_access_pointer(dev->ip_ptr));
11243 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
11244
11245 netdev_do_free_pcpu_stats(dev);
11246 if (dev->priv_destructor)
11247 dev->priv_destructor(dev);
11248 if (dev->needs_free_netdev)
11249 free_netdev(dev);
11250
11251 cnt++;
11252
11253 /* Free network device */
11254 kobject_put(&dev->dev.kobj);
11255 }
11256 if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
11257 wake_up(&netdev_unregistering_wq);
11258}
11259
11260/* Collate per-cpu network dstats statistics
11261 *
11262 * Read per-cpu network statistics from dev->dstats and populate the related
11263 * fields in @s.
11264 */
11265static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
11266 const struct pcpu_dstats __percpu *dstats)
11267{
11268 int cpu;
11269
11270 for_each_possible_cpu(cpu) {
11271 u64 rx_packets, rx_bytes, rx_drops;
11272 u64 tx_packets, tx_bytes, tx_drops;
11273 const struct pcpu_dstats *stats;
11274 unsigned int start;
11275
11276 stats = per_cpu_ptr(dstats, cpu);
11277 do {
11278 start = u64_stats_fetch_begin(&stats->syncp);
11279 rx_packets = u64_stats_read(&stats->rx_packets);
11280 rx_bytes = u64_stats_read(&stats->rx_bytes);
11281 rx_drops = u64_stats_read(&stats->rx_drops);
11282 tx_packets = u64_stats_read(&stats->tx_packets);
11283 tx_bytes = u64_stats_read(&stats->tx_bytes);
11284 tx_drops = u64_stats_read(&stats->tx_drops);
11285 } while (u64_stats_fetch_retry(&stats->syncp, start));
11286
11287 s->rx_packets += rx_packets;
11288 s->rx_bytes += rx_bytes;
11289 s->rx_dropped += rx_drops;
11290 s->tx_packets += tx_packets;
11291 s->tx_bytes += tx_bytes;
11292 s->tx_dropped += tx_drops;
11293 }
11294}
11295
11296/* ndo_get_stats64 implementation for dtstats-based accounting.
11297 *
11298 * Populate @s from dev->stats and dev->dstats. This is used internally by the
11299 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
11300 */
11301static void dev_get_dstats64(const struct net_device *dev,
11302 struct rtnl_link_stats64 *s)
11303{
11304 netdev_stats_to_stats64(s, &dev->stats);
11305 dev_fetch_dstats(s, dev->dstats);
11306}
11307
11308/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
11309 * all the same fields in the same order as net_device_stats, with only
11310 * the type differing, but rtnl_link_stats64 may have additional fields
11311 * at the end for newer counters.
11312 */
11313void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
11314 const struct net_device_stats *netdev_stats)
11315{
11316 size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
11317 const atomic_long_t *src = (atomic_long_t *)netdev_stats;
11318 u64 *dst = (u64 *)stats64;
11319
11320 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
11321 for (i = 0; i < n; i++)
11322 dst[i] = (unsigned long)atomic_long_read(&src[i]);
11323 /* zero out counters that only exist in rtnl_link_stats64 */
11324 memset((char *)stats64 + n * sizeof(u64), 0,
11325 sizeof(*stats64) - n * sizeof(u64));
11326}
11327EXPORT_SYMBOL(netdev_stats_to_stats64);
11328
11329static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
11330 struct net_device *dev)
11331{
11332 struct net_device_core_stats __percpu *p;
11333
11334 p = alloc_percpu_gfp(struct net_device_core_stats,
11335 GFP_ATOMIC | __GFP_NOWARN);
11336
11337 if (p && cmpxchg(&dev->core_stats, NULL, p))
11338 free_percpu(p);
11339
11340 /* This READ_ONCE() pairs with the cmpxchg() above */
11341 return READ_ONCE(dev->core_stats);
11342}
11343
11344noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
11345{
11346 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11347 struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
11348 unsigned long __percpu *field;
11349
11350 if (unlikely(!p)) {
11351 p = netdev_core_stats_alloc(dev);
11352 if (!p)
11353 return;
11354 }
11355
11356 field = (unsigned long __percpu *)((void __percpu *)p + offset);
11357 this_cpu_inc(*field);
11358}
11359EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
11360
11361/**
11362 * dev_get_stats - get network device statistics
11363 * @dev: device to get statistics from
11364 * @storage: place to store stats
11365 *
11366 * Get network statistics from device. Return @storage.
11367 * The device driver may provide its own method by setting
11368 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
11369 * otherwise the internal statistics structure is used.
11370 */
11371struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
11372 struct rtnl_link_stats64 *storage)
11373{
11374 const struct net_device_ops *ops = dev->netdev_ops;
11375 const struct net_device_core_stats __percpu *p;
11376
11377 /*
11378 * IPv{4,6} and udp tunnels share common stat helpers and use
11379 * different stat type (NETDEV_PCPU_STAT_TSTATS vs
11380 * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
11381 */
11382 BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
11383 offsetof(struct pcpu_dstats, rx_bytes));
11384 BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
11385 offsetof(struct pcpu_dstats, rx_packets));
11386 BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
11387 offsetof(struct pcpu_dstats, tx_bytes));
11388 BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
11389 offsetof(struct pcpu_dstats, tx_packets));
11390
11391 if (ops->ndo_get_stats64) {
11392 memset(storage, 0, sizeof(*storage));
11393 ops->ndo_get_stats64(dev, storage);
11394 } else if (ops->ndo_get_stats) {
11395 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
11396 } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
11397 dev_get_tstats64(dev, storage);
11398 } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
11399 dev_get_dstats64(dev, storage);
11400 } else {
11401 netdev_stats_to_stats64(storage, &dev->stats);
11402 }
11403
11404 /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
11405 p = READ_ONCE(dev->core_stats);
11406 if (p) {
11407 const struct net_device_core_stats *core_stats;
11408 int i;
11409
11410 for_each_possible_cpu(i) {
11411 core_stats = per_cpu_ptr(p, i);
11412 storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
11413 storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
11414 storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
11415 storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
11416 }
11417 }
11418 return storage;
11419}
11420EXPORT_SYMBOL(dev_get_stats);
11421
11422/**
11423 * dev_fetch_sw_netstats - get per-cpu network device statistics
11424 * @s: place to store stats
11425 * @netstats: per-cpu network stats to read from
11426 *
11427 * Read per-cpu network statistics and populate the related fields in @s.
11428 */
11429void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
11430 const struct pcpu_sw_netstats __percpu *netstats)
11431{
11432 int cpu;
11433
11434 for_each_possible_cpu(cpu) {
11435 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
11436 const struct pcpu_sw_netstats *stats;
11437 unsigned int start;
11438
11439 stats = per_cpu_ptr(netstats, cpu);
11440 do {
11441 start = u64_stats_fetch_begin(&stats->syncp);
11442 rx_packets = u64_stats_read(&stats->rx_packets);
11443 rx_bytes = u64_stats_read(&stats->rx_bytes);
11444 tx_packets = u64_stats_read(&stats->tx_packets);
11445 tx_bytes = u64_stats_read(&stats->tx_bytes);
11446 } while (u64_stats_fetch_retry(&stats->syncp, start));
11447
11448 s->rx_packets += rx_packets;
11449 s->rx_bytes += rx_bytes;
11450 s->tx_packets += tx_packets;
11451 s->tx_bytes += tx_bytes;
11452 }
11453}
11454EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
11455
11456/**
11457 * dev_get_tstats64 - ndo_get_stats64 implementation
11458 * @dev: device to get statistics from
11459 * @s: place to store stats
11460 *
11461 * Populate @s from dev->stats and dev->tstats. Can be used as
11462 * ndo_get_stats64() callback.
11463 */
11464void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
11465{
11466 netdev_stats_to_stats64(s, &dev->stats);
11467 dev_fetch_sw_netstats(s, dev->tstats);
11468}
11469EXPORT_SYMBOL_GPL(dev_get_tstats64);
11470
11471struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
11472{
11473 struct netdev_queue *queue = dev_ingress_queue(dev);
11474
11475#ifdef CONFIG_NET_CLS_ACT
11476 if (queue)
11477 return queue;
11478 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
11479 if (!queue)
11480 return NULL;
11481 netdev_init_one_queue(dev, queue, NULL);
11482 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
11483 RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
11484 rcu_assign_pointer(dev->ingress_queue, queue);
11485#endif
11486 return queue;
11487}
11488
11489static const struct ethtool_ops default_ethtool_ops;
11490
11491void netdev_set_default_ethtool_ops(struct net_device *dev,
11492 const struct ethtool_ops *ops)
11493{
11494 if (dev->ethtool_ops == &default_ethtool_ops)
11495 dev->ethtool_ops = ops;
11496}
11497EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
11498
11499/**
11500 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
11501 * @dev: netdev to enable the IRQ coalescing on
11502 *
11503 * Sets a conservative default for SW IRQ coalescing. Users can use
11504 * sysfs attributes to override the default values.
11505 */
11506void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
11507{
11508 WARN_ON(dev->reg_state == NETREG_REGISTERED);
11509
11510 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
11511 netdev_set_gro_flush_timeout(dev, 20000);
11512 netdev_set_defer_hard_irqs(dev, 1);
11513 }
11514}
11515EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
11516
11517/**
11518 * alloc_netdev_mqs - allocate network device
11519 * @sizeof_priv: size of private data to allocate space for
11520 * @name: device name format string
11521 * @name_assign_type: origin of device name
11522 * @setup: callback to initialize device
11523 * @txqs: the number of TX subqueues to allocate
11524 * @rxqs: the number of RX subqueues to allocate
11525 *
11526 * Allocates a struct net_device with private data area for driver use
11527 * and performs basic initialization. Also allocates subqueue structs
11528 * for each queue on the device.
11529 */
11530struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
11531 unsigned char name_assign_type,
11532 void (*setup)(struct net_device *),
11533 unsigned int txqs, unsigned int rxqs)
11534{
11535 struct net_device *dev;
11536 size_t napi_config_sz;
11537 unsigned int maxqs;
11538
11539 BUG_ON(strlen(name) >= sizeof(dev->name));
11540
11541 if (txqs < 1) {
11542 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
11543 return NULL;
11544 }
11545
11546 if (rxqs < 1) {
11547 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
11548 return NULL;
11549 }
11550
11551 maxqs = max(txqs, rxqs);
11552
11553 dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
11554 GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
11555 if (!dev)
11556 return NULL;
11557
11558 dev->priv_len = sizeof_priv;
11559
11560 ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
11561#ifdef CONFIG_PCPU_DEV_REFCNT
11562 dev->pcpu_refcnt = alloc_percpu(int);
11563 if (!dev->pcpu_refcnt)
11564 goto free_dev;
11565 __dev_hold(dev);
11566#else
11567 refcount_set(&dev->dev_refcnt, 1);
11568#endif
11569
11570 if (dev_addr_init(dev))
11571 goto free_pcpu;
11572
11573 dev_mc_init(dev);
11574 dev_uc_init(dev);
11575
11576 dev_net_set(dev, &init_net);
11577
11578 dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
11579 dev->xdp_zc_max_segs = 1;
11580 dev->gso_max_segs = GSO_MAX_SEGS;
11581 dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
11582 dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
11583 dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
11584 dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
11585 dev->tso_max_segs = TSO_MAX_SEGS;
11586 dev->upper_level = 1;
11587 dev->lower_level = 1;
11588#ifdef CONFIG_LOCKDEP
11589 dev->nested_level = 0;
11590 INIT_LIST_HEAD(&dev->unlink_list);
11591#endif
11592
11593 INIT_LIST_HEAD(&dev->napi_list);
11594 INIT_LIST_HEAD(&dev->unreg_list);
11595 INIT_LIST_HEAD(&dev->close_list);
11596 INIT_LIST_HEAD(&dev->link_watch_list);
11597 INIT_LIST_HEAD(&dev->adj_list.upper);
11598 INIT_LIST_HEAD(&dev->adj_list.lower);
11599 INIT_LIST_HEAD(&dev->ptype_all);
11600 INIT_LIST_HEAD(&dev->ptype_specific);
11601 INIT_LIST_HEAD(&dev->net_notifier_list);
11602#ifdef CONFIG_NET_SCHED
11603 hash_init(dev->qdisc_hash);
11604#endif
11605
11606 mutex_init(&dev->lock);
11607
11608 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
11609 setup(dev);
11610
11611 if (!dev->tx_queue_len) {
11612 dev->priv_flags |= IFF_NO_QUEUE;
11613 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
11614 }
11615
11616 dev->num_tx_queues = txqs;
11617 dev->real_num_tx_queues = txqs;
11618 if (netif_alloc_netdev_queues(dev))
11619 goto free_all;
11620
11621 dev->num_rx_queues = rxqs;
11622 dev->real_num_rx_queues = rxqs;
11623 if (netif_alloc_rx_queues(dev))
11624 goto free_all;
11625 dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
11626 if (!dev->ethtool)
11627 goto free_all;
11628
11629 dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
11630 if (!dev->cfg)
11631 goto free_all;
11632 dev->cfg_pending = dev->cfg;
11633
11634 napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
11635 dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
11636 if (!dev->napi_config)
11637 goto free_all;
11638
11639 strscpy(dev->name, name);
11640 dev->name_assign_type = name_assign_type;
11641 dev->group = INIT_NETDEV_GROUP;
11642 if (!dev->ethtool_ops)
11643 dev->ethtool_ops = &default_ethtool_ops;
11644
11645 nf_hook_netdev_init(dev);
11646
11647 return dev;
11648
11649free_all:
11650 free_netdev(dev);
11651 return NULL;
11652
11653free_pcpu:
11654#ifdef CONFIG_PCPU_DEV_REFCNT
11655 free_percpu(dev->pcpu_refcnt);
11656free_dev:
11657#endif
11658 kvfree(dev);
11659 return NULL;
11660}
11661EXPORT_SYMBOL(alloc_netdev_mqs);
11662
11663static void netdev_napi_exit(struct net_device *dev)
11664{
11665 if (!list_empty(&dev->napi_list)) {
11666 struct napi_struct *p, *n;
11667
11668 netdev_lock(dev);
11669 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
11670 __netif_napi_del_locked(p);
11671 netdev_unlock(dev);
11672
11673 synchronize_net();
11674 }
11675
11676 kvfree(dev->napi_config);
11677}
11678
11679/**
11680 * free_netdev - free network device
11681 * @dev: device
11682 *
11683 * This function does the last stage of destroying an allocated device
11684 * interface. The reference to the device object is released. If this
11685 * is the last reference then it will be freed.Must be called in process
11686 * context.
11687 */
11688void free_netdev(struct net_device *dev)
11689{
11690 might_sleep();
11691
11692 /* When called immediately after register_netdevice() failed the unwind
11693 * handling may still be dismantling the device. Handle that case by
11694 * deferring the free.
11695 */
11696 if (dev->reg_state == NETREG_UNREGISTERING) {
11697 ASSERT_RTNL();
11698 dev->needs_free_netdev = true;
11699 return;
11700 }
11701
11702 WARN_ON(dev->cfg != dev->cfg_pending);
11703 kfree(dev->cfg);
11704 kfree(dev->ethtool);
11705 netif_free_tx_queues(dev);
11706 netif_free_rx_queues(dev);
11707
11708 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
11709
11710 /* Flush device addresses */
11711 dev_addr_flush(dev);
11712
11713 netdev_napi_exit(dev);
11714
11715 ref_tracker_dir_exit(&dev->refcnt_tracker);
11716#ifdef CONFIG_PCPU_DEV_REFCNT
11717 free_percpu(dev->pcpu_refcnt);
11718 dev->pcpu_refcnt = NULL;
11719#endif
11720 free_percpu(dev->core_stats);
11721 dev->core_stats = NULL;
11722 free_percpu(dev->xdp_bulkq);
11723 dev->xdp_bulkq = NULL;
11724
11725 netdev_free_phy_link_topology(dev);
11726
11727 mutex_destroy(&dev->lock);
11728
11729 /* Compatibility with error handling in drivers */
11730 if (dev->reg_state == NETREG_UNINITIALIZED ||
11731 dev->reg_state == NETREG_DUMMY) {
11732 kvfree(dev);
11733 return;
11734 }
11735
11736 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
11737 WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
11738
11739 /* will free via device release */
11740 put_device(&dev->dev);
11741}
11742EXPORT_SYMBOL(free_netdev);
11743
11744/**
11745 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
11746 * @sizeof_priv: size of private data to allocate space for
11747 *
11748 * Return: the allocated net_device on success, NULL otherwise
11749 */
11750struct net_device *alloc_netdev_dummy(int sizeof_priv)
11751{
11752 return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
11753 init_dummy_netdev);
11754}
11755EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
11756
11757/**
11758 * synchronize_net - Synchronize with packet receive processing
11759 *
11760 * Wait for packets currently being received to be done.
11761 * Does not block later packets from starting.
11762 */
11763void synchronize_net(void)
11764{
11765 might_sleep();
11766 if (from_cleanup_net() || rtnl_is_locked())
11767 synchronize_rcu_expedited();
11768 else
11769 synchronize_rcu();
11770}
11771EXPORT_SYMBOL(synchronize_net);
11772
11773static void netdev_rss_contexts_free(struct net_device *dev)
11774{
11775 struct ethtool_rxfh_context *ctx;
11776 unsigned long context;
11777
11778 mutex_lock(&dev->ethtool->rss_lock);
11779 xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
11780 struct ethtool_rxfh_param rxfh;
11781
11782 rxfh.indir = ethtool_rxfh_context_indir(ctx);
11783 rxfh.key = ethtool_rxfh_context_key(ctx);
11784 rxfh.hfunc = ctx->hfunc;
11785 rxfh.input_xfrm = ctx->input_xfrm;
11786 rxfh.rss_context = context;
11787 rxfh.rss_delete = true;
11788
11789 xa_erase(&dev->ethtool->rss_ctx, context);
11790 if (dev->ethtool_ops->create_rxfh_context)
11791 dev->ethtool_ops->remove_rxfh_context(dev, ctx,
11792 context, NULL);
11793 else
11794 dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
11795 kfree(ctx);
11796 }
11797 xa_destroy(&dev->ethtool->rss_ctx);
11798 mutex_unlock(&dev->ethtool->rss_lock);
11799}
11800
11801/**
11802 * unregister_netdevice_queue - remove device from the kernel
11803 * @dev: device
11804 * @head: list
11805 *
11806 * This function shuts down a device interface and removes it
11807 * from the kernel tables.
11808 * If head not NULL, device is queued to be unregistered later.
11809 *
11810 * Callers must hold the rtnl semaphore. You may want
11811 * unregister_netdev() instead of this.
11812 */
11813
11814void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11815{
11816 ASSERT_RTNL();
11817
11818 if (head) {
11819 list_move_tail(&dev->unreg_list, head);
11820 } else {
11821 LIST_HEAD(single);
11822
11823 list_add(&dev->unreg_list, &single);
11824 unregister_netdevice_many(&single);
11825 }
11826}
11827EXPORT_SYMBOL(unregister_netdevice_queue);
11828
11829void unregister_netdevice_many_notify(struct list_head *head,
11830 u32 portid, const struct nlmsghdr *nlh)
11831{
11832 struct net_device *dev, *tmp;
11833 LIST_HEAD(close_head);
11834 int cnt = 0;
11835
11836 BUG_ON(dev_boot_phase);
11837 ASSERT_RTNL();
11838
11839 if (list_empty(head))
11840 return;
11841
11842 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11843 /* Some devices call without registering
11844 * for initialization unwind. Remove those
11845 * devices and proceed with the remaining.
11846 */
11847 if (dev->reg_state == NETREG_UNINITIALIZED) {
11848 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11849 dev->name, dev);
11850
11851 WARN_ON(1);
11852 list_del(&dev->unreg_list);
11853 continue;
11854 }
11855 dev->dismantle = true;
11856 BUG_ON(dev->reg_state != NETREG_REGISTERED);
11857 }
11858
11859 /* If device is running, close it first. */
11860 list_for_each_entry(dev, head, unreg_list)
11861 list_add_tail(&dev->close_list, &close_head);
11862 dev_close_many(&close_head, true);
11863
11864 list_for_each_entry(dev, head, unreg_list) {
11865 /* And unlink it from device chain. */
11866 unlist_netdevice(dev);
11867 netdev_lock(dev);
11868 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11869 netdev_unlock(dev);
11870 }
11871 flush_all_backlogs();
11872
11873 synchronize_net();
11874
11875 list_for_each_entry(dev, head, unreg_list) {
11876 struct sk_buff *skb = NULL;
11877
11878 /* Shutdown queueing discipline. */
11879 dev_shutdown(dev);
11880 dev_tcx_uninstall(dev);
11881 dev_xdp_uninstall(dev);
11882 bpf_dev_bound_netdev_unregister(dev);
11883 dev_dmabuf_uninstall(dev);
11884
11885 netdev_offload_xstats_disable_all(dev);
11886
11887 /* Notify protocols, that we are about to destroy
11888 * this device. They should clean all the things.
11889 */
11890 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11891
11892 if (!dev->rtnl_link_ops ||
11893 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11894 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11895 GFP_KERNEL, NULL, 0,
11896 portid, nlh);
11897
11898 /*
11899 * Flush the unicast and multicast chains
11900 */
11901 dev_uc_flush(dev);
11902 dev_mc_flush(dev);
11903
11904 netdev_name_node_alt_flush(dev);
11905 netdev_name_node_free(dev->name_node);
11906
11907 netdev_rss_contexts_free(dev);
11908
11909 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11910
11911 if (dev->netdev_ops->ndo_uninit)
11912 dev->netdev_ops->ndo_uninit(dev);
11913
11914 mutex_destroy(&dev->ethtool->rss_lock);
11915
11916 net_shaper_flush_netdev(dev);
11917
11918 if (skb)
11919 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11920
11921 /* Notifier chain MUST detach us all upper devices. */
11922 WARN_ON(netdev_has_any_upper_dev(dev));
11923 WARN_ON(netdev_has_any_lower_dev(dev));
11924
11925 /* Remove entries from kobject tree */
11926 netdev_unregister_kobject(dev);
11927#ifdef CONFIG_XPS
11928 /* Remove XPS queueing entries */
11929 netif_reset_xps_queues_gt(dev, 0);
11930#endif
11931 }
11932
11933 synchronize_net();
11934
11935 list_for_each_entry(dev, head, unreg_list) {
11936 netdev_put(dev, &dev->dev_registered_tracker);
11937 net_set_todo(dev);
11938 cnt++;
11939 }
11940 atomic_add(cnt, &dev_unreg_count);
11941
11942 list_del(head);
11943}
11944
11945/**
11946 * unregister_netdevice_many - unregister many devices
11947 * @head: list of devices
11948 *
11949 * Note: As most callers use a stack allocated list_head,
11950 * we force a list_del() to make sure stack won't be corrupted later.
11951 */
11952void unregister_netdevice_many(struct list_head *head)
11953{
11954 unregister_netdevice_many_notify(head, 0, NULL);
11955}
11956EXPORT_SYMBOL(unregister_netdevice_many);
11957
11958/**
11959 * unregister_netdev - remove device from the kernel
11960 * @dev: device
11961 *
11962 * This function shuts down a device interface and removes it
11963 * from the kernel tables.
11964 *
11965 * This is just a wrapper for unregister_netdevice that takes
11966 * the rtnl semaphore. In general you want to use this and not
11967 * unregister_netdevice.
11968 */
11969void unregister_netdev(struct net_device *dev)
11970{
11971 rtnl_net_dev_lock(dev);
11972 unregister_netdevice(dev);
11973 rtnl_net_dev_unlock(dev);
11974}
11975EXPORT_SYMBOL(unregister_netdev);
11976
11977/**
11978 * __dev_change_net_namespace - move device to different nethost namespace
11979 * @dev: device
11980 * @net: network namespace
11981 * @pat: If not NULL name pattern to try if the current device name
11982 * is already taken in the destination network namespace.
11983 * @new_ifindex: If not zero, specifies device index in the target
11984 * namespace.
11985 *
11986 * This function shuts down a device interface and moves it
11987 * to a new network namespace. On success 0 is returned, on
11988 * a failure a netagive errno code is returned.
11989 *
11990 * Callers must hold the rtnl semaphore.
11991 */
11992
11993int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11994 const char *pat, int new_ifindex)
11995{
11996 struct netdev_name_node *name_node;
11997 struct net *net_old = dev_net(dev);
11998 char new_name[IFNAMSIZ] = {};
11999 int err, new_nsid;
12000
12001 ASSERT_RTNL();
12002
12003 /* Don't allow namespace local devices to be moved. */
12004 err = -EINVAL;
12005 if (dev->netns_local)
12006 goto out;
12007
12008 /* Ensure the device has been registered */
12009 if (dev->reg_state != NETREG_REGISTERED)
12010 goto out;
12011
12012 /* Get out if there is nothing todo */
12013 err = 0;
12014 if (net_eq(net_old, net))
12015 goto out;
12016
12017 /* Pick the destination device name, and ensure
12018 * we can use it in the destination network namespace.
12019 */
12020 err = -EEXIST;
12021 if (netdev_name_in_use(net, dev->name)) {
12022 /* We get here if we can't use the current device name */
12023 if (!pat)
12024 goto out;
12025 err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
12026 if (err < 0)
12027 goto out;
12028 }
12029 /* Check that none of the altnames conflicts. */
12030 err = -EEXIST;
12031 netdev_for_each_altname(dev, name_node)
12032 if (netdev_name_in_use(net, name_node->name))
12033 goto out;
12034
12035 /* Check that new_ifindex isn't used yet. */
12036 if (new_ifindex) {
12037 err = dev_index_reserve(net, new_ifindex);
12038 if (err < 0)
12039 goto out;
12040 } else {
12041 /* If there is an ifindex conflict assign a new one */
12042 err = dev_index_reserve(net, dev->ifindex);
12043 if (err == -EBUSY)
12044 err = dev_index_reserve(net, 0);
12045 if (err < 0)
12046 goto out;
12047 new_ifindex = err;
12048 }
12049
12050 /*
12051 * And now a mini version of register_netdevice unregister_netdevice.
12052 */
12053
12054 /* If device is running close it first. */
12055 dev_close(dev);
12056
12057 /* And unlink it from device chain */
12058 unlist_netdevice(dev);
12059
12060 synchronize_net();
12061
12062 /* Shutdown queueing discipline. */
12063 dev_shutdown(dev);
12064
12065 /* Notify protocols, that we are about to destroy
12066 * this device. They should clean all the things.
12067 *
12068 * Note that dev->reg_state stays at NETREG_REGISTERED.
12069 * This is wanted because this way 8021q and macvlan know
12070 * the device is just moving and can keep their slaves up.
12071 */
12072 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
12073 rcu_barrier();
12074
12075 new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
12076
12077 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
12078 new_ifindex);
12079
12080 /*
12081 * Flush the unicast and multicast chains
12082 */
12083 dev_uc_flush(dev);
12084 dev_mc_flush(dev);
12085
12086 /* Send a netdev-removed uevent to the old namespace */
12087 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
12088 netdev_adjacent_del_links(dev);
12089
12090 /* Move per-net netdevice notifiers that are following the netdevice */
12091 move_netdevice_notifiers_dev_net(dev, net);
12092
12093 /* Actually switch the network namespace */
12094 dev_net_set(dev, net);
12095 dev->ifindex = new_ifindex;
12096
12097 if (new_name[0]) {
12098 /* Rename the netdev to prepared name */
12099 write_seqlock_bh(&netdev_rename_lock);
12100 strscpy(dev->name, new_name, IFNAMSIZ);
12101 write_sequnlock_bh(&netdev_rename_lock);
12102 }
12103
12104 /* Fixup kobjects */
12105 dev_set_uevent_suppress(&dev->dev, 1);
12106 err = device_rename(&dev->dev, dev->name);
12107 dev_set_uevent_suppress(&dev->dev, 0);
12108 WARN_ON(err);
12109
12110 /* Send a netdev-add uevent to the new namespace */
12111 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
12112 netdev_adjacent_add_links(dev);
12113
12114 /* Adapt owner in case owning user namespace of target network
12115 * namespace is different from the original one.
12116 */
12117 err = netdev_change_owner(dev, net_old, net);
12118 WARN_ON(err);
12119
12120 /* Add the device back in the hashes */
12121 list_netdevice(dev);
12122
12123 /* Notify protocols, that a new device appeared. */
12124 call_netdevice_notifiers(NETDEV_REGISTER, dev);
12125
12126 /*
12127 * Prevent userspace races by waiting until the network
12128 * device is fully setup before sending notifications.
12129 */
12130 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
12131
12132 synchronize_net();
12133 err = 0;
12134out:
12135 return err;
12136}
12137EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
12138
12139static int dev_cpu_dead(unsigned int oldcpu)
12140{
12141 struct sk_buff **list_skb;
12142 struct sk_buff *skb;
12143 unsigned int cpu;
12144 struct softnet_data *sd, *oldsd, *remsd = NULL;
12145
12146 local_irq_disable();
12147 cpu = smp_processor_id();
12148 sd = &per_cpu(softnet_data, cpu);
12149 oldsd = &per_cpu(softnet_data, oldcpu);
12150
12151 /* Find end of our completion_queue. */
12152 list_skb = &sd->completion_queue;
12153 while (*list_skb)
12154 list_skb = &(*list_skb)->next;
12155 /* Append completion queue from offline CPU. */
12156 *list_skb = oldsd->completion_queue;
12157 oldsd->completion_queue = NULL;
12158
12159 /* Append output queue from offline CPU. */
12160 if (oldsd->output_queue) {
12161 *sd->output_queue_tailp = oldsd->output_queue;
12162 sd->output_queue_tailp = oldsd->output_queue_tailp;
12163 oldsd->output_queue = NULL;
12164 oldsd->output_queue_tailp = &oldsd->output_queue;
12165 }
12166 /* Append NAPI poll list from offline CPU, with one exception :
12167 * process_backlog() must be called by cpu owning percpu backlog.
12168 * We properly handle process_queue & input_pkt_queue later.
12169 */
12170 while (!list_empty(&oldsd->poll_list)) {
12171 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
12172 struct napi_struct,
12173 poll_list);
12174
12175 list_del_init(&napi->poll_list);
12176 if (napi->poll == process_backlog)
12177 napi->state &= NAPIF_STATE_THREADED;
12178 else
12179 ____napi_schedule(sd, napi);
12180 }
12181
12182 raise_softirq_irqoff(NET_TX_SOFTIRQ);
12183 local_irq_enable();
12184
12185 if (!use_backlog_threads()) {
12186#ifdef CONFIG_RPS
12187 remsd = oldsd->rps_ipi_list;
12188 oldsd->rps_ipi_list = NULL;
12189#endif
12190 /* send out pending IPI's on offline CPU */
12191 net_rps_send_ipi(remsd);
12192 }
12193
12194 /* Process offline CPU's input_pkt_queue */
12195 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
12196 netif_rx(skb);
12197 rps_input_queue_head_incr(oldsd);
12198 }
12199 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
12200 netif_rx(skb);
12201 rps_input_queue_head_incr(oldsd);
12202 }
12203
12204 return 0;
12205}
12206
12207/**
12208 * netdev_increment_features - increment feature set by one
12209 * @all: current feature set
12210 * @one: new feature set
12211 * @mask: mask feature set
12212 *
12213 * Computes a new feature set after adding a device with feature set
12214 * @one to the master device with current feature set @all. Will not
12215 * enable anything that is off in @mask. Returns the new feature set.
12216 */
12217netdev_features_t netdev_increment_features(netdev_features_t all,
12218 netdev_features_t one, netdev_features_t mask)
12219{
12220 if (mask & NETIF_F_HW_CSUM)
12221 mask |= NETIF_F_CSUM_MASK;
12222 mask |= NETIF_F_VLAN_CHALLENGED;
12223
12224 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
12225 all &= one | ~NETIF_F_ALL_FOR_ALL;
12226
12227 /* If one device supports hw checksumming, set for all. */
12228 if (all & NETIF_F_HW_CSUM)
12229 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
12230
12231 return all;
12232}
12233EXPORT_SYMBOL(netdev_increment_features);
12234
12235static struct hlist_head * __net_init netdev_create_hash(void)
12236{
12237 int i;
12238 struct hlist_head *hash;
12239
12240 hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
12241 if (hash != NULL)
12242 for (i = 0; i < NETDEV_HASHENTRIES; i++)
12243 INIT_HLIST_HEAD(&hash[i]);
12244
12245 return hash;
12246}
12247
12248/* Initialize per network namespace state */
12249static int __net_init netdev_init(struct net *net)
12250{
12251 BUILD_BUG_ON(GRO_HASH_BUCKETS >
12252 8 * sizeof_field(struct napi_struct, gro_bitmask));
12253
12254 INIT_LIST_HEAD(&net->dev_base_head);
12255
12256 net->dev_name_head = netdev_create_hash();
12257 if (net->dev_name_head == NULL)
12258 goto err_name;
12259
12260 net->dev_index_head = netdev_create_hash();
12261 if (net->dev_index_head == NULL)
12262 goto err_idx;
12263
12264 xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
12265
12266 RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
12267
12268 return 0;
12269
12270err_idx:
12271 kfree(net->dev_name_head);
12272err_name:
12273 return -ENOMEM;
12274}
12275
12276/**
12277 * netdev_drivername - network driver for the device
12278 * @dev: network device
12279 *
12280 * Determine network driver for device.
12281 */
12282const char *netdev_drivername(const struct net_device *dev)
12283{
12284 const struct device_driver *driver;
12285 const struct device *parent;
12286 const char *empty = "";
12287
12288 parent = dev->dev.parent;
12289 if (!parent)
12290 return empty;
12291
12292 driver = parent->driver;
12293 if (driver && driver->name)
12294 return driver->name;
12295 return empty;
12296}
12297
12298static void __netdev_printk(const char *level, const struct net_device *dev,
12299 struct va_format *vaf)
12300{
12301 if (dev && dev->dev.parent) {
12302 dev_printk_emit(level[1] - '0',
12303 dev->dev.parent,
12304 "%s %s %s%s: %pV",
12305 dev_driver_string(dev->dev.parent),
12306 dev_name(dev->dev.parent),
12307 netdev_name(dev), netdev_reg_state(dev),
12308 vaf);
12309 } else if (dev) {
12310 printk("%s%s%s: %pV",
12311 level, netdev_name(dev), netdev_reg_state(dev), vaf);
12312 } else {
12313 printk("%s(NULL net_device): %pV", level, vaf);
12314 }
12315}
12316
12317void netdev_printk(const char *level, const struct net_device *dev,
12318 const char *format, ...)
12319{
12320 struct va_format vaf;
12321 va_list args;
12322
12323 va_start(args, format);
12324
12325 vaf.fmt = format;
12326 vaf.va = &args;
12327
12328 __netdev_printk(level, dev, &vaf);
12329
12330 va_end(args);
12331}
12332EXPORT_SYMBOL(netdev_printk);
12333
12334#define define_netdev_printk_level(func, level) \
12335void func(const struct net_device *dev, const char *fmt, ...) \
12336{ \
12337 struct va_format vaf; \
12338 va_list args; \
12339 \
12340 va_start(args, fmt); \
12341 \
12342 vaf.fmt = fmt; \
12343 vaf.va = &args; \
12344 \
12345 __netdev_printk(level, dev, &vaf); \
12346 \
12347 va_end(args); \
12348} \
12349EXPORT_SYMBOL(func);
12350
12351define_netdev_printk_level(netdev_emerg, KERN_EMERG);
12352define_netdev_printk_level(netdev_alert, KERN_ALERT);
12353define_netdev_printk_level(netdev_crit, KERN_CRIT);
12354define_netdev_printk_level(netdev_err, KERN_ERR);
12355define_netdev_printk_level(netdev_warn, KERN_WARNING);
12356define_netdev_printk_level(netdev_notice, KERN_NOTICE);
12357define_netdev_printk_level(netdev_info, KERN_INFO);
12358
12359static void __net_exit netdev_exit(struct net *net)
12360{
12361 kfree(net->dev_name_head);
12362 kfree(net->dev_index_head);
12363 xa_destroy(&net->dev_by_index);
12364 if (net != &init_net)
12365 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
12366}
12367
12368static struct pernet_operations __net_initdata netdev_net_ops = {
12369 .init = netdev_init,
12370 .exit = netdev_exit,
12371};
12372
12373static void __net_exit default_device_exit_net(struct net *net)
12374{
12375 struct netdev_name_node *name_node, *tmp;
12376 struct net_device *dev, *aux;
12377 /*
12378 * Push all migratable network devices back to the
12379 * initial network namespace
12380 */
12381 ASSERT_RTNL();
12382 for_each_netdev_safe(net, dev, aux) {
12383 int err;
12384 char fb_name[IFNAMSIZ];
12385
12386 /* Ignore unmoveable devices (i.e. loopback) */
12387 if (dev->netns_local)
12388 continue;
12389
12390 /* Leave virtual devices for the generic cleanup */
12391 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
12392 continue;
12393
12394 /* Push remaining network devices to init_net */
12395 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
12396 if (netdev_name_in_use(&init_net, fb_name))
12397 snprintf(fb_name, IFNAMSIZ, "dev%%d");
12398
12399 netdev_for_each_altname_safe(dev, name_node, tmp)
12400 if (netdev_name_in_use(&init_net, name_node->name))
12401 __netdev_name_node_alt_destroy(name_node);
12402
12403 err = dev_change_net_namespace(dev, &init_net, fb_name);
12404 if (err) {
12405 pr_emerg("%s: failed to move %s to init_net: %d\n",
12406 __func__, dev->name, err);
12407 BUG();
12408 }
12409 }
12410}
12411
12412static void __net_exit default_device_exit_batch(struct list_head *net_list)
12413{
12414 /* At exit all network devices most be removed from a network
12415 * namespace. Do this in the reverse order of registration.
12416 * Do this across as many network namespaces as possible to
12417 * improve batching efficiency.
12418 */
12419 struct net_device *dev;
12420 struct net *net;
12421 LIST_HEAD(dev_kill_list);
12422
12423 rtnl_lock();
12424 list_for_each_entry(net, net_list, exit_list) {
12425 default_device_exit_net(net);
12426 cond_resched();
12427 }
12428
12429 list_for_each_entry(net, net_list, exit_list) {
12430 for_each_netdev_reverse(net, dev) {
12431 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
12432 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
12433 else
12434 unregister_netdevice_queue(dev, &dev_kill_list);
12435 }
12436 }
12437 unregister_netdevice_many(&dev_kill_list);
12438 rtnl_unlock();
12439}
12440
12441static struct pernet_operations __net_initdata default_device_ops = {
12442 .exit_batch = default_device_exit_batch,
12443};
12444
12445static void __init net_dev_struct_check(void)
12446{
12447 /* TX read-mostly hotpath */
12448 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
12449 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
12450 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
12451 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
12452 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
12453 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
12454 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
12455 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
12456 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
12457 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
12458 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
12459 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
12460 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
12461#ifdef CONFIG_XPS
12462 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
12463#endif
12464#ifdef CONFIG_NETFILTER_EGRESS
12465 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
12466#endif
12467#ifdef CONFIG_NET_XGRESS
12468 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
12469#endif
12470 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
12471
12472 /* TXRX read-mostly hotpath */
12473 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
12474 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
12475 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
12476 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
12477 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
12478 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
12479 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
12480
12481 /* RX read-mostly hotpath */
12482 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
12483 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
12484 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
12485 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
12486 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
12487 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
12488 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
12489 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
12490 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
12491#ifdef CONFIG_NETPOLL
12492 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
12493#endif
12494#ifdef CONFIG_NET_XGRESS
12495 CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
12496#endif
12497 CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
12498}
12499
12500/*
12501 * Initialize the DEV module. At boot time this walks the device list and
12502 * unhooks any devices that fail to initialise (normally hardware not
12503 * present) and leaves us with a valid list of present and active devices.
12504 *
12505 */
12506
12507/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
12508#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
12509
12510static int net_page_pool_create(int cpuid)
12511{
12512#if IS_ENABLED(CONFIG_PAGE_POOL)
12513 struct page_pool_params page_pool_params = {
12514 .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
12515 .flags = PP_FLAG_SYSTEM_POOL,
12516 .nid = cpu_to_mem(cpuid),
12517 };
12518 struct page_pool *pp_ptr;
12519 int err;
12520
12521 pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
12522 if (IS_ERR(pp_ptr))
12523 return -ENOMEM;
12524
12525 err = xdp_reg_page_pool(pp_ptr);
12526 if (err) {
12527 page_pool_destroy(pp_ptr);
12528 return err;
12529 }
12530
12531 per_cpu(system_page_pool, cpuid) = pp_ptr;
12532#endif
12533 return 0;
12534}
12535
12536static int backlog_napi_should_run(unsigned int cpu)
12537{
12538 struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12539 struct napi_struct *napi = &sd->backlog;
12540
12541 return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
12542}
12543
12544static void run_backlog_napi(unsigned int cpu)
12545{
12546 struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12547
12548 napi_threaded_poll_loop(&sd->backlog);
12549}
12550
12551static void backlog_napi_setup(unsigned int cpu)
12552{
12553 struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12554 struct napi_struct *napi = &sd->backlog;
12555
12556 napi->thread = this_cpu_read(backlog_napi);
12557 set_bit(NAPI_STATE_THREADED, &napi->state);
12558}
12559
12560static struct smp_hotplug_thread backlog_threads = {
12561 .store = &backlog_napi,
12562 .thread_should_run = backlog_napi_should_run,
12563 .thread_fn = run_backlog_napi,
12564 .thread_comm = "backlog_napi/%u",
12565 .setup = backlog_napi_setup,
12566};
12567
12568/*
12569 * This is called single threaded during boot, so no need
12570 * to take the rtnl semaphore.
12571 */
12572static int __init net_dev_init(void)
12573{
12574 int i, rc = -ENOMEM;
12575
12576 BUG_ON(!dev_boot_phase);
12577
12578 net_dev_struct_check();
12579
12580 if (dev_proc_init())
12581 goto out;
12582
12583 if (netdev_kobject_init())
12584 goto out;
12585
12586 for (i = 0; i < PTYPE_HASH_SIZE; i++)
12587 INIT_LIST_HEAD(&ptype_base[i]);
12588
12589 if (register_pernet_subsys(&netdev_net_ops))
12590 goto out;
12591
12592 /*
12593 * Initialise the packet receive queues.
12594 */
12595
12596 flush_backlogs_fallback = flush_backlogs_alloc();
12597 if (!flush_backlogs_fallback)
12598 goto out;
12599
12600 for_each_possible_cpu(i) {
12601 struct softnet_data *sd = &per_cpu(softnet_data, i);
12602
12603 skb_queue_head_init(&sd->input_pkt_queue);
12604 skb_queue_head_init(&sd->process_queue);
12605#ifdef CONFIG_XFRM_OFFLOAD
12606 skb_queue_head_init(&sd->xfrm_backlog);
12607#endif
12608 INIT_LIST_HEAD(&sd->poll_list);
12609 sd->output_queue_tailp = &sd->output_queue;
12610#ifdef CONFIG_RPS
12611 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
12612 sd->cpu = i;
12613#endif
12614 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
12615 spin_lock_init(&sd->defer_lock);
12616
12617 init_gro_hash(&sd->backlog);
12618 sd->backlog.poll = process_backlog;
12619 sd->backlog.weight = weight_p;
12620 INIT_LIST_HEAD(&sd->backlog.poll_list);
12621
12622 if (net_page_pool_create(i))
12623 goto out;
12624 }
12625 if (use_backlog_threads())
12626 smpboot_register_percpu_thread(&backlog_threads);
12627
12628 dev_boot_phase = 0;
12629
12630 /* The loopback device is special if any other network devices
12631 * is present in a network namespace the loopback device must
12632 * be present. Since we now dynamically allocate and free the
12633 * loopback device ensure this invariant is maintained by
12634 * keeping the loopback device as the first device on the
12635 * list of network devices. Ensuring the loopback devices
12636 * is the first device that appears and the last network device
12637 * that disappears.
12638 */
12639 if (register_pernet_device(&loopback_net_ops))
12640 goto out;
12641
12642 if (register_pernet_device(&default_device_ops))
12643 goto out;
12644
12645 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
12646 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
12647
12648 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
12649 NULL, dev_cpu_dead);
12650 WARN_ON(rc < 0);
12651 rc = 0;
12652
12653 /* avoid static key IPIs to isolated CPUs */
12654 if (housekeeping_enabled(HK_TYPE_MISC))
12655 net_enable_timestamp();
12656out:
12657 if (rc < 0) {
12658 for_each_possible_cpu(i) {
12659 struct page_pool *pp_ptr;
12660
12661 pp_ptr = per_cpu(system_page_pool, i);
12662 if (!pp_ptr)
12663 continue;
12664
12665 xdp_unreg_page_pool(pp_ptr);
12666 page_pool_destroy(pp_ptr);
12667 per_cpu(system_page_pool, i) = NULL;
12668 }
12669 }
12670
12671 return rc;
12672}
12673
12674subsys_initcall(net_dev_init);