net/core/dev.c at v5.8-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v5.8-rc2 272 kB view raw
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
  100#include <linux/stat.h>
  101#include <net/dst.h>
  102#include <net/dst_metadata.h>
  103#include <net/pkt_sched.h>
  104#include <net/pkt_cls.h>
  105#include <net/checksum.h>
  106#include <net/xfrm.h>
  107#include <linux/highmem.h>
  108#include <linux/init.h>
  109#include <linux/module.h>
  110#include <linux/netpoll.h>
  111#include <linux/rcupdate.h>
  112#include <linux/delay.h>
  113#include <net/iw_handler.h>
  114#include <asm/current.h>
  115#include <linux/audit.h>
  116#include <linux/dmaengine.h>
  117#include <linux/err.h>
  118#include <linux/ctype.h>
  119#include <linux/if_arp.h>
  120#include <linux/if_vlan.h>
  121#include <linux/ip.h>
  122#include <net/ip.h>
  123#include <net/mpls.h>
  124#include <linux/ipv6.h>
  125#include <linux/in.h>
  126#include <linux/jhash.h>
  127#include <linux/random.h>
  128#include <trace/events/napi.h>
  129#include <trace/events/net.h>
  130#include <trace/events/skb.h>
  131#include <linux/inetdevice.h>
  132#include <linux/cpu_rmap.h>
  133#include <linux/static_key.h>
  134#include <linux/hashtable.h>
  135#include <linux/vmalloc.h>
  136#include <linux/if_macvlan.h>
  137#include <linux/errqueue.h>
  138#include <linux/hrtimer.h>
  139#include <linux/netfilter_ingress.h>
  140#include <linux/crash_dump.h>
  141#include <linux/sctp.h>
  142#include <net/udp_tunnel.h>
  143#include <linux/net_namespace.h>
  144#include <linux/indirect_call_wrapper.h>
  145#include <net/devlink.h>
  146
  147#include "net-sysfs.h"
  148
  149#define MAX_GRO_SKBS 8
  150
  151/* This should be increased if a protocol with a bigger head is added. */
  152#define GRO_MAX_HEAD (MAX_HEADER + 128)
  153
  154static DEFINE_SPINLOCK(ptype_lock);
  155static DEFINE_SPINLOCK(offload_lock);
  156struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  157struct list_head ptype_all __read_mostly;	/* Taps */
  158static struct list_head offload_base __read_mostly;
  159
  160static int netif_rx_internal(struct sk_buff *skb);
  161static int call_netdevice_notifiers_info(unsigned long val,
  162					 struct netdev_notifier_info *info);
  163static int call_netdevice_notifiers_extack(unsigned long val,
  164					   struct net_device *dev,
  165					   struct netlink_ext_ack *extack);
  166static struct napi_struct *napi_by_id(unsigned int napi_id);
  167
  168/*
  169 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  170 * semaphore.
  171 *
  172 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  173 *
  174 * Writers must hold the rtnl semaphore while they loop through the
  175 * dev_base_head list, and hold dev_base_lock for writing when they do the
  176 * actual updates.  This allows pure readers to access the list even
  177 * while a writer is preparing to update it.
  178 *
  179 * To put it another way, dev_base_lock is held for writing only to
  180 * protect against pure readers; the rtnl semaphore provides the
  181 * protection against other writers.
  182 *
  183 * See, for example usages, register_netdevice() and
  184 * unregister_netdevice(), which must be called with the rtnl
  185 * semaphore held.
  186 */
  187DEFINE_RWLOCK(dev_base_lock);
  188EXPORT_SYMBOL(dev_base_lock);
  189
  190static DEFINE_MUTEX(ifalias_mutex);
  191
  192/* protects napi_hash addition/deletion and napi_gen_id */
  193static DEFINE_SPINLOCK(napi_hash_lock);
  194
  195static unsigned int napi_gen_id = NR_CPUS;
  196static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  197
  198static DECLARE_RWSEM(devnet_rename_sem);
  199
  200static inline void dev_base_seq_inc(struct net *net)
  201{
  202	while (++net->dev_base_seq == 0)
  203		;
  204}
  205
  206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  207{
  208	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  209
  210	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  211}
  212
  213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  214{
  215	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  216}
  217
  218static inline void rps_lock(struct softnet_data *sd)
  219{
  220#ifdef CONFIG_RPS
  221	spin_lock(&sd->input_pkt_queue.lock);
  222#endif
  223}
  224
  225static inline void rps_unlock(struct softnet_data *sd)
  226{
  227#ifdef CONFIG_RPS
  228	spin_unlock(&sd->input_pkt_queue.lock);
  229#endif
  230}
  231
  232static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  233						       const char *name)
  234{
  235	struct netdev_name_node *name_node;
  236
  237	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  238	if (!name_node)
  239		return NULL;
  240	INIT_HLIST_NODE(&name_node->hlist);
  241	name_node->dev = dev;
  242	name_node->name = name;
  243	return name_node;
  244}
  245
  246static struct netdev_name_node *
  247netdev_name_node_head_alloc(struct net_device *dev)
  248{
  249	struct netdev_name_node *name_node;
  250
  251	name_node = netdev_name_node_alloc(dev, dev->name);
  252	if (!name_node)
  253		return NULL;
  254	INIT_LIST_HEAD(&name_node->list);
  255	return name_node;
  256}
  257
  258static void netdev_name_node_free(struct netdev_name_node *name_node)
  259{
  260	kfree(name_node);
  261}
  262
  263static void netdev_name_node_add(struct net *net,
  264				 struct netdev_name_node *name_node)
  265{
  266	hlist_add_head_rcu(&name_node->hlist,
  267			   dev_name_hash(net, name_node->name));
  268}
  269
  270static void netdev_name_node_del(struct netdev_name_node *name_node)
  271{
  272	hlist_del_rcu(&name_node->hlist);
  273}
  274
  275static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  276							const char *name)
  277{
  278	struct hlist_head *head = dev_name_hash(net, name);
  279	struct netdev_name_node *name_node;
  280
  281	hlist_for_each_entry(name_node, head, hlist)
  282		if (!strcmp(name_node->name, name))
  283			return name_node;
  284	return NULL;
  285}
  286
  287static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  288							    const char *name)
  289{
  290	struct hlist_head *head = dev_name_hash(net, name);
  291	struct netdev_name_node *name_node;
  292
  293	hlist_for_each_entry_rcu(name_node, head, hlist)
  294		if (!strcmp(name_node->name, name))
  295			return name_node;
  296	return NULL;
  297}
  298
  299int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  300{
  301	struct netdev_name_node *name_node;
  302	struct net *net = dev_net(dev);
  303
  304	name_node = netdev_name_node_lookup(net, name);
  305	if (name_node)
  306		return -EEXIST;
  307	name_node = netdev_name_node_alloc(dev, name);
  308	if (!name_node)
  309		return -ENOMEM;
  310	netdev_name_node_add(net, name_node);
  311	/* The node that holds dev->name acts as a head of per-device list. */
  312	list_add_tail(&name_node->list, &dev->name_node->list);
  313
  314	return 0;
  315}
  316EXPORT_SYMBOL(netdev_name_node_alt_create);
  317
  318static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  319{
  320	list_del(&name_node->list);
  321	netdev_name_node_del(name_node);
  322	kfree(name_node->name);
  323	netdev_name_node_free(name_node);
  324}
  325
  326int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  327{
  328	struct netdev_name_node *name_node;
  329	struct net *net = dev_net(dev);
  330
  331	name_node = netdev_name_node_lookup(net, name);
  332	if (!name_node)
  333		return -ENOENT;
  334	/* lookup might have found our primary name or a name belonging
  335	 * to another device.
  336	 */
  337	if (name_node == dev->name_node || name_node->dev != dev)
  338		return -EINVAL;
  339
  340	__netdev_name_node_alt_destroy(name_node);
  341
  342	return 0;
  343}
  344EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  345
  346static void netdev_name_node_alt_flush(struct net_device *dev)
  347{
  348	struct netdev_name_node *name_node, *tmp;
  349
  350	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  351		__netdev_name_node_alt_destroy(name_node);
  352}
  353
  354/* Device list insertion */
  355static void list_netdevice(struct net_device *dev)
  356{
  357	struct net *net = dev_net(dev);
  358
  359	ASSERT_RTNL();
  360
  361	write_lock_bh(&dev_base_lock);
  362	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  363	netdev_name_node_add(net, dev->name_node);
  364	hlist_add_head_rcu(&dev->index_hlist,
  365			   dev_index_hash(net, dev->ifindex));
  366	write_unlock_bh(&dev_base_lock);
  367
  368	dev_base_seq_inc(net);
  369}
  370
  371/* Device list removal
  372 * caller must respect a RCU grace period before freeing/reusing dev
  373 */
  374static void unlist_netdevice(struct net_device *dev)
  375{
  376	ASSERT_RTNL();
  377
  378	/* Unlink dev from the device chain */
  379	write_lock_bh(&dev_base_lock);
  380	list_del_rcu(&dev->dev_list);
  381	netdev_name_node_del(dev->name_node);
  382	hlist_del_rcu(&dev->index_hlist);
  383	write_unlock_bh(&dev_base_lock);
  384
  385	dev_base_seq_inc(dev_net(dev));
  386}
  387
  388/*
  389 *	Our notifier list
  390 */
  391
  392static RAW_NOTIFIER_HEAD(netdev_chain);
  393
  394/*
  395 *	Device drivers call our routines to queue packets here. We empty the
  396 *	queue in the local softnet handler.
  397 */
  398
  399DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  400EXPORT_PER_CPU_SYMBOL(softnet_data);
  401
  402#ifdef CONFIG_LOCKDEP
  403/*
  404 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  405 * according to dev->type
  406 */
  407static const unsigned short netdev_lock_type[] = {
  408	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  409	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  410	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  411	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  412	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  413	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  414	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  415	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  416	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  417	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  418	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  419	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  420	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  421	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  422	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  423
  424static const char *const netdev_lock_name[] = {
  425	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  426	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  427	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  428	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  429	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  430	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  431	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  432	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  433	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  434	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  435	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  436	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  437	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  438	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  439	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  440
  441static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  442static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  443
  444static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  445{
  446	int i;
  447
  448	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  449		if (netdev_lock_type[i] == dev_type)
  450			return i;
  451	/* the last key is used by default */
  452	return ARRAY_SIZE(netdev_lock_type) - 1;
  453}
  454
  455static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  456						 unsigned short dev_type)
  457{
  458	int i;
  459
  460	i = netdev_lock_pos(dev_type);
  461	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  462				   netdev_lock_name[i]);
  463}
  464
  465static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  466{
  467	int i;
  468
  469	i = netdev_lock_pos(dev->type);
  470	lockdep_set_class_and_name(&dev->addr_list_lock,
  471				   &netdev_addr_lock_key[i],
  472				   netdev_lock_name[i]);
  473}
  474#else
  475static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  476						 unsigned short dev_type)
  477{
  478}
  479
  480static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  481{
  482}
  483#endif
  484
  485/*******************************************************************************
  486 *
  487 *		Protocol management and registration routines
  488 *
  489 *******************************************************************************/
  490
  491
  492/*
  493 *	Add a protocol ID to the list. Now that the input handler is
  494 *	smarter we can dispense with all the messy stuff that used to be
  495 *	here.
  496 *
  497 *	BEWARE!!! Protocol handlers, mangling input packets,
  498 *	MUST BE last in hash buckets and checking protocol handlers
  499 *	MUST start from promiscuous ptype_all chain in net_bh.
  500 *	It is true now, do not change it.
  501 *	Explanation follows: if protocol handler, mangling packet, will
  502 *	be the first on list, it is not able to sense, that packet
  503 *	is cloned and should be copied-on-write, so that it will
  504 *	change it and subsequent readers will get broken packet.
  505 *							--ANK (980803)
  506 */
  507
  508static inline struct list_head *ptype_head(const struct packet_type *pt)
  509{
  510	if (pt->type == htons(ETH_P_ALL))
  511		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  512	else
  513		return pt->dev ? &pt->dev->ptype_specific :
  514				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  515}
  516
  517/**
  518 *	dev_add_pack - add packet handler
  519 *	@pt: packet type declaration
  520 *
  521 *	Add a protocol handler to the networking stack. The passed &packet_type
  522 *	is linked into kernel lists and may not be freed until it has been
  523 *	removed from the kernel lists.
  524 *
  525 *	This call does not sleep therefore it can not
  526 *	guarantee all CPU's that are in middle of receiving packets
  527 *	will see the new packet type (until the next received packet).
  528 */
  529
  530void dev_add_pack(struct packet_type *pt)
  531{
  532	struct list_head *head = ptype_head(pt);
  533
  534	spin_lock(&ptype_lock);
  535	list_add_rcu(&pt->list, head);
  536	spin_unlock(&ptype_lock);
  537}
  538EXPORT_SYMBOL(dev_add_pack);
  539
  540/**
  541 *	__dev_remove_pack	 - remove packet handler
  542 *	@pt: packet type declaration
  543 *
  544 *	Remove a protocol handler that was previously added to the kernel
  545 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  546 *	from the kernel lists and can be freed or reused once this function
  547 *	returns.
  548 *
  549 *      The packet type might still be in use by receivers
  550 *	and must not be freed until after all the CPU's have gone
  551 *	through a quiescent state.
  552 */
  553void __dev_remove_pack(struct packet_type *pt)
  554{
  555	struct list_head *head = ptype_head(pt);
  556	struct packet_type *pt1;
  557
  558	spin_lock(&ptype_lock);
  559
  560	list_for_each_entry(pt1, head, list) {
  561		if (pt == pt1) {
  562			list_del_rcu(&pt->list);
  563			goto out;
  564		}
  565	}
  566
  567	pr_warn("dev_remove_pack: %p not found\n", pt);
  568out:
  569	spin_unlock(&ptype_lock);
  570}
  571EXPORT_SYMBOL(__dev_remove_pack);
  572
  573/**
  574 *	dev_remove_pack	 - remove packet handler
  575 *	@pt: packet type declaration
  576 *
  577 *	Remove a protocol handler that was previously added to the kernel
  578 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  579 *	from the kernel lists and can be freed or reused once this function
  580 *	returns.
  581 *
  582 *	This call sleeps to guarantee that no CPU is looking at the packet
  583 *	type after return.
  584 */
  585void dev_remove_pack(struct packet_type *pt)
  586{
  587	__dev_remove_pack(pt);
  588
  589	synchronize_net();
  590}
  591EXPORT_SYMBOL(dev_remove_pack);
  592
  593
  594/**
  595 *	dev_add_offload - register offload handlers
  596 *	@po: protocol offload declaration
  597 *
  598 *	Add protocol offload handlers to the networking stack. The passed
  599 *	&proto_offload is linked into kernel lists and may not be freed until
  600 *	it has been removed from the kernel lists.
  601 *
  602 *	This call does not sleep therefore it can not
  603 *	guarantee all CPU's that are in middle of receiving packets
  604 *	will see the new offload handlers (until the next received packet).
  605 */
  606void dev_add_offload(struct packet_offload *po)
  607{
  608	struct packet_offload *elem;
  609
  610	spin_lock(&offload_lock);
  611	list_for_each_entry(elem, &offload_base, list) {
  612		if (po->priority < elem->priority)
  613			break;
  614	}
  615	list_add_rcu(&po->list, elem->list.prev);
  616	spin_unlock(&offload_lock);
  617}
  618EXPORT_SYMBOL(dev_add_offload);
  619
  620/**
  621 *	__dev_remove_offload	 - remove offload handler
  622 *	@po: packet offload declaration
  623 *
  624 *	Remove a protocol offload handler that was previously added to the
  625 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  626 *	is removed from the kernel lists and can be freed or reused once this
  627 *	function returns.
  628 *
  629 *      The packet type might still be in use by receivers
  630 *	and must not be freed until after all the CPU's have gone
  631 *	through a quiescent state.
  632 */
  633static void __dev_remove_offload(struct packet_offload *po)
  634{
  635	struct list_head *head = &offload_base;
  636	struct packet_offload *po1;
  637
  638	spin_lock(&offload_lock);
  639
  640	list_for_each_entry(po1, head, list) {
  641		if (po == po1) {
  642			list_del_rcu(&po->list);
  643			goto out;
  644		}
  645	}
  646
  647	pr_warn("dev_remove_offload: %p not found\n", po);
  648out:
  649	spin_unlock(&offload_lock);
  650}
  651
  652/**
  653 *	dev_remove_offload	 - remove packet offload handler
  654 *	@po: packet offload declaration
  655 *
  656 *	Remove a packet offload handler that was previously added to the kernel
  657 *	offload handlers by dev_add_offload(). The passed &offload_type is
  658 *	removed from the kernel lists and can be freed or reused once this
  659 *	function returns.
  660 *
  661 *	This call sleeps to guarantee that no CPU is looking at the packet
  662 *	type after return.
  663 */
  664void dev_remove_offload(struct packet_offload *po)
  665{
  666	__dev_remove_offload(po);
  667
  668	synchronize_net();
  669}
  670EXPORT_SYMBOL(dev_remove_offload);
  671
  672/******************************************************************************
  673 *
  674 *		      Device Boot-time Settings Routines
  675 *
  676 ******************************************************************************/
  677
  678/* Boot time configuration table */
  679static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  680
  681/**
  682 *	netdev_boot_setup_add	- add new setup entry
  683 *	@name: name of the device
  684 *	@map: configured settings for the device
  685 *
  686 *	Adds new setup entry to the dev_boot_setup list.  The function
  687 *	returns 0 on error and 1 on success.  This is a generic routine to
  688 *	all netdevices.
  689 */
  690static int netdev_boot_setup_add(char *name, struct ifmap *map)
  691{
  692	struct netdev_boot_setup *s;
  693	int i;
  694
  695	s = dev_boot_setup;
  696	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  697		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  698			memset(s[i].name, 0, sizeof(s[i].name));
  699			strlcpy(s[i].name, name, IFNAMSIZ);
  700			memcpy(&s[i].map, map, sizeof(s[i].map));
  701			break;
  702		}
  703	}
  704
  705	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  706}
  707
  708/**
  709 * netdev_boot_setup_check	- check boot time settings
  710 * @dev: the netdevice
  711 *
  712 * Check boot time settings for the device.
  713 * The found settings are set for the device to be used
  714 * later in the device probing.
  715 * Returns 0 if no settings found, 1 if they are.
  716 */
  717int netdev_boot_setup_check(struct net_device *dev)
  718{
  719	struct netdev_boot_setup *s = dev_boot_setup;
  720	int i;
  721
  722	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  723		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  724		    !strcmp(dev->name, s[i].name)) {
  725			dev->irq = s[i].map.irq;
  726			dev->base_addr = s[i].map.base_addr;
  727			dev->mem_start = s[i].map.mem_start;
  728			dev->mem_end = s[i].map.mem_end;
  729			return 1;
  730		}
  731	}
  732	return 0;
  733}
  734EXPORT_SYMBOL(netdev_boot_setup_check);
  735
  736
  737/**
  738 * netdev_boot_base	- get address from boot time settings
  739 * @prefix: prefix for network device
  740 * @unit: id for network device
  741 *
  742 * Check boot time settings for the base address of device.
  743 * The found settings are set for the device to be used
  744 * later in the device probing.
  745 * Returns 0 if no settings found.
  746 */
  747unsigned long netdev_boot_base(const char *prefix, int unit)
  748{
  749	const struct netdev_boot_setup *s = dev_boot_setup;
  750	char name[IFNAMSIZ];
  751	int i;
  752
  753	sprintf(name, "%s%d", prefix, unit);
  754
  755	/*
  756	 * If device already registered then return base of 1
  757	 * to indicate not to probe for this interface
  758	 */
  759	if (__dev_get_by_name(&init_net, name))
  760		return 1;
  761
  762	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  763		if (!strcmp(name, s[i].name))
  764			return s[i].map.base_addr;
  765	return 0;
  766}
  767
  768/*
  769 * Saves at boot time configured settings for any netdevice.
  770 */
  771int __init netdev_boot_setup(char *str)
  772{
  773	int ints[5];
  774	struct ifmap map;
  775
  776	str = get_options(str, ARRAY_SIZE(ints), ints);
  777	if (!str || !*str)
  778		return 0;
  779
  780	/* Save settings */
  781	memset(&map, 0, sizeof(map));
  782	if (ints[0] > 0)
  783		map.irq = ints[1];
  784	if (ints[0] > 1)
  785		map.base_addr = ints[2];
  786	if (ints[0] > 2)
  787		map.mem_start = ints[3];
  788	if (ints[0] > 3)
  789		map.mem_end = ints[4];
  790
  791	/* Add new entry to the list */
  792	return netdev_boot_setup_add(str, &map);
  793}
  794
  795__setup("netdev=", netdev_boot_setup);
  796
  797/*******************************************************************************
  798 *
  799 *			    Device Interface Subroutines
  800 *
  801 *******************************************************************************/
  802
  803/**
  804 *	dev_get_iflink	- get 'iflink' value of a interface
  805 *	@dev: targeted interface
  806 *
  807 *	Indicates the ifindex the interface is linked to.
  808 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  809 */
  810
  811int dev_get_iflink(const struct net_device *dev)
  812{
  813	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  814		return dev->netdev_ops->ndo_get_iflink(dev);
  815
  816	return dev->ifindex;
  817}
  818EXPORT_SYMBOL(dev_get_iflink);
  819
  820/**
  821 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  822 *	@dev: targeted interface
  823 *	@skb: The packet.
  824 *
  825 *	For better visibility of tunnel traffic OVS needs to retrieve
  826 *	egress tunnel information for a packet. Following API allows
  827 *	user to get this info.
  828 */
  829int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  830{
  831	struct ip_tunnel_info *info;
  832
  833	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  834		return -EINVAL;
  835
  836	info = skb_tunnel_info_unclone(skb);
  837	if (!info)
  838		return -ENOMEM;
  839	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  840		return -EINVAL;
  841
  842	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  843}
  844EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  845
  846/**
  847 *	__dev_get_by_name	- find a device by its name
  848 *	@net: the applicable net namespace
  849 *	@name: name to find
  850 *
  851 *	Find an interface by name. Must be called under RTNL semaphore
  852 *	or @dev_base_lock. If the name is found a pointer to the device
  853 *	is returned. If the name is not found then %NULL is returned. The
  854 *	reference counters are not incremented so the caller must be
  855 *	careful with locks.
  856 */
  857
  858struct net_device *__dev_get_by_name(struct net *net, const char *name)
  859{
  860	struct netdev_name_node *node_name;
  861
  862	node_name = netdev_name_node_lookup(net, name);
  863	return node_name ? node_name->dev : NULL;
  864}
  865EXPORT_SYMBOL(__dev_get_by_name);
  866
  867/**
  868 * dev_get_by_name_rcu	- find a device by its name
  869 * @net: the applicable net namespace
  870 * @name: name to find
  871 *
  872 * Find an interface by name.
  873 * If the name is found a pointer to the device is returned.
  874 * If the name is not found then %NULL is returned.
  875 * The reference counters are not incremented so the caller must be
  876 * careful with locks. The caller must hold RCU lock.
  877 */
  878
  879struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  880{
  881	struct netdev_name_node *node_name;
  882
  883	node_name = netdev_name_node_lookup_rcu(net, name);
  884	return node_name ? node_name->dev : NULL;
  885}
  886EXPORT_SYMBOL(dev_get_by_name_rcu);
  887
  888/**
  889 *	dev_get_by_name		- find a device by its name
  890 *	@net: the applicable net namespace
  891 *	@name: name to find
  892 *
  893 *	Find an interface by name. This can be called from any
  894 *	context and does its own locking. The returned handle has
  895 *	the usage count incremented and the caller must use dev_put() to
  896 *	release it when it is no longer needed. %NULL is returned if no
  897 *	matching device is found.
  898 */
  899
  900struct net_device *dev_get_by_name(struct net *net, const char *name)
  901{
  902	struct net_device *dev;
  903
  904	rcu_read_lock();
  905	dev = dev_get_by_name_rcu(net, name);
  906	if (dev)
  907		dev_hold(dev);
  908	rcu_read_unlock();
  909	return dev;
  910}
  911EXPORT_SYMBOL(dev_get_by_name);
  912
  913/**
  914 *	__dev_get_by_index - find a device by its ifindex
  915 *	@net: the applicable net namespace
  916 *	@ifindex: index of device
  917 *
  918 *	Search for an interface by index. Returns %NULL if the device
  919 *	is not found or a pointer to the device. The device has not
  920 *	had its reference counter increased so the caller must be careful
  921 *	about locking. The caller must hold either the RTNL semaphore
  922 *	or @dev_base_lock.
  923 */
  924
  925struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  926{
  927	struct net_device *dev;
  928	struct hlist_head *head = dev_index_hash(net, ifindex);
  929
  930	hlist_for_each_entry(dev, head, index_hlist)
  931		if (dev->ifindex == ifindex)
  932			return dev;
  933
  934	return NULL;
  935}
  936EXPORT_SYMBOL(__dev_get_by_index);
  937
  938/**
  939 *	dev_get_by_index_rcu - find a device by its ifindex
  940 *	@net: the applicable net namespace
  941 *	@ifindex: index of device
  942 *
  943 *	Search for an interface by index. Returns %NULL if the device
  944 *	is not found or a pointer to the device. The device has not
  945 *	had its reference counter increased so the caller must be careful
  946 *	about locking. The caller must hold RCU lock.
  947 */
  948
  949struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  950{
  951	struct net_device *dev;
  952	struct hlist_head *head = dev_index_hash(net, ifindex);
  953
  954	hlist_for_each_entry_rcu(dev, head, index_hlist)
  955		if (dev->ifindex == ifindex)
  956			return dev;
  957
  958	return NULL;
  959}
  960EXPORT_SYMBOL(dev_get_by_index_rcu);
  961
  962
  963/**
  964 *	dev_get_by_index - find a device by its ifindex
  965 *	@net: the applicable net namespace
  966 *	@ifindex: index of device
  967 *
  968 *	Search for an interface by index. Returns NULL if the device
  969 *	is not found or a pointer to the device. The device returned has
  970 *	had a reference added and the pointer is safe until the user calls
  971 *	dev_put to indicate they have finished with it.
  972 */
  973
  974struct net_device *dev_get_by_index(struct net *net, int ifindex)
  975{
  976	struct net_device *dev;
  977
  978	rcu_read_lock();
  979	dev = dev_get_by_index_rcu(net, ifindex);
  980	if (dev)
  981		dev_hold(dev);
  982	rcu_read_unlock();
  983	return dev;
  984}
  985EXPORT_SYMBOL(dev_get_by_index);
  986
  987/**
  988 *	dev_get_by_napi_id - find a device by napi_id
  989 *	@napi_id: ID of the NAPI struct
  990 *
  991 *	Search for an interface by NAPI ID. Returns %NULL if the device
  992 *	is not found or a pointer to the device. The device has not had
  993 *	its reference counter increased so the caller must be careful
  994 *	about locking. The caller must hold RCU lock.
  995 */
  996
  997struct net_device *dev_get_by_napi_id(unsigned int napi_id)
  998{
  999	struct napi_struct *napi;
 1000
 1001	WARN_ON_ONCE(!rcu_read_lock_held());
 1002
 1003	if (napi_id < MIN_NAPI_ID)
 1004		return NULL;
 1005
 1006	napi = napi_by_id(napi_id);
 1007
 1008	return napi ? napi->dev : NULL;
 1009}
 1010EXPORT_SYMBOL(dev_get_by_napi_id);
 1011
 1012/**
 1013 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1014 *	@net: network namespace
 1015 *	@name: a pointer to the buffer where the name will be stored.
 1016 *	@ifindex: the ifindex of the interface to get the name from.
 1017 */
 1018int netdev_get_name(struct net *net, char *name, int ifindex)
 1019{
 1020	struct net_device *dev;
 1021	int ret;
 1022
 1023	down_read(&devnet_rename_sem);
 1024	rcu_read_lock();
 1025
 1026	dev = dev_get_by_index_rcu(net, ifindex);
 1027	if (!dev) {
 1028		ret = -ENODEV;
 1029		goto out;
 1030	}
 1031
 1032	strcpy(name, dev->name);
 1033
 1034	ret = 0;
 1035out:
 1036	rcu_read_unlock();
 1037	up_read(&devnet_rename_sem);
 1038	return ret;
 1039}
 1040
 1041/**
 1042 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1043 *	@net: the applicable net namespace
 1044 *	@type: media type of device
 1045 *	@ha: hardware address
 1046 *
 1047 *	Search for an interface by MAC address. Returns NULL if the device
 1048 *	is not found or a pointer to the device.
 1049 *	The caller must hold RCU or RTNL.
 1050 *	The returned device has not had its ref count increased
 1051 *	and the caller must therefore be careful about locking
 1052 *
 1053 */
 1054
 1055struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1056				       const char *ha)
 1057{
 1058	struct net_device *dev;
 1059
 1060	for_each_netdev_rcu(net, dev)
 1061		if (dev->type == type &&
 1062		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1063			return dev;
 1064
 1065	return NULL;
 1066}
 1067EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1068
 1069struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1070{
 1071	struct net_device *dev;
 1072
 1073	ASSERT_RTNL();
 1074	for_each_netdev(net, dev)
 1075		if (dev->type == type)
 1076			return dev;
 1077
 1078	return NULL;
 1079}
 1080EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 1081
 1082struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1083{
 1084	struct net_device *dev, *ret = NULL;
 1085
 1086	rcu_read_lock();
 1087	for_each_netdev_rcu(net, dev)
 1088		if (dev->type == type) {
 1089			dev_hold(dev);
 1090			ret = dev;
 1091			break;
 1092		}
 1093	rcu_read_unlock();
 1094	return ret;
 1095}
 1096EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1097
 1098/**
 1099 *	__dev_get_by_flags - find any device with given flags
 1100 *	@net: the applicable net namespace
 1101 *	@if_flags: IFF_* values
 1102 *	@mask: bitmask of bits in if_flags to check
 1103 *
 1104 *	Search for any interface with the given flags. Returns NULL if a device
 1105 *	is not found or a pointer to the device. Must be called inside
 1106 *	rtnl_lock(), and result refcount is unchanged.
 1107 */
 1108
 1109struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1110				      unsigned short mask)
 1111{
 1112	struct net_device *dev, *ret;
 1113
 1114	ASSERT_RTNL();
 1115
 1116	ret = NULL;
 1117	for_each_netdev(net, dev) {
 1118		if (((dev->flags ^ if_flags) & mask) == 0) {
 1119			ret = dev;
 1120			break;
 1121		}
 1122	}
 1123	return ret;
 1124}
 1125EXPORT_SYMBOL(__dev_get_by_flags);
 1126
 1127/**
 1128 *	dev_valid_name - check if name is okay for network device
 1129 *	@name: name string
 1130 *
 1131 *	Network device names need to be valid file names to
 1132 *	to allow sysfs to work.  We also disallow any kind of
 1133 *	whitespace.
 1134 */
 1135bool dev_valid_name(const char *name)
 1136{
 1137	if (*name == '\0')
 1138		return false;
 1139	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1140		return false;
 1141	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1142		return false;
 1143
 1144	while (*name) {
 1145		if (*name == '/' || *name == ':' || isspace(*name))
 1146			return false;
 1147		name++;
 1148	}
 1149	return true;
 1150}
 1151EXPORT_SYMBOL(dev_valid_name);
 1152
 1153/**
 1154 *	__dev_alloc_name - allocate a name for a device
 1155 *	@net: network namespace to allocate the device name in
 1156 *	@name: name format string
 1157 *	@buf:  scratch buffer and result name string
 1158 *
 1159 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1160 *	id. It scans list of devices to build up a free map, then chooses
 1161 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1162 *	while allocating the name and adding the device in order to avoid
 1163 *	duplicates.
 1164 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1165 *	Returns the number of the unit assigned or a negative errno code.
 1166 */
 1167
 1168static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1169{
 1170	int i = 0;
 1171	const char *p;
 1172	const int max_netdevices = 8*PAGE_SIZE;
 1173	unsigned long *inuse;
 1174	struct net_device *d;
 1175
 1176	if (!dev_valid_name(name))
 1177		return -EINVAL;
 1178
 1179	p = strchr(name, '%');
 1180	if (p) {
 1181		/*
 1182		 * Verify the string as this thing may have come from
 1183		 * the user.  There must be either one "%d" and no other "%"
 1184		 * characters.
 1185		 */
 1186		if (p[1] != 'd' || strchr(p + 2, '%'))
 1187			return -EINVAL;
 1188
 1189		/* Use one page as a bit array of possible slots */
 1190		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1191		if (!inuse)
 1192			return -ENOMEM;
 1193
 1194		for_each_netdev(net, d) {
 1195			if (!sscanf(d->name, name, &i))
 1196				continue;
 1197			if (i < 0 || i >= max_netdevices)
 1198				continue;
 1199
 1200			/*  avoid cases where sscanf is not exact inverse of printf */
 1201			snprintf(buf, IFNAMSIZ, name, i);
 1202			if (!strncmp(buf, d->name, IFNAMSIZ))
 1203				set_bit(i, inuse);
 1204		}
 1205
 1206		i = find_first_zero_bit(inuse, max_netdevices);
 1207		free_page((unsigned long) inuse);
 1208	}
 1209
 1210	snprintf(buf, IFNAMSIZ, name, i);
 1211	if (!__dev_get_by_name(net, buf))
 1212		return i;
 1213
 1214	/* It is possible to run out of possible slots
 1215	 * when the name is long and there isn't enough space left
 1216	 * for the digits, or if all bits are used.
 1217	 */
 1218	return -ENFILE;
 1219}
 1220
 1221static int dev_alloc_name_ns(struct net *net,
 1222			     struct net_device *dev,
 1223			     const char *name)
 1224{
 1225	char buf[IFNAMSIZ];
 1226	int ret;
 1227
 1228	BUG_ON(!net);
 1229	ret = __dev_alloc_name(net, name, buf);
 1230	if (ret >= 0)
 1231		strlcpy(dev->name, buf, IFNAMSIZ);
 1232	return ret;
 1233}
 1234
 1235/**
 1236 *	dev_alloc_name - allocate a name for a device
 1237 *	@dev: device
 1238 *	@name: name format string
 1239 *
 1240 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1241 *	id. It scans list of devices to build up a free map, then chooses
 1242 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1243 *	while allocating the name and adding the device in order to avoid
 1244 *	duplicates.
 1245 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1246 *	Returns the number of the unit assigned or a negative errno code.
 1247 */
 1248
 1249int dev_alloc_name(struct net_device *dev, const char *name)
 1250{
 1251	return dev_alloc_name_ns(dev_net(dev), dev, name);
 1252}
 1253EXPORT_SYMBOL(dev_alloc_name);
 1254
 1255static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1256			      const char *name)
 1257{
 1258	BUG_ON(!net);
 1259
 1260	if (!dev_valid_name(name))
 1261		return -EINVAL;
 1262
 1263	if (strchr(name, '%'))
 1264		return dev_alloc_name_ns(net, dev, name);
 1265	else if (__dev_get_by_name(net, name))
 1266		return -EEXIST;
 1267	else if (dev->name != name)
 1268		strlcpy(dev->name, name, IFNAMSIZ);
 1269
 1270	return 0;
 1271}
 1272
 1273/**
 1274 *	dev_change_name - change name of a device
 1275 *	@dev: device
 1276 *	@newname: name (or format string) must be at least IFNAMSIZ
 1277 *
 1278 *	Change name of a device, can pass format strings "eth%d".
 1279 *	for wildcarding.
 1280 */
 1281int dev_change_name(struct net_device *dev, const char *newname)
 1282{
 1283	unsigned char old_assign_type;
 1284	char oldname[IFNAMSIZ];
 1285	int err = 0;
 1286	int ret;
 1287	struct net *net;
 1288
 1289	ASSERT_RTNL();
 1290	BUG_ON(!dev_net(dev));
 1291
 1292	net = dev_net(dev);
 1293
 1294	/* Some auto-enslaved devices e.g. failover slaves are
 1295	 * special, as userspace might rename the device after
 1296	 * the interface had been brought up and running since
 1297	 * the point kernel initiated auto-enslavement. Allow
 1298	 * live name change even when these slave devices are
 1299	 * up and running.
 1300	 *
 1301	 * Typically, users of these auto-enslaving devices
 1302	 * don't actually care about slave name change, as
 1303	 * they are supposed to operate on master interface
 1304	 * directly.
 1305	 */
 1306	if (dev->flags & IFF_UP &&
 1307	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1308		return -EBUSY;
 1309
 1310	down_write(&devnet_rename_sem);
 1311
 1312	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1313		up_write(&devnet_rename_sem);
 1314		return 0;
 1315	}
 1316
 1317	memcpy(oldname, dev->name, IFNAMSIZ);
 1318
 1319	err = dev_get_valid_name(net, dev, newname);
 1320	if (err < 0) {
 1321		up_write(&devnet_rename_sem);
 1322		return err;
 1323	}
 1324
 1325	if (oldname[0] && !strchr(oldname, '%'))
 1326		netdev_info(dev, "renamed from %s\n", oldname);
 1327
 1328	old_assign_type = dev->name_assign_type;
 1329	dev->name_assign_type = NET_NAME_RENAMED;
 1330
 1331rollback:
 1332	ret = device_rename(&dev->dev, dev->name);
 1333	if (ret) {
 1334		memcpy(dev->name, oldname, IFNAMSIZ);
 1335		dev->name_assign_type = old_assign_type;
 1336		up_write(&devnet_rename_sem);
 1337		return ret;
 1338	}
 1339
 1340	up_write(&devnet_rename_sem);
 1341
 1342	netdev_adjacent_rename_links(dev, oldname);
 1343
 1344	write_lock_bh(&dev_base_lock);
 1345	netdev_name_node_del(dev->name_node);
 1346	write_unlock_bh(&dev_base_lock);
 1347
 1348	synchronize_rcu();
 1349
 1350	write_lock_bh(&dev_base_lock);
 1351	netdev_name_node_add(net, dev->name_node);
 1352	write_unlock_bh(&dev_base_lock);
 1353
 1354	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1355	ret = notifier_to_errno(ret);
 1356
 1357	if (ret) {
 1358		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1359		if (err >= 0) {
 1360			err = ret;
 1361			down_write(&devnet_rename_sem);
 1362			memcpy(dev->name, oldname, IFNAMSIZ);
 1363			memcpy(oldname, newname, IFNAMSIZ);
 1364			dev->name_assign_type = old_assign_type;
 1365			old_assign_type = NET_NAME_RENAMED;
 1366			goto rollback;
 1367		} else {
 1368			pr_err("%s: name change rollback failed: %d\n",
 1369			       dev->name, ret);
 1370		}
 1371	}
 1372
 1373	return err;
 1374}
 1375
 1376/**
 1377 *	dev_set_alias - change ifalias of a device
 1378 *	@dev: device
 1379 *	@alias: name up to IFALIASZ
 1380 *	@len: limit of bytes to copy from info
 1381 *
 1382 *	Set ifalias for a device,
 1383 */
 1384int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1385{
 1386	struct dev_ifalias *new_alias = NULL;
 1387
 1388	if (len >= IFALIASZ)
 1389		return -EINVAL;
 1390
 1391	if (len) {
 1392		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1393		if (!new_alias)
 1394			return -ENOMEM;
 1395
 1396		memcpy(new_alias->ifalias, alias, len);
 1397		new_alias->ifalias[len] = 0;
 1398	}
 1399
 1400	mutex_lock(&ifalias_mutex);
 1401	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1402					mutex_is_locked(&ifalias_mutex));
 1403	mutex_unlock(&ifalias_mutex);
 1404
 1405	if (new_alias)
 1406		kfree_rcu(new_alias, rcuhead);
 1407
 1408	return len;
 1409}
 1410EXPORT_SYMBOL(dev_set_alias);
 1411
 1412/**
 1413 *	dev_get_alias - get ifalias of a device
 1414 *	@dev: device
 1415 *	@name: buffer to store name of ifalias
 1416 *	@len: size of buffer
 1417 *
 1418 *	get ifalias for a device.  Caller must make sure dev cannot go
 1419 *	away,  e.g. rcu read lock or own a reference count to device.
 1420 */
 1421int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1422{
 1423	const struct dev_ifalias *alias;
 1424	int ret = 0;
 1425
 1426	rcu_read_lock();
 1427	alias = rcu_dereference(dev->ifalias);
 1428	if (alias)
 1429		ret = snprintf(name, len, "%s", alias->ifalias);
 1430	rcu_read_unlock();
 1431
 1432	return ret;
 1433}
 1434
 1435/**
 1436 *	netdev_features_change - device changes features
 1437 *	@dev: device to cause notification
 1438 *
 1439 *	Called to indicate a device has changed features.
 1440 */
 1441void netdev_features_change(struct net_device *dev)
 1442{
 1443	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1444}
 1445EXPORT_SYMBOL(netdev_features_change);
 1446
 1447/**
 1448 *	netdev_state_change - device changes state
 1449 *	@dev: device to cause notification
 1450 *
 1451 *	Called to indicate a device has changed state. This function calls
 1452 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1453 *	to the routing socket.
 1454 */
 1455void netdev_state_change(struct net_device *dev)
 1456{
 1457	if (dev->flags & IFF_UP) {
 1458		struct netdev_notifier_change_info change_info = {
 1459			.info.dev = dev,
 1460		};
 1461
 1462		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1463					      &change_info.info);
 1464		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1465	}
 1466}
 1467EXPORT_SYMBOL(netdev_state_change);
 1468
 1469/**
 1470 * netdev_notify_peers - notify network peers about existence of @dev
 1471 * @dev: network device
 1472 *
 1473 * Generate traffic such that interested network peers are aware of
 1474 * @dev, such as by generating a gratuitous ARP. This may be used when
 1475 * a device wants to inform the rest of the network about some sort of
 1476 * reconfiguration such as a failover event or virtual machine
 1477 * migration.
 1478 */
 1479void netdev_notify_peers(struct net_device *dev)
 1480{
 1481	rtnl_lock();
 1482	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1483	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1484	rtnl_unlock();
 1485}
 1486EXPORT_SYMBOL(netdev_notify_peers);
 1487
 1488static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1489{
 1490	const struct net_device_ops *ops = dev->netdev_ops;
 1491	int ret;
 1492
 1493	ASSERT_RTNL();
 1494
 1495	if (!netif_device_present(dev))
 1496		return -ENODEV;
 1497
 1498	/* Block netpoll from trying to do any rx path servicing.
 1499	 * If we don't do this there is a chance ndo_poll_controller
 1500	 * or ndo_poll may be running while we open the device
 1501	 */
 1502	netpoll_poll_disable(dev);
 1503
 1504	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1505	ret = notifier_to_errno(ret);
 1506	if (ret)
 1507		return ret;
 1508
 1509	set_bit(__LINK_STATE_START, &dev->state);
 1510
 1511	if (ops->ndo_validate_addr)
 1512		ret = ops->ndo_validate_addr(dev);
 1513
 1514	if (!ret && ops->ndo_open)
 1515		ret = ops->ndo_open(dev);
 1516
 1517	netpoll_poll_enable(dev);
 1518
 1519	if (ret)
 1520		clear_bit(__LINK_STATE_START, &dev->state);
 1521	else {
 1522		dev->flags |= IFF_UP;
 1523		dev_set_rx_mode(dev);
 1524		dev_activate(dev);
 1525		add_device_randomness(dev->dev_addr, dev->addr_len);
 1526	}
 1527
 1528	return ret;
 1529}
 1530
 1531/**
 1532 *	dev_open	- prepare an interface for use.
 1533 *	@dev: device to open
 1534 *	@extack: netlink extended ack
 1535 *
 1536 *	Takes a device from down to up state. The device's private open
 1537 *	function is invoked and then the multicast lists are loaded. Finally
 1538 *	the device is moved into the up state and a %NETDEV_UP message is
 1539 *	sent to the netdev notifier chain.
 1540 *
 1541 *	Calling this function on an active interface is a nop. On a failure
 1542 *	a negative errno code is returned.
 1543 */
 1544int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1545{
 1546	int ret;
 1547
 1548	if (dev->flags & IFF_UP)
 1549		return 0;
 1550
 1551	ret = __dev_open(dev, extack);
 1552	if (ret < 0)
 1553		return ret;
 1554
 1555	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1556	call_netdevice_notifiers(NETDEV_UP, dev);
 1557
 1558	return ret;
 1559}
 1560EXPORT_SYMBOL(dev_open);
 1561
 1562static void __dev_close_many(struct list_head *head)
 1563{
 1564	struct net_device *dev;
 1565
 1566	ASSERT_RTNL();
 1567	might_sleep();
 1568
 1569	list_for_each_entry(dev, head, close_list) {
 1570		/* Temporarily disable netpoll until the interface is down */
 1571		netpoll_poll_disable(dev);
 1572
 1573		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1574
 1575		clear_bit(__LINK_STATE_START, &dev->state);
 1576
 1577		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1578		 * can be even on different cpu. So just clear netif_running().
 1579		 *
 1580		 * dev->stop() will invoke napi_disable() on all of it's
 1581		 * napi_struct instances on this device.
 1582		 */
 1583		smp_mb__after_atomic(); /* Commit netif_running(). */
 1584	}
 1585
 1586	dev_deactivate_many(head);
 1587
 1588	list_for_each_entry(dev, head, close_list) {
 1589		const struct net_device_ops *ops = dev->netdev_ops;
 1590
 1591		/*
 1592		 *	Call the device specific close. This cannot fail.
 1593		 *	Only if device is UP
 1594		 *
 1595		 *	We allow it to be called even after a DETACH hot-plug
 1596		 *	event.
 1597		 */
 1598		if (ops->ndo_stop)
 1599			ops->ndo_stop(dev);
 1600
 1601		dev->flags &= ~IFF_UP;
 1602		netpoll_poll_enable(dev);
 1603	}
 1604}
 1605
 1606static void __dev_close(struct net_device *dev)
 1607{
 1608	LIST_HEAD(single);
 1609
 1610	list_add(&dev->close_list, &single);
 1611	__dev_close_many(&single);
 1612	list_del(&single);
 1613}
 1614
 1615void dev_close_many(struct list_head *head, bool unlink)
 1616{
 1617	struct net_device *dev, *tmp;
 1618
 1619	/* Remove the devices that don't need to be closed */
 1620	list_for_each_entry_safe(dev, tmp, head, close_list)
 1621		if (!(dev->flags & IFF_UP))
 1622			list_del_init(&dev->close_list);
 1623
 1624	__dev_close_many(head);
 1625
 1626	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1627		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1628		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1629		if (unlink)
 1630			list_del_init(&dev->close_list);
 1631	}
 1632}
 1633EXPORT_SYMBOL(dev_close_many);
 1634
 1635/**
 1636 *	dev_close - shutdown an interface.
 1637 *	@dev: device to shutdown
 1638 *
 1639 *	This function moves an active device into down state. A
 1640 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1641 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1642 *	chain.
 1643 */
 1644void dev_close(struct net_device *dev)
 1645{
 1646	if (dev->flags & IFF_UP) {
 1647		LIST_HEAD(single);
 1648
 1649		list_add(&dev->close_list, &single);
 1650		dev_close_many(&single, true);
 1651		list_del(&single);
 1652	}
 1653}
 1654EXPORT_SYMBOL(dev_close);
 1655
 1656
 1657/**
 1658 *	dev_disable_lro - disable Large Receive Offload on a device
 1659 *	@dev: device
 1660 *
 1661 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1662 *	called under RTNL.  This is needed if received packets may be
 1663 *	forwarded to another interface.
 1664 */
 1665void dev_disable_lro(struct net_device *dev)
 1666{
 1667	struct net_device *lower_dev;
 1668	struct list_head *iter;
 1669
 1670	dev->wanted_features &= ~NETIF_F_LRO;
 1671	netdev_update_features(dev);
 1672
 1673	if (unlikely(dev->features & NETIF_F_LRO))
 1674		netdev_WARN(dev, "failed to disable LRO!\n");
 1675
 1676	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1677		dev_disable_lro(lower_dev);
 1678}
 1679EXPORT_SYMBOL(dev_disable_lro);
 1680
 1681/**
 1682 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1683 *	@dev: device
 1684 *
 1685 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1686 *	called under RTNL.  This is needed if Generic XDP is installed on
 1687 *	the device.
 1688 */
 1689static void dev_disable_gro_hw(struct net_device *dev)
 1690{
 1691	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1692	netdev_update_features(dev);
 1693
 1694	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1695		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1696}
 1697
 1698const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1699{
 1700#define N(val) 						\
 1701	case NETDEV_##val:				\
 1702		return "NETDEV_" __stringify(val);
 1703	switch (cmd) {
 1704	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1705	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1706	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1707	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1708	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1709	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1710	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1711	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1712	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1713	N(PRE_CHANGEADDR)
 1714	}
 1715#undef N
 1716	return "UNKNOWN_NETDEV_EVENT";
 1717}
 1718EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1719
 1720static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1721				   struct net_device *dev)
 1722{
 1723	struct netdev_notifier_info info = {
 1724		.dev = dev,
 1725	};
 1726
 1727	return nb->notifier_call(nb, val, &info);
 1728}
 1729
 1730static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1731					     struct net_device *dev)
 1732{
 1733	int err;
 1734
 1735	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1736	err = notifier_to_errno(err);
 1737	if (err)
 1738		return err;
 1739
 1740	if (!(dev->flags & IFF_UP))
 1741		return 0;
 1742
 1743	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1744	return 0;
 1745}
 1746
 1747static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1748						struct net_device *dev)
 1749{
 1750	if (dev->flags & IFF_UP) {
 1751		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1752					dev);
 1753		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1754	}
 1755	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1756}
 1757
 1758static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1759						 struct net *net)
 1760{
 1761	struct net_device *dev;
 1762	int err;
 1763
 1764	for_each_netdev(net, dev) {
 1765		err = call_netdevice_register_notifiers(nb, dev);
 1766		if (err)
 1767			goto rollback;
 1768	}
 1769	return 0;
 1770
 1771rollback:
 1772	for_each_netdev_continue_reverse(net, dev)
 1773		call_netdevice_unregister_notifiers(nb, dev);
 1774	return err;
 1775}
 1776
 1777static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1778						    struct net *net)
 1779{
 1780	struct net_device *dev;
 1781
 1782	for_each_netdev(net, dev)
 1783		call_netdevice_unregister_notifiers(nb, dev);
 1784}
 1785
 1786static int dev_boot_phase = 1;
 1787
 1788/**
 1789 * register_netdevice_notifier - register a network notifier block
 1790 * @nb: notifier
 1791 *
 1792 * Register a notifier to be called when network device events occur.
 1793 * The notifier passed is linked into the kernel structures and must
 1794 * not be reused until it has been unregistered. A negative errno code
 1795 * is returned on a failure.
 1796 *
 1797 * When registered all registration and up events are replayed
 1798 * to the new notifier to allow device to have a race free
 1799 * view of the network device list.
 1800 */
 1801
 1802int register_netdevice_notifier(struct notifier_block *nb)
 1803{
 1804	struct net *net;
 1805	int err;
 1806
 1807	/* Close race with setup_net() and cleanup_net() */
 1808	down_write(&pernet_ops_rwsem);
 1809	rtnl_lock();
 1810	err = raw_notifier_chain_register(&netdev_chain, nb);
 1811	if (err)
 1812		goto unlock;
 1813	if (dev_boot_phase)
 1814		goto unlock;
 1815	for_each_net(net) {
 1816		err = call_netdevice_register_net_notifiers(nb, net);
 1817		if (err)
 1818			goto rollback;
 1819	}
 1820
 1821unlock:
 1822	rtnl_unlock();
 1823	up_write(&pernet_ops_rwsem);
 1824	return err;
 1825
 1826rollback:
 1827	for_each_net_continue_reverse(net)
 1828		call_netdevice_unregister_net_notifiers(nb, net);
 1829
 1830	raw_notifier_chain_unregister(&netdev_chain, nb);
 1831	goto unlock;
 1832}
 1833EXPORT_SYMBOL(register_netdevice_notifier);
 1834
 1835/**
 1836 * unregister_netdevice_notifier - unregister a network notifier block
 1837 * @nb: notifier
 1838 *
 1839 * Unregister a notifier previously registered by
 1840 * register_netdevice_notifier(). The notifier is unlinked into the
 1841 * kernel structures and may then be reused. A negative errno code
 1842 * is returned on a failure.
 1843 *
 1844 * After unregistering unregister and down device events are synthesized
 1845 * for all devices on the device list to the removed notifier to remove
 1846 * the need for special case cleanup code.
 1847 */
 1848
 1849int unregister_netdevice_notifier(struct notifier_block *nb)
 1850{
 1851	struct net *net;
 1852	int err;
 1853
 1854	/* Close race with setup_net() and cleanup_net() */
 1855	down_write(&pernet_ops_rwsem);
 1856	rtnl_lock();
 1857	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1858	if (err)
 1859		goto unlock;
 1860
 1861	for_each_net(net)
 1862		call_netdevice_unregister_net_notifiers(nb, net);
 1863
 1864unlock:
 1865	rtnl_unlock();
 1866	up_write(&pernet_ops_rwsem);
 1867	return err;
 1868}
 1869EXPORT_SYMBOL(unregister_netdevice_notifier);
 1870
 1871static int __register_netdevice_notifier_net(struct net *net,
 1872					     struct notifier_block *nb,
 1873					     bool ignore_call_fail)
 1874{
 1875	int err;
 1876
 1877	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1878	if (err)
 1879		return err;
 1880	if (dev_boot_phase)
 1881		return 0;
 1882
 1883	err = call_netdevice_register_net_notifiers(nb, net);
 1884	if (err && !ignore_call_fail)
 1885		goto chain_unregister;
 1886
 1887	return 0;
 1888
 1889chain_unregister:
 1890	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1891	return err;
 1892}
 1893
 1894static int __unregister_netdevice_notifier_net(struct net *net,
 1895					       struct notifier_block *nb)
 1896{
 1897	int err;
 1898
 1899	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1900	if (err)
 1901		return err;
 1902
 1903	call_netdevice_unregister_net_notifiers(nb, net);
 1904	return 0;
 1905}
 1906
 1907/**
 1908 * register_netdevice_notifier_net - register a per-netns network notifier block
 1909 * @net: network namespace
 1910 * @nb: notifier
 1911 *
 1912 * Register a notifier to be called when network device events occur.
 1913 * The notifier passed is linked into the kernel structures and must
 1914 * not be reused until it has been unregistered. A negative errno code
 1915 * is returned on a failure.
 1916 *
 1917 * When registered all registration and up events are replayed
 1918 * to the new notifier to allow device to have a race free
 1919 * view of the network device list.
 1920 */
 1921
 1922int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1923{
 1924	int err;
 1925
 1926	rtnl_lock();
 1927	err = __register_netdevice_notifier_net(net, nb, false);
 1928	rtnl_unlock();
 1929	return err;
 1930}
 1931EXPORT_SYMBOL(register_netdevice_notifier_net);
 1932
 1933/**
 1934 * unregister_netdevice_notifier_net - unregister a per-netns
 1935 *                                     network notifier block
 1936 * @net: network namespace
 1937 * @nb: notifier
 1938 *
 1939 * Unregister a notifier previously registered by
 1940 * register_netdevice_notifier(). The notifier is unlinked into the
 1941 * kernel structures and may then be reused. A negative errno code
 1942 * is returned on a failure.
 1943 *
 1944 * After unregistering unregister and down device events are synthesized
 1945 * for all devices on the device list to the removed notifier to remove
 1946 * the need for special case cleanup code.
 1947 */
 1948
 1949int unregister_netdevice_notifier_net(struct net *net,
 1950				      struct notifier_block *nb)
 1951{
 1952	int err;
 1953
 1954	rtnl_lock();
 1955	err = __unregister_netdevice_notifier_net(net, nb);
 1956	rtnl_unlock();
 1957	return err;
 1958}
 1959EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1960
 1961int register_netdevice_notifier_dev_net(struct net_device *dev,
 1962					struct notifier_block *nb,
 1963					struct netdev_net_notifier *nn)
 1964{
 1965	int err;
 1966
 1967	rtnl_lock();
 1968	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1969	if (!err) {
 1970		nn->nb = nb;
 1971		list_add(&nn->list, &dev->net_notifier_list);
 1972	}
 1973	rtnl_unlock();
 1974	return err;
 1975}
 1976EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1977
 1978int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1979					  struct notifier_block *nb,
 1980					  struct netdev_net_notifier *nn)
 1981{
 1982	int err;
 1983
 1984	rtnl_lock();
 1985	list_del(&nn->list);
 1986	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1987	rtnl_unlock();
 1988	return err;
 1989}
 1990EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1991
 1992static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 1993					     struct net *net)
 1994{
 1995	struct netdev_net_notifier *nn;
 1996
 1997	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 1998		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 1999		__register_netdevice_notifier_net(net, nn->nb, true);
 2000	}
 2001}
 2002
 2003/**
 2004 *	call_netdevice_notifiers_info - call all network notifier blocks
 2005 *	@val: value passed unmodified to notifier function
 2006 *	@info: notifier information data
 2007 *
 2008 *	Call all network notifier blocks.  Parameters and return value
 2009 *	are as for raw_notifier_call_chain().
 2010 */
 2011
 2012static int call_netdevice_notifiers_info(unsigned long val,
 2013					 struct netdev_notifier_info *info)
 2014{
 2015	struct net *net = dev_net(info->dev);
 2016	int ret;
 2017
 2018	ASSERT_RTNL();
 2019
 2020	/* Run per-netns notifier block chain first, then run the global one.
 2021	 * Hopefully, one day, the global one is going to be removed after
 2022	 * all notifier block registrators get converted to be per-netns.
 2023	 */
 2024	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2025	if (ret & NOTIFY_STOP_MASK)
 2026		return ret;
 2027	return raw_notifier_call_chain(&netdev_chain, val, info);
 2028}
 2029
 2030static int call_netdevice_notifiers_extack(unsigned long val,
 2031					   struct net_device *dev,
 2032					   struct netlink_ext_ack *extack)
 2033{
 2034	struct netdev_notifier_info info = {
 2035		.dev = dev,
 2036		.extack = extack,
 2037	};
 2038
 2039	return call_netdevice_notifiers_info(val, &info);
 2040}
 2041
 2042/**
 2043 *	call_netdevice_notifiers - call all network notifier blocks
 2044 *      @val: value passed unmodified to notifier function
 2045 *      @dev: net_device pointer passed unmodified to notifier function
 2046 *
 2047 *	Call all network notifier blocks.  Parameters and return value
 2048 *	are as for raw_notifier_call_chain().
 2049 */
 2050
 2051int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2052{
 2053	return call_netdevice_notifiers_extack(val, dev, NULL);
 2054}
 2055EXPORT_SYMBOL(call_netdevice_notifiers);
 2056
 2057/**
 2058 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2059 *	@val: value passed unmodified to notifier function
 2060 *	@dev: net_device pointer passed unmodified to notifier function
 2061 *	@arg: additional u32 argument passed to the notifier function
 2062 *
 2063 *	Call all network notifier blocks.  Parameters and return value
 2064 *	are as for raw_notifier_call_chain().
 2065 */
 2066static int call_netdevice_notifiers_mtu(unsigned long val,
 2067					struct net_device *dev, u32 arg)
 2068{
 2069	struct netdev_notifier_info_ext info = {
 2070		.info.dev = dev,
 2071		.ext.mtu = arg,
 2072	};
 2073
 2074	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2075
 2076	return call_netdevice_notifiers_info(val, &info.info);
 2077}
 2078
 2079#ifdef CONFIG_NET_INGRESS
 2080static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2081
 2082void net_inc_ingress_queue(void)
 2083{
 2084	static_branch_inc(&ingress_needed_key);
 2085}
 2086EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2087
 2088void net_dec_ingress_queue(void)
 2089{
 2090	static_branch_dec(&ingress_needed_key);
 2091}
 2092EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2093#endif
 2094
 2095#ifdef CONFIG_NET_EGRESS
 2096static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2097
 2098void net_inc_egress_queue(void)
 2099{
 2100	static_branch_inc(&egress_needed_key);
 2101}
 2102EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2103
 2104void net_dec_egress_queue(void)
 2105{
 2106	static_branch_dec(&egress_needed_key);
 2107}
 2108EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2109#endif
 2110
 2111static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2112#ifdef CONFIG_JUMP_LABEL
 2113static atomic_t netstamp_needed_deferred;
 2114static atomic_t netstamp_wanted;
 2115static void netstamp_clear(struct work_struct *work)
 2116{
 2117	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2118	int wanted;
 2119
 2120	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2121	if (wanted > 0)
 2122		static_branch_enable(&netstamp_needed_key);
 2123	else
 2124		static_branch_disable(&netstamp_needed_key);
 2125}
 2126static DECLARE_WORK(netstamp_work, netstamp_clear);
 2127#endif
 2128
 2129void net_enable_timestamp(void)
 2130{
 2131#ifdef CONFIG_JUMP_LABEL
 2132	int wanted;
 2133
 2134	while (1) {
 2135		wanted = atomic_read(&netstamp_wanted);
 2136		if (wanted <= 0)
 2137			break;
 2138		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2139			return;
 2140	}
 2141	atomic_inc(&netstamp_needed_deferred);
 2142	schedule_work(&netstamp_work);
 2143#else
 2144	static_branch_inc(&netstamp_needed_key);
 2145#endif
 2146}
 2147EXPORT_SYMBOL(net_enable_timestamp);
 2148
 2149void net_disable_timestamp(void)
 2150{
 2151#ifdef CONFIG_JUMP_LABEL
 2152	int wanted;
 2153
 2154	while (1) {
 2155		wanted = atomic_read(&netstamp_wanted);
 2156		if (wanted <= 1)
 2157			break;
 2158		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2159			return;
 2160	}
 2161	atomic_dec(&netstamp_needed_deferred);
 2162	schedule_work(&netstamp_work);
 2163#else
 2164	static_branch_dec(&netstamp_needed_key);
 2165#endif
 2166}
 2167EXPORT_SYMBOL(net_disable_timestamp);
 2168
 2169static inline void net_timestamp_set(struct sk_buff *skb)
 2170{
 2171	skb->tstamp = 0;
 2172	if (static_branch_unlikely(&netstamp_needed_key))
 2173		__net_timestamp(skb);
 2174}
 2175
 2176#define net_timestamp_check(COND, SKB)				\
 2177	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2178		if ((COND) && !(SKB)->tstamp)			\
 2179			__net_timestamp(SKB);			\
 2180	}							\
 2181
 2182bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2183{
 2184	unsigned int len;
 2185
 2186	if (!(dev->flags & IFF_UP))
 2187		return false;
 2188
 2189	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2190	if (skb->len <= len)
 2191		return true;
 2192
 2193	/* if TSO is enabled, we don't care about the length as the packet
 2194	 * could be forwarded without being segmented before
 2195	 */
 2196	if (skb_is_gso(skb))
 2197		return true;
 2198
 2199	return false;
 2200}
 2201EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2202
 2203int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2204{
 2205	int ret = ____dev_forward_skb(dev, skb);
 2206
 2207	if (likely(!ret)) {
 2208		skb->protocol = eth_type_trans(skb, dev);
 2209		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2210	}
 2211
 2212	return ret;
 2213}
 2214EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2215
 2216/**
 2217 * dev_forward_skb - loopback an skb to another netif
 2218 *
 2219 * @dev: destination network device
 2220 * @skb: buffer to forward
 2221 *
 2222 * return values:
 2223 *	NET_RX_SUCCESS	(no congestion)
 2224 *	NET_RX_DROP     (packet was dropped, but freed)
 2225 *
 2226 * dev_forward_skb can be used for injecting an skb from the
 2227 * start_xmit function of one device into the receive queue
 2228 * of another device.
 2229 *
 2230 * The receiving device may be in another namespace, so
 2231 * we have to clear all information in the skb that could
 2232 * impact namespace isolation.
 2233 */
 2234int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2235{
 2236	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2237}
 2238EXPORT_SYMBOL_GPL(dev_forward_skb);
 2239
 2240static inline int deliver_skb(struct sk_buff *skb,
 2241			      struct packet_type *pt_prev,
 2242			      struct net_device *orig_dev)
 2243{
 2244	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2245		return -ENOMEM;
 2246	refcount_inc(&skb->users);
 2247	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2248}
 2249
 2250static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2251					  struct packet_type **pt,
 2252					  struct net_device *orig_dev,
 2253					  __be16 type,
 2254					  struct list_head *ptype_list)
 2255{
 2256	struct packet_type *ptype, *pt_prev = *pt;
 2257
 2258	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2259		if (ptype->type != type)
 2260			continue;
 2261		if (pt_prev)
 2262			deliver_skb(skb, pt_prev, orig_dev);
 2263		pt_prev = ptype;
 2264	}
 2265	*pt = pt_prev;
 2266}
 2267
 2268static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2269{
 2270	if (!ptype->af_packet_priv || !skb->sk)
 2271		return false;
 2272
 2273	if (ptype->id_match)
 2274		return ptype->id_match(ptype, skb->sk);
 2275	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2276		return true;
 2277
 2278	return false;
 2279}
 2280
 2281/**
 2282 * dev_nit_active - return true if any network interface taps are in use
 2283 *
 2284 * @dev: network device to check for the presence of taps
 2285 */
 2286bool dev_nit_active(struct net_device *dev)
 2287{
 2288	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2289}
 2290EXPORT_SYMBOL_GPL(dev_nit_active);
 2291
 2292/*
 2293 *	Support routine. Sends outgoing frames to any network
 2294 *	taps currently in use.
 2295 */
 2296
 2297void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2298{
 2299	struct packet_type *ptype;
 2300	struct sk_buff *skb2 = NULL;
 2301	struct packet_type *pt_prev = NULL;
 2302	struct list_head *ptype_list = &ptype_all;
 2303
 2304	rcu_read_lock();
 2305again:
 2306	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2307		if (ptype->ignore_outgoing)
 2308			continue;
 2309
 2310		/* Never send packets back to the socket
 2311		 * they originated from - MvS (miquels@drinkel.ow.org)
 2312		 */
 2313		if (skb_loop_sk(ptype, skb))
 2314			continue;
 2315
 2316		if (pt_prev) {
 2317			deliver_skb(skb2, pt_prev, skb->dev);
 2318			pt_prev = ptype;
 2319			continue;
 2320		}
 2321
 2322		/* need to clone skb, done only once */
 2323		skb2 = skb_clone(skb, GFP_ATOMIC);
 2324		if (!skb2)
 2325			goto out_unlock;
 2326
 2327		net_timestamp_set(skb2);
 2328
 2329		/* skb->nh should be correctly
 2330		 * set by sender, so that the second statement is
 2331		 * just protection against buggy protocols.
 2332		 */
 2333		skb_reset_mac_header(skb2);
 2334
 2335		if (skb_network_header(skb2) < skb2->data ||
 2336		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2337			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2338					     ntohs(skb2->protocol),
 2339					     dev->name);
 2340			skb_reset_network_header(skb2);
 2341		}
 2342
 2343		skb2->transport_header = skb2->network_header;
 2344		skb2->pkt_type = PACKET_OUTGOING;
 2345		pt_prev = ptype;
 2346	}
 2347
 2348	if (ptype_list == &ptype_all) {
 2349		ptype_list = &dev->ptype_all;
 2350		goto again;
 2351	}
 2352out_unlock:
 2353	if (pt_prev) {
 2354		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2355			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2356		else
 2357			kfree_skb(skb2);
 2358	}
 2359	rcu_read_unlock();
 2360}
 2361EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2362
 2363/**
 2364 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2365 * @dev: Network device
 2366 * @txq: number of queues available
 2367 *
 2368 * If real_num_tx_queues is changed the tc mappings may no longer be
 2369 * valid. To resolve this verify the tc mapping remains valid and if
 2370 * not NULL the mapping. With no priorities mapping to this
 2371 * offset/count pair it will no longer be used. In the worst case TC0
 2372 * is invalid nothing can be done so disable priority mappings. If is
 2373 * expected that drivers will fix this mapping if they can before
 2374 * calling netif_set_real_num_tx_queues.
 2375 */
 2376static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2377{
 2378	int i;
 2379	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2380
 2381	/* If TC0 is invalidated disable TC mapping */
 2382	if (tc->offset + tc->count > txq) {
 2383		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2384		dev->num_tc = 0;
 2385		return;
 2386	}
 2387
 2388	/* Invalidated prio to tc mappings set to TC0 */
 2389	for (i = 1; i < TC_BITMASK + 1; i++) {
 2390		int q = netdev_get_prio_tc_map(dev, i);
 2391
 2392		tc = &dev->tc_to_txq[q];
 2393		if (tc->offset + tc->count > txq) {
 2394			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2395				i, q);
 2396			netdev_set_prio_tc_map(dev, i, 0);
 2397		}
 2398	}
 2399}
 2400
 2401int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2402{
 2403	if (dev->num_tc) {
 2404		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2405		int i;
 2406
 2407		/* walk through the TCs and see if it falls into any of them */
 2408		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2409			if ((txq - tc->offset) < tc->count)
 2410				return i;
 2411		}
 2412
 2413		/* didn't find it, just return -1 to indicate no match */
 2414		return -1;
 2415	}
 2416
 2417	return 0;
 2418}
 2419EXPORT_SYMBOL(netdev_txq_to_tc);
 2420
 2421#ifdef CONFIG_XPS
 2422struct static_key xps_needed __read_mostly;
 2423EXPORT_SYMBOL(xps_needed);
 2424struct static_key xps_rxqs_needed __read_mostly;
 2425EXPORT_SYMBOL(xps_rxqs_needed);
 2426static DEFINE_MUTEX(xps_map_mutex);
 2427#define xmap_dereference(P)		\
 2428	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2429
 2430static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2431			     int tci, u16 index)
 2432{
 2433	struct xps_map *map = NULL;
 2434	int pos;
 2435
 2436	if (dev_maps)
 2437		map = xmap_dereference(dev_maps->attr_map[tci]);
 2438	if (!map)
 2439		return false;
 2440
 2441	for (pos = map->len; pos--;) {
 2442		if (map->queues[pos] != index)
 2443			continue;
 2444
 2445		if (map->len > 1) {
 2446			map->queues[pos] = map->queues[--map->len];
 2447			break;
 2448		}
 2449
 2450		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2451		kfree_rcu(map, rcu);
 2452		return false;
 2453	}
 2454
 2455	return true;
 2456}
 2457
 2458static bool remove_xps_queue_cpu(struct net_device *dev,
 2459				 struct xps_dev_maps *dev_maps,
 2460				 int cpu, u16 offset, u16 count)
 2461{
 2462	int num_tc = dev->num_tc ? : 1;
 2463	bool active = false;
 2464	int tci;
 2465
 2466	for (tci = cpu * num_tc; num_tc--; tci++) {
 2467		int i, j;
 2468
 2469		for (i = count, j = offset; i--; j++) {
 2470			if (!remove_xps_queue(dev_maps, tci, j))
 2471				break;
 2472		}
 2473
 2474		active |= i < 0;
 2475	}
 2476
 2477	return active;
 2478}
 2479
 2480static void reset_xps_maps(struct net_device *dev,
 2481			   struct xps_dev_maps *dev_maps,
 2482			   bool is_rxqs_map)
 2483{
 2484	if (is_rxqs_map) {
 2485		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2486		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2487	} else {
 2488		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2489	}
 2490	static_key_slow_dec_cpuslocked(&xps_needed);
 2491	kfree_rcu(dev_maps, rcu);
 2492}
 2493
 2494static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2495			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2496			   u16 offset, u16 count, bool is_rxqs_map)
 2497{
 2498	bool active = false;
 2499	int i, j;
 2500
 2501	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2502	     j < nr_ids;)
 2503		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2504					       count);
 2505	if (!active)
 2506		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2507
 2508	if (!is_rxqs_map) {
 2509		for (i = offset + (count - 1); count--; i--) {
 2510			netdev_queue_numa_node_write(
 2511				netdev_get_tx_queue(dev, i),
 2512				NUMA_NO_NODE);
 2513		}
 2514	}
 2515}
 2516
 2517static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2518				   u16 count)
 2519{
 2520	const unsigned long *possible_mask = NULL;
 2521	struct xps_dev_maps *dev_maps;
 2522	unsigned int nr_ids;
 2523
 2524	if (!static_key_false(&xps_needed))
 2525		return;
 2526
 2527	cpus_read_lock();
 2528	mutex_lock(&xps_map_mutex);
 2529
 2530	if (static_key_false(&xps_rxqs_needed)) {
 2531		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2532		if (dev_maps) {
 2533			nr_ids = dev->num_rx_queues;
 2534			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2535				       offset, count, true);
 2536		}
 2537	}
 2538
 2539	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2540	if (!dev_maps)
 2541		goto out_no_maps;
 2542
 2543	if (num_possible_cpus() > 1)
 2544		possible_mask = cpumask_bits(cpu_possible_mask);
 2545	nr_ids = nr_cpu_ids;
 2546	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2547		       false);
 2548
 2549out_no_maps:
 2550	mutex_unlock(&xps_map_mutex);
 2551	cpus_read_unlock();
 2552}
 2553
 2554static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2555{
 2556	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2557}
 2558
 2559static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2560				      u16 index, bool is_rxqs_map)
 2561{
 2562	struct xps_map *new_map;
 2563	int alloc_len = XPS_MIN_MAP_ALLOC;
 2564	int i, pos;
 2565
 2566	for (pos = 0; map && pos < map->len; pos++) {
 2567		if (map->queues[pos] != index)
 2568			continue;
 2569		return map;
 2570	}
 2571
 2572	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2573	if (map) {
 2574		if (pos < map->alloc_len)
 2575			return map;
 2576
 2577		alloc_len = map->alloc_len * 2;
 2578	}
 2579
 2580	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2581	 *  map
 2582	 */
 2583	if (is_rxqs_map)
 2584		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2585	else
 2586		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2587				       cpu_to_node(attr_index));
 2588	if (!new_map)
 2589		return NULL;
 2590
 2591	for (i = 0; i < pos; i++)
 2592		new_map->queues[i] = map->queues[i];
 2593	new_map->alloc_len = alloc_len;
 2594	new_map->len = pos;
 2595
 2596	return new_map;
 2597}
 2598
 2599/* Must be called under cpus_read_lock */
 2600int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2601			  u16 index, bool is_rxqs_map)
 2602{
 2603	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2604	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2605	int i, j, tci, numa_node_id = -2;
 2606	int maps_sz, num_tc = 1, tc = 0;
 2607	struct xps_map *map, *new_map;
 2608	bool active = false;
 2609	unsigned int nr_ids;
 2610
 2611	if (dev->num_tc) {
 2612		/* Do not allow XPS on subordinate device directly */
 2613		num_tc = dev->num_tc;
 2614		if (num_tc < 0)
 2615			return -EINVAL;
 2616
 2617		/* If queue belongs to subordinate dev use its map */
 2618		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2619
 2620		tc = netdev_txq_to_tc(dev, index);
 2621		if (tc < 0)
 2622			return -EINVAL;
 2623	}
 2624
 2625	mutex_lock(&xps_map_mutex);
 2626	if (is_rxqs_map) {
 2627		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2628		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2629		nr_ids = dev->num_rx_queues;
 2630	} else {
 2631		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2632		if (num_possible_cpus() > 1) {
 2633			online_mask = cpumask_bits(cpu_online_mask);
 2634			possible_mask = cpumask_bits(cpu_possible_mask);
 2635		}
 2636		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2637		nr_ids = nr_cpu_ids;
 2638	}
 2639
 2640	if (maps_sz < L1_CACHE_BYTES)
 2641		maps_sz = L1_CACHE_BYTES;
 2642
 2643	/* allocate memory for queue storage */
 2644	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2645	     j < nr_ids;) {
 2646		if (!new_dev_maps)
 2647			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2648		if (!new_dev_maps) {
 2649			mutex_unlock(&xps_map_mutex);
 2650			return -ENOMEM;
 2651		}
 2652
 2653		tci = j * num_tc + tc;
 2654		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2655				 NULL;
 2656
 2657		map = expand_xps_map(map, j, index, is_rxqs_map);
 2658		if (!map)
 2659			goto error;
 2660
 2661		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2662	}
 2663
 2664	if (!new_dev_maps)
 2665		goto out_no_new_maps;
 2666
 2667	if (!dev_maps) {
 2668		/* Increment static keys at most once per type */
 2669		static_key_slow_inc_cpuslocked(&xps_needed);
 2670		if (is_rxqs_map)
 2671			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2672	}
 2673
 2674	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2675	     j < nr_ids;) {
 2676		/* copy maps belonging to foreign traffic classes */
 2677		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2678			/* fill in the new device map from the old device map */
 2679			map = xmap_dereference(dev_maps->attr_map[tci]);
 2680			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2681		}
 2682
 2683		/* We need to explicitly update tci as prevous loop
 2684		 * could break out early if dev_maps is NULL.
 2685		 */
 2686		tci = j * num_tc + tc;
 2687
 2688		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2689		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2690			/* add tx-queue to CPU/rx-queue maps */
 2691			int pos = 0;
 2692
 2693			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2694			while ((pos < map->len) && (map->queues[pos] != index))
 2695				pos++;
 2696
 2697			if (pos == map->len)
 2698				map->queues[map->len++] = index;
 2699#ifdef CONFIG_NUMA
 2700			if (!is_rxqs_map) {
 2701				if (numa_node_id == -2)
 2702					numa_node_id = cpu_to_node(j);
 2703				else if (numa_node_id != cpu_to_node(j))
 2704					numa_node_id = -1;
 2705			}
 2706#endif
 2707		} else if (dev_maps) {
 2708			/* fill in the new device map from the old device map */
 2709			map = xmap_dereference(dev_maps->attr_map[tci]);
 2710			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2711		}
 2712
 2713		/* copy maps belonging to foreign traffic classes */
 2714		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2715			/* fill in the new device map from the old device map */
 2716			map = xmap_dereference(dev_maps->attr_map[tci]);
 2717			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2718		}
 2719	}
 2720
 2721	if (is_rxqs_map)
 2722		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2723	else
 2724		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2725
 2726	/* Cleanup old maps */
 2727	if (!dev_maps)
 2728		goto out_no_old_maps;
 2729
 2730	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2731	     j < nr_ids;) {
 2732		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2733			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2734			map = xmap_dereference(dev_maps->attr_map[tci]);
 2735			if (map && map != new_map)
 2736				kfree_rcu(map, rcu);
 2737		}
 2738	}
 2739
 2740	kfree_rcu(dev_maps, rcu);
 2741
 2742out_no_old_maps:
 2743	dev_maps = new_dev_maps;
 2744	active = true;
 2745
 2746out_no_new_maps:
 2747	if (!is_rxqs_map) {
 2748		/* update Tx queue numa node */
 2749		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2750					     (numa_node_id >= 0) ?
 2751					     numa_node_id : NUMA_NO_NODE);
 2752	}
 2753
 2754	if (!dev_maps)
 2755		goto out_no_maps;
 2756
 2757	/* removes tx-queue from unused CPUs/rx-queues */
 2758	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2759	     j < nr_ids;) {
 2760		for (i = tc, tci = j * num_tc; i--; tci++)
 2761			active |= remove_xps_queue(dev_maps, tci, index);
 2762		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2763		    !netif_attr_test_online(j, online_mask, nr_ids))
 2764			active |= remove_xps_queue(dev_maps, tci, index);
 2765		for (i = num_tc - tc, tci++; --i; tci++)
 2766			active |= remove_xps_queue(dev_maps, tci, index);
 2767	}
 2768
 2769	/* free map if not active */
 2770	if (!active)
 2771		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2772
 2773out_no_maps:
 2774	mutex_unlock(&xps_map_mutex);
 2775
 2776	return 0;
 2777error:
 2778	/* remove any maps that we added */
 2779	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2780	     j < nr_ids;) {
 2781		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2782			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2783			map = dev_maps ?
 2784			      xmap_dereference(dev_maps->attr_map[tci]) :
 2785			      NULL;
 2786			if (new_map && new_map != map)
 2787				kfree(new_map);
 2788		}
 2789	}
 2790
 2791	mutex_unlock(&xps_map_mutex);
 2792
 2793	kfree(new_dev_maps);
 2794	return -ENOMEM;
 2795}
 2796EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2797
 2798int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2799			u16 index)
 2800{
 2801	int ret;
 2802
 2803	cpus_read_lock();
 2804	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2805	cpus_read_unlock();
 2806
 2807	return ret;
 2808}
 2809EXPORT_SYMBOL(netif_set_xps_queue);
 2810
 2811#endif
 2812static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2813{
 2814	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2815
 2816	/* Unbind any subordinate channels */
 2817	while (txq-- != &dev->_tx[0]) {
 2818		if (txq->sb_dev)
 2819			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2820	}
 2821}
 2822
 2823void netdev_reset_tc(struct net_device *dev)
 2824{
 2825#ifdef CONFIG_XPS
 2826	netif_reset_xps_queues_gt(dev, 0);
 2827#endif
 2828	netdev_unbind_all_sb_channels(dev);
 2829
 2830	/* Reset TC configuration of device */
 2831	dev->num_tc = 0;
 2832	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2833	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2834}
 2835EXPORT_SYMBOL(netdev_reset_tc);
 2836
 2837int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2838{
 2839	if (tc >= dev->num_tc)
 2840		return -EINVAL;
 2841
 2842#ifdef CONFIG_XPS
 2843	netif_reset_xps_queues(dev, offset, count);
 2844#endif
 2845	dev->tc_to_txq[tc].count = count;
 2846	dev->tc_to_txq[tc].offset = offset;
 2847	return 0;
 2848}
 2849EXPORT_SYMBOL(netdev_set_tc_queue);
 2850
 2851int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2852{
 2853	if (num_tc > TC_MAX_QUEUE)
 2854		return -EINVAL;
 2855
 2856#ifdef CONFIG_XPS
 2857	netif_reset_xps_queues_gt(dev, 0);
 2858#endif
 2859	netdev_unbind_all_sb_channels(dev);
 2860
 2861	dev->num_tc = num_tc;
 2862	return 0;
 2863}
 2864EXPORT_SYMBOL(netdev_set_num_tc);
 2865
 2866void netdev_unbind_sb_channel(struct net_device *dev,
 2867			      struct net_device *sb_dev)
 2868{
 2869	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2870
 2871#ifdef CONFIG_XPS
 2872	netif_reset_xps_queues_gt(sb_dev, 0);
 2873#endif
 2874	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2875	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2876
 2877	while (txq-- != &dev->_tx[0]) {
 2878		if (txq->sb_dev == sb_dev)
 2879			txq->sb_dev = NULL;
 2880	}
 2881}
 2882EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2883
 2884int netdev_bind_sb_channel_queue(struct net_device *dev,
 2885				 struct net_device *sb_dev,
 2886				 u8 tc, u16 count, u16 offset)
 2887{
 2888	/* Make certain the sb_dev and dev are already configured */
 2889	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2890		return -EINVAL;
 2891
 2892	/* We cannot hand out queues we don't have */
 2893	if ((offset + count) > dev->real_num_tx_queues)
 2894		return -EINVAL;
 2895
 2896	/* Record the mapping */
 2897	sb_dev->tc_to_txq[tc].count = count;
 2898	sb_dev->tc_to_txq[tc].offset = offset;
 2899
 2900	/* Provide a way for Tx queue to find the tc_to_txq map or
 2901	 * XPS map for itself.
 2902	 */
 2903	while (count--)
 2904		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2905
 2906	return 0;
 2907}
 2908EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2909
 2910int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2911{
 2912	/* Do not use a multiqueue device to represent a subordinate channel */
 2913	if (netif_is_multiqueue(dev))
 2914		return -ENODEV;
 2915
 2916	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2917	 * Channel 0 is meant to be "native" mode and used only to represent
 2918	 * the main root device. We allow writing 0 to reset the device back
 2919	 * to normal mode after being used as a subordinate channel.
 2920	 */
 2921	if (channel > S16_MAX)
 2922		return -EINVAL;
 2923
 2924	dev->num_tc = -channel;
 2925
 2926	return 0;
 2927}
 2928EXPORT_SYMBOL(netdev_set_sb_channel);
 2929
 2930/*
 2931 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2932 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2933 */
 2934int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2935{
 2936	bool disabling;
 2937	int rc;
 2938
 2939	disabling = txq < dev->real_num_tx_queues;
 2940
 2941	if (txq < 1 || txq > dev->num_tx_queues)
 2942		return -EINVAL;
 2943
 2944	if (dev->reg_state == NETREG_REGISTERED ||
 2945	    dev->reg_state == NETREG_UNREGISTERING) {
 2946		ASSERT_RTNL();
 2947
 2948		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2949						  txq);
 2950		if (rc)
 2951			return rc;
 2952
 2953		if (dev->num_tc)
 2954			netif_setup_tc(dev, txq);
 2955
 2956		dev->real_num_tx_queues = txq;
 2957
 2958		if (disabling) {
 2959			synchronize_net();
 2960			qdisc_reset_all_tx_gt(dev, txq);
 2961#ifdef CONFIG_XPS
 2962			netif_reset_xps_queues_gt(dev, txq);
 2963#endif
 2964		}
 2965	} else {
 2966		dev->real_num_tx_queues = txq;
 2967	}
 2968
 2969	return 0;
 2970}
 2971EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2972
 2973#ifdef CONFIG_SYSFS
 2974/**
 2975 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2976 *	@dev: Network device
 2977 *	@rxq: Actual number of RX queues
 2978 *
 2979 *	This must be called either with the rtnl_lock held or before
 2980 *	registration of the net device.  Returns 0 on success, or a
 2981 *	negative error code.  If called before registration, it always
 2982 *	succeeds.
 2983 */
 2984int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2985{
 2986	int rc;
 2987
 2988	if (rxq < 1 || rxq > dev->num_rx_queues)
 2989		return -EINVAL;
 2990
 2991	if (dev->reg_state == NETREG_REGISTERED) {
 2992		ASSERT_RTNL();
 2993
 2994		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 2995						  rxq);
 2996		if (rc)
 2997			return rc;
 2998	}
 2999
 3000	dev->real_num_rx_queues = rxq;
 3001	return 0;
 3002}
 3003EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3004#endif
 3005
 3006/**
 3007 * netif_get_num_default_rss_queues - default number of RSS queues
 3008 *
 3009 * This routine should set an upper limit on the number of RSS queues
 3010 * used by default by multiqueue devices.
 3011 */
 3012int netif_get_num_default_rss_queues(void)
 3013{
 3014	return is_kdump_kernel() ?
 3015		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3016}
 3017EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3018
 3019static void __netif_reschedule(struct Qdisc *q)
 3020{
 3021	struct softnet_data *sd;
 3022	unsigned long flags;
 3023
 3024	local_irq_save(flags);
 3025	sd = this_cpu_ptr(&softnet_data);
 3026	q->next_sched = NULL;
 3027	*sd->output_queue_tailp = q;
 3028	sd->output_queue_tailp = &q->next_sched;
 3029	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3030	local_irq_restore(flags);
 3031}
 3032
 3033void __netif_schedule(struct Qdisc *q)
 3034{
 3035	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3036		__netif_reschedule(q);
 3037}
 3038EXPORT_SYMBOL(__netif_schedule);
 3039
 3040struct dev_kfree_skb_cb {
 3041	enum skb_free_reason reason;
 3042};
 3043
 3044static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3045{
 3046	return (struct dev_kfree_skb_cb *)skb->cb;
 3047}
 3048
 3049void netif_schedule_queue(struct netdev_queue *txq)
 3050{
 3051	rcu_read_lock();
 3052	if (!netif_xmit_stopped(txq)) {
 3053		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3054
 3055		__netif_schedule(q);
 3056	}
 3057	rcu_read_unlock();
 3058}
 3059EXPORT_SYMBOL(netif_schedule_queue);
 3060
 3061void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3062{
 3063	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3064		struct Qdisc *q;
 3065
 3066		rcu_read_lock();
 3067		q = rcu_dereference(dev_queue->qdisc);
 3068		__netif_schedule(q);
 3069		rcu_read_unlock();
 3070	}
 3071}
 3072EXPORT_SYMBOL(netif_tx_wake_queue);
 3073
 3074void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3075{
 3076	unsigned long flags;
 3077
 3078	if (unlikely(!skb))
 3079		return;
 3080
 3081	if (likely(refcount_read(&skb->users) == 1)) {
 3082		smp_rmb();
 3083		refcount_set(&skb->users, 0);
 3084	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3085		return;
 3086	}
 3087	get_kfree_skb_cb(skb)->reason = reason;
 3088	local_irq_save(flags);
 3089	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3090	__this_cpu_write(softnet_data.completion_queue, skb);
 3091	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3092	local_irq_restore(flags);
 3093}
 3094EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3095
 3096void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3097{
 3098	if (in_irq() || irqs_disabled())
 3099		__dev_kfree_skb_irq(skb, reason);
 3100	else
 3101		dev_kfree_skb(skb);
 3102}
 3103EXPORT_SYMBOL(__dev_kfree_skb_any);
 3104
 3105
 3106/**
 3107 * netif_device_detach - mark device as removed
 3108 * @dev: network device
 3109 *
 3110 * Mark device as removed from system and therefore no longer available.
 3111 */
 3112void netif_device_detach(struct net_device *dev)
 3113{
 3114	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3115	    netif_running(dev)) {
 3116		netif_tx_stop_all_queues(dev);
 3117	}
 3118}
 3119EXPORT_SYMBOL(netif_device_detach);
 3120
 3121/**
 3122 * netif_device_attach - mark device as attached
 3123 * @dev: network device
 3124 *
 3125 * Mark device as attached from system and restart if needed.
 3126 */
 3127void netif_device_attach(struct net_device *dev)
 3128{
 3129	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3130	    netif_running(dev)) {
 3131		netif_tx_wake_all_queues(dev);
 3132		__netdev_watchdog_up(dev);
 3133	}
 3134}
 3135EXPORT_SYMBOL(netif_device_attach);
 3136
 3137/*
 3138 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3139 * to be used as a distribution range.
 3140 */
 3141static u16 skb_tx_hash(const struct net_device *dev,
 3142		       const struct net_device *sb_dev,
 3143		       struct sk_buff *skb)
 3144{
 3145	u32 hash;
 3146	u16 qoffset = 0;
 3147	u16 qcount = dev->real_num_tx_queues;
 3148
 3149	if (dev->num_tc) {
 3150		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3151
 3152		qoffset = sb_dev->tc_to_txq[tc].offset;
 3153		qcount = sb_dev->tc_to_txq[tc].count;
 3154	}
 3155
 3156	if (skb_rx_queue_recorded(skb)) {
 3157		hash = skb_get_rx_queue(skb);
 3158		if (hash >= qoffset)
 3159			hash -= qoffset;
 3160		while (unlikely(hash >= qcount))
 3161			hash -= qcount;
 3162		return hash + qoffset;
 3163	}
 3164
 3165	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3166}
 3167
 3168static void skb_warn_bad_offload(const struct sk_buff *skb)
 3169{
 3170	static const netdev_features_t null_features;
 3171	struct net_device *dev = skb->dev;
 3172	const char *name = "";
 3173
 3174	if (!net_ratelimit())
 3175		return;
 3176
 3177	if (dev) {
 3178		if (dev->dev.parent)
 3179			name = dev_driver_string(dev->dev.parent);
 3180		else
 3181			name = netdev_name(dev);
 3182	}
 3183	skb_dump(KERN_WARNING, skb, false);
 3184	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3185	     name, dev ? &dev->features : &null_features,
 3186	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3187}
 3188
 3189/*
 3190 * Invalidate hardware checksum when packet is to be mangled, and
 3191 * complete checksum manually on outgoing path.
 3192 */
 3193int skb_checksum_help(struct sk_buff *skb)
 3194{
 3195	__wsum csum;
 3196	int ret = 0, offset;
 3197
 3198	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3199		goto out_set_summed;
 3200
 3201	if (unlikely(skb_shinfo(skb)->gso_size)) {
 3202		skb_warn_bad_offload(skb);
 3203		return -EINVAL;
 3204	}
 3205
 3206	/* Before computing a checksum, we should make sure no frag could
 3207	 * be modified by an external entity : checksum could be wrong.
 3208	 */
 3209	if (skb_has_shared_frag(skb)) {
 3210		ret = __skb_linearize(skb);
 3211		if (ret)
 3212			goto out;
 3213	}
 3214
 3215	offset = skb_checksum_start_offset(skb);
 3216	BUG_ON(offset >= skb_headlen(skb));
 3217	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3218
 3219	offset += skb->csum_offset;
 3220	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3221
 3222	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3223	if (ret)
 3224		goto out;
 3225
 3226	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3227out_set_summed:
 3228	skb->ip_summed = CHECKSUM_NONE;
 3229out:
 3230	return ret;
 3231}
 3232EXPORT_SYMBOL(skb_checksum_help);
 3233
 3234int skb_crc32c_csum_help(struct sk_buff *skb)
 3235{
 3236	__le32 crc32c_csum;
 3237	int ret = 0, offset, start;
 3238
 3239	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3240		goto out;
 3241
 3242	if (unlikely(skb_is_gso(skb)))
 3243		goto out;
 3244
 3245	/* Before computing a checksum, we should make sure no frag could
 3246	 * be modified by an external entity : checksum could be wrong.
 3247	 */
 3248	if (unlikely(skb_has_shared_frag(skb))) {
 3249		ret = __skb_linearize(skb);
 3250		if (ret)
 3251			goto out;
 3252	}
 3253	start = skb_checksum_start_offset(skb);
 3254	offset = start + offsetof(struct sctphdr, checksum);
 3255	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3256		ret = -EINVAL;
 3257		goto out;
 3258	}
 3259
 3260	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3261	if (ret)
 3262		goto out;
 3263
 3264	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3265						  skb->len - start, ~(__u32)0,
 3266						  crc32c_csum_stub));
 3267	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3268	skb->ip_summed = CHECKSUM_NONE;
 3269	skb->csum_not_inet = 0;
 3270out:
 3271	return ret;
 3272}
 3273
 3274__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3275{
 3276	__be16 type = skb->protocol;
 3277
 3278	/* Tunnel gso handlers can set protocol to ethernet. */
 3279	if (type == htons(ETH_P_TEB)) {
 3280		struct ethhdr *eth;
 3281
 3282		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3283			return 0;
 3284
 3285		eth = (struct ethhdr *)skb->data;
 3286		type = eth->h_proto;
 3287	}
 3288
 3289	return __vlan_get_protocol(skb, type, depth);
 3290}
 3291
 3292/**
 3293 *	skb_mac_gso_segment - mac layer segmentation handler.
 3294 *	@skb: buffer to segment
 3295 *	@features: features for the output path (see dev->features)
 3296 */
 3297struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3298				    netdev_features_t features)
 3299{
 3300	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3301	struct packet_offload *ptype;
 3302	int vlan_depth = skb->mac_len;
 3303	__be16 type = skb_network_protocol(skb, &vlan_depth);
 3304
 3305	if (unlikely(!type))
 3306		return ERR_PTR(-EINVAL);
 3307
 3308	__skb_pull(skb, vlan_depth);
 3309
 3310	rcu_read_lock();
 3311	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3312		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3313			segs = ptype->callbacks.gso_segment(skb, features);
 3314			break;
 3315		}
 3316	}
 3317	rcu_read_unlock();
 3318
 3319	__skb_push(skb, skb->data - skb_mac_header(skb));
 3320
 3321	return segs;
 3322}
 3323EXPORT_SYMBOL(skb_mac_gso_segment);
 3324
 3325
 3326/* openvswitch calls this on rx path, so we need a different check.
 3327 */
 3328static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3329{
 3330	if (tx_path)
 3331		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3332		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3333
 3334	return skb->ip_summed == CHECKSUM_NONE;
 3335}
 3336
 3337/**
 3338 *	__skb_gso_segment - Perform segmentation on skb.
 3339 *	@skb: buffer to segment
 3340 *	@features: features for the output path (see dev->features)
 3341 *	@tx_path: whether it is called in TX path
 3342 *
 3343 *	This function segments the given skb and returns a list of segments.
 3344 *
 3345 *	It may return NULL if the skb requires no segmentation.  This is
 3346 *	only possible when GSO is used for verifying header integrity.
 3347 *
 3348 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3349 */
 3350struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3351				  netdev_features_t features, bool tx_path)
 3352{
 3353	struct sk_buff *segs;
 3354
 3355	if (unlikely(skb_needs_check(skb, tx_path))) {
 3356		int err;
 3357
 3358		/* We're going to init ->check field in TCP or UDP header */
 3359		err = skb_cow_head(skb, 0);
 3360		if (err < 0)
 3361			return ERR_PTR(err);
 3362	}
 3363
 3364	/* Only report GSO partial support if it will enable us to
 3365	 * support segmentation on this frame without needing additional
 3366	 * work.
 3367	 */
 3368	if (features & NETIF_F_GSO_PARTIAL) {
 3369		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3370		struct net_device *dev = skb->dev;
 3371
 3372		partial_features |= dev->features & dev->gso_partial_features;
 3373		if (!skb_gso_ok(skb, features | partial_features))
 3374			features &= ~NETIF_F_GSO_PARTIAL;
 3375	}
 3376
 3377	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3378		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3379
 3380	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3381	SKB_GSO_CB(skb)->encap_level = 0;
 3382
 3383	skb_reset_mac_header(skb);
 3384	skb_reset_mac_len(skb);
 3385
 3386	segs = skb_mac_gso_segment(skb, features);
 3387
 3388	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3389		skb_warn_bad_offload(skb);
 3390
 3391	return segs;
 3392}
 3393EXPORT_SYMBOL(__skb_gso_segment);
 3394
 3395/* Take action when hardware reception checksum errors are detected. */
 3396#ifdef CONFIG_BUG
 3397void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3398{
 3399	if (net_ratelimit()) {
 3400		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3401		skb_dump(KERN_ERR, skb, true);
 3402		dump_stack();
 3403	}
 3404}
 3405EXPORT_SYMBOL(netdev_rx_csum_fault);
 3406#endif
 3407
 3408/* XXX: check that highmem exists at all on the given machine. */
 3409static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3410{
 3411#ifdef CONFIG_HIGHMEM
 3412	int i;
 3413
 3414	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3415		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3416			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3417
 3418			if (PageHighMem(skb_frag_page(frag)))
 3419				return 1;
 3420		}
 3421	}
 3422#endif
 3423	return 0;
 3424}
 3425
 3426/* If MPLS offload request, verify we are testing hardware MPLS features
 3427 * instead of standard features for the netdev.
 3428 */
 3429#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3430static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3431					   netdev_features_t features,
 3432					   __be16 type)
 3433{
 3434	if (eth_p_mpls(type))
 3435		features &= skb->dev->mpls_features;
 3436
 3437	return features;
 3438}
 3439#else
 3440static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3441					   netdev_features_t features,
 3442					   __be16 type)
 3443{
 3444	return features;
 3445}
 3446#endif
 3447
 3448static netdev_features_t harmonize_features(struct sk_buff *skb,
 3449	netdev_features_t features)
 3450{
 3451	int tmp;
 3452	__be16 type;
 3453
 3454	type = skb_network_protocol(skb, &tmp);
 3455	features = net_mpls_features(skb, features, type);
 3456
 3457	if (skb->ip_summed != CHECKSUM_NONE &&
 3458	    !can_checksum_protocol(features, type)) {
 3459		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3460	}
 3461	if (illegal_highdma(skb->dev, skb))
 3462		features &= ~NETIF_F_SG;
 3463
 3464	return features;
 3465}
 3466
 3467netdev_features_t passthru_features_check(struct sk_buff *skb,
 3468					  struct net_device *dev,
 3469					  netdev_features_t features)
 3470{
 3471	return features;
 3472}
 3473EXPORT_SYMBOL(passthru_features_check);
 3474
 3475static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3476					     struct net_device *dev,
 3477					     netdev_features_t features)
 3478{
 3479	return vlan_features_check(skb, features);
 3480}
 3481
 3482static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3483					    struct net_device *dev,
 3484					    netdev_features_t features)
 3485{
 3486	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3487
 3488	if (gso_segs > dev->gso_max_segs)
 3489		return features & ~NETIF_F_GSO_MASK;
 3490
 3491	/* Support for GSO partial features requires software
 3492	 * intervention before we can actually process the packets
 3493	 * so we need to strip support for any partial features now
 3494	 * and we can pull them back in after we have partially
 3495	 * segmented the frame.
 3496	 */
 3497	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3498		features &= ~dev->gso_partial_features;
 3499
 3500	/* Make sure to clear the IPv4 ID mangling feature if the
 3501	 * IPv4 header has the potential to be fragmented.
 3502	 */
 3503	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3504		struct iphdr *iph = skb->encapsulation ?
 3505				    inner_ip_hdr(skb) : ip_hdr(skb);
 3506
 3507		if (!(iph->frag_off & htons(IP_DF)))
 3508			features &= ~NETIF_F_TSO_MANGLEID;
 3509	}
 3510
 3511	return features;
 3512}
 3513
 3514netdev_features_t netif_skb_features(struct sk_buff *skb)
 3515{
 3516	struct net_device *dev = skb->dev;
 3517	netdev_features_t features = dev->features;
 3518
 3519	if (skb_is_gso(skb))
 3520		features = gso_features_check(skb, dev, features);
 3521
 3522	/* If encapsulation offload request, verify we are testing
 3523	 * hardware encapsulation features instead of standard
 3524	 * features for the netdev
 3525	 */
 3526	if (skb->encapsulation)
 3527		features &= dev->hw_enc_features;
 3528
 3529	if (skb_vlan_tagged(skb))
 3530		features = netdev_intersect_features(features,
 3531						     dev->vlan_features |
 3532						     NETIF_F_HW_VLAN_CTAG_TX |
 3533						     NETIF_F_HW_VLAN_STAG_TX);
 3534
 3535	if (dev->netdev_ops->ndo_features_check)
 3536		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3537								features);
 3538	else
 3539		features &= dflt_features_check(skb, dev, features);
 3540
 3541	return harmonize_features(skb, features);
 3542}
 3543EXPORT_SYMBOL(netif_skb_features);
 3544
 3545static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3546		    struct netdev_queue *txq, bool more)
 3547{
 3548	unsigned int len;
 3549	int rc;
 3550
 3551	if (dev_nit_active(dev))
 3552		dev_queue_xmit_nit(skb, dev);
 3553
 3554	len = skb->len;
 3555	trace_net_dev_start_xmit(skb, dev);
 3556	rc = netdev_start_xmit(skb, dev, txq, more);
 3557	trace_net_dev_xmit(skb, rc, dev, len);
 3558
 3559	return rc;
 3560}
 3561
 3562struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3563				    struct netdev_queue *txq, int *ret)
 3564{
 3565	struct sk_buff *skb = first;
 3566	int rc = NETDEV_TX_OK;
 3567
 3568	while (skb) {
 3569		struct sk_buff *next = skb->next;
 3570
 3571		skb_mark_not_on_list(skb);
 3572		rc = xmit_one(skb, dev, txq, next != NULL);
 3573		if (unlikely(!dev_xmit_complete(rc))) {
 3574			skb->next = next;
 3575			goto out;
 3576		}
 3577
 3578		skb = next;
 3579		if (netif_tx_queue_stopped(txq) && skb) {
 3580			rc = NETDEV_TX_BUSY;
 3581			break;
 3582		}
 3583	}
 3584
 3585out:
 3586	*ret = rc;
 3587	return skb;
 3588}
 3589
 3590static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3591					  netdev_features_t features)
 3592{
 3593	if (skb_vlan_tag_present(skb) &&
 3594	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3595		skb = __vlan_hwaccel_push_inside(skb);
 3596	return skb;
 3597}
 3598
 3599int skb_csum_hwoffload_help(struct sk_buff *skb,
 3600			    const netdev_features_t features)
 3601{
 3602	if (unlikely(skb->csum_not_inet))
 3603		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3604			skb_crc32c_csum_help(skb);
 3605
 3606	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3607}
 3608EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3609
 3610static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3611{
 3612	netdev_features_t features;
 3613
 3614	features = netif_skb_features(skb);
 3615	skb = validate_xmit_vlan(skb, features);
 3616	if (unlikely(!skb))
 3617		goto out_null;
 3618
 3619	skb = sk_validate_xmit_skb(skb, dev);
 3620	if (unlikely(!skb))
 3621		goto out_null;
 3622
 3623	if (netif_needs_gso(skb, features)) {
 3624		struct sk_buff *segs;
 3625
 3626		segs = skb_gso_segment(skb, features);
 3627		if (IS_ERR(segs)) {
 3628			goto out_kfree_skb;
 3629		} else if (segs) {
 3630			consume_skb(skb);
 3631			skb = segs;
 3632		}
 3633	} else {
 3634		if (skb_needs_linearize(skb, features) &&
 3635		    __skb_linearize(skb))
 3636			goto out_kfree_skb;
 3637
 3638		/* If packet is not checksummed and device does not
 3639		 * support checksumming for this protocol, complete
 3640		 * checksumming here.
 3641		 */
 3642		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3643			if (skb->encapsulation)
 3644				skb_set_inner_transport_header(skb,
 3645							       skb_checksum_start_offset(skb));
 3646			else
 3647				skb_set_transport_header(skb,
 3648							 skb_checksum_start_offset(skb));
 3649			if (skb_csum_hwoffload_help(skb, features))
 3650				goto out_kfree_skb;
 3651		}
 3652	}
 3653
 3654	skb = validate_xmit_xfrm(skb, features, again);
 3655
 3656	return skb;
 3657
 3658out_kfree_skb:
 3659	kfree_skb(skb);
 3660out_null:
 3661	atomic_long_inc(&dev->tx_dropped);
 3662	return NULL;
 3663}
 3664
 3665struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3666{
 3667	struct sk_buff *next, *head = NULL, *tail;
 3668
 3669	for (; skb != NULL; skb = next) {
 3670		next = skb->next;
 3671		skb_mark_not_on_list(skb);
 3672
 3673		/* in case skb wont be segmented, point to itself */
 3674		skb->prev = skb;
 3675
 3676		skb = validate_xmit_skb(skb, dev, again);
 3677		if (!skb)
 3678			continue;
 3679
 3680		if (!head)
 3681			head = skb;
 3682		else
 3683			tail->next = skb;
 3684		/* If skb was segmented, skb->prev points to
 3685		 * the last segment. If not, it still contains skb.
 3686		 */
 3687		tail = skb->prev;
 3688	}
 3689	return head;
 3690}
 3691EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3692
 3693static void qdisc_pkt_len_init(struct sk_buff *skb)
 3694{
 3695	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3696
 3697	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3698
 3699	/* To get more precise estimation of bytes sent on wire,
 3700	 * we add to pkt_len the headers size of all segments
 3701	 */
 3702	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3703		unsigned int hdr_len;
 3704		u16 gso_segs = shinfo->gso_segs;
 3705
 3706		/* mac layer + network layer */
 3707		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3708
 3709		/* + transport layer */
 3710		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3711			const struct tcphdr *th;
 3712			struct tcphdr _tcphdr;
 3713
 3714			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3715						sizeof(_tcphdr), &_tcphdr);
 3716			if (likely(th))
 3717				hdr_len += __tcp_hdrlen(th);
 3718		} else {
 3719			struct udphdr _udphdr;
 3720
 3721			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3722					       sizeof(_udphdr), &_udphdr))
 3723				hdr_len += sizeof(struct udphdr);
 3724		}
 3725
 3726		if (shinfo->gso_type & SKB_GSO_DODGY)
 3727			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3728						shinfo->gso_size);
 3729
 3730		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3731	}
 3732}
 3733
 3734static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3735				 struct net_device *dev,
 3736				 struct netdev_queue *txq)
 3737{
 3738	spinlock_t *root_lock = qdisc_lock(q);
 3739	struct sk_buff *to_free = NULL;
 3740	bool contended;
 3741	int rc;
 3742
 3743	qdisc_calculate_pkt_len(skb, q);
 3744
 3745	if (q->flags & TCQ_F_NOLOCK) {
 3746		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3747		qdisc_run(q);
 3748
 3749		if (unlikely(to_free))
 3750			kfree_skb_list(to_free);
 3751		return rc;
 3752	}
 3753
 3754	/*
 3755	 * Heuristic to force contended enqueues to serialize on a
 3756	 * separate lock before trying to get qdisc main lock.
 3757	 * This permits qdisc->running owner to get the lock more
 3758	 * often and dequeue packets faster.
 3759	 */
 3760	contended = qdisc_is_running(q);
 3761	if (unlikely(contended))
 3762		spin_lock(&q->busylock);
 3763
 3764	spin_lock(root_lock);
 3765	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3766		__qdisc_drop(skb, &to_free);
 3767		rc = NET_XMIT_DROP;
 3768	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3769		   qdisc_run_begin(q)) {
 3770		/*
 3771		 * This is a work-conserving queue; there are no old skbs
 3772		 * waiting to be sent out; and the qdisc is not running -
 3773		 * xmit the skb directly.
 3774		 */
 3775
 3776		qdisc_bstats_update(q, skb);
 3777
 3778		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3779			if (unlikely(contended)) {
 3780				spin_unlock(&q->busylock);
 3781				contended = false;
 3782			}
 3783			__qdisc_run(q);
 3784		}
 3785
 3786		qdisc_run_end(q);
 3787		rc = NET_XMIT_SUCCESS;
 3788	} else {
 3789		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3790		if (qdisc_run_begin(q)) {
 3791			if (unlikely(contended)) {
 3792				spin_unlock(&q->busylock);
 3793				contended = false;
 3794			}
 3795			__qdisc_run(q);
 3796			qdisc_run_end(q);
 3797		}
 3798	}
 3799	spin_unlock(root_lock);
 3800	if (unlikely(to_free))
 3801		kfree_skb_list(to_free);
 3802	if (unlikely(contended))
 3803		spin_unlock(&q->busylock);
 3804	return rc;
 3805}
 3806
 3807#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3808static void skb_update_prio(struct sk_buff *skb)
 3809{
 3810	const struct netprio_map *map;
 3811	const struct sock *sk;
 3812	unsigned int prioidx;
 3813
 3814	if (skb->priority)
 3815		return;
 3816	map = rcu_dereference_bh(skb->dev->priomap);
 3817	if (!map)
 3818		return;
 3819	sk = skb_to_full_sk(skb);
 3820	if (!sk)
 3821		return;
 3822
 3823	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3824
 3825	if (prioidx < map->priomap_len)
 3826		skb->priority = map->priomap[prioidx];
 3827}
 3828#else
 3829#define skb_update_prio(skb)
 3830#endif
 3831
 3832/**
 3833 *	dev_loopback_xmit - loop back @skb
 3834 *	@net: network namespace this loopback is happening in
 3835 *	@sk:  sk needed to be a netfilter okfn
 3836 *	@skb: buffer to transmit
 3837 */
 3838int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3839{
 3840	skb_reset_mac_header(skb);
 3841	__skb_pull(skb, skb_network_offset(skb));
 3842	skb->pkt_type = PACKET_LOOPBACK;
 3843	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3844	WARN_ON(!skb_dst(skb));
 3845	skb_dst_force(skb);
 3846	netif_rx_ni(skb);
 3847	return 0;
 3848}
 3849EXPORT_SYMBOL(dev_loopback_xmit);
 3850
 3851#ifdef CONFIG_NET_EGRESS
 3852static struct sk_buff *
 3853sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3854{
 3855	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3856	struct tcf_result cl_res;
 3857
 3858	if (!miniq)
 3859		return skb;
 3860
 3861	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3862	mini_qdisc_bstats_cpu_update(miniq, skb);
 3863
 3864	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3865	case TC_ACT_OK:
 3866	case TC_ACT_RECLASSIFY:
 3867		skb->tc_index = TC_H_MIN(cl_res.classid);
 3868		break;
 3869	case TC_ACT_SHOT:
 3870		mini_qdisc_qstats_cpu_drop(miniq);
 3871		*ret = NET_XMIT_DROP;
 3872		kfree_skb(skb);
 3873		return NULL;
 3874	case TC_ACT_STOLEN:
 3875	case TC_ACT_QUEUED:
 3876	case TC_ACT_TRAP:
 3877		*ret = NET_XMIT_SUCCESS;
 3878		consume_skb(skb);
 3879		return NULL;
 3880	case TC_ACT_REDIRECT:
 3881		/* No need to push/pop skb's mac_header here on egress! */
 3882		skb_do_redirect(skb);
 3883		*ret = NET_XMIT_SUCCESS;
 3884		return NULL;
 3885	default:
 3886		break;
 3887	}
 3888
 3889	return skb;
 3890}
 3891#endif /* CONFIG_NET_EGRESS */
 3892
 3893#ifdef CONFIG_XPS
 3894static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3895			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3896{
 3897	struct xps_map *map;
 3898	int queue_index = -1;
 3899
 3900	if (dev->num_tc) {
 3901		tci *= dev->num_tc;
 3902		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3903	}
 3904
 3905	map = rcu_dereference(dev_maps->attr_map[tci]);
 3906	if (map) {
 3907		if (map->len == 1)
 3908			queue_index = map->queues[0];
 3909		else
 3910			queue_index = map->queues[reciprocal_scale(
 3911						skb_get_hash(skb), map->len)];
 3912		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3913			queue_index = -1;
 3914	}
 3915	return queue_index;
 3916}
 3917#endif
 3918
 3919static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3920			 struct sk_buff *skb)
 3921{
 3922#ifdef CONFIG_XPS
 3923	struct xps_dev_maps *dev_maps;
 3924	struct sock *sk = skb->sk;
 3925	int queue_index = -1;
 3926
 3927	if (!static_key_false(&xps_needed))
 3928		return -1;
 3929
 3930	rcu_read_lock();
 3931	if (!static_key_false(&xps_rxqs_needed))
 3932		goto get_cpus_map;
 3933
 3934	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3935	if (dev_maps) {
 3936		int tci = sk_rx_queue_get(sk);
 3937
 3938		if (tci >= 0 && tci < dev->num_rx_queues)
 3939			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3940							  tci);
 3941	}
 3942
 3943get_cpus_map:
 3944	if (queue_index < 0) {
 3945		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3946		if (dev_maps) {
 3947			unsigned int tci = skb->sender_cpu - 1;
 3948
 3949			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3950							  tci);
 3951		}
 3952	}
 3953	rcu_read_unlock();
 3954
 3955	return queue_index;
 3956#else
 3957	return -1;
 3958#endif
 3959}
 3960
 3961u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3962		     struct net_device *sb_dev)
 3963{
 3964	return 0;
 3965}
 3966EXPORT_SYMBOL(dev_pick_tx_zero);
 3967
 3968u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3969		       struct net_device *sb_dev)
 3970{
 3971	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3972}
 3973EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3974
 3975u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3976		     struct net_device *sb_dev)
 3977{
 3978	struct sock *sk = skb->sk;
 3979	int queue_index = sk_tx_queue_get(sk);
 3980
 3981	sb_dev = sb_dev ? : dev;
 3982
 3983	if (queue_index < 0 || skb->ooo_okay ||
 3984	    queue_index >= dev->real_num_tx_queues) {
 3985		int new_index = get_xps_queue(dev, sb_dev, skb);
 3986
 3987		if (new_index < 0)
 3988			new_index = skb_tx_hash(dev, sb_dev, skb);
 3989
 3990		if (queue_index != new_index && sk &&
 3991		    sk_fullsock(sk) &&
 3992		    rcu_access_pointer(sk->sk_dst_cache))
 3993			sk_tx_queue_set(sk, new_index);
 3994
 3995		queue_index = new_index;
 3996	}
 3997
 3998	return queue_index;
 3999}
 4000EXPORT_SYMBOL(netdev_pick_tx);
 4001
 4002struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4003					 struct sk_buff *skb,
 4004					 struct net_device *sb_dev)
 4005{
 4006	int queue_index = 0;
 4007
 4008#ifdef CONFIG_XPS
 4009	u32 sender_cpu = skb->sender_cpu - 1;
 4010
 4011	if (sender_cpu >= (u32)NR_CPUS)
 4012		skb->sender_cpu = raw_smp_processor_id() + 1;
 4013#endif
 4014
 4015	if (dev->real_num_tx_queues != 1) {
 4016		const struct net_device_ops *ops = dev->netdev_ops;
 4017
 4018		if (ops->ndo_select_queue)
 4019			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4020		else
 4021			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4022
 4023		queue_index = netdev_cap_txqueue(dev, queue_index);
 4024	}
 4025
 4026	skb_set_queue_mapping(skb, queue_index);
 4027	return netdev_get_tx_queue(dev, queue_index);
 4028}
 4029
 4030/**
 4031 *	__dev_queue_xmit - transmit a buffer
 4032 *	@skb: buffer to transmit
 4033 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4034 *
 4035 *	Queue a buffer for transmission to a network device. The caller must
 4036 *	have set the device and priority and built the buffer before calling
 4037 *	this function. The function can be called from an interrupt.
 4038 *
 4039 *	A negative errno code is returned on a failure. A success does not
 4040 *	guarantee the frame will be transmitted as it may be dropped due
 4041 *	to congestion or traffic shaping.
 4042 *
 4043 * -----------------------------------------------------------------------------------
 4044 *      I notice this method can also return errors from the queue disciplines,
 4045 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4046 *      be positive.
 4047 *
 4048 *      Regardless of the return value, the skb is consumed, so it is currently
 4049 *      difficult to retry a send to this method.  (You can bump the ref count
 4050 *      before sending to hold a reference for retry if you are careful.)
 4051 *
 4052 *      When calling this method, interrupts MUST be enabled.  This is because
 4053 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4054 *          --BLG
 4055 */
 4056static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4057{
 4058	struct net_device *dev = skb->dev;
 4059	struct netdev_queue *txq;
 4060	struct Qdisc *q;
 4061	int rc = -ENOMEM;
 4062	bool again = false;
 4063
 4064	skb_reset_mac_header(skb);
 4065
 4066	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4067		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4068
 4069	/* Disable soft irqs for various locks below. Also
 4070	 * stops preemption for RCU.
 4071	 */
 4072	rcu_read_lock_bh();
 4073
 4074	skb_update_prio(skb);
 4075
 4076	qdisc_pkt_len_init(skb);
 4077#ifdef CONFIG_NET_CLS_ACT
 4078	skb->tc_at_ingress = 0;
 4079# ifdef CONFIG_NET_EGRESS
 4080	if (static_branch_unlikely(&egress_needed_key)) {
 4081		skb = sch_handle_egress(skb, &rc, dev);
 4082		if (!skb)
 4083			goto out;
 4084	}
 4085# endif
 4086#endif
 4087	/* If device/qdisc don't need skb->dst, release it right now while
 4088	 * its hot in this cpu cache.
 4089	 */
 4090	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4091		skb_dst_drop(skb);
 4092	else
 4093		skb_dst_force(skb);
 4094
 4095	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4096	q = rcu_dereference_bh(txq->qdisc);
 4097
 4098	trace_net_dev_queue(skb);
 4099	if (q->enqueue) {
 4100		rc = __dev_xmit_skb(skb, q, dev, txq);
 4101		goto out;
 4102	}
 4103
 4104	/* The device has no queue. Common case for software devices:
 4105	 * loopback, all the sorts of tunnels...
 4106
 4107	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4108	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4109	 * counters.)
 4110	 * However, it is possible, that they rely on protection
 4111	 * made by us here.
 4112
 4113	 * Check this and shot the lock. It is not prone from deadlocks.
 4114	 *Either shot noqueue qdisc, it is even simpler 8)
 4115	 */
 4116	if (dev->flags & IFF_UP) {
 4117		int cpu = smp_processor_id(); /* ok because BHs are off */
 4118
 4119		if (txq->xmit_lock_owner != cpu) {
 4120			if (dev_xmit_recursion())
 4121				goto recursion_alert;
 4122
 4123			skb = validate_xmit_skb(skb, dev, &again);
 4124			if (!skb)
 4125				goto out;
 4126
 4127			HARD_TX_LOCK(dev, txq, cpu);
 4128
 4129			if (!netif_xmit_stopped(txq)) {
 4130				dev_xmit_recursion_inc();
 4131				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4132				dev_xmit_recursion_dec();
 4133				if (dev_xmit_complete(rc)) {
 4134					HARD_TX_UNLOCK(dev, txq);
 4135					goto out;
 4136				}
 4137			}
 4138			HARD_TX_UNLOCK(dev, txq);
 4139			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4140					     dev->name);
 4141		} else {
 4142			/* Recursion is detected! It is possible,
 4143			 * unfortunately
 4144			 */
 4145recursion_alert:
 4146			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4147					     dev->name);
 4148		}
 4149	}
 4150
 4151	rc = -ENETDOWN;
 4152	rcu_read_unlock_bh();
 4153
 4154	atomic_long_inc(&dev->tx_dropped);
 4155	kfree_skb_list(skb);
 4156	return rc;
 4157out:
 4158	rcu_read_unlock_bh();
 4159	return rc;
 4160}
 4161
 4162int dev_queue_xmit(struct sk_buff *skb)
 4163{
 4164	return __dev_queue_xmit(skb, NULL);
 4165}
 4166EXPORT_SYMBOL(dev_queue_xmit);
 4167
 4168int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4169{
 4170	return __dev_queue_xmit(skb, sb_dev);
 4171}
 4172EXPORT_SYMBOL(dev_queue_xmit_accel);
 4173
 4174int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4175{
 4176	struct net_device *dev = skb->dev;
 4177	struct sk_buff *orig_skb = skb;
 4178	struct netdev_queue *txq;
 4179	int ret = NETDEV_TX_BUSY;
 4180	bool again = false;
 4181
 4182	if (unlikely(!netif_running(dev) ||
 4183		     !netif_carrier_ok(dev)))
 4184		goto drop;
 4185
 4186	skb = validate_xmit_skb_list(skb, dev, &again);
 4187	if (skb != orig_skb)
 4188		goto drop;
 4189
 4190	skb_set_queue_mapping(skb, queue_id);
 4191	txq = skb_get_tx_queue(dev, skb);
 4192
 4193	local_bh_disable();
 4194
 4195	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4196	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4197		ret = netdev_start_xmit(skb, dev, txq, false);
 4198	HARD_TX_UNLOCK(dev, txq);
 4199
 4200	local_bh_enable();
 4201
 4202	if (!dev_xmit_complete(ret))
 4203		kfree_skb(skb);
 4204
 4205	return ret;
 4206drop:
 4207	atomic_long_inc(&dev->tx_dropped);
 4208	kfree_skb_list(skb);
 4209	return NET_XMIT_DROP;
 4210}
 4211EXPORT_SYMBOL(dev_direct_xmit);
 4212
 4213/*************************************************************************
 4214 *			Receiver routines
 4215 *************************************************************************/
 4216
 4217int netdev_max_backlog __read_mostly = 1000;
 4218EXPORT_SYMBOL(netdev_max_backlog);
 4219
 4220int netdev_tstamp_prequeue __read_mostly = 1;
 4221int netdev_budget __read_mostly = 300;
 4222/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4223unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4224int weight_p __read_mostly = 64;           /* old backlog weight */
 4225int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4226int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4227int dev_rx_weight __read_mostly = 64;
 4228int dev_tx_weight __read_mostly = 64;
 4229/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4230int gro_normal_batch __read_mostly = 8;
 4231
 4232/* Called with irq disabled */
 4233static inline void ____napi_schedule(struct softnet_data *sd,
 4234				     struct napi_struct *napi)
 4235{
 4236	list_add_tail(&napi->poll_list, &sd->poll_list);
 4237	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4238}
 4239
 4240#ifdef CONFIG_RPS
 4241
 4242/* One global table that all flow-based protocols share. */
 4243struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4244EXPORT_SYMBOL(rps_sock_flow_table);
 4245u32 rps_cpu_mask __read_mostly;
 4246EXPORT_SYMBOL(rps_cpu_mask);
 4247
 4248struct static_key_false rps_needed __read_mostly;
 4249EXPORT_SYMBOL(rps_needed);
 4250struct static_key_false rfs_needed __read_mostly;
 4251EXPORT_SYMBOL(rfs_needed);
 4252
 4253static struct rps_dev_flow *
 4254set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4255	    struct rps_dev_flow *rflow, u16 next_cpu)
 4256{
 4257	if (next_cpu < nr_cpu_ids) {
 4258#ifdef CONFIG_RFS_ACCEL
 4259		struct netdev_rx_queue *rxqueue;
 4260		struct rps_dev_flow_table *flow_table;
 4261		struct rps_dev_flow *old_rflow;
 4262		u32 flow_id;
 4263		u16 rxq_index;
 4264		int rc;
 4265
 4266		/* Should we steer this flow to a different hardware queue? */
 4267		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4268		    !(dev->features & NETIF_F_NTUPLE))
 4269			goto out;
 4270		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4271		if (rxq_index == skb_get_rx_queue(skb))
 4272			goto out;
 4273
 4274		rxqueue = dev->_rx + rxq_index;
 4275		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4276		if (!flow_table)
 4277			goto out;
 4278		flow_id = skb_get_hash(skb) & flow_table->mask;
 4279		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4280							rxq_index, flow_id);
 4281		if (rc < 0)
 4282			goto out;
 4283		old_rflow = rflow;
 4284		rflow = &flow_table->flows[flow_id];
 4285		rflow->filter = rc;
 4286		if (old_rflow->filter == rflow->filter)
 4287			old_rflow->filter = RPS_NO_FILTER;
 4288	out:
 4289#endif
 4290		rflow->last_qtail =
 4291			per_cpu(softnet_data, next_cpu).input_queue_head;
 4292	}
 4293
 4294	rflow->cpu = next_cpu;
 4295	return rflow;
 4296}
 4297
 4298/*
 4299 * get_rps_cpu is called from netif_receive_skb and returns the target
 4300 * CPU from the RPS map of the receiving queue for a given skb.
 4301 * rcu_read_lock must be held on entry.
 4302 */
 4303static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4304		       struct rps_dev_flow **rflowp)
 4305{
 4306	const struct rps_sock_flow_table *sock_flow_table;
 4307	struct netdev_rx_queue *rxqueue = dev->_rx;
 4308	struct rps_dev_flow_table *flow_table;
 4309	struct rps_map *map;
 4310	int cpu = -1;
 4311	u32 tcpu;
 4312	u32 hash;
 4313
 4314	if (skb_rx_queue_recorded(skb)) {
 4315		u16 index = skb_get_rx_queue(skb);
 4316
 4317		if (unlikely(index >= dev->real_num_rx_queues)) {
 4318			WARN_ONCE(dev->real_num_rx_queues > 1,
 4319				  "%s received packet on queue %u, but number "
 4320				  "of RX queues is %u\n",
 4321				  dev->name, index, dev->real_num_rx_queues);
 4322			goto done;
 4323		}
 4324		rxqueue += index;
 4325	}
 4326
 4327	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4328
 4329	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4330	map = rcu_dereference(rxqueue->rps_map);
 4331	if (!flow_table && !map)
 4332		goto done;
 4333
 4334	skb_reset_network_header(skb);
 4335	hash = skb_get_hash(skb);
 4336	if (!hash)
 4337		goto done;
 4338
 4339	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4340	if (flow_table && sock_flow_table) {
 4341		struct rps_dev_flow *rflow;
 4342		u32 next_cpu;
 4343		u32 ident;
 4344
 4345		/* First check into global flow table if there is a match */
 4346		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4347		if ((ident ^ hash) & ~rps_cpu_mask)
 4348			goto try_rps;
 4349
 4350		next_cpu = ident & rps_cpu_mask;
 4351
 4352		/* OK, now we know there is a match,
 4353		 * we can look at the local (per receive queue) flow table
 4354		 */
 4355		rflow = &flow_table->flows[hash & flow_table->mask];
 4356		tcpu = rflow->cpu;
 4357
 4358		/*
 4359		 * If the desired CPU (where last recvmsg was done) is
 4360		 * different from current CPU (one in the rx-queue flow
 4361		 * table entry), switch if one of the following holds:
 4362		 *   - Current CPU is unset (>= nr_cpu_ids).
 4363		 *   - Current CPU is offline.
 4364		 *   - The current CPU's queue tail has advanced beyond the
 4365		 *     last packet that was enqueued using this table entry.
 4366		 *     This guarantees that all previous packets for the flow
 4367		 *     have been dequeued, thus preserving in order delivery.
 4368		 */
 4369		if (unlikely(tcpu != next_cpu) &&
 4370		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4371		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4372		      rflow->last_qtail)) >= 0)) {
 4373			tcpu = next_cpu;
 4374			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4375		}
 4376
 4377		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4378			*rflowp = rflow;
 4379			cpu = tcpu;
 4380			goto done;
 4381		}
 4382	}
 4383
 4384try_rps:
 4385
 4386	if (map) {
 4387		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4388		if (cpu_online(tcpu)) {
 4389			cpu = tcpu;
 4390			goto done;
 4391		}
 4392	}
 4393
 4394done:
 4395	return cpu;
 4396}
 4397
 4398#ifdef CONFIG_RFS_ACCEL
 4399
 4400/**
 4401 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4402 * @dev: Device on which the filter was set
 4403 * @rxq_index: RX queue index
 4404 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4405 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4406 *
 4407 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4408 * this function for each installed filter and remove the filters for
 4409 * which it returns %true.
 4410 */
 4411bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4412			 u32 flow_id, u16 filter_id)
 4413{
 4414	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4415	struct rps_dev_flow_table *flow_table;
 4416	struct rps_dev_flow *rflow;
 4417	bool expire = true;
 4418	unsigned int cpu;
 4419
 4420	rcu_read_lock();
 4421	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4422	if (flow_table && flow_id <= flow_table->mask) {
 4423		rflow = &flow_table->flows[flow_id];
 4424		cpu = READ_ONCE(rflow->cpu);
 4425		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4426		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4427			   rflow->last_qtail) <
 4428		     (int)(10 * flow_table->mask)))
 4429			expire = false;
 4430	}
 4431	rcu_read_unlock();
 4432	return expire;
 4433}
 4434EXPORT_SYMBOL(rps_may_expire_flow);
 4435
 4436#endif /* CONFIG_RFS_ACCEL */
 4437
 4438/* Called from hardirq (IPI) context */
 4439static void rps_trigger_softirq(void *data)
 4440{
 4441	struct softnet_data *sd = data;
 4442
 4443	____napi_schedule(sd, &sd->backlog);
 4444	sd->received_rps++;
 4445}
 4446
 4447#endif /* CONFIG_RPS */
 4448
 4449/*
 4450 * Check if this softnet_data structure is another cpu one
 4451 * If yes, queue it to our IPI list and return 1
 4452 * If no, return 0
 4453 */
 4454static int rps_ipi_queued(struct softnet_data *sd)
 4455{
 4456#ifdef CONFIG_RPS
 4457	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4458
 4459	if (sd != mysd) {
 4460		sd->rps_ipi_next = mysd->rps_ipi_list;
 4461		mysd->rps_ipi_list = sd;
 4462
 4463		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4464		return 1;
 4465	}
 4466#endif /* CONFIG_RPS */
 4467	return 0;
 4468}
 4469
 4470#ifdef CONFIG_NET_FLOW_LIMIT
 4471int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4472#endif
 4473
 4474static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4475{
 4476#ifdef CONFIG_NET_FLOW_LIMIT
 4477	struct sd_flow_limit *fl;
 4478	struct softnet_data *sd;
 4479	unsigned int old_flow, new_flow;
 4480
 4481	if (qlen < (netdev_max_backlog >> 1))
 4482		return false;
 4483
 4484	sd = this_cpu_ptr(&softnet_data);
 4485
 4486	rcu_read_lock();
 4487	fl = rcu_dereference(sd->flow_limit);
 4488	if (fl) {
 4489		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4490		old_flow = fl->history[fl->history_head];
 4491		fl->history[fl->history_head] = new_flow;
 4492
 4493		fl->history_head++;
 4494		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4495
 4496		if (likely(fl->buckets[old_flow]))
 4497			fl->buckets[old_flow]--;
 4498
 4499		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4500			fl->count++;
 4501			rcu_read_unlock();
 4502			return true;
 4503		}
 4504	}
 4505	rcu_read_unlock();
 4506#endif
 4507	return false;
 4508}
 4509
 4510/*
 4511 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4512 * queue (may be a remote CPU queue).
 4513 */
 4514static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4515			      unsigned int *qtail)
 4516{
 4517	struct softnet_data *sd;
 4518	unsigned long flags;
 4519	unsigned int qlen;
 4520
 4521	sd = &per_cpu(softnet_data, cpu);
 4522
 4523	local_irq_save(flags);
 4524
 4525	rps_lock(sd);
 4526	if (!netif_running(skb->dev))
 4527		goto drop;
 4528	qlen = skb_queue_len(&sd->input_pkt_queue);
 4529	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4530		if (qlen) {
 4531enqueue:
 4532			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4533			input_queue_tail_incr_save(sd, qtail);
 4534			rps_unlock(sd);
 4535			local_irq_restore(flags);
 4536			return NET_RX_SUCCESS;
 4537		}
 4538
 4539		/* Schedule NAPI for backlog device
 4540		 * We can use non atomic operation since we own the queue lock
 4541		 */
 4542		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4543			if (!rps_ipi_queued(sd))
 4544				____napi_schedule(sd, &sd->backlog);
 4545		}
 4546		goto enqueue;
 4547	}
 4548
 4549drop:
 4550	sd->dropped++;
 4551	rps_unlock(sd);
 4552
 4553	local_irq_restore(flags);
 4554
 4555	atomic_long_inc(&skb->dev->rx_dropped);
 4556	kfree_skb(skb);
 4557	return NET_RX_DROP;
 4558}
 4559
 4560static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4561{
 4562	struct net_device *dev = skb->dev;
 4563	struct netdev_rx_queue *rxqueue;
 4564
 4565	rxqueue = dev->_rx;
 4566
 4567	if (skb_rx_queue_recorded(skb)) {
 4568		u16 index = skb_get_rx_queue(skb);
 4569
 4570		if (unlikely(index >= dev->real_num_rx_queues)) {
 4571			WARN_ONCE(dev->real_num_rx_queues > 1,
 4572				  "%s received packet on queue %u, but number "
 4573				  "of RX queues is %u\n",
 4574				  dev->name, index, dev->real_num_rx_queues);
 4575
 4576			return rxqueue; /* Return first rxqueue */
 4577		}
 4578		rxqueue += index;
 4579	}
 4580	return rxqueue;
 4581}
 4582
 4583static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4584				     struct xdp_buff *xdp,
 4585				     struct bpf_prog *xdp_prog)
 4586{
 4587	struct netdev_rx_queue *rxqueue;
 4588	void *orig_data, *orig_data_end;
 4589	u32 metalen, act = XDP_DROP;
 4590	__be16 orig_eth_type;
 4591	struct ethhdr *eth;
 4592	bool orig_bcast;
 4593	int hlen, off;
 4594	u32 mac_len;
 4595
 4596	/* Reinjected packets coming from act_mirred or similar should
 4597	 * not get XDP generic processing.
 4598	 */
 4599	if (skb_is_redirected(skb))
 4600		return XDP_PASS;
 4601
 4602	/* XDP packets must be linear and must have sufficient headroom
 4603	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4604	 * native XDP provides, thus we need to do it here as well.
 4605	 */
 4606	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4607	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4608		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4609		int troom = skb->tail + skb->data_len - skb->end;
 4610
 4611		/* In case we have to go down the path and also linearize,
 4612		 * then lets do the pskb_expand_head() work just once here.
 4613		 */
 4614		if (pskb_expand_head(skb,
 4615				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4616				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4617			goto do_drop;
 4618		if (skb_linearize(skb))
 4619			goto do_drop;
 4620	}
 4621
 4622	/* The XDP program wants to see the packet starting at the MAC
 4623	 * header.
 4624	 */
 4625	mac_len = skb->data - skb_mac_header(skb);
 4626	hlen = skb_headlen(skb) + mac_len;
 4627	xdp->data = skb->data - mac_len;
 4628	xdp->data_meta = xdp->data;
 4629	xdp->data_end = xdp->data + hlen;
 4630	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4631
 4632	/* SKB "head" area always have tailroom for skb_shared_info */
 4633	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4634	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4635
 4636	orig_data_end = xdp->data_end;
 4637	orig_data = xdp->data;
 4638	eth = (struct ethhdr *)xdp->data;
 4639	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4640	orig_eth_type = eth->h_proto;
 4641
 4642	rxqueue = netif_get_rxqueue(skb);
 4643	xdp->rxq = &rxqueue->xdp_rxq;
 4644
 4645	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4646
 4647	/* check if bpf_xdp_adjust_head was used */
 4648	off = xdp->data - orig_data;
 4649	if (off) {
 4650		if (off > 0)
 4651			__skb_pull(skb, off);
 4652		else if (off < 0)
 4653			__skb_push(skb, -off);
 4654
 4655		skb->mac_header += off;
 4656		skb_reset_network_header(skb);
 4657	}
 4658
 4659	/* check if bpf_xdp_adjust_tail was used */
 4660	off = xdp->data_end - orig_data_end;
 4661	if (off != 0) {
 4662		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4663		skb->len += off; /* positive on grow, negative on shrink */
 4664	}
 4665
 4666	/* check if XDP changed eth hdr such SKB needs update */
 4667	eth = (struct ethhdr *)xdp->data;
 4668	if ((orig_eth_type != eth->h_proto) ||
 4669	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4670		__skb_push(skb, ETH_HLEN);
 4671		skb->protocol = eth_type_trans(skb, skb->dev);
 4672	}
 4673
 4674	switch (act) {
 4675	case XDP_REDIRECT:
 4676	case XDP_TX:
 4677		__skb_push(skb, mac_len);
 4678		break;
 4679	case XDP_PASS:
 4680		metalen = xdp->data - xdp->data_meta;
 4681		if (metalen)
 4682			skb_metadata_set(skb, metalen);
 4683		break;
 4684	default:
 4685		bpf_warn_invalid_xdp_action(act);
 4686		/* fall through */
 4687	case XDP_ABORTED:
 4688		trace_xdp_exception(skb->dev, xdp_prog, act);
 4689		/* fall through */
 4690	case XDP_DROP:
 4691	do_drop:
 4692		kfree_skb(skb);
 4693		break;
 4694	}
 4695
 4696	return act;
 4697}
 4698
 4699/* When doing generic XDP we have to bypass the qdisc layer and the
 4700 * network taps in order to match in-driver-XDP behavior.
 4701 */
 4702void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4703{
 4704	struct net_device *dev = skb->dev;
 4705	struct netdev_queue *txq;
 4706	bool free_skb = true;
 4707	int cpu, rc;
 4708
 4709	txq = netdev_core_pick_tx(dev, skb, NULL);
 4710	cpu = smp_processor_id();
 4711	HARD_TX_LOCK(dev, txq, cpu);
 4712	if (!netif_xmit_stopped(txq)) {
 4713		rc = netdev_start_xmit(skb, dev, txq, 0);
 4714		if (dev_xmit_complete(rc))
 4715			free_skb = false;
 4716	}
 4717	HARD_TX_UNLOCK(dev, txq);
 4718	if (free_skb) {
 4719		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4720		kfree_skb(skb);
 4721	}
 4722}
 4723
 4724static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4725
 4726int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4727{
 4728	if (xdp_prog) {
 4729		struct xdp_buff xdp;
 4730		u32 act;
 4731		int err;
 4732
 4733		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4734		if (act != XDP_PASS) {
 4735			switch (act) {
 4736			case XDP_REDIRECT:
 4737				err = xdp_do_generic_redirect(skb->dev, skb,
 4738							      &xdp, xdp_prog);
 4739				if (err)
 4740					goto out_redir;
 4741				break;
 4742			case XDP_TX:
 4743				generic_xdp_tx(skb, xdp_prog);
 4744				break;
 4745			}
 4746			return XDP_DROP;
 4747		}
 4748	}
 4749	return XDP_PASS;
 4750out_redir:
 4751	kfree_skb(skb);
 4752	return XDP_DROP;
 4753}
 4754EXPORT_SYMBOL_GPL(do_xdp_generic);
 4755
 4756static int netif_rx_internal(struct sk_buff *skb)
 4757{
 4758	int ret;
 4759
 4760	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4761
 4762	trace_netif_rx(skb);
 4763
 4764#ifdef CONFIG_RPS
 4765	if (static_branch_unlikely(&rps_needed)) {
 4766		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4767		int cpu;
 4768
 4769		preempt_disable();
 4770		rcu_read_lock();
 4771
 4772		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4773		if (cpu < 0)
 4774			cpu = smp_processor_id();
 4775
 4776		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4777
 4778		rcu_read_unlock();
 4779		preempt_enable();
 4780	} else
 4781#endif
 4782	{
 4783		unsigned int qtail;
 4784
 4785		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4786		put_cpu();
 4787	}
 4788	return ret;
 4789}
 4790
 4791/**
 4792 *	netif_rx	-	post buffer to the network code
 4793 *	@skb: buffer to post
 4794 *
 4795 *	This function receives a packet from a device driver and queues it for
 4796 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4797 *	may be dropped during processing for congestion control or by the
 4798 *	protocol layers.
 4799 *
 4800 *	return values:
 4801 *	NET_RX_SUCCESS	(no congestion)
 4802 *	NET_RX_DROP     (packet was dropped)
 4803 *
 4804 */
 4805
 4806int netif_rx(struct sk_buff *skb)
 4807{
 4808	int ret;
 4809
 4810	trace_netif_rx_entry(skb);
 4811
 4812	ret = netif_rx_internal(skb);
 4813	trace_netif_rx_exit(ret);
 4814
 4815	return ret;
 4816}
 4817EXPORT_SYMBOL(netif_rx);
 4818
 4819int netif_rx_ni(struct sk_buff *skb)
 4820{
 4821	int err;
 4822
 4823	trace_netif_rx_ni_entry(skb);
 4824
 4825	preempt_disable();
 4826	err = netif_rx_internal(skb);
 4827	if (local_softirq_pending())
 4828		do_softirq();
 4829	preempt_enable();
 4830	trace_netif_rx_ni_exit(err);
 4831
 4832	return err;
 4833}
 4834EXPORT_SYMBOL(netif_rx_ni);
 4835
 4836static __latent_entropy void net_tx_action(struct softirq_action *h)
 4837{
 4838	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4839
 4840	if (sd->completion_queue) {
 4841		struct sk_buff *clist;
 4842
 4843		local_irq_disable();
 4844		clist = sd->completion_queue;
 4845		sd->completion_queue = NULL;
 4846		local_irq_enable();
 4847
 4848		while (clist) {
 4849			struct sk_buff *skb = clist;
 4850
 4851			clist = clist->next;
 4852
 4853			WARN_ON(refcount_read(&skb->users));
 4854			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4855				trace_consume_skb(skb);
 4856			else
 4857				trace_kfree_skb(skb, net_tx_action);
 4858
 4859			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4860				__kfree_skb(skb);
 4861			else
 4862				__kfree_skb_defer(skb);
 4863		}
 4864
 4865		__kfree_skb_flush();
 4866	}
 4867
 4868	if (sd->output_queue) {
 4869		struct Qdisc *head;
 4870
 4871		local_irq_disable();
 4872		head = sd->output_queue;
 4873		sd->output_queue = NULL;
 4874		sd->output_queue_tailp = &sd->output_queue;
 4875		local_irq_enable();
 4876
 4877		while (head) {
 4878			struct Qdisc *q = head;
 4879			spinlock_t *root_lock = NULL;
 4880
 4881			head = head->next_sched;
 4882
 4883			if (!(q->flags & TCQ_F_NOLOCK)) {
 4884				root_lock = qdisc_lock(q);
 4885				spin_lock(root_lock);
 4886			}
 4887			/* We need to make sure head->next_sched is read
 4888			 * before clearing __QDISC_STATE_SCHED
 4889			 */
 4890			smp_mb__before_atomic();
 4891			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4892			qdisc_run(q);
 4893			if (root_lock)
 4894				spin_unlock(root_lock);
 4895		}
 4896	}
 4897
 4898	xfrm_dev_backlog(sd);
 4899}
 4900
 4901#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 4902/* This hook is defined here for ATM LANE */
 4903int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4904			     unsigned char *addr) __read_mostly;
 4905EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4906#endif
 4907
 4908static inline struct sk_buff *
 4909sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4910		   struct net_device *orig_dev)
 4911{
 4912#ifdef CONFIG_NET_CLS_ACT
 4913	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4914	struct tcf_result cl_res;
 4915
 4916	/* If there's at least one ingress present somewhere (so
 4917	 * we get here via enabled static key), remaining devices
 4918	 * that are not configured with an ingress qdisc will bail
 4919	 * out here.
 4920	 */
 4921	if (!miniq)
 4922		return skb;
 4923
 4924	if (*pt_prev) {
 4925		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4926		*pt_prev = NULL;
 4927	}
 4928
 4929	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4930	skb->tc_at_ingress = 1;
 4931	mini_qdisc_bstats_cpu_update(miniq, skb);
 4932
 4933	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4934				     &cl_res, false)) {
 4935	case TC_ACT_OK:
 4936	case TC_ACT_RECLASSIFY:
 4937		skb->tc_index = TC_H_MIN(cl_res.classid);
 4938		break;
 4939	case TC_ACT_SHOT:
 4940		mini_qdisc_qstats_cpu_drop(miniq);
 4941		kfree_skb(skb);
 4942		return NULL;
 4943	case TC_ACT_STOLEN:
 4944	case TC_ACT_QUEUED:
 4945	case TC_ACT_TRAP:
 4946		consume_skb(skb);
 4947		return NULL;
 4948	case TC_ACT_REDIRECT:
 4949		/* skb_mac_header check was done by cls/act_bpf, so
 4950		 * we can safely push the L2 header back before
 4951		 * redirecting to another netdev
 4952		 */
 4953		__skb_push(skb, skb->mac_len);
 4954		skb_do_redirect(skb);
 4955		return NULL;
 4956	case TC_ACT_CONSUMED:
 4957		return NULL;
 4958	default:
 4959		break;
 4960	}
 4961#endif /* CONFIG_NET_CLS_ACT */
 4962	return skb;
 4963}
 4964
 4965/**
 4966 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4967 *	@dev: device to check
 4968 *
 4969 *	Check if a receive handler is already registered for a given device.
 4970 *	Return true if there one.
 4971 *
 4972 *	The caller must hold the rtnl_mutex.
 4973 */
 4974bool netdev_is_rx_handler_busy(struct net_device *dev)
 4975{
 4976	ASSERT_RTNL();
 4977	return dev && rtnl_dereference(dev->rx_handler);
 4978}
 4979EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 4980
 4981/**
 4982 *	netdev_rx_handler_register - register receive handler
 4983 *	@dev: device to register a handler for
 4984 *	@rx_handler: receive handler to register
 4985 *	@rx_handler_data: data pointer that is used by rx handler
 4986 *
 4987 *	Register a receive handler for a device. This handler will then be
 4988 *	called from __netif_receive_skb. A negative errno code is returned
 4989 *	on a failure.
 4990 *
 4991 *	The caller must hold the rtnl_mutex.
 4992 *
 4993 *	For a general description of rx_handler, see enum rx_handler_result.
 4994 */
 4995int netdev_rx_handler_register(struct net_device *dev,
 4996			       rx_handler_func_t *rx_handler,
 4997			       void *rx_handler_data)
 4998{
 4999	if (netdev_is_rx_handler_busy(dev))
 5000		return -EBUSY;
 5001
 5002	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5003		return -EINVAL;
 5004
 5005	/* Note: rx_handler_data must be set before rx_handler */
 5006	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5007	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5008
 5009	return 0;
 5010}
 5011EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5012
 5013/**
 5014 *	netdev_rx_handler_unregister - unregister receive handler
 5015 *	@dev: device to unregister a handler from
 5016 *
 5017 *	Unregister a receive handler from a device.
 5018 *
 5019 *	The caller must hold the rtnl_mutex.
 5020 */
 5021void netdev_rx_handler_unregister(struct net_device *dev)
 5022{
 5023
 5024	ASSERT_RTNL();
 5025	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5026	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5027	 * section has a guarantee to see a non NULL rx_handler_data
 5028	 * as well.
 5029	 */
 5030	synchronize_net();
 5031	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5032}
 5033EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5034
 5035/*
 5036 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5037 * the special handling of PFMEMALLOC skbs.
 5038 */
 5039static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5040{
 5041	switch (skb->protocol) {
 5042	case htons(ETH_P_ARP):
 5043	case htons(ETH_P_IP):
 5044	case htons(ETH_P_IPV6):
 5045	case htons(ETH_P_8021Q):
 5046	case htons(ETH_P_8021AD):
 5047		return true;
 5048	default:
 5049		return false;
 5050	}
 5051}
 5052
 5053static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5054			     int *ret, struct net_device *orig_dev)
 5055{
 5056	if (nf_hook_ingress_active(skb)) {
 5057		int ingress_retval;
 5058
 5059		if (*pt_prev) {
 5060			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5061			*pt_prev = NULL;
 5062		}
 5063
 5064		rcu_read_lock();
 5065		ingress_retval = nf_hook_ingress(skb);
 5066		rcu_read_unlock();
 5067		return ingress_retval;
 5068	}
 5069	return 0;
 5070}
 5071
 5072static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5073				    struct packet_type **ppt_prev)
 5074{
 5075	struct packet_type *ptype, *pt_prev;
 5076	rx_handler_func_t *rx_handler;
 5077	struct sk_buff *skb = *pskb;
 5078	struct net_device *orig_dev;
 5079	bool deliver_exact = false;
 5080	int ret = NET_RX_DROP;
 5081	__be16 type;
 5082
 5083	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 5084
 5085	trace_netif_receive_skb(skb);
 5086
 5087	orig_dev = skb->dev;
 5088
 5089	skb_reset_network_header(skb);
 5090	if (!skb_transport_header_was_set(skb))
 5091		skb_reset_transport_header(skb);
 5092	skb_reset_mac_len(skb);
 5093
 5094	pt_prev = NULL;
 5095
 5096another_round:
 5097	skb->skb_iif = skb->dev->ifindex;
 5098
 5099	__this_cpu_inc(softnet_data.processed);
 5100
 5101	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5102		int ret2;
 5103
 5104		preempt_disable();
 5105		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5106		preempt_enable();
 5107
 5108		if (ret2 != XDP_PASS) {
 5109			ret = NET_RX_DROP;
 5110			goto out;
 5111		}
 5112		skb_reset_mac_len(skb);
 5113	}
 5114
 5115	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5116	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5117		skb = skb_vlan_untag(skb);
 5118		if (unlikely(!skb))
 5119			goto out;
 5120	}
 5121
 5122	if (skb_skip_tc_classify(skb))
 5123		goto skip_classify;
 5124
 5125	if (pfmemalloc)
 5126		goto skip_taps;
 5127
 5128	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5129		if (pt_prev)
 5130			ret = deliver_skb(skb, pt_prev, orig_dev);
 5131		pt_prev = ptype;
 5132	}
 5133
 5134	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5135		if (pt_prev)
 5136			ret = deliver_skb(skb, pt_prev, orig_dev);
 5137		pt_prev = ptype;
 5138	}
 5139
 5140skip_taps:
 5141#ifdef CONFIG_NET_INGRESS
 5142	if (static_branch_unlikely(&ingress_needed_key)) {
 5143		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 5144		if (!skb)
 5145			goto out;
 5146
 5147		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5148			goto out;
 5149	}
 5150#endif
 5151	skb_reset_redirect(skb);
 5152skip_classify:
 5153	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5154		goto drop;
 5155
 5156	if (skb_vlan_tag_present(skb)) {
 5157		if (pt_prev) {
 5158			ret = deliver_skb(skb, pt_prev, orig_dev);
 5159			pt_prev = NULL;
 5160		}
 5161		if (vlan_do_receive(&skb))
 5162			goto another_round;
 5163		else if (unlikely(!skb))
 5164			goto out;
 5165	}
 5166
 5167	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5168	if (rx_handler) {
 5169		if (pt_prev) {
 5170			ret = deliver_skb(skb, pt_prev, orig_dev);
 5171			pt_prev = NULL;
 5172		}
 5173		switch (rx_handler(&skb)) {
 5174		case RX_HANDLER_CONSUMED:
 5175			ret = NET_RX_SUCCESS;
 5176			goto out;
 5177		case RX_HANDLER_ANOTHER:
 5178			goto another_round;
 5179		case RX_HANDLER_EXACT:
 5180			deliver_exact = true;
 5181		case RX_HANDLER_PASS:
 5182			break;
 5183		default:
 5184			BUG();
 5185		}
 5186	}
 5187
 5188	if (unlikely(skb_vlan_tag_present(skb))) {
 5189check_vlan_id:
 5190		if (skb_vlan_tag_get_id(skb)) {
 5191			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5192			 * find vlan device.
 5193			 */
 5194			skb->pkt_type = PACKET_OTHERHOST;
 5195		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5196			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5197			/* Outer header is 802.1P with vlan 0, inner header is
 5198			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5199			 * not find vlan dev for vlan id 0.
 5200			 */
 5201			__vlan_hwaccel_clear_tag(skb);
 5202			skb = skb_vlan_untag(skb);
 5203			if (unlikely(!skb))
 5204				goto out;
 5205			if (vlan_do_receive(&skb))
 5206				/* After stripping off 802.1P header with vlan 0
 5207				 * vlan dev is found for inner header.
 5208				 */
 5209				goto another_round;
 5210			else if (unlikely(!skb))
 5211				goto out;
 5212			else
 5213				/* We have stripped outer 802.1P vlan 0 header.
 5214				 * But could not find vlan dev.
 5215				 * check again for vlan id to set OTHERHOST.
 5216				 */
 5217				goto check_vlan_id;
 5218		}
 5219		/* Note: we might in the future use prio bits
 5220		 * and set skb->priority like in vlan_do_receive()
 5221		 * For the time being, just ignore Priority Code Point
 5222		 */
 5223		__vlan_hwaccel_clear_tag(skb);
 5224	}
 5225
 5226	type = skb->protocol;
 5227
 5228	/* deliver only exact match when indicated */
 5229	if (likely(!deliver_exact)) {
 5230		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5231				       &ptype_base[ntohs(type) &
 5232						   PTYPE_HASH_MASK]);
 5233	}
 5234
 5235	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5236			       &orig_dev->ptype_specific);
 5237
 5238	if (unlikely(skb->dev != orig_dev)) {
 5239		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5240				       &skb->dev->ptype_specific);
 5241	}
 5242
 5243	if (pt_prev) {
 5244		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5245			goto drop;
 5246		*ppt_prev = pt_prev;
 5247	} else {
 5248drop:
 5249		if (!deliver_exact)
 5250			atomic_long_inc(&skb->dev->rx_dropped);
 5251		else
 5252			atomic_long_inc(&skb->dev->rx_nohandler);
 5253		kfree_skb(skb);
 5254		/* Jamal, now you will not able to escape explaining
 5255		 * me how you were going to use this. :-)
 5256		 */
 5257		ret = NET_RX_DROP;
 5258	}
 5259
 5260out:
 5261	/* The invariant here is that if *ppt_prev is not NULL
 5262	 * then skb should also be non-NULL.
 5263	 *
 5264	 * Apparently *ppt_prev assignment above holds this invariant due to
 5265	 * skb dereferencing near it.
 5266	 */
 5267	*pskb = skb;
 5268	return ret;
 5269}
 5270
 5271static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5272{
 5273	struct net_device *orig_dev = skb->dev;
 5274	struct packet_type *pt_prev = NULL;
 5275	int ret;
 5276
 5277	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5278	if (pt_prev)
 5279		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5280					 skb->dev, pt_prev, orig_dev);
 5281	return ret;
 5282}
 5283
 5284/**
 5285 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5286 *	@skb: buffer to process
 5287 *
 5288 *	More direct receive version of netif_receive_skb().  It should
 5289 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5290 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5291 *
 5292 *	This function may only be called from softirq context and interrupts
 5293 *	should be enabled.
 5294 *
 5295 *	Return values (usually ignored):
 5296 *	NET_RX_SUCCESS: no congestion
 5297 *	NET_RX_DROP: packet was dropped
 5298 */
 5299int netif_receive_skb_core(struct sk_buff *skb)
 5300{
 5301	int ret;
 5302
 5303	rcu_read_lock();
 5304	ret = __netif_receive_skb_one_core(skb, false);
 5305	rcu_read_unlock();
 5306
 5307	return ret;
 5308}
 5309EXPORT_SYMBOL(netif_receive_skb_core);
 5310
 5311static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5312						  struct packet_type *pt_prev,
 5313						  struct net_device *orig_dev)
 5314{
 5315	struct sk_buff *skb, *next;
 5316
 5317	if (!pt_prev)
 5318		return;
 5319	if (list_empty(head))
 5320		return;
 5321	if (pt_prev->list_func != NULL)
 5322		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5323				   ip_list_rcv, head, pt_prev, orig_dev);
 5324	else
 5325		list_for_each_entry_safe(skb, next, head, list) {
 5326			skb_list_del_init(skb);
 5327			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5328		}
 5329}
 5330
 5331static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5332{
 5333	/* Fast-path assumptions:
 5334	 * - There is no RX handler.
 5335	 * - Only one packet_type matches.
 5336	 * If either of these fails, we will end up doing some per-packet
 5337	 * processing in-line, then handling the 'last ptype' for the whole
 5338	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5339	 * because the 'last ptype' must be constant across the sublist, and all
 5340	 * other ptypes are handled per-packet.
 5341	 */
 5342	/* Current (common) ptype of sublist */
 5343	struct packet_type *pt_curr = NULL;
 5344	/* Current (common) orig_dev of sublist */
 5345	struct net_device *od_curr = NULL;
 5346	struct list_head sublist;
 5347	struct sk_buff *skb, *next;
 5348
 5349	INIT_LIST_HEAD(&sublist);
 5350	list_for_each_entry_safe(skb, next, head, list) {
 5351		struct net_device *orig_dev = skb->dev;
 5352		struct packet_type *pt_prev = NULL;
 5353
 5354		skb_list_del_init(skb);
 5355		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5356		if (!pt_prev)
 5357			continue;
 5358		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5359			/* dispatch old sublist */
 5360			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5361			/* start new sublist */
 5362			INIT_LIST_HEAD(&sublist);
 5363			pt_curr = pt_prev;
 5364			od_curr = orig_dev;
 5365		}
 5366		list_add_tail(&skb->list, &sublist);
 5367	}
 5368
 5369	/* dispatch final sublist */
 5370	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5371}
 5372
 5373static int __netif_receive_skb(struct sk_buff *skb)
 5374{
 5375	int ret;
 5376
 5377	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5378		unsigned int noreclaim_flag;
 5379
 5380		/*
 5381		 * PFMEMALLOC skbs are special, they should
 5382		 * - be delivered to SOCK_MEMALLOC sockets only
 5383		 * - stay away from userspace
 5384		 * - have bounded memory usage
 5385		 *
 5386		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5387		 * context down to all allocation sites.
 5388		 */
 5389		noreclaim_flag = memalloc_noreclaim_save();
 5390		ret = __netif_receive_skb_one_core(skb, true);
 5391		memalloc_noreclaim_restore(noreclaim_flag);
 5392	} else
 5393		ret = __netif_receive_skb_one_core(skb, false);
 5394
 5395	return ret;
 5396}
 5397
 5398static void __netif_receive_skb_list(struct list_head *head)
 5399{
 5400	unsigned long noreclaim_flag = 0;
 5401	struct sk_buff *skb, *next;
 5402	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5403
 5404	list_for_each_entry_safe(skb, next, head, list) {
 5405		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5406			struct list_head sublist;
 5407
 5408			/* Handle the previous sublist */
 5409			list_cut_before(&sublist, head, &skb->list);
 5410			if (!list_empty(&sublist))
 5411				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5412			pfmemalloc = !pfmemalloc;
 5413			/* See comments in __netif_receive_skb */
 5414			if (pfmemalloc)
 5415				noreclaim_flag = memalloc_noreclaim_save();
 5416			else
 5417				memalloc_noreclaim_restore(noreclaim_flag);
 5418		}
 5419	}
 5420	/* Handle the remaining sublist */
 5421	if (!list_empty(head))
 5422		__netif_receive_skb_list_core(head, pfmemalloc);
 5423	/* Restore pflags */
 5424	if (pfmemalloc)
 5425		memalloc_noreclaim_restore(noreclaim_flag);
 5426}
 5427
 5428static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5429{
 5430	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5431	struct bpf_prog *new = xdp->prog;
 5432	int ret = 0;
 5433
 5434	if (new) {
 5435		u32 i;
 5436
 5437		/* generic XDP does not work with DEVMAPs that can
 5438		 * have a bpf_prog installed on an entry
 5439		 */
 5440		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5441			if (dev_map_can_have_prog(new->aux->used_maps[i]))
 5442				return -EINVAL;
 5443		}
 5444	}
 5445
 5446	switch (xdp->command) {
 5447	case XDP_SETUP_PROG:
 5448		rcu_assign_pointer(dev->xdp_prog, new);
 5449		if (old)
 5450			bpf_prog_put(old);
 5451
 5452		if (old && !new) {
 5453			static_branch_dec(&generic_xdp_needed_key);
 5454		} else if (new && !old) {
 5455			static_branch_inc(&generic_xdp_needed_key);
 5456			dev_disable_lro(dev);
 5457			dev_disable_gro_hw(dev);
 5458		}
 5459		break;
 5460
 5461	case XDP_QUERY_PROG:
 5462		xdp->prog_id = old ? old->aux->id : 0;
 5463		break;
 5464
 5465	default:
 5466		ret = -EINVAL;
 5467		break;
 5468	}
 5469
 5470	return ret;
 5471}
 5472
 5473static int netif_receive_skb_internal(struct sk_buff *skb)
 5474{
 5475	int ret;
 5476
 5477	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5478
 5479	if (skb_defer_rx_timestamp(skb))
 5480		return NET_RX_SUCCESS;
 5481
 5482	rcu_read_lock();
 5483#ifdef CONFIG_RPS
 5484	if (static_branch_unlikely(&rps_needed)) {
 5485		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5486		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5487
 5488		if (cpu >= 0) {
 5489			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5490			rcu_read_unlock();
 5491			return ret;
 5492		}
 5493	}
 5494#endif
 5495	ret = __netif_receive_skb(skb);
 5496	rcu_read_unlock();
 5497	return ret;
 5498}
 5499
 5500static void netif_receive_skb_list_internal(struct list_head *head)
 5501{
 5502	struct sk_buff *skb, *next;
 5503	struct list_head sublist;
 5504
 5505	INIT_LIST_HEAD(&sublist);
 5506	list_for_each_entry_safe(skb, next, head, list) {
 5507		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5508		skb_list_del_init(skb);
 5509		if (!skb_defer_rx_timestamp(skb))
 5510			list_add_tail(&skb->list, &sublist);
 5511	}
 5512	list_splice_init(&sublist, head);
 5513
 5514	rcu_read_lock();
 5515#ifdef CONFIG_RPS
 5516	if (static_branch_unlikely(&rps_needed)) {
 5517		list_for_each_entry_safe(skb, next, head, list) {
 5518			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5519			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5520
 5521			if (cpu >= 0) {
 5522				/* Will be handled, remove from list */
 5523				skb_list_del_init(skb);
 5524				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5525			}
 5526		}
 5527	}
 5528#endif
 5529	__netif_receive_skb_list(head);
 5530	rcu_read_unlock();
 5531}
 5532
 5533/**
 5534 *	netif_receive_skb - process receive buffer from network
 5535 *	@skb: buffer to process
 5536 *
 5537 *	netif_receive_skb() is the main receive data processing function.
 5538 *	It always succeeds. The buffer may be dropped during processing
 5539 *	for congestion control or by the protocol layers.
 5540 *
 5541 *	This function may only be called from softirq context and interrupts
 5542 *	should be enabled.
 5543 *
 5544 *	Return values (usually ignored):
 5545 *	NET_RX_SUCCESS: no congestion
 5546 *	NET_RX_DROP: packet was dropped
 5547 */
 5548int netif_receive_skb(struct sk_buff *skb)
 5549{
 5550	int ret;
 5551
 5552	trace_netif_receive_skb_entry(skb);
 5553
 5554	ret = netif_receive_skb_internal(skb);
 5555	trace_netif_receive_skb_exit(ret);
 5556
 5557	return ret;
 5558}
 5559EXPORT_SYMBOL(netif_receive_skb);
 5560
 5561/**
 5562 *	netif_receive_skb_list - process many receive buffers from network
 5563 *	@head: list of skbs to process.
 5564 *
 5565 *	Since return value of netif_receive_skb() is normally ignored, and
 5566 *	wouldn't be meaningful for a list, this function returns void.
 5567 *
 5568 *	This function may only be called from softirq context and interrupts
 5569 *	should be enabled.
 5570 */
 5571void netif_receive_skb_list(struct list_head *head)
 5572{
 5573	struct sk_buff *skb;
 5574
 5575	if (list_empty(head))
 5576		return;
 5577	if (trace_netif_receive_skb_list_entry_enabled()) {
 5578		list_for_each_entry(skb, head, list)
 5579			trace_netif_receive_skb_list_entry(skb);
 5580	}
 5581	netif_receive_skb_list_internal(head);
 5582	trace_netif_receive_skb_list_exit(0);
 5583}
 5584EXPORT_SYMBOL(netif_receive_skb_list);
 5585
 5586DEFINE_PER_CPU(struct work_struct, flush_works);
 5587
 5588/* Network device is going away, flush any packets still pending */
 5589static void flush_backlog(struct work_struct *work)
 5590{
 5591	struct sk_buff *skb, *tmp;
 5592	struct softnet_data *sd;
 5593
 5594	local_bh_disable();
 5595	sd = this_cpu_ptr(&softnet_data);
 5596
 5597	local_irq_disable();
 5598	rps_lock(sd);
 5599	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5600		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5601			__skb_unlink(skb, &sd->input_pkt_queue);
 5602			kfree_skb(skb);
 5603			input_queue_head_incr(sd);
 5604		}
 5605	}
 5606	rps_unlock(sd);
 5607	local_irq_enable();
 5608
 5609	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5610		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5611			__skb_unlink(skb, &sd->process_queue);
 5612			kfree_skb(skb);
 5613			input_queue_head_incr(sd);
 5614		}
 5615	}
 5616	local_bh_enable();
 5617}
 5618
 5619static void flush_all_backlogs(void)
 5620{
 5621	unsigned int cpu;
 5622
 5623	get_online_cpus();
 5624
 5625	for_each_online_cpu(cpu)
 5626		queue_work_on(cpu, system_highpri_wq,
 5627			      per_cpu_ptr(&flush_works, cpu));
 5628
 5629	for_each_online_cpu(cpu)
 5630		flush_work(per_cpu_ptr(&flush_works, cpu));
 5631
 5632	put_online_cpus();
 5633}
 5634
 5635/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5636static void gro_normal_list(struct napi_struct *napi)
 5637{
 5638	if (!napi->rx_count)
 5639		return;
 5640	netif_receive_skb_list_internal(&napi->rx_list);
 5641	INIT_LIST_HEAD(&napi->rx_list);
 5642	napi->rx_count = 0;
 5643}
 5644
 5645/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5646 * pass the whole batch up to the stack.
 5647 */
 5648static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5649{
 5650	list_add_tail(&skb->list, &napi->rx_list);
 5651	if (++napi->rx_count >= gro_normal_batch)
 5652		gro_normal_list(napi);
 5653}
 5654
 5655INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5656INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5657static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5658{
 5659	struct packet_offload *ptype;
 5660	__be16 type = skb->protocol;
 5661	struct list_head *head = &offload_base;
 5662	int err = -ENOENT;
 5663
 5664	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5665
 5666	if (NAPI_GRO_CB(skb)->count == 1) {
 5667		skb_shinfo(skb)->gso_size = 0;
 5668		goto out;
 5669	}
 5670
 5671	rcu_read_lock();
 5672	list_for_each_entry_rcu(ptype, head, list) {
 5673		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5674			continue;
 5675
 5676		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5677					 ipv6_gro_complete, inet_gro_complete,
 5678					 skb, 0);
 5679		break;
 5680	}
 5681	rcu_read_unlock();
 5682
 5683	if (err) {
 5684		WARN_ON(&ptype->list == head);
 5685		kfree_skb(skb);
 5686		return NET_RX_SUCCESS;
 5687	}
 5688
 5689out:
 5690	gro_normal_one(napi, skb);
 5691	return NET_RX_SUCCESS;
 5692}
 5693
 5694static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5695				   bool flush_old)
 5696{
 5697	struct list_head *head = &napi->gro_hash[index].list;
 5698	struct sk_buff *skb, *p;
 5699
 5700	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5701		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5702			return;
 5703		skb_list_del_init(skb);
 5704		napi_gro_complete(napi, skb);
 5705		napi->gro_hash[index].count--;
 5706	}
 5707
 5708	if (!napi->gro_hash[index].count)
 5709		__clear_bit(index, &napi->gro_bitmask);
 5710}
 5711
 5712/* napi->gro_hash[].list contains packets ordered by age.
 5713 * youngest packets at the head of it.
 5714 * Complete skbs in reverse order to reduce latencies.
 5715 */
 5716void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5717{
 5718	unsigned long bitmask = napi->gro_bitmask;
 5719	unsigned int i, base = ~0U;
 5720
 5721	while ((i = ffs(bitmask)) != 0) {
 5722		bitmask >>= i;
 5723		base += i;
 5724		__napi_gro_flush_chain(napi, base, flush_old);
 5725	}
 5726}
 5727EXPORT_SYMBOL(napi_gro_flush);
 5728
 5729static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5730					  struct sk_buff *skb)
 5731{
 5732	unsigned int maclen = skb->dev->hard_header_len;
 5733	u32 hash = skb_get_hash_raw(skb);
 5734	struct list_head *head;
 5735	struct sk_buff *p;
 5736
 5737	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5738	list_for_each_entry(p, head, list) {
 5739		unsigned long diffs;
 5740
 5741		NAPI_GRO_CB(p)->flush = 0;
 5742
 5743		if (hash != skb_get_hash_raw(p)) {
 5744			NAPI_GRO_CB(p)->same_flow = 0;
 5745			continue;
 5746		}
 5747
 5748		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5749		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5750		if (skb_vlan_tag_present(p))
 5751			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5752		diffs |= skb_metadata_dst_cmp(p, skb);
 5753		diffs |= skb_metadata_differs(p, skb);
 5754		if (maclen == ETH_HLEN)
 5755			diffs |= compare_ether_header(skb_mac_header(p),
 5756						      skb_mac_header(skb));
 5757		else if (!diffs)
 5758			diffs = memcmp(skb_mac_header(p),
 5759				       skb_mac_header(skb),
 5760				       maclen);
 5761		NAPI_GRO_CB(p)->same_flow = !diffs;
 5762	}
 5763
 5764	return head;
 5765}
 5766
 5767static void skb_gro_reset_offset(struct sk_buff *skb)
 5768{
 5769	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5770	const skb_frag_t *frag0 = &pinfo->frags[0];
 5771
 5772	NAPI_GRO_CB(skb)->data_offset = 0;
 5773	NAPI_GRO_CB(skb)->frag0 = NULL;
 5774	NAPI_GRO_CB(skb)->frag0_len = 0;
 5775
 5776	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5777	    !PageHighMem(skb_frag_page(frag0))) {
 5778		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5779		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5780						    skb_frag_size(frag0),
 5781						    skb->end - skb->tail);
 5782	}
 5783}
 5784
 5785static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5786{
 5787	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5788
 5789	BUG_ON(skb->end - skb->tail < grow);
 5790
 5791	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5792
 5793	skb->data_len -= grow;
 5794	skb->tail += grow;
 5795
 5796	skb_frag_off_add(&pinfo->frags[0], grow);
 5797	skb_frag_size_sub(&pinfo->frags[0], grow);
 5798
 5799	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5800		skb_frag_unref(skb, 0);
 5801		memmove(pinfo->frags, pinfo->frags + 1,
 5802			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5803	}
 5804}
 5805
 5806static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5807{
 5808	struct sk_buff *oldest;
 5809
 5810	oldest = list_last_entry(head, struct sk_buff, list);
 5811
 5812	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5813	 * impossible.
 5814	 */
 5815	if (WARN_ON_ONCE(!oldest))
 5816		return;
 5817
 5818	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5819	 * SKB to the chain.
 5820	 */
 5821	skb_list_del_init(oldest);
 5822	napi_gro_complete(napi, oldest);
 5823}
 5824
 5825INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5826							   struct sk_buff *));
 5827INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5828							   struct sk_buff *));
 5829static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5830{
 5831	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5832	struct list_head *head = &offload_base;
 5833	struct packet_offload *ptype;
 5834	__be16 type = skb->protocol;
 5835	struct list_head *gro_head;
 5836	struct sk_buff *pp = NULL;
 5837	enum gro_result ret;
 5838	int same_flow;
 5839	int grow;
 5840
 5841	if (netif_elide_gro(skb->dev))
 5842		goto normal;
 5843
 5844	gro_head = gro_list_prepare(napi, skb);
 5845
 5846	rcu_read_lock();
 5847	list_for_each_entry_rcu(ptype, head, list) {
 5848		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5849			continue;
 5850
 5851		skb_set_network_header(skb, skb_gro_offset(skb));
 5852		skb_reset_mac_len(skb);
 5853		NAPI_GRO_CB(skb)->same_flow = 0;
 5854		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5855		NAPI_GRO_CB(skb)->free = 0;
 5856		NAPI_GRO_CB(skb)->encap_mark = 0;
 5857		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5858		NAPI_GRO_CB(skb)->is_fou = 0;
 5859		NAPI_GRO_CB(skb)->is_atomic = 1;
 5860		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5861
 5862		/* Setup for GRO checksum validation */
 5863		switch (skb->ip_summed) {
 5864		case CHECKSUM_COMPLETE:
 5865			NAPI_GRO_CB(skb)->csum = skb->csum;
 5866			NAPI_GRO_CB(skb)->csum_valid = 1;
 5867			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5868			break;
 5869		case CHECKSUM_UNNECESSARY:
 5870			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5871			NAPI_GRO_CB(skb)->csum_valid = 0;
 5872			break;
 5873		default:
 5874			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5875			NAPI_GRO_CB(skb)->csum_valid = 0;
 5876		}
 5877
 5878		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5879					ipv6_gro_receive, inet_gro_receive,
 5880					gro_head, skb);
 5881		break;
 5882	}
 5883	rcu_read_unlock();
 5884
 5885	if (&ptype->list == head)
 5886		goto normal;
 5887
 5888	if (PTR_ERR(pp) == -EINPROGRESS) {
 5889		ret = GRO_CONSUMED;
 5890		goto ok;
 5891	}
 5892
 5893	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5894	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5895
 5896	if (pp) {
 5897		skb_list_del_init(pp);
 5898		napi_gro_complete(napi, pp);
 5899		napi->gro_hash[hash].count--;
 5900	}
 5901
 5902	if (same_flow)
 5903		goto ok;
 5904
 5905	if (NAPI_GRO_CB(skb)->flush)
 5906		goto normal;
 5907
 5908	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5909		gro_flush_oldest(napi, gro_head);
 5910	} else {
 5911		napi->gro_hash[hash].count++;
 5912	}
 5913	NAPI_GRO_CB(skb)->count = 1;
 5914	NAPI_GRO_CB(skb)->age = jiffies;
 5915	NAPI_GRO_CB(skb)->last = skb;
 5916	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5917	list_add(&skb->list, gro_head);
 5918	ret = GRO_HELD;
 5919
 5920pull:
 5921	grow = skb_gro_offset(skb) - skb_headlen(skb);
 5922	if (grow > 0)
 5923		gro_pull_from_frag0(skb, grow);
 5924ok:
 5925	if (napi->gro_hash[hash].count) {
 5926		if (!test_bit(hash, &napi->gro_bitmask))
 5927			__set_bit(hash, &napi->gro_bitmask);
 5928	} else if (test_bit(hash, &napi->gro_bitmask)) {
 5929		__clear_bit(hash, &napi->gro_bitmask);
 5930	}
 5931
 5932	return ret;
 5933
 5934normal:
 5935	ret = GRO_NORMAL;
 5936	goto pull;
 5937}
 5938
 5939struct packet_offload *gro_find_receive_by_type(__be16 type)
 5940{
 5941	struct list_head *offload_head = &offload_base;
 5942	struct packet_offload *ptype;
 5943
 5944	list_for_each_entry_rcu(ptype, offload_head, list) {
 5945		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5946			continue;
 5947		return ptype;
 5948	}
 5949	return NULL;
 5950}
 5951EXPORT_SYMBOL(gro_find_receive_by_type);
 5952
 5953struct packet_offload *gro_find_complete_by_type(__be16 type)
 5954{
 5955	struct list_head *offload_head = &offload_base;
 5956	struct packet_offload *ptype;
 5957
 5958	list_for_each_entry_rcu(ptype, offload_head, list) {
 5959		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5960			continue;
 5961		return ptype;
 5962	}
 5963	return NULL;
 5964}
 5965EXPORT_SYMBOL(gro_find_complete_by_type);
 5966
 5967static void napi_skb_free_stolen_head(struct sk_buff *skb)
 5968{
 5969	skb_dst_drop(skb);
 5970	skb_ext_put(skb);
 5971	kmem_cache_free(skbuff_head_cache, skb);
 5972}
 5973
 5974static gro_result_t napi_skb_finish(struct napi_struct *napi,
 5975				    struct sk_buff *skb,
 5976				    gro_result_t ret)
 5977{
 5978	switch (ret) {
 5979	case GRO_NORMAL:
 5980		gro_normal_one(napi, skb);
 5981		break;
 5982
 5983	case GRO_DROP:
 5984		kfree_skb(skb);
 5985		break;
 5986
 5987	case GRO_MERGED_FREE:
 5988		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 5989			napi_skb_free_stolen_head(skb);
 5990		else
 5991			__kfree_skb(skb);
 5992		break;
 5993
 5994	case GRO_HELD:
 5995	case GRO_MERGED:
 5996	case GRO_CONSUMED:
 5997		break;
 5998	}
 5999
 6000	return ret;
 6001}
 6002
 6003gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6004{
 6005	gro_result_t ret;
 6006
 6007	skb_mark_napi_id(skb, napi);
 6008	trace_napi_gro_receive_entry(skb);
 6009
 6010	skb_gro_reset_offset(skb);
 6011
 6012	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6013	trace_napi_gro_receive_exit(ret);
 6014
 6015	return ret;
 6016}
 6017EXPORT_SYMBOL(napi_gro_receive);
 6018
 6019static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6020{
 6021	if (unlikely(skb->pfmemalloc)) {
 6022		consume_skb(skb);
 6023		return;
 6024	}
 6025	__skb_pull(skb, skb_headlen(skb));
 6026	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6027	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6028	__vlan_hwaccel_clear_tag(skb);
 6029	skb->dev = napi->dev;
 6030	skb->skb_iif = 0;
 6031
 6032	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6033	skb->pkt_type = PACKET_HOST;
 6034
 6035	skb->encapsulation = 0;
 6036	skb_shinfo(skb)->gso_type = 0;
 6037	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6038	skb_ext_reset(skb);
 6039
 6040	napi->skb = skb;
 6041}
 6042
 6043struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6044{
 6045	struct sk_buff *skb = napi->skb;
 6046
 6047	if (!skb) {
 6048		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6049		if (skb) {
 6050			napi->skb = skb;
 6051			skb_mark_napi_id(skb, napi);
 6052		}
 6053	}
 6054	return skb;
 6055}
 6056EXPORT_SYMBOL(napi_get_frags);
 6057
 6058static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6059				      struct sk_buff *skb,
 6060				      gro_result_t ret)
 6061{
 6062	switch (ret) {
 6063	case GRO_NORMAL:
 6064	case GRO_HELD:
 6065		__skb_push(skb, ETH_HLEN);
 6066		skb->protocol = eth_type_trans(skb, skb->dev);
 6067		if (ret == GRO_NORMAL)
 6068			gro_normal_one(napi, skb);
 6069		break;
 6070
 6071	case GRO_DROP:
 6072		napi_reuse_skb(napi, skb);
 6073		break;
 6074
 6075	case GRO_MERGED_FREE:
 6076		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6077			napi_skb_free_stolen_head(skb);
 6078		else
 6079			napi_reuse_skb(napi, skb);
 6080		break;
 6081
 6082	case GRO_MERGED:
 6083	case GRO_CONSUMED:
 6084		break;
 6085	}
 6086
 6087	return ret;
 6088}
 6089
 6090/* Upper GRO stack assumes network header starts at gro_offset=0
 6091 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6092 * We copy ethernet header into skb->data to have a common layout.
 6093 */
 6094static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6095{
 6096	struct sk_buff *skb = napi->skb;
 6097	const struct ethhdr *eth;
 6098	unsigned int hlen = sizeof(*eth);
 6099
 6100	napi->skb = NULL;
 6101
 6102	skb_reset_mac_header(skb);
 6103	skb_gro_reset_offset(skb);
 6104
 6105	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6106		eth = skb_gro_header_slow(skb, hlen, 0);
 6107		if (unlikely(!eth)) {
 6108			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6109					     __func__, napi->dev->name);
 6110			napi_reuse_skb(napi, skb);
 6111			return NULL;
 6112		}
 6113	} else {
 6114		eth = (const struct ethhdr *)skb->data;
 6115		gro_pull_from_frag0(skb, hlen);
 6116		NAPI_GRO_CB(skb)->frag0 += hlen;
 6117		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6118	}
 6119	__skb_pull(skb, hlen);
 6120
 6121	/*
 6122	 * This works because the only protocols we care about don't require
 6123	 * special handling.
 6124	 * We'll fix it up properly in napi_frags_finish()
 6125	 */
 6126	skb->protocol = eth->h_proto;
 6127
 6128	return skb;
 6129}
 6130
 6131gro_result_t napi_gro_frags(struct napi_struct *napi)
 6132{
 6133	gro_result_t ret;
 6134	struct sk_buff *skb = napi_frags_skb(napi);
 6135
 6136	if (!skb)
 6137		return GRO_DROP;
 6138
 6139	trace_napi_gro_frags_entry(skb);
 6140
 6141	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6142	trace_napi_gro_frags_exit(ret);
 6143
 6144	return ret;
 6145}
 6146EXPORT_SYMBOL(napi_gro_frags);
 6147
 6148/* Compute the checksum from gro_offset and return the folded value
 6149 * after adding in any pseudo checksum.
 6150 */
 6151__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6152{
 6153	__wsum wsum;
 6154	__sum16 sum;
 6155
 6156	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6157
 6158	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6159	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6160	/* See comments in __skb_checksum_complete(). */
 6161	if (likely(!sum)) {
 6162		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6163		    !skb->csum_complete_sw)
 6164			netdev_rx_csum_fault(skb->dev, skb);
 6165	}
 6166
 6167	NAPI_GRO_CB(skb)->csum = wsum;
 6168	NAPI_GRO_CB(skb)->csum_valid = 1;
 6169
 6170	return sum;
 6171}
 6172EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6173
 6174static void net_rps_send_ipi(struct softnet_data *remsd)
 6175{
 6176#ifdef CONFIG_RPS
 6177	while (remsd) {
 6178		struct softnet_data *next = remsd->rps_ipi_next;
 6179
 6180		if (cpu_online(remsd->cpu))
 6181			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6182		remsd = next;
 6183	}
 6184#endif
 6185}
 6186
 6187/*
 6188 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6189 * Note: called with local irq disabled, but exits with local irq enabled.
 6190 */
 6191static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6192{
 6193#ifdef CONFIG_RPS
 6194	struct softnet_data *remsd = sd->rps_ipi_list;
 6195
 6196	if (remsd) {
 6197		sd->rps_ipi_list = NULL;
 6198
 6199		local_irq_enable();
 6200
 6201		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6202		net_rps_send_ipi(remsd);
 6203	} else
 6204#endif
 6205		local_irq_enable();
 6206}
 6207
 6208static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6209{
 6210#ifdef CONFIG_RPS
 6211	return sd->rps_ipi_list != NULL;
 6212#else
 6213	return false;
 6214#endif
 6215}
 6216
 6217static int process_backlog(struct napi_struct *napi, int quota)
 6218{
 6219	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6220	bool again = true;
 6221	int work = 0;
 6222
 6223	/* Check if we have pending ipi, its better to send them now,
 6224	 * not waiting net_rx_action() end.
 6225	 */
 6226	if (sd_has_rps_ipi_waiting(sd)) {
 6227		local_irq_disable();
 6228		net_rps_action_and_irq_enable(sd);
 6229	}
 6230
 6231	napi->weight = dev_rx_weight;
 6232	while (again) {
 6233		struct sk_buff *skb;
 6234
 6235		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6236			rcu_read_lock();
 6237			__netif_receive_skb(skb);
 6238			rcu_read_unlock();
 6239			input_queue_head_incr(sd);
 6240			if (++work >= quota)
 6241				return work;
 6242
 6243		}
 6244
 6245		local_irq_disable();
 6246		rps_lock(sd);
 6247		if (skb_queue_empty(&sd->input_pkt_queue)) {
 6248			/*
 6249			 * Inline a custom version of __napi_complete().
 6250			 * only current cpu owns and manipulates this napi,
 6251			 * and NAPI_STATE_SCHED is the only possible flag set
 6252			 * on backlog.
 6253			 * We can use a plain write instead of clear_bit(),
 6254			 * and we dont need an smp_mb() memory barrier.
 6255			 */
 6256			napi->state = 0;
 6257			again = false;
 6258		} else {
 6259			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6260						   &sd->process_queue);
 6261		}
 6262		rps_unlock(sd);
 6263		local_irq_enable();
 6264	}
 6265
 6266	return work;
 6267}
 6268
 6269/**
 6270 * __napi_schedule - schedule for receive
 6271 * @n: entry to schedule
 6272 *
 6273 * The entry's receive function will be scheduled to run.
 6274 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6275 */
 6276void __napi_schedule(struct napi_struct *n)
 6277{
 6278	unsigned long flags;
 6279
 6280	local_irq_save(flags);
 6281	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6282	local_irq_restore(flags);
 6283}
 6284EXPORT_SYMBOL(__napi_schedule);
 6285
 6286/**
 6287 *	napi_schedule_prep - check if napi can be scheduled
 6288 *	@n: napi context
 6289 *
 6290 * Test if NAPI routine is already running, and if not mark
 6291 * it as running.  This is used as a condition variable
 6292 * insure only one NAPI poll instance runs.  We also make
 6293 * sure there is no pending NAPI disable.
 6294 */
 6295bool napi_schedule_prep(struct napi_struct *n)
 6296{
 6297	unsigned long val, new;
 6298
 6299	do {
 6300		val = READ_ONCE(n->state);
 6301		if (unlikely(val & NAPIF_STATE_DISABLE))
 6302			return false;
 6303		new = val | NAPIF_STATE_SCHED;
 6304
 6305		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6306		 * This was suggested by Alexander Duyck, as compiler
 6307		 * emits better code than :
 6308		 * if (val & NAPIF_STATE_SCHED)
 6309		 *     new |= NAPIF_STATE_MISSED;
 6310		 */
 6311		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6312						   NAPIF_STATE_MISSED;
 6313	} while (cmpxchg(&n->state, val, new) != val);
 6314
 6315	return !(val & NAPIF_STATE_SCHED);
 6316}
 6317EXPORT_SYMBOL(napi_schedule_prep);
 6318
 6319/**
 6320 * __napi_schedule_irqoff - schedule for receive
 6321 * @n: entry to schedule
 6322 *
 6323 * Variant of __napi_schedule() assuming hard irqs are masked
 6324 */
 6325void __napi_schedule_irqoff(struct napi_struct *n)
 6326{
 6327	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6328}
 6329EXPORT_SYMBOL(__napi_schedule_irqoff);
 6330
 6331bool napi_complete_done(struct napi_struct *n, int work_done)
 6332{
 6333	unsigned long flags, val, new, timeout = 0;
 6334	bool ret = true;
 6335
 6336	/*
 6337	 * 1) Don't let napi dequeue from the cpu poll list
 6338	 *    just in case its running on a different cpu.
 6339	 * 2) If we are busy polling, do nothing here, we have
 6340	 *    the guarantee we will be called later.
 6341	 */
 6342	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6343				 NAPIF_STATE_IN_BUSY_POLL)))
 6344		return false;
 6345
 6346	if (work_done) {
 6347		if (n->gro_bitmask)
 6348			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6349		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6350	}
 6351	if (n->defer_hard_irqs_count > 0) {
 6352		n->defer_hard_irqs_count--;
 6353		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6354		if (timeout)
 6355			ret = false;
 6356	}
 6357	if (n->gro_bitmask) {
 6358		/* When the NAPI instance uses a timeout and keeps postponing
 6359		 * it, we need to bound somehow the time packets are kept in
 6360		 * the GRO layer
 6361		 */
 6362		napi_gro_flush(n, !!timeout);
 6363	}
 6364
 6365	gro_normal_list(n);
 6366
 6367	if (unlikely(!list_empty(&n->poll_list))) {
 6368		/* If n->poll_list is not empty, we need to mask irqs */
 6369		local_irq_save(flags);
 6370		list_del_init(&n->poll_list);
 6371		local_irq_restore(flags);
 6372	}
 6373
 6374	do {
 6375		val = READ_ONCE(n->state);
 6376
 6377		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6378
 6379		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6380
 6381		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6382		 * because we will call napi->poll() one more time.
 6383		 * This C code was suggested by Alexander Duyck to help gcc.
 6384		 */
 6385		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6386						    NAPIF_STATE_SCHED;
 6387	} while (cmpxchg(&n->state, val, new) != val);
 6388
 6389	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6390		__napi_schedule(n);
 6391		return false;
 6392	}
 6393
 6394	if (timeout)
 6395		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6396			      HRTIMER_MODE_REL_PINNED);
 6397	return ret;
 6398}
 6399EXPORT_SYMBOL(napi_complete_done);
 6400
 6401/* must be called under rcu_read_lock(), as we dont take a reference */
 6402static struct napi_struct *napi_by_id(unsigned int napi_id)
 6403{
 6404	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6405	struct napi_struct *napi;
 6406
 6407	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6408		if (napi->napi_id == napi_id)
 6409			return napi;
 6410
 6411	return NULL;
 6412}
 6413
 6414#if defined(CONFIG_NET_RX_BUSY_POLL)
 6415
 6416#define BUSY_POLL_BUDGET 8
 6417
 6418static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6419{
 6420	int rc;
 6421
 6422	/* Busy polling means there is a high chance device driver hard irq
 6423	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6424	 * set in napi_schedule_prep().
 6425	 * Since we are about to call napi->poll() once more, we can safely
 6426	 * clear NAPI_STATE_MISSED.
 6427	 *
 6428	 * Note: x86 could use a single "lock and ..." instruction
 6429	 * to perform these two clear_bit()
 6430	 */
 6431	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6432	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6433
 6434	local_bh_disable();
 6435
 6436	/* All we really want here is to re-enable device interrupts.
 6437	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6438	 */
 6439	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6440	/* We can't gro_normal_list() here, because napi->poll() might have
 6441	 * rearmed the napi (napi_complete_done()) in which case it could
 6442	 * already be running on another CPU.
 6443	 */
 6444	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6445	netpoll_poll_unlock(have_poll_lock);
 6446	if (rc == BUSY_POLL_BUDGET) {
 6447		/* As the whole budget was spent, we still own the napi so can
 6448		 * safely handle the rx_list.
 6449		 */
 6450		gro_normal_list(napi);
 6451		__napi_schedule(napi);
 6452	}
 6453	local_bh_enable();
 6454}
 6455
 6456void napi_busy_loop(unsigned int napi_id,
 6457		    bool (*loop_end)(void *, unsigned long),
 6458		    void *loop_end_arg)
 6459{
 6460	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6461	int (*napi_poll)(struct napi_struct *napi, int budget);
 6462	void *have_poll_lock = NULL;
 6463	struct napi_struct *napi;
 6464
 6465restart:
 6466	napi_poll = NULL;
 6467
 6468	rcu_read_lock();
 6469
 6470	napi = napi_by_id(napi_id);
 6471	if (!napi)
 6472		goto out;
 6473
 6474	preempt_disable();
 6475	for (;;) {
 6476		int work = 0;
 6477
 6478		local_bh_disable();
 6479		if (!napi_poll) {
 6480			unsigned long val = READ_ONCE(napi->state);
 6481
 6482			/* If multiple threads are competing for this napi,
 6483			 * we avoid dirtying napi->state as much as we can.
 6484			 */
 6485			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6486				   NAPIF_STATE_IN_BUSY_POLL))
 6487				goto count;
 6488			if (cmpxchg(&napi->state, val,
 6489				    val | NAPIF_STATE_IN_BUSY_POLL |
 6490					  NAPIF_STATE_SCHED) != val)
 6491				goto count;
 6492			have_poll_lock = netpoll_poll_lock(napi);
 6493			napi_poll = napi->poll;
 6494		}
 6495		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6496		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6497		gro_normal_list(napi);
 6498count:
 6499		if (work > 0)
 6500			__NET_ADD_STATS(dev_net(napi->dev),
 6501					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6502		local_bh_enable();
 6503
 6504		if (!loop_end || loop_end(loop_end_arg, start_time))
 6505			break;
 6506
 6507		if (unlikely(need_resched())) {
 6508			if (napi_poll)
 6509				busy_poll_stop(napi, have_poll_lock);
 6510			preempt_enable();
 6511			rcu_read_unlock();
 6512			cond_resched();
 6513			if (loop_end(loop_end_arg, start_time))
 6514				return;
 6515			goto restart;
 6516		}
 6517		cpu_relax();
 6518	}
 6519	if (napi_poll)
 6520		busy_poll_stop(napi, have_poll_lock);
 6521	preempt_enable();
 6522out:
 6523	rcu_read_unlock();
 6524}
 6525EXPORT_SYMBOL(napi_busy_loop);
 6526
 6527#endif /* CONFIG_NET_RX_BUSY_POLL */
 6528
 6529static void napi_hash_add(struct napi_struct *napi)
 6530{
 6531	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 6532	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 6533		return;
 6534
 6535	spin_lock(&napi_hash_lock);
 6536
 6537	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6538	do {
 6539		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6540			napi_gen_id = MIN_NAPI_ID;
 6541	} while (napi_by_id(napi_gen_id));
 6542	napi->napi_id = napi_gen_id;
 6543
 6544	hlist_add_head_rcu(&napi->napi_hash_node,
 6545			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6546
 6547	spin_unlock(&napi_hash_lock);
 6548}
 6549
 6550/* Warning : caller is responsible to make sure rcu grace period
 6551 * is respected before freeing memory containing @napi
 6552 */
 6553bool napi_hash_del(struct napi_struct *napi)
 6554{
 6555	bool rcu_sync_needed = false;
 6556
 6557	spin_lock(&napi_hash_lock);
 6558
 6559	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
 6560		rcu_sync_needed = true;
 6561		hlist_del_rcu(&napi->napi_hash_node);
 6562	}
 6563	spin_unlock(&napi_hash_lock);
 6564	return rcu_sync_needed;
 6565}
 6566EXPORT_SYMBOL_GPL(napi_hash_del);
 6567
 6568static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6569{
 6570	struct napi_struct *napi;
 6571
 6572	napi = container_of(timer, struct napi_struct, timer);
 6573
 6574	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6575	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6576	 */
 6577	if (!napi_disable_pending(napi) &&
 6578	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6579		__napi_schedule_irqoff(napi);
 6580
 6581	return HRTIMER_NORESTART;
 6582}
 6583
 6584static void init_gro_hash(struct napi_struct *napi)
 6585{
 6586	int i;
 6587
 6588	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6589		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6590		napi->gro_hash[i].count = 0;
 6591	}
 6592	napi->gro_bitmask = 0;
 6593}
 6594
 6595void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6596		    int (*poll)(struct napi_struct *, int), int weight)
 6597{
 6598	INIT_LIST_HEAD(&napi->poll_list);
 6599	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6600	napi->timer.function = napi_watchdog;
 6601	init_gro_hash(napi);
 6602	napi->skb = NULL;
 6603	INIT_LIST_HEAD(&napi->rx_list);
 6604	napi->rx_count = 0;
 6605	napi->poll = poll;
 6606	if (weight > NAPI_POLL_WEIGHT)
 6607		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6608				weight);
 6609	napi->weight = weight;
 6610	list_add(&napi->dev_list, &dev->napi_list);
 6611	napi->dev = dev;
 6612#ifdef CONFIG_NETPOLL
 6613	napi->poll_owner = -1;
 6614#endif
 6615	set_bit(NAPI_STATE_SCHED, &napi->state);
 6616	napi_hash_add(napi);
 6617}
 6618EXPORT_SYMBOL(netif_napi_add);
 6619
 6620void napi_disable(struct napi_struct *n)
 6621{
 6622	might_sleep();
 6623	set_bit(NAPI_STATE_DISABLE, &n->state);
 6624
 6625	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6626		msleep(1);
 6627	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6628		msleep(1);
 6629
 6630	hrtimer_cancel(&n->timer);
 6631
 6632	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6633}
 6634EXPORT_SYMBOL(napi_disable);
 6635
 6636static void flush_gro_hash(struct napi_struct *napi)
 6637{
 6638	int i;
 6639
 6640	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6641		struct sk_buff *skb, *n;
 6642
 6643		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6644			kfree_skb(skb);
 6645		napi->gro_hash[i].count = 0;
 6646	}
 6647}
 6648
 6649/* Must be called in process context */
 6650void netif_napi_del(struct napi_struct *napi)
 6651{
 6652	might_sleep();
 6653	if (napi_hash_del(napi))
 6654		synchronize_net();
 6655	list_del_init(&napi->dev_list);
 6656	napi_free_frags(napi);
 6657
 6658	flush_gro_hash(napi);
 6659	napi->gro_bitmask = 0;
 6660}
 6661EXPORT_SYMBOL(netif_napi_del);
 6662
 6663static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6664{
 6665	void *have;
 6666	int work, weight;
 6667
 6668	list_del_init(&n->poll_list);
 6669
 6670	have = netpoll_poll_lock(n);
 6671
 6672	weight = n->weight;
 6673
 6674	/* This NAPI_STATE_SCHED test is for avoiding a race
 6675	 * with netpoll's poll_napi().  Only the entity which
 6676	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6677	 * actually make the ->poll() call.  Therefore we avoid
 6678	 * accidentally calling ->poll() when NAPI is not scheduled.
 6679	 */
 6680	work = 0;
 6681	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6682		work = n->poll(n, weight);
 6683		trace_napi_poll(n, work, weight);
 6684	}
 6685
 6686	WARN_ON_ONCE(work > weight);
 6687
 6688	if (likely(work < weight))
 6689		goto out_unlock;
 6690
 6691	/* Drivers must not modify the NAPI state if they
 6692	 * consume the entire weight.  In such cases this code
 6693	 * still "owns" the NAPI instance and therefore can
 6694	 * move the instance around on the list at-will.
 6695	 */
 6696	if (unlikely(napi_disable_pending(n))) {
 6697		napi_complete(n);
 6698		goto out_unlock;
 6699	}
 6700
 6701	if (n->gro_bitmask) {
 6702		/* flush too old packets
 6703		 * If HZ < 1000, flush all packets.
 6704		 */
 6705		napi_gro_flush(n, HZ >= 1000);
 6706	}
 6707
 6708	gro_normal_list(n);
 6709
 6710	/* Some drivers may have called napi_schedule
 6711	 * prior to exhausting their budget.
 6712	 */
 6713	if (unlikely(!list_empty(&n->poll_list))) {
 6714		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6715			     n->dev ? n->dev->name : "backlog");
 6716		goto out_unlock;
 6717	}
 6718
 6719	list_add_tail(&n->poll_list, repoll);
 6720
 6721out_unlock:
 6722	netpoll_poll_unlock(have);
 6723
 6724	return work;
 6725}
 6726
 6727static __latent_entropy void net_rx_action(struct softirq_action *h)
 6728{
 6729	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6730	unsigned long time_limit = jiffies +
 6731		usecs_to_jiffies(netdev_budget_usecs);
 6732	int budget = netdev_budget;
 6733	LIST_HEAD(list);
 6734	LIST_HEAD(repoll);
 6735
 6736	local_irq_disable();
 6737	list_splice_init(&sd->poll_list, &list);
 6738	local_irq_enable();
 6739
 6740	for (;;) {
 6741		struct napi_struct *n;
 6742
 6743		if (list_empty(&list)) {
 6744			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6745				goto out;
 6746			break;
 6747		}
 6748
 6749		n = list_first_entry(&list, struct napi_struct, poll_list);
 6750		budget -= napi_poll(n, &repoll);
 6751
 6752		/* If softirq window is exhausted then punt.
 6753		 * Allow this to run for 2 jiffies since which will allow
 6754		 * an average latency of 1.5/HZ.
 6755		 */
 6756		if (unlikely(budget <= 0 ||
 6757			     time_after_eq(jiffies, time_limit))) {
 6758			sd->time_squeeze++;
 6759			break;
 6760		}
 6761	}
 6762
 6763	local_irq_disable();
 6764
 6765	list_splice_tail_init(&sd->poll_list, &list);
 6766	list_splice_tail(&repoll, &list);
 6767	list_splice(&list, &sd->poll_list);
 6768	if (!list_empty(&sd->poll_list))
 6769		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6770
 6771	net_rps_action_and_irq_enable(sd);
 6772out:
 6773	__kfree_skb_flush();
 6774}
 6775
 6776struct netdev_adjacent {
 6777	struct net_device *dev;
 6778
 6779	/* upper master flag, there can only be one master device per list */
 6780	bool master;
 6781
 6782	/* lookup ignore flag */
 6783	bool ignore;
 6784
 6785	/* counter for the number of times this device was added to us */
 6786	u16 ref_nr;
 6787
 6788	/* private field for the users */
 6789	void *private;
 6790
 6791	struct list_head list;
 6792	struct rcu_head rcu;
 6793};
 6794
 6795static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6796						 struct list_head *adj_list)
 6797{
 6798	struct netdev_adjacent *adj;
 6799
 6800	list_for_each_entry(adj, adj_list, list) {
 6801		if (adj->dev == adj_dev)
 6802			return adj;
 6803	}
 6804	return NULL;
 6805}
 6806
 6807static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
 6808{
 6809	struct net_device *dev = data;
 6810
 6811	return upper_dev == dev;
 6812}
 6813
 6814/**
 6815 * netdev_has_upper_dev - Check if device is linked to an upper device
 6816 * @dev: device
 6817 * @upper_dev: upper device to check
 6818 *
 6819 * Find out if a device is linked to specified upper device and return true
 6820 * in case it is. Note that this checks only immediate upper device,
 6821 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6822 */
 6823bool netdev_has_upper_dev(struct net_device *dev,
 6824			  struct net_device *upper_dev)
 6825{
 6826	ASSERT_RTNL();
 6827
 6828	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6829					     upper_dev);
 6830}
 6831EXPORT_SYMBOL(netdev_has_upper_dev);
 6832
 6833/**
 6834 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6835 * @dev: device
 6836 * @upper_dev: upper device to check
 6837 *
 6838 * Find out if a device is linked to specified upper device and return true
 6839 * in case it is. Note that this checks the entire upper device chain.
 6840 * The caller must hold rcu lock.
 6841 */
 6842
 6843bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6844				  struct net_device *upper_dev)
 6845{
 6846	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6847					       upper_dev);
 6848}
 6849EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6850
 6851/**
 6852 * netdev_has_any_upper_dev - Check if device is linked to some device
 6853 * @dev: device
 6854 *
 6855 * Find out if a device is linked to an upper device and return true in case
 6856 * it is. The caller must hold the RTNL lock.
 6857 */
 6858bool netdev_has_any_upper_dev(struct net_device *dev)
 6859{
 6860	ASSERT_RTNL();
 6861
 6862	return !list_empty(&dev->adj_list.upper);
 6863}
 6864EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6865
 6866/**
 6867 * netdev_master_upper_dev_get - Get master upper device
 6868 * @dev: device
 6869 *
 6870 * Find a master upper device and return pointer to it or NULL in case
 6871 * it's not there. The caller must hold the RTNL lock.
 6872 */
 6873struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6874{
 6875	struct netdev_adjacent *upper;
 6876
 6877	ASSERT_RTNL();
 6878
 6879	if (list_empty(&dev->adj_list.upper))
 6880		return NULL;
 6881
 6882	upper = list_first_entry(&dev->adj_list.upper,
 6883				 struct netdev_adjacent, list);
 6884	if (likely(upper->master))
 6885		return upper->dev;
 6886	return NULL;
 6887}
 6888EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6889
 6890static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6891{
 6892	struct netdev_adjacent *upper;
 6893
 6894	ASSERT_RTNL();
 6895
 6896	if (list_empty(&dev->adj_list.upper))
 6897		return NULL;
 6898
 6899	upper = list_first_entry(&dev->adj_list.upper,
 6900				 struct netdev_adjacent, list);
 6901	if (likely(upper->master) && !upper->ignore)
 6902		return upper->dev;
 6903	return NULL;
 6904}
 6905
 6906/**
 6907 * netdev_has_any_lower_dev - Check if device is linked to some device
 6908 * @dev: device
 6909 *
 6910 * Find out if a device is linked to a lower device and return true in case
 6911 * it is. The caller must hold the RTNL lock.
 6912 */
 6913static bool netdev_has_any_lower_dev(struct net_device *dev)
 6914{
 6915	ASSERT_RTNL();
 6916
 6917	return !list_empty(&dev->adj_list.lower);
 6918}
 6919
 6920void *netdev_adjacent_get_private(struct list_head *adj_list)
 6921{
 6922	struct netdev_adjacent *adj;
 6923
 6924	adj = list_entry(adj_list, struct netdev_adjacent, list);
 6925
 6926	return adj->private;
 6927}
 6928EXPORT_SYMBOL(netdev_adjacent_get_private);
 6929
 6930/**
 6931 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 6932 * @dev: device
 6933 * @iter: list_head ** of the current position
 6934 *
 6935 * Gets the next device from the dev's upper list, starting from iter
 6936 * position. The caller must hold RCU read lock.
 6937 */
 6938struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 6939						 struct list_head **iter)
 6940{
 6941	struct netdev_adjacent *upper;
 6942
 6943	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6944
 6945	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6946
 6947	if (&upper->list == &dev->adj_list.upper)
 6948		return NULL;
 6949
 6950	*iter = &upper->list;
 6951
 6952	return upper->dev;
 6953}
 6954EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 6955
 6956static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 6957						  struct list_head **iter,
 6958						  bool *ignore)
 6959{
 6960	struct netdev_adjacent *upper;
 6961
 6962	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 6963
 6964	if (&upper->list == &dev->adj_list.upper)
 6965		return NULL;
 6966
 6967	*iter = &upper->list;
 6968	*ignore = upper->ignore;
 6969
 6970	return upper->dev;
 6971}
 6972
 6973static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 6974						    struct list_head **iter)
 6975{
 6976	struct netdev_adjacent *upper;
 6977
 6978	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 6979
 6980	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 6981
 6982	if (&upper->list == &dev->adj_list.upper)
 6983		return NULL;
 6984
 6985	*iter = &upper->list;
 6986
 6987	return upper->dev;
 6988}
 6989
 6990static int __netdev_walk_all_upper_dev(struct net_device *dev,
 6991				       int (*fn)(struct net_device *dev,
 6992						 void *data),
 6993				       void *data)
 6994{
 6995	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 6996	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 6997	int ret, cur = 0;
 6998	bool ignore;
 6999
 7000	now = dev;
 7001	iter = &dev->adj_list.upper;
 7002
 7003	while (1) {
 7004		if (now != dev) {
 7005			ret = fn(now, data);
 7006			if (ret)
 7007				return ret;
 7008		}
 7009
 7010		next = NULL;
 7011		while (1) {
 7012			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7013			if (!udev)
 7014				break;
 7015			if (ignore)
 7016				continue;
 7017
 7018			next = udev;
 7019			niter = &udev->adj_list.upper;
 7020			dev_stack[cur] = now;
 7021			iter_stack[cur++] = iter;
 7022			break;
 7023		}
 7024
 7025		if (!next) {
 7026			if (!cur)
 7027				return 0;
 7028			next = dev_stack[--cur];
 7029			niter = iter_stack[cur];
 7030		}
 7031
 7032		now = next;
 7033		iter = niter;
 7034	}
 7035
 7036	return 0;
 7037}
 7038
 7039int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7040				  int (*fn)(struct net_device *dev,
 7041					    void *data),
 7042				  void *data)
 7043{
 7044	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7045	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7046	int ret, cur = 0;
 7047
 7048	now = dev;
 7049	iter = &dev->adj_list.upper;
 7050
 7051	while (1) {
 7052		if (now != dev) {
 7053			ret = fn(now, data);
 7054			if (ret)
 7055				return ret;
 7056		}
 7057
 7058		next = NULL;
 7059		while (1) {
 7060			udev = netdev_next_upper_dev_rcu(now, &iter);
 7061			if (!udev)
 7062				break;
 7063
 7064			next = udev;
 7065			niter = &udev->adj_list.upper;
 7066			dev_stack[cur] = now;
 7067			iter_stack[cur++] = iter;
 7068			break;
 7069		}
 7070
 7071		if (!next) {
 7072			if (!cur)
 7073				return 0;
 7074			next = dev_stack[--cur];
 7075			niter = iter_stack[cur];
 7076		}
 7077
 7078		now = next;
 7079		iter = niter;
 7080	}
 7081
 7082	return 0;
 7083}
 7084EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7085
 7086static bool __netdev_has_upper_dev(struct net_device *dev,
 7087				   struct net_device *upper_dev)
 7088{
 7089	ASSERT_RTNL();
 7090
 7091	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7092					   upper_dev);
 7093}
 7094
 7095/**
 7096 * netdev_lower_get_next_private - Get the next ->private from the
 7097 *				   lower neighbour list
 7098 * @dev: device
 7099 * @iter: list_head ** of the current position
 7100 *
 7101 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7102 * list, starting from iter position. The caller must hold either hold the
 7103 * RTNL lock or its own locking that guarantees that the neighbour lower
 7104 * list will remain unchanged.
 7105 */
 7106void *netdev_lower_get_next_private(struct net_device *dev,
 7107				    struct list_head **iter)
 7108{
 7109	struct netdev_adjacent *lower;
 7110
 7111	lower = list_entry(*iter, struct netdev_adjacent, list);
 7112
 7113	if (&lower->list == &dev->adj_list.lower)
 7114		return NULL;
 7115
 7116	*iter = lower->list.next;
 7117
 7118	return lower->private;
 7119}
 7120EXPORT_SYMBOL(netdev_lower_get_next_private);
 7121
 7122/**
 7123 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7124 *				       lower neighbour list, RCU
 7125 *				       variant
 7126 * @dev: device
 7127 * @iter: list_head ** of the current position
 7128 *
 7129 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7130 * list, starting from iter position. The caller must hold RCU read lock.
 7131 */
 7132void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7133					struct list_head **iter)
 7134{
 7135	struct netdev_adjacent *lower;
 7136
 7137	WARN_ON_ONCE(!rcu_read_lock_held());
 7138
 7139	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7140
 7141	if (&lower->list == &dev->adj_list.lower)
 7142		return NULL;
 7143
 7144	*iter = &lower->list;
 7145
 7146	return lower->private;
 7147}
 7148EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7149
 7150/**
 7151 * netdev_lower_get_next - Get the next device from the lower neighbour
 7152 *                         list
 7153 * @dev: device
 7154 * @iter: list_head ** of the current position
 7155 *
 7156 * Gets the next netdev_adjacent from the dev's lower neighbour
 7157 * list, starting from iter position. The caller must hold RTNL lock or
 7158 * its own locking that guarantees that the neighbour lower
 7159 * list will remain unchanged.
 7160 */
 7161void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7162{
 7163	struct netdev_adjacent *lower;
 7164
 7165	lower = list_entry(*iter, struct netdev_adjacent, list);
 7166
 7167	if (&lower->list == &dev->adj_list.lower)
 7168		return NULL;
 7169
 7170	*iter = lower->list.next;
 7171
 7172	return lower->dev;
 7173}
 7174EXPORT_SYMBOL(netdev_lower_get_next);
 7175
 7176static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7177						struct list_head **iter)
 7178{
 7179	struct netdev_adjacent *lower;
 7180
 7181	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7182
 7183	if (&lower->list == &dev->adj_list.lower)
 7184		return NULL;
 7185
 7186	*iter = &lower->list;
 7187
 7188	return lower->dev;
 7189}
 7190
 7191static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7192						  struct list_head **iter,
 7193						  bool *ignore)
 7194{
 7195	struct netdev_adjacent *lower;
 7196
 7197	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7198
 7199	if (&lower->list == &dev->adj_list.lower)
 7200		return NULL;
 7201
 7202	*iter = &lower->list;
 7203	*ignore = lower->ignore;
 7204
 7205	return lower->dev;
 7206}
 7207
 7208int netdev_walk_all_lower_dev(struct net_device *dev,
 7209			      int (*fn)(struct net_device *dev,
 7210					void *data),
 7211			      void *data)
 7212{
 7213	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7214	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7215	int ret, cur = 0;
 7216
 7217	now = dev;
 7218	iter = &dev->adj_list.lower;
 7219
 7220	while (1) {
 7221		if (now != dev) {
 7222			ret = fn(now, data);
 7223			if (ret)
 7224				return ret;
 7225		}
 7226
 7227		next = NULL;
 7228		while (1) {
 7229			ldev = netdev_next_lower_dev(now, &iter);
 7230			if (!ldev)
 7231				break;
 7232
 7233			next = ldev;
 7234			niter = &ldev->adj_list.lower;
 7235			dev_stack[cur] = now;
 7236			iter_stack[cur++] = iter;
 7237			break;
 7238		}
 7239
 7240		if (!next) {
 7241			if (!cur)
 7242				return 0;
 7243			next = dev_stack[--cur];
 7244			niter = iter_stack[cur];
 7245		}
 7246
 7247		now = next;
 7248		iter = niter;
 7249	}
 7250
 7251	return 0;
 7252}
 7253EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7254
 7255static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7256				       int (*fn)(struct net_device *dev,
 7257						 void *data),
 7258				       void *data)
 7259{
 7260	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7261	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7262	int ret, cur = 0;
 7263	bool ignore;
 7264
 7265	now = dev;
 7266	iter = &dev->adj_list.lower;
 7267
 7268	while (1) {
 7269		if (now != dev) {
 7270			ret = fn(now, data);
 7271			if (ret)
 7272				return ret;
 7273		}
 7274
 7275		next = NULL;
 7276		while (1) {
 7277			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7278			if (!ldev)
 7279				break;
 7280			if (ignore)
 7281				continue;
 7282
 7283			next = ldev;
 7284			niter = &ldev->adj_list.lower;
 7285			dev_stack[cur] = now;
 7286			iter_stack[cur++] = iter;
 7287			break;
 7288		}
 7289
 7290		if (!next) {
 7291			if (!cur)
 7292				return 0;
 7293			next = dev_stack[--cur];
 7294			niter = iter_stack[cur];
 7295		}
 7296
 7297		now = next;
 7298		iter = niter;
 7299	}
 7300
 7301	return 0;
 7302}
 7303
 7304struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7305					     struct list_head **iter)
 7306{
 7307	struct netdev_adjacent *lower;
 7308
 7309	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7310	if (&lower->list == &dev->adj_list.lower)
 7311		return NULL;
 7312
 7313	*iter = &lower->list;
 7314
 7315	return lower->dev;
 7316}
 7317EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7318
 7319static u8 __netdev_upper_depth(struct net_device *dev)
 7320{
 7321	struct net_device *udev;
 7322	struct list_head *iter;
 7323	u8 max_depth = 0;
 7324	bool ignore;
 7325
 7326	for (iter = &dev->adj_list.upper,
 7327	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7328	     udev;
 7329	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7330		if (ignore)
 7331			continue;
 7332		if (max_depth < udev->upper_level)
 7333			max_depth = udev->upper_level;
 7334	}
 7335
 7336	return max_depth;
 7337}
 7338
 7339static u8 __netdev_lower_depth(struct net_device *dev)
 7340{
 7341	struct net_device *ldev;
 7342	struct list_head *iter;
 7343	u8 max_depth = 0;
 7344	bool ignore;
 7345
 7346	for (iter = &dev->adj_list.lower,
 7347	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7348	     ldev;
 7349	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7350		if (ignore)
 7351			continue;
 7352		if (max_depth < ldev->lower_level)
 7353			max_depth = ldev->lower_level;
 7354	}
 7355
 7356	return max_depth;
 7357}
 7358
 7359static int __netdev_update_upper_level(struct net_device *dev, void *data)
 7360{
 7361	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7362	return 0;
 7363}
 7364
 7365static int __netdev_update_lower_level(struct net_device *dev, void *data)
 7366{
 7367	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7368	return 0;
 7369}
 7370
 7371int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7372				  int (*fn)(struct net_device *dev,
 7373					    void *data),
 7374				  void *data)
 7375{
 7376	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7377	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7378	int ret, cur = 0;
 7379
 7380	now = dev;
 7381	iter = &dev->adj_list.lower;
 7382
 7383	while (1) {
 7384		if (now != dev) {
 7385			ret = fn(now, data);
 7386			if (ret)
 7387				return ret;
 7388		}
 7389
 7390		next = NULL;
 7391		while (1) {
 7392			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7393			if (!ldev)
 7394				break;
 7395
 7396			next = ldev;
 7397			niter = &ldev->adj_list.lower;
 7398			dev_stack[cur] = now;
 7399			iter_stack[cur++] = iter;
 7400			break;
 7401		}
 7402
 7403		if (!next) {
 7404			if (!cur)
 7405				return 0;
 7406			next = dev_stack[--cur];
 7407			niter = iter_stack[cur];
 7408		}
 7409
 7410		now = next;
 7411		iter = niter;
 7412	}
 7413
 7414	return 0;
 7415}
 7416EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7417
 7418/**
 7419 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7420 *				       lower neighbour list, RCU
 7421 *				       variant
 7422 * @dev: device
 7423 *
 7424 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7425 * list. The caller must hold RCU read lock.
 7426 */
 7427void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7428{
 7429	struct netdev_adjacent *lower;
 7430
 7431	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7432			struct netdev_adjacent, list);
 7433	if (lower)
 7434		return lower->private;
 7435	return NULL;
 7436}
 7437EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7438
 7439/**
 7440 * netdev_master_upper_dev_get_rcu - Get master upper device
 7441 * @dev: device
 7442 *
 7443 * Find a master upper device and return pointer to it or NULL in case
 7444 * it's not there. The caller must hold the RCU read lock.
 7445 */
 7446struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7447{
 7448	struct netdev_adjacent *upper;
 7449
 7450	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7451				       struct netdev_adjacent, list);
 7452	if (upper && likely(upper->master))
 7453		return upper->dev;
 7454	return NULL;
 7455}
 7456EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7457
 7458static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7459			      struct net_device *adj_dev,
 7460			      struct list_head *dev_list)
 7461{
 7462	char linkname[IFNAMSIZ+7];
 7463
 7464	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7465		"upper_%s" : "lower_%s", adj_dev->name);
 7466	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7467				 linkname);
 7468}
 7469static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7470			       char *name,
 7471			       struct list_head *dev_list)
 7472{
 7473	char linkname[IFNAMSIZ+7];
 7474
 7475	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7476		"upper_%s" : "lower_%s", name);
 7477	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7478}
 7479
 7480static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7481						 struct net_device *adj_dev,
 7482						 struct list_head *dev_list)
 7483{
 7484	return (dev_list == &dev->adj_list.upper ||
 7485		dev_list == &dev->adj_list.lower) &&
 7486		net_eq(dev_net(dev), dev_net(adj_dev));
 7487}
 7488
 7489static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7490					struct net_device *adj_dev,
 7491					struct list_head *dev_list,
 7492					void *private, bool master)
 7493{
 7494	struct netdev_adjacent *adj;
 7495	int ret;
 7496
 7497	adj = __netdev_find_adj(adj_dev, dev_list);
 7498
 7499	if (adj) {
 7500		adj->ref_nr += 1;
 7501		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7502			 dev->name, adj_dev->name, adj->ref_nr);
 7503
 7504		return 0;
 7505	}
 7506
 7507	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7508	if (!adj)
 7509		return -ENOMEM;
 7510
 7511	adj->dev = adj_dev;
 7512	adj->master = master;
 7513	adj->ref_nr = 1;
 7514	adj->private = private;
 7515	adj->ignore = false;
 7516	dev_hold(adj_dev);
 7517
 7518	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7519		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7520
 7521	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7522		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7523		if (ret)
 7524			goto free_adj;
 7525	}
 7526
 7527	/* Ensure that master link is always the first item in list. */
 7528	if (master) {
 7529		ret = sysfs_create_link(&(dev->dev.kobj),
 7530					&(adj_dev->dev.kobj), "master");
 7531		if (ret)
 7532			goto remove_symlinks;
 7533
 7534		list_add_rcu(&adj->list, dev_list);
 7535	} else {
 7536		list_add_tail_rcu(&adj->list, dev_list);
 7537	}
 7538
 7539	return 0;
 7540
 7541remove_symlinks:
 7542	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7543		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7544free_adj:
 7545	kfree(adj);
 7546	dev_put(adj_dev);
 7547
 7548	return ret;
 7549}
 7550
 7551static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7552					 struct net_device *adj_dev,
 7553					 u16 ref_nr,
 7554					 struct list_head *dev_list)
 7555{
 7556	struct netdev_adjacent *adj;
 7557
 7558	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7559		 dev->name, adj_dev->name, ref_nr);
 7560
 7561	adj = __netdev_find_adj(adj_dev, dev_list);
 7562
 7563	if (!adj) {
 7564		pr_err("Adjacency does not exist for device %s from %s\n",
 7565		       dev->name, adj_dev->name);
 7566		WARN_ON(1);
 7567		return;
 7568	}
 7569
 7570	if (adj->ref_nr > ref_nr) {
 7571		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7572			 dev->name, adj_dev->name, ref_nr,
 7573			 adj->ref_nr - ref_nr);
 7574		adj->ref_nr -= ref_nr;
 7575		return;
 7576	}
 7577
 7578	if (adj->master)
 7579		sysfs_remove_link(&(dev->dev.kobj), "master");
 7580
 7581	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7582		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7583
 7584	list_del_rcu(&adj->list);
 7585	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7586		 adj_dev->name, dev->name, adj_dev->name);
 7587	dev_put(adj_dev);
 7588	kfree_rcu(adj, rcu);
 7589}
 7590
 7591static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7592					    struct net_device *upper_dev,
 7593					    struct list_head *up_list,
 7594					    struct list_head *down_list,
 7595					    void *private, bool master)
 7596{
 7597	int ret;
 7598
 7599	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7600					   private, master);
 7601	if (ret)
 7602		return ret;
 7603
 7604	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7605					   private, false);
 7606	if (ret) {
 7607		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7608		return ret;
 7609	}
 7610
 7611	return 0;
 7612}
 7613
 7614static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7615					       struct net_device *upper_dev,
 7616					       u16 ref_nr,
 7617					       struct list_head *up_list,
 7618					       struct list_head *down_list)
 7619{
 7620	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7621	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7622}
 7623
 7624static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7625						struct net_device *upper_dev,
 7626						void *private, bool master)
 7627{
 7628	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7629						&dev->adj_list.upper,
 7630						&upper_dev->adj_list.lower,
 7631						private, master);
 7632}
 7633
 7634static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7635						   struct net_device *upper_dev)
 7636{
 7637	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7638					   &dev->adj_list.upper,
 7639					   &upper_dev->adj_list.lower);
 7640}
 7641
 7642static int __netdev_upper_dev_link(struct net_device *dev,
 7643				   struct net_device *upper_dev, bool master,
 7644				   void *upper_priv, void *upper_info,
 7645				   struct netlink_ext_ack *extack)
 7646{
 7647	struct netdev_notifier_changeupper_info changeupper_info = {
 7648		.info = {
 7649			.dev = dev,
 7650			.extack = extack,
 7651		},
 7652		.upper_dev = upper_dev,
 7653		.master = master,
 7654		.linking = true,
 7655		.upper_info = upper_info,
 7656	};
 7657	struct net_device *master_dev;
 7658	int ret = 0;
 7659
 7660	ASSERT_RTNL();
 7661
 7662	if (dev == upper_dev)
 7663		return -EBUSY;
 7664
 7665	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7666	if (__netdev_has_upper_dev(upper_dev, dev))
 7667		return -EBUSY;
 7668
 7669	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7670		return -EMLINK;
 7671
 7672	if (!master) {
 7673		if (__netdev_has_upper_dev(dev, upper_dev))
 7674			return -EEXIST;
 7675	} else {
 7676		master_dev = __netdev_master_upper_dev_get(dev);
 7677		if (master_dev)
 7678			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7679	}
 7680
 7681	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7682					    &changeupper_info.info);
 7683	ret = notifier_to_errno(ret);
 7684	if (ret)
 7685		return ret;
 7686
 7687	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7688						   master);
 7689	if (ret)
 7690		return ret;
 7691
 7692	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7693					    &changeupper_info.info);
 7694	ret = notifier_to_errno(ret);
 7695	if (ret)
 7696		goto rollback;
 7697
 7698	__netdev_update_upper_level(dev, NULL);
 7699	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7700
 7701	__netdev_update_lower_level(upper_dev, NULL);
 7702	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7703				    NULL);
 7704
 7705	return 0;
 7706
 7707rollback:
 7708	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7709
 7710	return ret;
 7711}
 7712
 7713/**
 7714 * netdev_upper_dev_link - Add a link to the upper device
 7715 * @dev: device
 7716 * @upper_dev: new upper device
 7717 * @extack: netlink extended ack
 7718 *
 7719 * Adds a link to device which is upper to this one. The caller must hold
 7720 * the RTNL lock. On a failure a negative errno code is returned.
 7721 * On success the reference counts are adjusted and the function
 7722 * returns zero.
 7723 */
 7724int netdev_upper_dev_link(struct net_device *dev,
 7725			  struct net_device *upper_dev,
 7726			  struct netlink_ext_ack *extack)
 7727{
 7728	return __netdev_upper_dev_link(dev, upper_dev, false,
 7729				       NULL, NULL, extack);
 7730}
 7731EXPORT_SYMBOL(netdev_upper_dev_link);
 7732
 7733/**
 7734 * netdev_master_upper_dev_link - Add a master link to the upper device
 7735 * @dev: device
 7736 * @upper_dev: new upper device
 7737 * @upper_priv: upper device private
 7738 * @upper_info: upper info to be passed down via notifier
 7739 * @extack: netlink extended ack
 7740 *
 7741 * Adds a link to device which is upper to this one. In this case, only
 7742 * one master upper device can be linked, although other non-master devices
 7743 * might be linked as well. The caller must hold the RTNL lock.
 7744 * On a failure a negative errno code is returned. On success the reference
 7745 * counts are adjusted and the function returns zero.
 7746 */
 7747int netdev_master_upper_dev_link(struct net_device *dev,
 7748				 struct net_device *upper_dev,
 7749				 void *upper_priv, void *upper_info,
 7750				 struct netlink_ext_ack *extack)
 7751{
 7752	return __netdev_upper_dev_link(dev, upper_dev, true,
 7753				       upper_priv, upper_info, extack);
 7754}
 7755EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7756
 7757/**
 7758 * netdev_upper_dev_unlink - Removes a link to upper device
 7759 * @dev: device
 7760 * @upper_dev: new upper device
 7761 *
 7762 * Removes a link to device which is upper to this one. The caller must hold
 7763 * the RTNL lock.
 7764 */
 7765void netdev_upper_dev_unlink(struct net_device *dev,
 7766			     struct net_device *upper_dev)
 7767{
 7768	struct netdev_notifier_changeupper_info changeupper_info = {
 7769		.info = {
 7770			.dev = dev,
 7771		},
 7772		.upper_dev = upper_dev,
 7773		.linking = false,
 7774	};
 7775
 7776	ASSERT_RTNL();
 7777
 7778	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7779
 7780	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7781				      &changeupper_info.info);
 7782
 7783	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7784
 7785	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7786				      &changeupper_info.info);
 7787
 7788	__netdev_update_upper_level(dev, NULL);
 7789	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7790
 7791	__netdev_update_lower_level(upper_dev, NULL);
 7792	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7793				    NULL);
 7794}
 7795EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7796
 7797static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7798				      struct net_device *lower_dev,
 7799				      bool val)
 7800{
 7801	struct netdev_adjacent *adj;
 7802
 7803	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7804	if (adj)
 7805		adj->ignore = val;
 7806
 7807	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7808	if (adj)
 7809		adj->ignore = val;
 7810}
 7811
 7812static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7813					struct net_device *lower_dev)
 7814{
 7815	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7816}
 7817
 7818static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7819				       struct net_device *lower_dev)
 7820{
 7821	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7822}
 7823
 7824int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7825				   struct net_device *new_dev,
 7826				   struct net_device *dev,
 7827				   struct netlink_ext_ack *extack)
 7828{
 7829	int err;
 7830
 7831	if (!new_dev)
 7832		return 0;
 7833
 7834	if (old_dev && new_dev != old_dev)
 7835		netdev_adjacent_dev_disable(dev, old_dev);
 7836
 7837	err = netdev_upper_dev_link(new_dev, dev, extack);
 7838	if (err) {
 7839		if (old_dev && new_dev != old_dev)
 7840			netdev_adjacent_dev_enable(dev, old_dev);
 7841		return err;
 7842	}
 7843
 7844	return 0;
 7845}
 7846EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7847
 7848void netdev_adjacent_change_commit(struct net_device *old_dev,
 7849				   struct net_device *new_dev,
 7850				   struct net_device *dev)
 7851{
 7852	if (!new_dev || !old_dev)
 7853		return;
 7854
 7855	if (new_dev == old_dev)
 7856		return;
 7857
 7858	netdev_adjacent_dev_enable(dev, old_dev);
 7859	netdev_upper_dev_unlink(old_dev, dev);
 7860}
 7861EXPORT_SYMBOL(netdev_adjacent_change_commit);
 7862
 7863void netdev_adjacent_change_abort(struct net_device *old_dev,
 7864				  struct net_device *new_dev,
 7865				  struct net_device *dev)
 7866{
 7867	if (!new_dev)
 7868		return;
 7869
 7870	if (old_dev && new_dev != old_dev)
 7871		netdev_adjacent_dev_enable(dev, old_dev);
 7872
 7873	netdev_upper_dev_unlink(new_dev, dev);
 7874}
 7875EXPORT_SYMBOL(netdev_adjacent_change_abort);
 7876
 7877/**
 7878 * netdev_bonding_info_change - Dispatch event about slave change
 7879 * @dev: device
 7880 * @bonding_info: info to dispatch
 7881 *
 7882 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 7883 * The caller must hold the RTNL lock.
 7884 */
 7885void netdev_bonding_info_change(struct net_device *dev,
 7886				struct netdev_bonding_info *bonding_info)
 7887{
 7888	struct netdev_notifier_bonding_info info = {
 7889		.info.dev = dev,
 7890	};
 7891
 7892	memcpy(&info.bonding_info, bonding_info,
 7893	       sizeof(struct netdev_bonding_info));
 7894	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 7895				      &info.info);
 7896}
 7897EXPORT_SYMBOL(netdev_bonding_info_change);
 7898
 7899/**
 7900 * netdev_get_xmit_slave - Get the xmit slave of master device
 7901 * @skb: The packet
 7902 * @all_slaves: assume all the slaves are active
 7903 *
 7904 * The reference counters are not incremented so the caller must be
 7905 * careful with locks. The caller must hold RCU lock.
 7906 * %NULL is returned if no slave is found.
 7907 */
 7908
 7909struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 7910					 struct sk_buff *skb,
 7911					 bool all_slaves)
 7912{
 7913	const struct net_device_ops *ops = dev->netdev_ops;
 7914
 7915	if (!ops->ndo_get_xmit_slave)
 7916		return NULL;
 7917	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 7918}
 7919EXPORT_SYMBOL(netdev_get_xmit_slave);
 7920
 7921static void netdev_adjacent_add_links(struct net_device *dev)
 7922{
 7923	struct netdev_adjacent *iter;
 7924
 7925	struct net *net = dev_net(dev);
 7926
 7927	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7928		if (!net_eq(net, dev_net(iter->dev)))
 7929			continue;
 7930		netdev_adjacent_sysfs_add(iter->dev, dev,
 7931					  &iter->dev->adj_list.lower);
 7932		netdev_adjacent_sysfs_add(dev, iter->dev,
 7933					  &dev->adj_list.upper);
 7934	}
 7935
 7936	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7937		if (!net_eq(net, dev_net(iter->dev)))
 7938			continue;
 7939		netdev_adjacent_sysfs_add(iter->dev, dev,
 7940					  &iter->dev->adj_list.upper);
 7941		netdev_adjacent_sysfs_add(dev, iter->dev,
 7942					  &dev->adj_list.lower);
 7943	}
 7944}
 7945
 7946static void netdev_adjacent_del_links(struct net_device *dev)
 7947{
 7948	struct netdev_adjacent *iter;
 7949
 7950	struct net *net = dev_net(dev);
 7951
 7952	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7953		if (!net_eq(net, dev_net(iter->dev)))
 7954			continue;
 7955		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7956					  &iter->dev->adj_list.lower);
 7957		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7958					  &dev->adj_list.upper);
 7959	}
 7960
 7961	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7962		if (!net_eq(net, dev_net(iter->dev)))
 7963			continue;
 7964		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 7965					  &iter->dev->adj_list.upper);
 7966		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 7967					  &dev->adj_list.lower);
 7968	}
 7969}
 7970
 7971void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 7972{
 7973	struct netdev_adjacent *iter;
 7974
 7975	struct net *net = dev_net(dev);
 7976
 7977	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 7978		if (!net_eq(net, dev_net(iter->dev)))
 7979			continue;
 7980		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7981					  &iter->dev->adj_list.lower);
 7982		netdev_adjacent_sysfs_add(iter->dev, dev,
 7983					  &iter->dev->adj_list.lower);
 7984	}
 7985
 7986	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 7987		if (!net_eq(net, dev_net(iter->dev)))
 7988			continue;
 7989		netdev_adjacent_sysfs_del(iter->dev, oldname,
 7990					  &iter->dev->adj_list.upper);
 7991		netdev_adjacent_sysfs_add(iter->dev, dev,
 7992					  &iter->dev->adj_list.upper);
 7993	}
 7994}
 7995
 7996void *netdev_lower_dev_get_private(struct net_device *dev,
 7997				   struct net_device *lower_dev)
 7998{
 7999	struct netdev_adjacent *lower;
 8000
 8001	if (!lower_dev)
 8002		return NULL;
 8003	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8004	if (!lower)
 8005		return NULL;
 8006
 8007	return lower->private;
 8008}
 8009EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8010
 8011
 8012/**
 8013 * netdev_lower_change - Dispatch event about lower device state change
 8014 * @lower_dev: device
 8015 * @lower_state_info: state to dispatch
 8016 *
 8017 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8018 * The caller must hold the RTNL lock.
 8019 */
 8020void netdev_lower_state_changed(struct net_device *lower_dev,
 8021				void *lower_state_info)
 8022{
 8023	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8024		.info.dev = lower_dev,
 8025	};
 8026
 8027	ASSERT_RTNL();
 8028	changelowerstate_info.lower_state_info = lower_state_info;
 8029	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8030				      &changelowerstate_info.info);
 8031}
 8032EXPORT_SYMBOL(netdev_lower_state_changed);
 8033
 8034static void dev_change_rx_flags(struct net_device *dev, int flags)
 8035{
 8036	const struct net_device_ops *ops = dev->netdev_ops;
 8037
 8038	if (ops->ndo_change_rx_flags)
 8039		ops->ndo_change_rx_flags(dev, flags);
 8040}
 8041
 8042static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8043{
 8044	unsigned int old_flags = dev->flags;
 8045	kuid_t uid;
 8046	kgid_t gid;
 8047
 8048	ASSERT_RTNL();
 8049
 8050	dev->flags |= IFF_PROMISC;
 8051	dev->promiscuity += inc;
 8052	if (dev->promiscuity == 0) {
 8053		/*
 8054		 * Avoid overflow.
 8055		 * If inc causes overflow, untouch promisc and return error.
 8056		 */
 8057		if (inc < 0)
 8058			dev->flags &= ~IFF_PROMISC;
 8059		else {
 8060			dev->promiscuity -= inc;
 8061			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8062				dev->name);
 8063			return -EOVERFLOW;
 8064		}
 8065	}
 8066	if (dev->flags != old_flags) {
 8067		pr_info("device %s %s promiscuous mode\n",
 8068			dev->name,
 8069			dev->flags & IFF_PROMISC ? "entered" : "left");
 8070		if (audit_enabled) {
 8071			current_uid_gid(&uid, &gid);
 8072			audit_log(audit_context(), GFP_ATOMIC,
 8073				  AUDIT_ANOM_PROMISCUOUS,
 8074				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8075				  dev->name, (dev->flags & IFF_PROMISC),
 8076				  (old_flags & IFF_PROMISC),
 8077				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8078				  from_kuid(&init_user_ns, uid),
 8079				  from_kgid(&init_user_ns, gid),
 8080				  audit_get_sessionid(current));
 8081		}
 8082
 8083		dev_change_rx_flags(dev, IFF_PROMISC);
 8084	}
 8085	if (notify)
 8086		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8087	return 0;
 8088}
 8089
 8090/**
 8091 *	dev_set_promiscuity	- update promiscuity count on a device
 8092 *	@dev: device
 8093 *	@inc: modifier
 8094 *
 8095 *	Add or remove promiscuity from a device. While the count in the device
 8096 *	remains above zero the interface remains promiscuous. Once it hits zero
 8097 *	the device reverts back to normal filtering operation. A negative inc
 8098 *	value is used to drop promiscuity on the device.
 8099 *	Return 0 if successful or a negative errno code on error.
 8100 */
 8101int dev_set_promiscuity(struct net_device *dev, int inc)
 8102{
 8103	unsigned int old_flags = dev->flags;
 8104	int err;
 8105
 8106	err = __dev_set_promiscuity(dev, inc, true);
 8107	if (err < 0)
 8108		return err;
 8109	if (dev->flags != old_flags)
 8110		dev_set_rx_mode(dev);
 8111	return err;
 8112}
 8113EXPORT_SYMBOL(dev_set_promiscuity);
 8114
 8115static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8116{
 8117	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8118
 8119	ASSERT_RTNL();
 8120
 8121	dev->flags |= IFF_ALLMULTI;
 8122	dev->allmulti += inc;
 8123	if (dev->allmulti == 0) {
 8124		/*
 8125		 * Avoid overflow.
 8126		 * If inc causes overflow, untouch allmulti and return error.
 8127		 */
 8128		if (inc < 0)
 8129			dev->flags &= ~IFF_ALLMULTI;
 8130		else {
 8131			dev->allmulti -= inc;
 8132			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8133				dev->name);
 8134			return -EOVERFLOW;
 8135		}
 8136	}
 8137	if (dev->flags ^ old_flags) {
 8138		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8139		dev_set_rx_mode(dev);
 8140		if (notify)
 8141			__dev_notify_flags(dev, old_flags,
 8142					   dev->gflags ^ old_gflags);
 8143	}
 8144	return 0;
 8145}
 8146
 8147/**
 8148 *	dev_set_allmulti	- update allmulti count on a device
 8149 *	@dev: device
 8150 *	@inc: modifier
 8151 *
 8152 *	Add or remove reception of all multicast frames to a device. While the
 8153 *	count in the device remains above zero the interface remains listening
 8154 *	to all interfaces. Once it hits zero the device reverts back to normal
 8155 *	filtering operation. A negative @inc value is used to drop the counter
 8156 *	when releasing a resource needing all multicasts.
 8157 *	Return 0 if successful or a negative errno code on error.
 8158 */
 8159
 8160int dev_set_allmulti(struct net_device *dev, int inc)
 8161{
 8162	return __dev_set_allmulti(dev, inc, true);
 8163}
 8164EXPORT_SYMBOL(dev_set_allmulti);
 8165
 8166/*
 8167 *	Upload unicast and multicast address lists to device and
 8168 *	configure RX filtering. When the device doesn't support unicast
 8169 *	filtering it is put in promiscuous mode while unicast addresses
 8170 *	are present.
 8171 */
 8172void __dev_set_rx_mode(struct net_device *dev)
 8173{
 8174	const struct net_device_ops *ops = dev->netdev_ops;
 8175
 8176	/* dev_open will call this function so the list will stay sane. */
 8177	if (!(dev->flags&IFF_UP))
 8178		return;
 8179
 8180	if (!netif_device_present(dev))
 8181		return;
 8182
 8183	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8184		/* Unicast addresses changes may only happen under the rtnl,
 8185		 * therefore calling __dev_set_promiscuity here is safe.
 8186		 */
 8187		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8188			__dev_set_promiscuity(dev, 1, false);
 8189			dev->uc_promisc = true;
 8190		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8191			__dev_set_promiscuity(dev, -1, false);
 8192			dev->uc_promisc = false;
 8193		}
 8194	}
 8195
 8196	if (ops->ndo_set_rx_mode)
 8197		ops->ndo_set_rx_mode(dev);
 8198}
 8199
 8200void dev_set_rx_mode(struct net_device *dev)
 8201{
 8202	netif_addr_lock_bh(dev);
 8203	__dev_set_rx_mode(dev);
 8204	netif_addr_unlock_bh(dev);
 8205}
 8206
 8207/**
 8208 *	dev_get_flags - get flags reported to userspace
 8209 *	@dev: device
 8210 *
 8211 *	Get the combination of flag bits exported through APIs to userspace.
 8212 */
 8213unsigned int dev_get_flags(const struct net_device *dev)
 8214{
 8215	unsigned int flags;
 8216
 8217	flags = (dev->flags & ~(IFF_PROMISC |
 8218				IFF_ALLMULTI |
 8219				IFF_RUNNING |
 8220				IFF_LOWER_UP |
 8221				IFF_DORMANT)) |
 8222		(dev->gflags & (IFF_PROMISC |
 8223				IFF_ALLMULTI));
 8224
 8225	if (netif_running(dev)) {
 8226		if (netif_oper_up(dev))
 8227			flags |= IFF_RUNNING;
 8228		if (netif_carrier_ok(dev))
 8229			flags |= IFF_LOWER_UP;
 8230		if (netif_dormant(dev))
 8231			flags |= IFF_DORMANT;
 8232	}
 8233
 8234	return flags;
 8235}
 8236EXPORT_SYMBOL(dev_get_flags);
 8237
 8238int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8239		       struct netlink_ext_ack *extack)
 8240{
 8241	unsigned int old_flags = dev->flags;
 8242	int ret;
 8243
 8244	ASSERT_RTNL();
 8245
 8246	/*
 8247	 *	Set the flags on our device.
 8248	 */
 8249
 8250	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8251			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8252			       IFF_AUTOMEDIA)) |
 8253		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8254				    IFF_ALLMULTI));
 8255
 8256	/*
 8257	 *	Load in the correct multicast list now the flags have changed.
 8258	 */
 8259
 8260	if ((old_flags ^ flags) & IFF_MULTICAST)
 8261		dev_change_rx_flags(dev, IFF_MULTICAST);
 8262
 8263	dev_set_rx_mode(dev);
 8264
 8265	/*
 8266	 *	Have we downed the interface. We handle IFF_UP ourselves
 8267	 *	according to user attempts to set it, rather than blindly
 8268	 *	setting it.
 8269	 */
 8270
 8271	ret = 0;
 8272	if ((old_flags ^ flags) & IFF_UP) {
 8273		if (old_flags & IFF_UP)
 8274			__dev_close(dev);
 8275		else
 8276			ret = __dev_open(dev, extack);
 8277	}
 8278
 8279	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8280		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8281		unsigned int old_flags = dev->flags;
 8282
 8283		dev->gflags ^= IFF_PROMISC;
 8284
 8285		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8286			if (dev->flags != old_flags)
 8287				dev_set_rx_mode(dev);
 8288	}
 8289
 8290	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8291	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8292	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8293	 */
 8294	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8295		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8296
 8297		dev->gflags ^= IFF_ALLMULTI;
 8298		__dev_set_allmulti(dev, inc, false);
 8299	}
 8300
 8301	return ret;
 8302}
 8303
 8304void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8305			unsigned int gchanges)
 8306{
 8307	unsigned int changes = dev->flags ^ old_flags;
 8308
 8309	if (gchanges)
 8310		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8311
 8312	if (changes & IFF_UP) {
 8313		if (dev->flags & IFF_UP)
 8314			call_netdevice_notifiers(NETDEV_UP, dev);
 8315		else
 8316			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8317	}
 8318
 8319	if (dev->flags & IFF_UP &&
 8320	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8321		struct netdev_notifier_change_info change_info = {
 8322			.info = {
 8323				.dev = dev,
 8324			},
 8325			.flags_changed = changes,
 8326		};
 8327
 8328		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8329	}
 8330}
 8331
 8332/**
 8333 *	dev_change_flags - change device settings
 8334 *	@dev: device
 8335 *	@flags: device state flags
 8336 *	@extack: netlink extended ack
 8337 *
 8338 *	Change settings on device based state flags. The flags are
 8339 *	in the userspace exported format.
 8340 */
 8341int dev_change_flags(struct net_device *dev, unsigned int flags,
 8342		     struct netlink_ext_ack *extack)
 8343{
 8344	int ret;
 8345	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8346
 8347	ret = __dev_change_flags(dev, flags, extack);
 8348	if (ret < 0)
 8349		return ret;
 8350
 8351	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8352	__dev_notify_flags(dev, old_flags, changes);
 8353	return ret;
 8354}
 8355EXPORT_SYMBOL(dev_change_flags);
 8356
 8357int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8358{
 8359	const struct net_device_ops *ops = dev->netdev_ops;
 8360
 8361	if (ops->ndo_change_mtu)
 8362		return ops->ndo_change_mtu(dev, new_mtu);
 8363
 8364	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8365	WRITE_ONCE(dev->mtu, new_mtu);
 8366	return 0;
 8367}
 8368EXPORT_SYMBOL(__dev_set_mtu);
 8369
 8370int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8371		     struct netlink_ext_ack *extack)
 8372{
 8373	/* MTU must be positive, and in range */
 8374	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8375		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8376		return -EINVAL;
 8377	}
 8378
 8379	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8380		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8381		return -EINVAL;
 8382	}
 8383	return 0;
 8384}
 8385
 8386/**
 8387 *	dev_set_mtu_ext - Change maximum transfer unit
 8388 *	@dev: device
 8389 *	@new_mtu: new transfer unit
 8390 *	@extack: netlink extended ack
 8391 *
 8392 *	Change the maximum transfer size of the network device.
 8393 */
 8394int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8395		    struct netlink_ext_ack *extack)
 8396{
 8397	int err, orig_mtu;
 8398
 8399	if (new_mtu == dev->mtu)
 8400		return 0;
 8401
 8402	err = dev_validate_mtu(dev, new_mtu, extack);
 8403	if (err)
 8404		return err;
 8405
 8406	if (!netif_device_present(dev))
 8407		return -ENODEV;
 8408
 8409	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8410	err = notifier_to_errno(err);
 8411	if (err)
 8412		return err;
 8413
 8414	orig_mtu = dev->mtu;
 8415	err = __dev_set_mtu(dev, new_mtu);
 8416
 8417	if (!err) {
 8418		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8419						   orig_mtu);
 8420		err = notifier_to_errno(err);
 8421		if (err) {
 8422			/* setting mtu back and notifying everyone again,
 8423			 * so that they have a chance to revert changes.
 8424			 */
 8425			__dev_set_mtu(dev, orig_mtu);
 8426			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8427						     new_mtu);
 8428		}
 8429	}
 8430	return err;
 8431}
 8432
 8433int dev_set_mtu(struct net_device *dev, int new_mtu)
 8434{
 8435	struct netlink_ext_ack extack;
 8436	int err;
 8437
 8438	memset(&extack, 0, sizeof(extack));
 8439	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8440	if (err && extack._msg)
 8441		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8442	return err;
 8443}
 8444EXPORT_SYMBOL(dev_set_mtu);
 8445
 8446/**
 8447 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8448 *	@dev: device
 8449 *	@new_len: new tx queue length
 8450 */
 8451int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8452{
 8453	unsigned int orig_len = dev->tx_queue_len;
 8454	int res;
 8455
 8456	if (new_len != (unsigned int)new_len)
 8457		return -ERANGE;
 8458
 8459	if (new_len != orig_len) {
 8460		dev->tx_queue_len = new_len;
 8461		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8462		res = notifier_to_errno(res);
 8463		if (res)
 8464			goto err_rollback;
 8465		res = dev_qdisc_change_tx_queue_len(dev);
 8466		if (res)
 8467			goto err_rollback;
 8468	}
 8469
 8470	return 0;
 8471
 8472err_rollback:
 8473	netdev_err(dev, "refused to change device tx_queue_len\n");
 8474	dev->tx_queue_len = orig_len;
 8475	return res;
 8476}
 8477
 8478/**
 8479 *	dev_set_group - Change group this device belongs to
 8480 *	@dev: device
 8481 *	@new_group: group this device should belong to
 8482 */
 8483void dev_set_group(struct net_device *dev, int new_group)
 8484{
 8485	dev->group = new_group;
 8486}
 8487EXPORT_SYMBOL(dev_set_group);
 8488
 8489/**
 8490 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8491 *	@dev: device
 8492 *	@addr: new address
 8493 *	@extack: netlink extended ack
 8494 */
 8495int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8496			      struct netlink_ext_ack *extack)
 8497{
 8498	struct netdev_notifier_pre_changeaddr_info info = {
 8499		.info.dev = dev,
 8500		.info.extack = extack,
 8501		.dev_addr = addr,
 8502	};
 8503	int rc;
 8504
 8505	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8506	return notifier_to_errno(rc);
 8507}
 8508EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8509
 8510/**
 8511 *	dev_set_mac_address - Change Media Access Control Address
 8512 *	@dev: device
 8513 *	@sa: new address
 8514 *	@extack: netlink extended ack
 8515 *
 8516 *	Change the hardware (MAC) address of the device
 8517 */
 8518int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8519			struct netlink_ext_ack *extack)
 8520{
 8521	const struct net_device_ops *ops = dev->netdev_ops;
 8522	int err;
 8523
 8524	if (!ops->ndo_set_mac_address)
 8525		return -EOPNOTSUPP;
 8526	if (sa->sa_family != dev->type)
 8527		return -EINVAL;
 8528	if (!netif_device_present(dev))
 8529		return -ENODEV;
 8530	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8531	if (err)
 8532		return err;
 8533	err = ops->ndo_set_mac_address(dev, sa);
 8534	if (err)
 8535		return err;
 8536	dev->addr_assign_type = NET_ADDR_SET;
 8537	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8538	add_device_randomness(dev->dev_addr, dev->addr_len);
 8539	return 0;
 8540}
 8541EXPORT_SYMBOL(dev_set_mac_address);
 8542
 8543/**
 8544 *	dev_change_carrier - Change device carrier
 8545 *	@dev: device
 8546 *	@new_carrier: new value
 8547 *
 8548 *	Change device carrier
 8549 */
 8550int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8551{
 8552	const struct net_device_ops *ops = dev->netdev_ops;
 8553
 8554	if (!ops->ndo_change_carrier)
 8555		return -EOPNOTSUPP;
 8556	if (!netif_device_present(dev))
 8557		return -ENODEV;
 8558	return ops->ndo_change_carrier(dev, new_carrier);
 8559}
 8560EXPORT_SYMBOL(dev_change_carrier);
 8561
 8562/**
 8563 *	dev_get_phys_port_id - Get device physical port ID
 8564 *	@dev: device
 8565 *	@ppid: port ID
 8566 *
 8567 *	Get device physical port ID
 8568 */
 8569int dev_get_phys_port_id(struct net_device *dev,
 8570			 struct netdev_phys_item_id *ppid)
 8571{
 8572	const struct net_device_ops *ops = dev->netdev_ops;
 8573
 8574	if (!ops->ndo_get_phys_port_id)
 8575		return -EOPNOTSUPP;
 8576	return ops->ndo_get_phys_port_id(dev, ppid);
 8577}
 8578EXPORT_SYMBOL(dev_get_phys_port_id);
 8579
 8580/**
 8581 *	dev_get_phys_port_name - Get device physical port name
 8582 *	@dev: device
 8583 *	@name: port name
 8584 *	@len: limit of bytes to copy to name
 8585 *
 8586 *	Get device physical port name
 8587 */
 8588int dev_get_phys_port_name(struct net_device *dev,
 8589			   char *name, size_t len)
 8590{
 8591	const struct net_device_ops *ops = dev->netdev_ops;
 8592	int err;
 8593
 8594	if (ops->ndo_get_phys_port_name) {
 8595		err = ops->ndo_get_phys_port_name(dev, name, len);
 8596		if (err != -EOPNOTSUPP)
 8597			return err;
 8598	}
 8599	return devlink_compat_phys_port_name_get(dev, name, len);
 8600}
 8601EXPORT_SYMBOL(dev_get_phys_port_name);
 8602
 8603/**
 8604 *	dev_get_port_parent_id - Get the device's port parent identifier
 8605 *	@dev: network device
 8606 *	@ppid: pointer to a storage for the port's parent identifier
 8607 *	@recurse: allow/disallow recursion to lower devices
 8608 *
 8609 *	Get the devices's port parent identifier
 8610 */
 8611int dev_get_port_parent_id(struct net_device *dev,
 8612			   struct netdev_phys_item_id *ppid,
 8613			   bool recurse)
 8614{
 8615	const struct net_device_ops *ops = dev->netdev_ops;
 8616	struct netdev_phys_item_id first = { };
 8617	struct net_device *lower_dev;
 8618	struct list_head *iter;
 8619	int err;
 8620
 8621	if (ops->ndo_get_port_parent_id) {
 8622		err = ops->ndo_get_port_parent_id(dev, ppid);
 8623		if (err != -EOPNOTSUPP)
 8624			return err;
 8625	}
 8626
 8627	err = devlink_compat_switch_id_get(dev, ppid);
 8628	if (!err || err != -EOPNOTSUPP)
 8629		return err;
 8630
 8631	if (!recurse)
 8632		return -EOPNOTSUPP;
 8633
 8634	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8635		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8636		if (err)
 8637			break;
 8638		if (!first.id_len)
 8639			first = *ppid;
 8640		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8641			return -ENODATA;
 8642	}
 8643
 8644	return err;
 8645}
 8646EXPORT_SYMBOL(dev_get_port_parent_id);
 8647
 8648/**
 8649 *	netdev_port_same_parent_id - Indicate if two network devices have
 8650 *	the same port parent identifier
 8651 *	@a: first network device
 8652 *	@b: second network device
 8653 */
 8654bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8655{
 8656	struct netdev_phys_item_id a_id = { };
 8657	struct netdev_phys_item_id b_id = { };
 8658
 8659	if (dev_get_port_parent_id(a, &a_id, true) ||
 8660	    dev_get_port_parent_id(b, &b_id, true))
 8661		return false;
 8662
 8663	return netdev_phys_item_id_same(&a_id, &b_id);
 8664}
 8665EXPORT_SYMBOL(netdev_port_same_parent_id);
 8666
 8667/**
 8668 *	dev_change_proto_down - update protocol port state information
 8669 *	@dev: device
 8670 *	@proto_down: new value
 8671 *
 8672 *	This info can be used by switch drivers to set the phys state of the
 8673 *	port.
 8674 */
 8675int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8676{
 8677	const struct net_device_ops *ops = dev->netdev_ops;
 8678
 8679	if (!ops->ndo_change_proto_down)
 8680		return -EOPNOTSUPP;
 8681	if (!netif_device_present(dev))
 8682		return -ENODEV;
 8683	return ops->ndo_change_proto_down(dev, proto_down);
 8684}
 8685EXPORT_SYMBOL(dev_change_proto_down);
 8686
 8687/**
 8688 *	dev_change_proto_down_generic - generic implementation for
 8689 * 	ndo_change_proto_down that sets carrier according to
 8690 * 	proto_down.
 8691 *
 8692 *	@dev: device
 8693 *	@proto_down: new value
 8694 */
 8695int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8696{
 8697	if (proto_down)
 8698		netif_carrier_off(dev);
 8699	else
 8700		netif_carrier_on(dev);
 8701	dev->proto_down = proto_down;
 8702	return 0;
 8703}
 8704EXPORT_SYMBOL(dev_change_proto_down_generic);
 8705
 8706u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
 8707		    enum bpf_netdev_command cmd)
 8708{
 8709	struct netdev_bpf xdp;
 8710
 8711	if (!bpf_op)
 8712		return 0;
 8713
 8714	memset(&xdp, 0, sizeof(xdp));
 8715	xdp.command = cmd;
 8716
 8717	/* Query must always succeed. */
 8718	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
 8719
 8720	return xdp.prog_id;
 8721}
 8722
 8723static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 8724			   struct netlink_ext_ack *extack, u32 flags,
 8725			   struct bpf_prog *prog)
 8726{
 8727	bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
 8728	struct bpf_prog *prev_prog = NULL;
 8729	struct netdev_bpf xdp;
 8730	int err;
 8731
 8732	if (non_hw) {
 8733		prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
 8734							   XDP_QUERY_PROG));
 8735		if (IS_ERR(prev_prog))
 8736			prev_prog = NULL;
 8737	}
 8738
 8739	memset(&xdp, 0, sizeof(xdp));
 8740	if (flags & XDP_FLAGS_HW_MODE)
 8741		xdp.command = XDP_SETUP_PROG_HW;
 8742	else
 8743		xdp.command = XDP_SETUP_PROG;
 8744	xdp.extack = extack;
 8745	xdp.flags = flags;
 8746	xdp.prog = prog;
 8747
 8748	err = bpf_op(dev, &xdp);
 8749	if (!err && non_hw)
 8750		bpf_prog_change_xdp(prev_prog, prog);
 8751
 8752	if (prev_prog)
 8753		bpf_prog_put(prev_prog);
 8754
 8755	return err;
 8756}
 8757
 8758static void dev_xdp_uninstall(struct net_device *dev)
 8759{
 8760	struct netdev_bpf xdp;
 8761	bpf_op_t ndo_bpf;
 8762
 8763	/* Remove generic XDP */
 8764	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
 8765
 8766	/* Remove from the driver */
 8767	ndo_bpf = dev->netdev_ops->ndo_bpf;
 8768	if (!ndo_bpf)
 8769		return;
 8770
 8771	memset(&xdp, 0, sizeof(xdp));
 8772	xdp.command = XDP_QUERY_PROG;
 8773	WARN_ON(ndo_bpf(dev, &xdp));
 8774	if (xdp.prog_id)
 8775		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8776					NULL));
 8777
 8778	/* Remove HW offload */
 8779	memset(&xdp, 0, sizeof(xdp));
 8780	xdp.command = XDP_QUERY_PROG_HW;
 8781	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
 8782		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
 8783					NULL));
 8784}
 8785
 8786/**
 8787 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 8788 *	@dev: device
 8789 *	@extack: netlink extended ack
 8790 *	@fd: new program fd or negative value to clear
 8791 *	@expected_fd: old program fd that userspace expects to replace or clear
 8792 *	@flags: xdp-related flags
 8793 *
 8794 *	Set or clear a bpf program for a device
 8795 */
 8796int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 8797		      int fd, int expected_fd, u32 flags)
 8798{
 8799	const struct net_device_ops *ops = dev->netdev_ops;
 8800	enum bpf_netdev_command query;
 8801	u32 prog_id, expected_id = 0;
 8802	bpf_op_t bpf_op, bpf_chk;
 8803	struct bpf_prog *prog;
 8804	bool offload;
 8805	int err;
 8806
 8807	ASSERT_RTNL();
 8808
 8809	offload = flags & XDP_FLAGS_HW_MODE;
 8810	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
 8811
 8812	bpf_op = bpf_chk = ops->ndo_bpf;
 8813	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
 8814		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
 8815		return -EOPNOTSUPP;
 8816	}
 8817	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
 8818		bpf_op = generic_xdp_install;
 8819	if (bpf_op == bpf_chk)
 8820		bpf_chk = generic_xdp_install;
 8821
 8822	prog_id = __dev_xdp_query(dev, bpf_op, query);
 8823	if (flags & XDP_FLAGS_REPLACE) {
 8824		if (expected_fd >= 0) {
 8825			prog = bpf_prog_get_type_dev(expected_fd,
 8826						     BPF_PROG_TYPE_XDP,
 8827						     bpf_op == ops->ndo_bpf);
 8828			if (IS_ERR(prog))
 8829				return PTR_ERR(prog);
 8830			expected_id = prog->aux->id;
 8831			bpf_prog_put(prog);
 8832		}
 8833
 8834		if (prog_id != expected_id) {
 8835			NL_SET_ERR_MSG(extack, "Active program does not match expected");
 8836			return -EEXIST;
 8837		}
 8838	}
 8839	if (fd >= 0) {
 8840		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
 8841			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
 8842			return -EEXIST;
 8843		}
 8844
 8845		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
 8846			NL_SET_ERR_MSG(extack, "XDP program already attached");
 8847			return -EBUSY;
 8848		}
 8849
 8850		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 8851					     bpf_op == ops->ndo_bpf);
 8852		if (IS_ERR(prog))
 8853			return PTR_ERR(prog);
 8854
 8855		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
 8856			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
 8857			bpf_prog_put(prog);
 8858			return -EINVAL;
 8859		}
 8860
 8861		if (prog->expected_attach_type == BPF_XDP_DEVMAP) {
 8862			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 8863			bpf_prog_put(prog);
 8864			return -EINVAL;
 8865		}
 8866
 8867		/* prog->aux->id may be 0 for orphaned device-bound progs */
 8868		if (prog->aux->id && prog->aux->id == prog_id) {
 8869			bpf_prog_put(prog);
 8870			return 0;
 8871		}
 8872	} else {
 8873		if (!prog_id)
 8874			return 0;
 8875		prog = NULL;
 8876	}
 8877
 8878	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
 8879	if (err < 0 && prog)
 8880		bpf_prog_put(prog);
 8881
 8882	return err;
 8883}
 8884
 8885/**
 8886 *	dev_new_index	-	allocate an ifindex
 8887 *	@net: the applicable net namespace
 8888 *
 8889 *	Returns a suitable unique value for a new device interface
 8890 *	number.  The caller must hold the rtnl semaphore or the
 8891 *	dev_base_lock to be sure it remains unique.
 8892 */
 8893static int dev_new_index(struct net *net)
 8894{
 8895	int ifindex = net->ifindex;
 8896
 8897	for (;;) {
 8898		if (++ifindex <= 0)
 8899			ifindex = 1;
 8900		if (!__dev_get_by_index(net, ifindex))
 8901			return net->ifindex = ifindex;
 8902	}
 8903}
 8904
 8905/* Delayed registration/unregisteration */
 8906static LIST_HEAD(net_todo_list);
 8907DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 8908
 8909static void net_set_todo(struct net_device *dev)
 8910{
 8911	list_add_tail(&dev->todo_list, &net_todo_list);
 8912	dev_net(dev)->dev_unreg_count++;
 8913}
 8914
 8915static void rollback_registered_many(struct list_head *head)
 8916{
 8917	struct net_device *dev, *tmp;
 8918	LIST_HEAD(close_head);
 8919
 8920	BUG_ON(dev_boot_phase);
 8921	ASSERT_RTNL();
 8922
 8923	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 8924		/* Some devices call without registering
 8925		 * for initialization unwind. Remove those
 8926		 * devices and proceed with the remaining.
 8927		 */
 8928		if (dev->reg_state == NETREG_UNINITIALIZED) {
 8929			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 8930				 dev->name, dev);
 8931
 8932			WARN_ON(1);
 8933			list_del(&dev->unreg_list);
 8934			continue;
 8935		}
 8936		dev->dismantle = true;
 8937		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 8938	}
 8939
 8940	/* If device is running, close it first. */
 8941	list_for_each_entry(dev, head, unreg_list)
 8942		list_add_tail(&dev->close_list, &close_head);
 8943	dev_close_many(&close_head, true);
 8944
 8945	list_for_each_entry(dev, head, unreg_list) {
 8946		/* And unlink it from device chain. */
 8947		unlist_netdevice(dev);
 8948
 8949		dev->reg_state = NETREG_UNREGISTERING;
 8950	}
 8951	flush_all_backlogs();
 8952
 8953	synchronize_net();
 8954
 8955	list_for_each_entry(dev, head, unreg_list) {
 8956		struct sk_buff *skb = NULL;
 8957
 8958		/* Shutdown queueing discipline. */
 8959		dev_shutdown(dev);
 8960
 8961		dev_xdp_uninstall(dev);
 8962
 8963		/* Notify protocols, that we are about to destroy
 8964		 * this device. They should clean all the things.
 8965		 */
 8966		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 8967
 8968		if (!dev->rtnl_link_ops ||
 8969		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 8970			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 8971						     GFP_KERNEL, NULL, 0);
 8972
 8973		/*
 8974		 *	Flush the unicast and multicast chains
 8975		 */
 8976		dev_uc_flush(dev);
 8977		dev_mc_flush(dev);
 8978
 8979		netdev_name_node_alt_flush(dev);
 8980		netdev_name_node_free(dev->name_node);
 8981
 8982		if (dev->netdev_ops->ndo_uninit)
 8983			dev->netdev_ops->ndo_uninit(dev);
 8984
 8985		if (skb)
 8986			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 8987
 8988		/* Notifier chain MUST detach us all upper devices. */
 8989		WARN_ON(netdev_has_any_upper_dev(dev));
 8990		WARN_ON(netdev_has_any_lower_dev(dev));
 8991
 8992		/* Remove entries from kobject tree */
 8993		netdev_unregister_kobject(dev);
 8994#ifdef CONFIG_XPS
 8995		/* Remove XPS queueing entries */
 8996		netif_reset_xps_queues_gt(dev, 0);
 8997#endif
 8998	}
 8999
 9000	synchronize_net();
 9001
 9002	list_for_each_entry(dev, head, unreg_list)
 9003		dev_put(dev);
 9004}
 9005
 9006static void rollback_registered(struct net_device *dev)
 9007{
 9008	LIST_HEAD(single);
 9009
 9010	list_add(&dev->unreg_list, &single);
 9011	rollback_registered_many(&single);
 9012	list_del(&single);
 9013}
 9014
 9015static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9016	struct net_device *upper, netdev_features_t features)
 9017{
 9018	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9019	netdev_features_t feature;
 9020	int feature_bit;
 9021
 9022	for_each_netdev_feature(upper_disables, feature_bit) {
 9023		feature = __NETIF_F_BIT(feature_bit);
 9024		if (!(upper->wanted_features & feature)
 9025		    && (features & feature)) {
 9026			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9027				   &feature, upper->name);
 9028			features &= ~feature;
 9029		}
 9030	}
 9031
 9032	return features;
 9033}
 9034
 9035static void netdev_sync_lower_features(struct net_device *upper,
 9036	struct net_device *lower, netdev_features_t features)
 9037{
 9038	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9039	netdev_features_t feature;
 9040	int feature_bit;
 9041
 9042	for_each_netdev_feature(upper_disables, feature_bit) {
 9043		feature = __NETIF_F_BIT(feature_bit);
 9044		if (!(features & feature) && (lower->features & feature)) {
 9045			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9046				   &feature, lower->name);
 9047			lower->wanted_features &= ~feature;
 9048			__netdev_update_features(lower);
 9049
 9050			if (unlikely(lower->features & feature))
 9051				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9052					    &feature, lower->name);
 9053			else
 9054				netdev_features_change(lower);
 9055		}
 9056	}
 9057}
 9058
 9059static netdev_features_t netdev_fix_features(struct net_device *dev,
 9060	netdev_features_t features)
 9061{
 9062	/* Fix illegal checksum combinations */
 9063	if ((features & NETIF_F_HW_CSUM) &&
 9064	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9065		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9066		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9067	}
 9068
 9069	/* TSO requires that SG is present as well. */
 9070	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9071		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9072		features &= ~NETIF_F_ALL_TSO;
 9073	}
 9074
 9075	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9076					!(features & NETIF_F_IP_CSUM)) {
 9077		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9078		features &= ~NETIF_F_TSO;
 9079		features &= ~NETIF_F_TSO_ECN;
 9080	}
 9081
 9082	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9083					 !(features & NETIF_F_IPV6_CSUM)) {
 9084		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9085		features &= ~NETIF_F_TSO6;
 9086	}
 9087
 9088	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9089	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9090		features &= ~NETIF_F_TSO_MANGLEID;
 9091
 9092	/* TSO ECN requires that TSO is present as well. */
 9093	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9094		features &= ~NETIF_F_TSO_ECN;
 9095
 9096	/* Software GSO depends on SG. */
 9097	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9098		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9099		features &= ~NETIF_F_GSO;
 9100	}
 9101
 9102	/* GSO partial features require GSO partial be set */
 9103	if ((features & dev->gso_partial_features) &&
 9104	    !(features & NETIF_F_GSO_PARTIAL)) {
 9105		netdev_dbg(dev,
 9106			   "Dropping partially supported GSO features since no GSO partial.\n");
 9107		features &= ~dev->gso_partial_features;
 9108	}
 9109
 9110	if (!(features & NETIF_F_RXCSUM)) {
 9111		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9112		 * successfully merged by hardware must also have the
 9113		 * checksum verified by hardware.  If the user does not
 9114		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9115		 */
 9116		if (features & NETIF_F_GRO_HW) {
 9117			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9118			features &= ~NETIF_F_GRO_HW;
 9119		}
 9120	}
 9121
 9122	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9123	if (features & NETIF_F_RXFCS) {
 9124		if (features & NETIF_F_LRO) {
 9125			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9126			features &= ~NETIF_F_LRO;
 9127		}
 9128
 9129		if (features & NETIF_F_GRO_HW) {
 9130			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9131			features &= ~NETIF_F_GRO_HW;
 9132		}
 9133	}
 9134
 9135	return features;
 9136}
 9137
 9138int __netdev_update_features(struct net_device *dev)
 9139{
 9140	struct net_device *upper, *lower;
 9141	netdev_features_t features;
 9142	struct list_head *iter;
 9143	int err = -1;
 9144
 9145	ASSERT_RTNL();
 9146
 9147	features = netdev_get_wanted_features(dev);
 9148
 9149	if (dev->netdev_ops->ndo_fix_features)
 9150		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9151
 9152	/* driver might be less strict about feature dependencies */
 9153	features = netdev_fix_features(dev, features);
 9154
 9155	/* some features can't be enabled if they're off an an upper device */
 9156	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9157		features = netdev_sync_upper_features(dev, upper, features);
 9158
 9159	if (dev->features == features)
 9160		goto sync_lower;
 9161
 9162	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9163		&dev->features, &features);
 9164
 9165	if (dev->netdev_ops->ndo_set_features)
 9166		err = dev->netdev_ops->ndo_set_features(dev, features);
 9167	else
 9168		err = 0;
 9169
 9170	if (unlikely(err < 0)) {
 9171		netdev_err(dev,
 9172			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9173			err, &features, &dev->features);
 9174		/* return non-0 since some features might have changed and
 9175		 * it's better to fire a spurious notification than miss it
 9176		 */
 9177		return -1;
 9178	}
 9179
 9180sync_lower:
 9181	/* some features must be disabled on lower devices when disabled
 9182	 * on an upper device (think: bonding master or bridge)
 9183	 */
 9184	netdev_for_each_lower_dev(dev, lower, iter)
 9185		netdev_sync_lower_features(dev, lower, features);
 9186
 9187	if (!err) {
 9188		netdev_features_t diff = features ^ dev->features;
 9189
 9190		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9191			/* udp_tunnel_{get,drop}_rx_info both need
 9192			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9193			 * device, or they won't do anything.
 9194			 * Thus we need to update dev->features
 9195			 * *before* calling udp_tunnel_get_rx_info,
 9196			 * but *after* calling udp_tunnel_drop_rx_info.
 9197			 */
 9198			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9199				dev->features = features;
 9200				udp_tunnel_get_rx_info(dev);
 9201			} else {
 9202				udp_tunnel_drop_rx_info(dev);
 9203			}
 9204		}
 9205
 9206		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9207			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9208				dev->features = features;
 9209				err |= vlan_get_rx_ctag_filter_info(dev);
 9210			} else {
 9211				vlan_drop_rx_ctag_filter_info(dev);
 9212			}
 9213		}
 9214
 9215		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9216			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9217				dev->features = features;
 9218				err |= vlan_get_rx_stag_filter_info(dev);
 9219			} else {
 9220				vlan_drop_rx_stag_filter_info(dev);
 9221			}
 9222		}
 9223
 9224		dev->features = features;
 9225	}
 9226
 9227	return err < 0 ? 0 : 1;
 9228}
 9229
 9230/**
 9231 *	netdev_update_features - recalculate device features
 9232 *	@dev: the device to check
 9233 *
 9234 *	Recalculate dev->features set and send notifications if it
 9235 *	has changed. Should be called after driver or hardware dependent
 9236 *	conditions might have changed that influence the features.
 9237 */
 9238void netdev_update_features(struct net_device *dev)
 9239{
 9240	if (__netdev_update_features(dev))
 9241		netdev_features_change(dev);
 9242}
 9243EXPORT_SYMBOL(netdev_update_features);
 9244
 9245/**
 9246 *	netdev_change_features - recalculate device features
 9247 *	@dev: the device to check
 9248 *
 9249 *	Recalculate dev->features set and send notifications even
 9250 *	if they have not changed. Should be called instead of
 9251 *	netdev_update_features() if also dev->vlan_features might
 9252 *	have changed to allow the changes to be propagated to stacked
 9253 *	VLAN devices.
 9254 */
 9255void netdev_change_features(struct net_device *dev)
 9256{
 9257	__netdev_update_features(dev);
 9258	netdev_features_change(dev);
 9259}
 9260EXPORT_SYMBOL(netdev_change_features);
 9261
 9262/**
 9263 *	netif_stacked_transfer_operstate -	transfer operstate
 9264 *	@rootdev: the root or lower level device to transfer state from
 9265 *	@dev: the device to transfer operstate to
 9266 *
 9267 *	Transfer operational state from root to device. This is normally
 9268 *	called when a stacking relationship exists between the root
 9269 *	device and the device(a leaf device).
 9270 */
 9271void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9272					struct net_device *dev)
 9273{
 9274	if (rootdev->operstate == IF_OPER_DORMANT)
 9275		netif_dormant_on(dev);
 9276	else
 9277		netif_dormant_off(dev);
 9278
 9279	if (rootdev->operstate == IF_OPER_TESTING)
 9280		netif_testing_on(dev);
 9281	else
 9282		netif_testing_off(dev);
 9283
 9284	if (netif_carrier_ok(rootdev))
 9285		netif_carrier_on(dev);
 9286	else
 9287		netif_carrier_off(dev);
 9288}
 9289EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9290
 9291static int netif_alloc_rx_queues(struct net_device *dev)
 9292{
 9293	unsigned int i, count = dev->num_rx_queues;
 9294	struct netdev_rx_queue *rx;
 9295	size_t sz = count * sizeof(*rx);
 9296	int err = 0;
 9297
 9298	BUG_ON(count < 1);
 9299
 9300	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9301	if (!rx)
 9302		return -ENOMEM;
 9303
 9304	dev->_rx = rx;
 9305
 9306	for (i = 0; i < count; i++) {
 9307		rx[i].dev = dev;
 9308
 9309		/* XDP RX-queue setup */
 9310		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 9311		if (err < 0)
 9312			goto err_rxq_info;
 9313	}
 9314	return 0;
 9315
 9316err_rxq_info:
 9317	/* Rollback successful reg's and free other resources */
 9318	while (i--)
 9319		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9320	kvfree(dev->_rx);
 9321	dev->_rx = NULL;
 9322	return err;
 9323}
 9324
 9325static void netif_free_rx_queues(struct net_device *dev)
 9326{
 9327	unsigned int i, count = dev->num_rx_queues;
 9328
 9329	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9330	if (!dev->_rx)
 9331		return;
 9332
 9333	for (i = 0; i < count; i++)
 9334		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9335
 9336	kvfree(dev->_rx);
 9337}
 9338
 9339static void netdev_init_one_queue(struct net_device *dev,
 9340				  struct netdev_queue *queue, void *_unused)
 9341{
 9342	/* Initialize queue lock */
 9343	spin_lock_init(&queue->_xmit_lock);
 9344	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9345	queue->xmit_lock_owner = -1;
 9346	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9347	queue->dev = dev;
 9348#ifdef CONFIG_BQL
 9349	dql_init(&queue->dql, HZ);
 9350#endif
 9351}
 9352
 9353static void netif_free_tx_queues(struct net_device *dev)
 9354{
 9355	kvfree(dev->_tx);
 9356}
 9357
 9358static int netif_alloc_netdev_queues(struct net_device *dev)
 9359{
 9360	unsigned int count = dev->num_tx_queues;
 9361	struct netdev_queue *tx;
 9362	size_t sz = count * sizeof(*tx);
 9363
 9364	if (count < 1 || count > 0xffff)
 9365		return -EINVAL;
 9366
 9367	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9368	if (!tx)
 9369		return -ENOMEM;
 9370
 9371	dev->_tx = tx;
 9372
 9373	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9374	spin_lock_init(&dev->tx_global_lock);
 9375
 9376	return 0;
 9377}
 9378
 9379void netif_tx_stop_all_queues(struct net_device *dev)
 9380{
 9381	unsigned int i;
 9382
 9383	for (i = 0; i < dev->num_tx_queues; i++) {
 9384		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9385
 9386		netif_tx_stop_queue(txq);
 9387	}
 9388}
 9389EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9390
 9391/**
 9392 *	register_netdevice	- register a network device
 9393 *	@dev: device to register
 9394 *
 9395 *	Take a completed network device structure and add it to the kernel
 9396 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9397 *	chain. 0 is returned on success. A negative errno code is returned
 9398 *	on a failure to set up the device, or if the name is a duplicate.
 9399 *
 9400 *	Callers must hold the rtnl semaphore. You may want
 9401 *	register_netdev() instead of this.
 9402 *
 9403 *	BUGS:
 9404 *	The locking appears insufficient to guarantee two parallel registers
 9405 *	will not get the same name.
 9406 */
 9407
 9408int register_netdevice(struct net_device *dev)
 9409{
 9410	int ret;
 9411	struct net *net = dev_net(dev);
 9412
 9413	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9414		     NETDEV_FEATURE_COUNT);
 9415	BUG_ON(dev_boot_phase);
 9416	ASSERT_RTNL();
 9417
 9418	might_sleep();
 9419
 9420	/* When net_device's are persistent, this will be fatal. */
 9421	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9422	BUG_ON(!net);
 9423
 9424	ret = ethtool_check_ops(dev->ethtool_ops);
 9425	if (ret)
 9426		return ret;
 9427
 9428	spin_lock_init(&dev->addr_list_lock);
 9429	netdev_set_addr_lockdep_class(dev);
 9430
 9431	ret = dev_get_valid_name(net, dev, dev->name);
 9432	if (ret < 0)
 9433		goto out;
 9434
 9435	ret = -ENOMEM;
 9436	dev->name_node = netdev_name_node_head_alloc(dev);
 9437	if (!dev->name_node)
 9438		goto out;
 9439
 9440	/* Init, if this function is available */
 9441	if (dev->netdev_ops->ndo_init) {
 9442		ret = dev->netdev_ops->ndo_init(dev);
 9443		if (ret) {
 9444			if (ret > 0)
 9445				ret = -EIO;
 9446			goto err_free_name;
 9447		}
 9448	}
 9449
 9450	if (((dev->hw_features | dev->features) &
 9451	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9452	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9453	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9454		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9455		ret = -EINVAL;
 9456		goto err_uninit;
 9457	}
 9458
 9459	ret = -EBUSY;
 9460	if (!dev->ifindex)
 9461		dev->ifindex = dev_new_index(net);
 9462	else if (__dev_get_by_index(net, dev->ifindex))
 9463		goto err_uninit;
 9464
 9465	/* Transfer changeable features to wanted_features and enable
 9466	 * software offloads (GSO and GRO).
 9467	 */
 9468	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9469	dev->features |= NETIF_F_SOFT_FEATURES;
 9470
 9471	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9472		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9473		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9474	}
 9475
 9476	dev->wanted_features = dev->features & dev->hw_features;
 9477
 9478	if (!(dev->flags & IFF_LOOPBACK))
 9479		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9480
 9481	/* If IPv4 TCP segmentation offload is supported we should also
 9482	 * allow the device to enable segmenting the frame with the option
 9483	 * of ignoring a static IP ID value.  This doesn't enable the
 9484	 * feature itself but allows the user to enable it later.
 9485	 */
 9486	if (dev->hw_features & NETIF_F_TSO)
 9487		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9488	if (dev->vlan_features & NETIF_F_TSO)
 9489		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9490	if (dev->mpls_features & NETIF_F_TSO)
 9491		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9492	if (dev->hw_enc_features & NETIF_F_TSO)
 9493		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9494
 9495	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9496	 */
 9497	dev->vlan_features |= NETIF_F_HIGHDMA;
 9498
 9499	/* Make NETIF_F_SG inheritable to tunnel devices.
 9500	 */
 9501	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9502
 9503	/* Make NETIF_F_SG inheritable to MPLS.
 9504	 */
 9505	dev->mpls_features |= NETIF_F_SG;
 9506
 9507	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9508	ret = notifier_to_errno(ret);
 9509	if (ret)
 9510		goto err_uninit;
 9511
 9512	ret = netdev_register_kobject(dev);
 9513	if (ret) {
 9514		dev->reg_state = NETREG_UNREGISTERED;
 9515		goto err_uninit;
 9516	}
 9517	dev->reg_state = NETREG_REGISTERED;
 9518
 9519	__netdev_update_features(dev);
 9520
 9521	/*
 9522	 *	Default initial state at registry is that the
 9523	 *	device is present.
 9524	 */
 9525
 9526	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9527
 9528	linkwatch_init_dev(dev);
 9529
 9530	dev_init_scheduler(dev);
 9531	dev_hold(dev);
 9532	list_netdevice(dev);
 9533	add_device_randomness(dev->dev_addr, dev->addr_len);
 9534
 9535	/* If the device has permanent device address, driver should
 9536	 * set dev_addr and also addr_assign_type should be set to
 9537	 * NET_ADDR_PERM (default value).
 9538	 */
 9539	if (dev->addr_assign_type == NET_ADDR_PERM)
 9540		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9541
 9542	/* Notify protocols, that a new device appeared. */
 9543	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 9544	ret = notifier_to_errno(ret);
 9545	if (ret) {
 9546		rollback_registered(dev);
 9547		rcu_barrier();
 9548
 9549		dev->reg_state = NETREG_UNREGISTERED;
 9550	}
 9551	/*
 9552	 *	Prevent userspace races by waiting until the network
 9553	 *	device is fully setup before sending notifications.
 9554	 */
 9555	if (!dev->rtnl_link_ops ||
 9556	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9557		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
 9558
 9559out:
 9560	return ret;
 9561
 9562err_uninit:
 9563	if (dev->netdev_ops->ndo_uninit)
 9564		dev->netdev_ops->ndo_uninit(dev);
 9565	if (dev->priv_destructor)
 9566		dev->priv_destructor(dev);
 9567err_free_name:
 9568	netdev_name_node_free(dev->name_node);
 9569	goto out;
 9570}
 9571EXPORT_SYMBOL(register_netdevice);
 9572
 9573/**
 9574 *	init_dummy_netdev	- init a dummy network device for NAPI
 9575 *	@dev: device to init
 9576 *
 9577 *	This takes a network device structure and initialize the minimum
 9578 *	amount of fields so it can be used to schedule NAPI polls without
 9579 *	registering a full blown interface. This is to be used by drivers
 9580 *	that need to tie several hardware interfaces to a single NAPI
 9581 *	poll scheduler due to HW limitations.
 9582 */
 9583int init_dummy_netdev(struct net_device *dev)
 9584{
 9585	/* Clear everything. Note we don't initialize spinlocks
 9586	 * are they aren't supposed to be taken by any of the
 9587	 * NAPI code and this dummy netdev is supposed to be
 9588	 * only ever used for NAPI polls
 9589	 */
 9590	memset(dev, 0, sizeof(struct net_device));
 9591
 9592	/* make sure we BUG if trying to hit standard
 9593	 * register/unregister code path
 9594	 */
 9595	dev->reg_state = NETREG_DUMMY;
 9596
 9597	/* NAPI wants this */
 9598	INIT_LIST_HEAD(&dev->napi_list);
 9599
 9600	/* a dummy interface is started by default */
 9601	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9602	set_bit(__LINK_STATE_START, &dev->state);
 9603
 9604	/* napi_busy_loop stats accounting wants this */
 9605	dev_net_set(dev, &init_net);
 9606
 9607	/* Note : We dont allocate pcpu_refcnt for dummy devices,
 9608	 * because users of this 'device' dont need to change
 9609	 * its refcount.
 9610	 */
 9611
 9612	return 0;
 9613}
 9614EXPORT_SYMBOL_GPL(init_dummy_netdev);
 9615
 9616
 9617/**
 9618 *	register_netdev	- register a network device
 9619 *	@dev: device to register
 9620 *
 9621 *	Take a completed network device structure and add it to the kernel
 9622 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9623 *	chain. 0 is returned on success. A negative errno code is returned
 9624 *	on a failure to set up the device, or if the name is a duplicate.
 9625 *
 9626 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
 9627 *	and expands the device name if you passed a format string to
 9628 *	alloc_netdev.
 9629 */
 9630int register_netdev(struct net_device *dev)
 9631{
 9632	int err;
 9633
 9634	if (rtnl_lock_killable())
 9635		return -EINTR;
 9636	err = register_netdevice(dev);
 9637	rtnl_unlock();
 9638	return err;
 9639}
 9640EXPORT_SYMBOL(register_netdev);
 9641
 9642int netdev_refcnt_read(const struct net_device *dev)
 9643{
 9644	int i, refcnt = 0;
 9645
 9646	for_each_possible_cpu(i)
 9647		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
 9648	return refcnt;
 9649}
 9650EXPORT_SYMBOL(netdev_refcnt_read);
 9651
 9652/**
 9653 * netdev_wait_allrefs - wait until all references are gone.
 9654 * @dev: target net_device
 9655 *
 9656 * This is called when unregistering network devices.
 9657 *
 9658 * Any protocol or device that holds a reference should register
 9659 * for netdevice notification, and cleanup and put back the
 9660 * reference if they receive an UNREGISTER event.
 9661 * We can get stuck here if buggy protocols don't correctly
 9662 * call dev_put.
 9663 */
 9664static void netdev_wait_allrefs(struct net_device *dev)
 9665{
 9666	unsigned long rebroadcast_time, warning_time;
 9667	int refcnt;
 9668
 9669	linkwatch_forget_dev(dev);
 9670
 9671	rebroadcast_time = warning_time = jiffies;
 9672	refcnt = netdev_refcnt_read(dev);
 9673
 9674	while (refcnt != 0) {
 9675		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 9676			rtnl_lock();
 9677
 9678			/* Rebroadcast unregister notification */
 9679			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9680
 9681			__rtnl_unlock();
 9682			rcu_barrier();
 9683			rtnl_lock();
 9684
 9685			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 9686				     &dev->state)) {
 9687				/* We must not have linkwatch events
 9688				 * pending on unregister. If this
 9689				 * happens, we simply run the queue
 9690				 * unscheduled, resulting in a noop
 9691				 * for this device.
 9692				 */
 9693				linkwatch_run_queue();
 9694			}
 9695
 9696			__rtnl_unlock();
 9697
 9698			rebroadcast_time = jiffies;
 9699		}
 9700
 9701		msleep(250);
 9702
 9703		refcnt = netdev_refcnt_read(dev);
 9704
 9705		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
 9706			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
 9707				 dev->name, refcnt);
 9708			warning_time = jiffies;
 9709		}
 9710	}
 9711}
 9712
 9713/* The sequence is:
 9714 *
 9715 *	rtnl_lock();
 9716 *	...
 9717 *	register_netdevice(x1);
 9718 *	register_netdevice(x2);
 9719 *	...
 9720 *	unregister_netdevice(y1);
 9721 *	unregister_netdevice(y2);
 9722 *      ...
 9723 *	rtnl_unlock();
 9724 *	free_netdev(y1);
 9725 *	free_netdev(y2);
 9726 *
 9727 * We are invoked by rtnl_unlock().
 9728 * This allows us to deal with problems:
 9729 * 1) We can delete sysfs objects which invoke hotplug
 9730 *    without deadlocking with linkwatch via keventd.
 9731 * 2) Since we run with the RTNL semaphore not held, we can sleep
 9732 *    safely in order to wait for the netdev refcnt to drop to zero.
 9733 *
 9734 * We must not return until all unregister events added during
 9735 * the interval the lock was held have been completed.
 9736 */
 9737void netdev_run_todo(void)
 9738{
 9739	struct list_head list;
 9740
 9741	/* Snapshot list, allow later requests */
 9742	list_replace_init(&net_todo_list, &list);
 9743
 9744	__rtnl_unlock();
 9745
 9746
 9747	/* Wait for rcu callbacks to finish before next phase */
 9748	if (!list_empty(&list))
 9749		rcu_barrier();
 9750
 9751	while (!list_empty(&list)) {
 9752		struct net_device *dev
 9753			= list_first_entry(&list, struct net_device, todo_list);
 9754		list_del(&dev->todo_list);
 9755
 9756		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
 9757			pr_err("network todo '%s' but state %d\n",
 9758			       dev->name, dev->reg_state);
 9759			dump_stack();
 9760			continue;
 9761		}
 9762
 9763		dev->reg_state = NETREG_UNREGISTERED;
 9764
 9765		netdev_wait_allrefs(dev);
 9766
 9767		/* paranoia */
 9768		BUG_ON(netdev_refcnt_read(dev));
 9769		BUG_ON(!list_empty(&dev->ptype_all));
 9770		BUG_ON(!list_empty(&dev->ptype_specific));
 9771		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 9772		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 9773#if IS_ENABLED(CONFIG_DECNET)
 9774		WARN_ON(dev->dn_ptr);
 9775#endif
 9776		if (dev->priv_destructor)
 9777			dev->priv_destructor(dev);
 9778		if (dev->needs_free_netdev)
 9779			free_netdev(dev);
 9780
 9781		/* Report a network device has been unregistered */
 9782		rtnl_lock();
 9783		dev_net(dev)->dev_unreg_count--;
 9784		__rtnl_unlock();
 9785		wake_up(&netdev_unregistering_wq);
 9786
 9787		/* Free network device */
 9788		kobject_put(&dev->dev.kobj);
 9789	}
 9790}
 9791
 9792/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 9793 * all the same fields in the same order as net_device_stats, with only
 9794 * the type differing, but rtnl_link_stats64 may have additional fields
 9795 * at the end for newer counters.
 9796 */
 9797void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 9798			     const struct net_device_stats *netdev_stats)
 9799{
 9800#if BITS_PER_LONG == 64
 9801	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
 9802	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
 9803	/* zero out counters that only exist in rtnl_link_stats64 */
 9804	memset((char *)stats64 + sizeof(*netdev_stats), 0,
 9805	       sizeof(*stats64) - sizeof(*netdev_stats));
 9806#else
 9807	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
 9808	const unsigned long *src = (const unsigned long *)netdev_stats;
 9809	u64 *dst = (u64 *)stats64;
 9810
 9811	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 9812	for (i = 0; i < n; i++)
 9813		dst[i] = src[i];
 9814	/* zero out counters that only exist in rtnl_link_stats64 */
 9815	memset((char *)stats64 + n * sizeof(u64), 0,
 9816	       sizeof(*stats64) - n * sizeof(u64));
 9817#endif
 9818}
 9819EXPORT_SYMBOL(netdev_stats_to_stats64);
 9820
 9821/**
 9822 *	dev_get_stats	- get network device statistics
 9823 *	@dev: device to get statistics from
 9824 *	@storage: place to store stats
 9825 *
 9826 *	Get network statistics from device. Return @storage.
 9827 *	The device driver may provide its own method by setting
 9828 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 9829 *	otherwise the internal statistics structure is used.
 9830 */
 9831struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 9832					struct rtnl_link_stats64 *storage)
 9833{
 9834	const struct net_device_ops *ops = dev->netdev_ops;
 9835
 9836	if (ops->ndo_get_stats64) {
 9837		memset(storage, 0, sizeof(*storage));
 9838		ops->ndo_get_stats64(dev, storage);
 9839	} else if (ops->ndo_get_stats) {
 9840		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
 9841	} else {
 9842		netdev_stats_to_stats64(storage, &dev->stats);
 9843	}
 9844	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
 9845	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
 9846	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
 9847	return storage;
 9848}
 9849EXPORT_SYMBOL(dev_get_stats);
 9850
 9851struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 9852{
 9853	struct netdev_queue *queue = dev_ingress_queue(dev);
 9854
 9855#ifdef CONFIG_NET_CLS_ACT
 9856	if (queue)
 9857		return queue;
 9858	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
 9859	if (!queue)
 9860		return NULL;
 9861	netdev_init_one_queue(dev, queue, NULL);
 9862	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
 9863	queue->qdisc_sleeping = &noop_qdisc;
 9864	rcu_assign_pointer(dev->ingress_queue, queue);
 9865#endif
 9866	return queue;
 9867}
 9868
 9869static const struct ethtool_ops default_ethtool_ops;
 9870
 9871void netdev_set_default_ethtool_ops(struct net_device *dev,
 9872				    const struct ethtool_ops *ops)
 9873{
 9874	if (dev->ethtool_ops == &default_ethtool_ops)
 9875		dev->ethtool_ops = ops;
 9876}
 9877EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
 9878
 9879void netdev_freemem(struct net_device *dev)
 9880{
 9881	char *addr = (char *)dev - dev->padded;
 9882
 9883	kvfree(addr);
 9884}
 9885
 9886/**
 9887 * alloc_netdev_mqs - allocate network device
 9888 * @sizeof_priv: size of private data to allocate space for
 9889 * @name: device name format string
 9890 * @name_assign_type: origin of device name
 9891 * @setup: callback to initialize device
 9892 * @txqs: the number of TX subqueues to allocate
 9893 * @rxqs: the number of RX subqueues to allocate
 9894 *
 9895 * Allocates a struct net_device with private data area for driver use
 9896 * and performs basic initialization.  Also allocates subqueue structs
 9897 * for each queue on the device.
 9898 */
 9899struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 9900		unsigned char name_assign_type,
 9901		void (*setup)(struct net_device *),
 9902		unsigned int txqs, unsigned int rxqs)
 9903{
 9904	struct net_device *dev;
 9905	unsigned int alloc_size;
 9906	struct net_device *p;
 9907
 9908	BUG_ON(strlen(name) >= sizeof(dev->name));
 9909
 9910	if (txqs < 1) {
 9911		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
 9912		return NULL;
 9913	}
 9914
 9915	if (rxqs < 1) {
 9916		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 9917		return NULL;
 9918	}
 9919
 9920	alloc_size = sizeof(struct net_device);
 9921	if (sizeof_priv) {
 9922		/* ensure 32-byte alignment of private area */
 9923		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
 9924		alloc_size += sizeof_priv;
 9925	}
 9926	/* ensure 32-byte alignment of whole construct */
 9927	alloc_size += NETDEV_ALIGN - 1;
 9928
 9929	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9930	if (!p)
 9931		return NULL;
 9932
 9933	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 9934	dev->padded = (char *)dev - (char *)p;
 9935
 9936	dev->pcpu_refcnt = alloc_percpu(int);
 9937	if (!dev->pcpu_refcnt)
 9938		goto free_dev;
 9939
 9940	if (dev_addr_init(dev))
 9941		goto free_pcpu;
 9942
 9943	dev_mc_init(dev);
 9944	dev_uc_init(dev);
 9945
 9946	dev_net_set(dev, &init_net);
 9947
 9948	dev->gso_max_size = GSO_MAX_SIZE;
 9949	dev->gso_max_segs = GSO_MAX_SEGS;
 9950	dev->upper_level = 1;
 9951	dev->lower_level = 1;
 9952
 9953	INIT_LIST_HEAD(&dev->napi_list);
 9954	INIT_LIST_HEAD(&dev->unreg_list);
 9955	INIT_LIST_HEAD(&dev->close_list);
 9956	INIT_LIST_HEAD(&dev->link_watch_list);
 9957	INIT_LIST_HEAD(&dev->adj_list.upper);
 9958	INIT_LIST_HEAD(&dev->adj_list.lower);
 9959	INIT_LIST_HEAD(&dev->ptype_all);
 9960	INIT_LIST_HEAD(&dev->ptype_specific);
 9961	INIT_LIST_HEAD(&dev->net_notifier_list);
 9962#ifdef CONFIG_NET_SCHED
 9963	hash_init(dev->qdisc_hash);
 9964#endif
 9965	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 9966	setup(dev);
 9967
 9968	if (!dev->tx_queue_len) {
 9969		dev->priv_flags |= IFF_NO_QUEUE;
 9970		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
 9971	}
 9972
 9973	dev->num_tx_queues = txqs;
 9974	dev->real_num_tx_queues = txqs;
 9975	if (netif_alloc_netdev_queues(dev))
 9976		goto free_all;
 9977
 9978	dev->num_rx_queues = rxqs;
 9979	dev->real_num_rx_queues = rxqs;
 9980	if (netif_alloc_rx_queues(dev))
 9981		goto free_all;
 9982
 9983	strcpy(dev->name, name);
 9984	dev->name_assign_type = name_assign_type;
 9985	dev->group = INIT_NETDEV_GROUP;
 9986	if (!dev->ethtool_ops)
 9987		dev->ethtool_ops = &default_ethtool_ops;
 9988
 9989	nf_hook_ingress_init(dev);
 9990
 9991	return dev;
 9992
 9993free_all:
 9994	free_netdev(dev);
 9995	return NULL;
 9996
 9997free_pcpu:
 9998	free_percpu(dev->pcpu_refcnt);
 9999free_dev:
10000	netdev_freemem(dev);
10001	return NULL;
10002}
10003EXPORT_SYMBOL(alloc_netdev_mqs);
10004
10005/**
10006 * free_netdev - free network device
10007 * @dev: device
10008 *
10009 * This function does the last stage of destroying an allocated device
10010 * interface. The reference to the device object is released. If this
10011 * is the last reference then it will be freed.Must be called in process
10012 * context.
10013 */
10014void free_netdev(struct net_device *dev)
10015{
10016	struct napi_struct *p, *n;
10017
10018	might_sleep();
10019	netif_free_tx_queues(dev);
10020	netif_free_rx_queues(dev);
10021
10022	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10023
10024	/* Flush device addresses */
10025	dev_addr_flush(dev);
10026
10027	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10028		netif_napi_del(p);
10029
10030	free_percpu(dev->pcpu_refcnt);
10031	dev->pcpu_refcnt = NULL;
10032	free_percpu(dev->xdp_bulkq);
10033	dev->xdp_bulkq = NULL;
10034
10035	/*  Compatibility with error handling in drivers */
10036	if (dev->reg_state == NETREG_UNINITIALIZED) {
10037		netdev_freemem(dev);
10038		return;
10039	}
10040
10041	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10042	dev->reg_state = NETREG_RELEASED;
10043
10044	/* will free via device release */
10045	put_device(&dev->dev);
10046}
10047EXPORT_SYMBOL(free_netdev);
10048
10049/**
10050 *	synchronize_net -  Synchronize with packet receive processing
10051 *
10052 *	Wait for packets currently being received to be done.
10053 *	Does not block later packets from starting.
10054 */
10055void synchronize_net(void)
10056{
10057	might_sleep();
10058	if (rtnl_is_locked())
10059		synchronize_rcu_expedited();
10060	else
10061		synchronize_rcu();
10062}
10063EXPORT_SYMBOL(synchronize_net);
10064
10065/**
10066 *	unregister_netdevice_queue - remove device from the kernel
10067 *	@dev: device
10068 *	@head: list
10069 *
10070 *	This function shuts down a device interface and removes it
10071 *	from the kernel tables.
10072 *	If head not NULL, device is queued to be unregistered later.
10073 *
10074 *	Callers must hold the rtnl semaphore.  You may want
10075 *	unregister_netdev() instead of this.
10076 */
10077
10078void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10079{
10080	ASSERT_RTNL();
10081
10082	if (head) {
10083		list_move_tail(&dev->unreg_list, head);
10084	} else {
10085		rollback_registered(dev);
10086		/* Finish processing unregister after unlock */
10087		net_set_todo(dev);
10088	}
10089}
10090EXPORT_SYMBOL(unregister_netdevice_queue);
10091
10092/**
10093 *	unregister_netdevice_many - unregister many devices
10094 *	@head: list of devices
10095 *
10096 *  Note: As most callers use a stack allocated list_head,
10097 *  we force a list_del() to make sure stack wont be corrupted later.
10098 */
10099void unregister_netdevice_many(struct list_head *head)
10100{
10101	struct net_device *dev;
10102
10103	if (!list_empty(head)) {
10104		rollback_registered_many(head);
10105		list_for_each_entry(dev, head, unreg_list)
10106			net_set_todo(dev);
10107		list_del(head);
10108	}
10109}
10110EXPORT_SYMBOL(unregister_netdevice_many);
10111
10112/**
10113 *	unregister_netdev - remove device from the kernel
10114 *	@dev: device
10115 *
10116 *	This function shuts down a device interface and removes it
10117 *	from the kernel tables.
10118 *
10119 *	This is just a wrapper for unregister_netdevice that takes
10120 *	the rtnl semaphore.  In general you want to use this and not
10121 *	unregister_netdevice.
10122 */
10123void unregister_netdev(struct net_device *dev)
10124{
10125	rtnl_lock();
10126	unregister_netdevice(dev);
10127	rtnl_unlock();
10128}
10129EXPORT_SYMBOL(unregister_netdev);
10130
10131/**
10132 *	dev_change_net_namespace - move device to different nethost namespace
10133 *	@dev: device
10134 *	@net: network namespace
10135 *	@pat: If not NULL name pattern to try if the current device name
10136 *	      is already taken in the destination network namespace.
10137 *
10138 *	This function shuts down a device interface and moves it
10139 *	to a new network namespace. On success 0 is returned, on
10140 *	a failure a netagive errno code is returned.
10141 *
10142 *	Callers must hold the rtnl semaphore.
10143 */
10144
10145int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10146{
10147	struct net *net_old = dev_net(dev);
10148	int err, new_nsid, new_ifindex;
10149
10150	ASSERT_RTNL();
10151
10152	/* Don't allow namespace local devices to be moved. */
10153	err = -EINVAL;
10154	if (dev->features & NETIF_F_NETNS_LOCAL)
10155		goto out;
10156
10157	/* Ensure the device has been registrered */
10158	if (dev->reg_state != NETREG_REGISTERED)
10159		goto out;
10160
10161	/* Get out if there is nothing todo */
10162	err = 0;
10163	if (net_eq(net_old, net))
10164		goto out;
10165
10166	/* Pick the destination device name, and ensure
10167	 * we can use it in the destination network namespace.
10168	 */
10169	err = -EEXIST;
10170	if (__dev_get_by_name(net, dev->name)) {
10171		/* We get here if we can't use the current device name */
10172		if (!pat)
10173			goto out;
10174		err = dev_get_valid_name(net, dev, pat);
10175		if (err < 0)
10176			goto out;
10177	}
10178
10179	/*
10180	 * And now a mini version of register_netdevice unregister_netdevice.
10181	 */
10182
10183	/* If device is running close it first. */
10184	dev_close(dev);
10185
10186	/* And unlink it from device chain */
10187	unlist_netdevice(dev);
10188
10189	synchronize_net();
10190
10191	/* Shutdown queueing discipline. */
10192	dev_shutdown(dev);
10193
10194	/* Notify protocols, that we are about to destroy
10195	 * this device. They should clean all the things.
10196	 *
10197	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10198	 * This is wanted because this way 8021q and macvlan know
10199	 * the device is just moving and can keep their slaves up.
10200	 */
10201	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10202	rcu_barrier();
10203
10204	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10205	/* If there is an ifindex conflict assign a new one */
10206	if (__dev_get_by_index(net, dev->ifindex))
10207		new_ifindex = dev_new_index(net);
10208	else
10209		new_ifindex = dev->ifindex;
10210
10211	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10212			    new_ifindex);
10213
10214	/*
10215	 *	Flush the unicast and multicast chains
10216	 */
10217	dev_uc_flush(dev);
10218	dev_mc_flush(dev);
10219
10220	/* Send a netdev-removed uevent to the old namespace */
10221	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10222	netdev_adjacent_del_links(dev);
10223
10224	/* Move per-net netdevice notifiers that are following the netdevice */
10225	move_netdevice_notifiers_dev_net(dev, net);
10226
10227	/* Actually switch the network namespace */
10228	dev_net_set(dev, net);
10229	dev->ifindex = new_ifindex;
10230
10231	/* Send a netdev-add uevent to the new namespace */
10232	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10233	netdev_adjacent_add_links(dev);
10234
10235	/* Fixup kobjects */
10236	err = device_rename(&dev->dev, dev->name);
10237	WARN_ON(err);
10238
10239	/* Adapt owner in case owning user namespace of target network
10240	 * namespace is different from the original one.
10241	 */
10242	err = netdev_change_owner(dev, net_old, net);
10243	WARN_ON(err);
10244
10245	/* Add the device back in the hashes */
10246	list_netdevice(dev);
10247
10248	/* Notify protocols, that a new device appeared. */
10249	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10250
10251	/*
10252	 *	Prevent userspace races by waiting until the network
10253	 *	device is fully setup before sending notifications.
10254	 */
10255	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10256
10257	synchronize_net();
10258	err = 0;
10259out:
10260	return err;
10261}
10262EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10263
10264static int dev_cpu_dead(unsigned int oldcpu)
10265{
10266	struct sk_buff **list_skb;
10267	struct sk_buff *skb;
10268	unsigned int cpu;
10269	struct softnet_data *sd, *oldsd, *remsd = NULL;
10270
10271	local_irq_disable();
10272	cpu = smp_processor_id();
10273	sd = &per_cpu(softnet_data, cpu);
10274	oldsd = &per_cpu(softnet_data, oldcpu);
10275
10276	/* Find end of our completion_queue. */
10277	list_skb = &sd->completion_queue;
10278	while (*list_skb)
10279		list_skb = &(*list_skb)->next;
10280	/* Append completion queue from offline CPU. */
10281	*list_skb = oldsd->completion_queue;
10282	oldsd->completion_queue = NULL;
10283
10284	/* Append output queue from offline CPU. */
10285	if (oldsd->output_queue) {
10286		*sd->output_queue_tailp = oldsd->output_queue;
10287		sd->output_queue_tailp = oldsd->output_queue_tailp;
10288		oldsd->output_queue = NULL;
10289		oldsd->output_queue_tailp = &oldsd->output_queue;
10290	}
10291	/* Append NAPI poll list from offline CPU, with one exception :
10292	 * process_backlog() must be called by cpu owning percpu backlog.
10293	 * We properly handle process_queue & input_pkt_queue later.
10294	 */
10295	while (!list_empty(&oldsd->poll_list)) {
10296		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10297							    struct napi_struct,
10298							    poll_list);
10299
10300		list_del_init(&napi->poll_list);
10301		if (napi->poll == process_backlog)
10302			napi->state = 0;
10303		else
10304			____napi_schedule(sd, napi);
10305	}
10306
10307	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10308	local_irq_enable();
10309
10310#ifdef CONFIG_RPS
10311	remsd = oldsd->rps_ipi_list;
10312	oldsd->rps_ipi_list = NULL;
10313#endif
10314	/* send out pending IPI's on offline CPU */
10315	net_rps_send_ipi(remsd);
10316
10317	/* Process offline CPU's input_pkt_queue */
10318	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10319		netif_rx_ni(skb);
10320		input_queue_head_incr(oldsd);
10321	}
10322	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10323		netif_rx_ni(skb);
10324		input_queue_head_incr(oldsd);
10325	}
10326
10327	return 0;
10328}
10329
10330/**
10331 *	netdev_increment_features - increment feature set by one
10332 *	@all: current feature set
10333 *	@one: new feature set
10334 *	@mask: mask feature set
10335 *
10336 *	Computes a new feature set after adding a device with feature set
10337 *	@one to the master device with current feature set @all.  Will not
10338 *	enable anything that is off in @mask. Returns the new feature set.
10339 */
10340netdev_features_t netdev_increment_features(netdev_features_t all,
10341	netdev_features_t one, netdev_features_t mask)
10342{
10343	if (mask & NETIF_F_HW_CSUM)
10344		mask |= NETIF_F_CSUM_MASK;
10345	mask |= NETIF_F_VLAN_CHALLENGED;
10346
10347	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10348	all &= one | ~NETIF_F_ALL_FOR_ALL;
10349
10350	/* If one device supports hw checksumming, set for all. */
10351	if (all & NETIF_F_HW_CSUM)
10352		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10353
10354	return all;
10355}
10356EXPORT_SYMBOL(netdev_increment_features);
10357
10358static struct hlist_head * __net_init netdev_create_hash(void)
10359{
10360	int i;
10361	struct hlist_head *hash;
10362
10363	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10364	if (hash != NULL)
10365		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10366			INIT_HLIST_HEAD(&hash[i]);
10367
10368	return hash;
10369}
10370
10371/* Initialize per network namespace state */
10372static int __net_init netdev_init(struct net *net)
10373{
10374	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10375		     8 * sizeof_field(struct napi_struct, gro_bitmask));
10376
10377	if (net != &init_net)
10378		INIT_LIST_HEAD(&net->dev_base_head);
10379
10380	net->dev_name_head = netdev_create_hash();
10381	if (net->dev_name_head == NULL)
10382		goto err_name;
10383
10384	net->dev_index_head = netdev_create_hash();
10385	if (net->dev_index_head == NULL)
10386		goto err_idx;
10387
10388	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10389
10390	return 0;
10391
10392err_idx:
10393	kfree(net->dev_name_head);
10394err_name:
10395	return -ENOMEM;
10396}
10397
10398/**
10399 *	netdev_drivername - network driver for the device
10400 *	@dev: network device
10401 *
10402 *	Determine network driver for device.
10403 */
10404const char *netdev_drivername(const struct net_device *dev)
10405{
10406	const struct device_driver *driver;
10407	const struct device *parent;
10408	const char *empty = "";
10409
10410	parent = dev->dev.parent;
10411	if (!parent)
10412		return empty;
10413
10414	driver = parent->driver;
10415	if (driver && driver->name)
10416		return driver->name;
10417	return empty;
10418}
10419
10420static void __netdev_printk(const char *level, const struct net_device *dev,
10421			    struct va_format *vaf)
10422{
10423	if (dev && dev->dev.parent) {
10424		dev_printk_emit(level[1] - '0',
10425				dev->dev.parent,
10426				"%s %s %s%s: %pV",
10427				dev_driver_string(dev->dev.parent),
10428				dev_name(dev->dev.parent),
10429				netdev_name(dev), netdev_reg_state(dev),
10430				vaf);
10431	} else if (dev) {
10432		printk("%s%s%s: %pV",
10433		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10434	} else {
10435		printk("%s(NULL net_device): %pV", level, vaf);
10436	}
10437}
10438
10439void netdev_printk(const char *level, const struct net_device *dev,
10440		   const char *format, ...)
10441{
10442	struct va_format vaf;
10443	va_list args;
10444
10445	va_start(args, format);
10446
10447	vaf.fmt = format;
10448	vaf.va = &args;
10449
10450	__netdev_printk(level, dev, &vaf);
10451
10452	va_end(args);
10453}
10454EXPORT_SYMBOL(netdev_printk);
10455
10456#define define_netdev_printk_level(func, level)			\
10457void func(const struct net_device *dev, const char *fmt, ...)	\
10458{								\
10459	struct va_format vaf;					\
10460	va_list args;						\
10461								\
10462	va_start(args, fmt);					\
10463								\
10464	vaf.fmt = fmt;						\
10465	vaf.va = &args;						\
10466								\
10467	__netdev_printk(level, dev, &vaf);			\
10468								\
10469	va_end(args);						\
10470}								\
10471EXPORT_SYMBOL(func);
10472
10473define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10474define_netdev_printk_level(netdev_alert, KERN_ALERT);
10475define_netdev_printk_level(netdev_crit, KERN_CRIT);
10476define_netdev_printk_level(netdev_err, KERN_ERR);
10477define_netdev_printk_level(netdev_warn, KERN_WARNING);
10478define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10479define_netdev_printk_level(netdev_info, KERN_INFO);
10480
10481static void __net_exit netdev_exit(struct net *net)
10482{
10483	kfree(net->dev_name_head);
10484	kfree(net->dev_index_head);
10485	if (net != &init_net)
10486		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10487}
10488
10489static struct pernet_operations __net_initdata netdev_net_ops = {
10490	.init = netdev_init,
10491	.exit = netdev_exit,
10492};
10493
10494static void __net_exit default_device_exit(struct net *net)
10495{
10496	struct net_device *dev, *aux;
10497	/*
10498	 * Push all migratable network devices back to the
10499	 * initial network namespace
10500	 */
10501	rtnl_lock();
10502	for_each_netdev_safe(net, dev, aux) {
10503		int err;
10504		char fb_name[IFNAMSIZ];
10505
10506		/* Ignore unmoveable devices (i.e. loopback) */
10507		if (dev->features & NETIF_F_NETNS_LOCAL)
10508			continue;
10509
10510		/* Leave virtual devices for the generic cleanup */
10511		if (dev->rtnl_link_ops)
10512			continue;
10513
10514		/* Push remaining network devices to init_net */
10515		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10516		if (__dev_get_by_name(&init_net, fb_name))
10517			snprintf(fb_name, IFNAMSIZ, "dev%%d");
10518		err = dev_change_net_namespace(dev, &init_net, fb_name);
10519		if (err) {
10520			pr_emerg("%s: failed to move %s to init_net: %d\n",
10521				 __func__, dev->name, err);
10522			BUG();
10523		}
10524	}
10525	rtnl_unlock();
10526}
10527
10528static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10529{
10530	/* Return with the rtnl_lock held when there are no network
10531	 * devices unregistering in any network namespace in net_list.
10532	 */
10533	struct net *net;
10534	bool unregistering;
10535	DEFINE_WAIT_FUNC(wait, woken_wake_function);
10536
10537	add_wait_queue(&netdev_unregistering_wq, &wait);
10538	for (;;) {
10539		unregistering = false;
10540		rtnl_lock();
10541		list_for_each_entry(net, net_list, exit_list) {
10542			if (net->dev_unreg_count > 0) {
10543				unregistering = true;
10544				break;
10545			}
10546		}
10547		if (!unregistering)
10548			break;
10549		__rtnl_unlock();
10550
10551		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10552	}
10553	remove_wait_queue(&netdev_unregistering_wq, &wait);
10554}
10555
10556static void __net_exit default_device_exit_batch(struct list_head *net_list)
10557{
10558	/* At exit all network devices most be removed from a network
10559	 * namespace.  Do this in the reverse order of registration.
10560	 * Do this across as many network namespaces as possible to
10561	 * improve batching efficiency.
10562	 */
10563	struct net_device *dev;
10564	struct net *net;
10565	LIST_HEAD(dev_kill_list);
10566
10567	/* To prevent network device cleanup code from dereferencing
10568	 * loopback devices or network devices that have been freed
10569	 * wait here for all pending unregistrations to complete,
10570	 * before unregistring the loopback device and allowing the
10571	 * network namespace be freed.
10572	 *
10573	 * The netdev todo list containing all network devices
10574	 * unregistrations that happen in default_device_exit_batch
10575	 * will run in the rtnl_unlock() at the end of
10576	 * default_device_exit_batch.
10577	 */
10578	rtnl_lock_unregistering(net_list);
10579	list_for_each_entry(net, net_list, exit_list) {
10580		for_each_netdev_reverse(net, dev) {
10581			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10582				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10583			else
10584				unregister_netdevice_queue(dev, &dev_kill_list);
10585		}
10586	}
10587	unregister_netdevice_many(&dev_kill_list);
10588	rtnl_unlock();
10589}
10590
10591static struct pernet_operations __net_initdata default_device_ops = {
10592	.exit = default_device_exit,
10593	.exit_batch = default_device_exit_batch,
10594};
10595
10596/*
10597 *	Initialize the DEV module. At boot time this walks the device list and
10598 *	unhooks any devices that fail to initialise (normally hardware not
10599 *	present) and leaves us with a valid list of present and active devices.
10600 *
10601 */
10602
10603/*
10604 *       This is called single threaded during boot, so no need
10605 *       to take the rtnl semaphore.
10606 */
10607static int __init net_dev_init(void)
10608{
10609	int i, rc = -ENOMEM;
10610
10611	BUG_ON(!dev_boot_phase);
10612
10613	if (dev_proc_init())
10614		goto out;
10615
10616	if (netdev_kobject_init())
10617		goto out;
10618
10619	INIT_LIST_HEAD(&ptype_all);
10620	for (i = 0; i < PTYPE_HASH_SIZE; i++)
10621		INIT_LIST_HEAD(&ptype_base[i]);
10622
10623	INIT_LIST_HEAD(&offload_base);
10624
10625	if (register_pernet_subsys(&netdev_net_ops))
10626		goto out;
10627
10628	/*
10629	 *	Initialise the packet receive queues.
10630	 */
10631
10632	for_each_possible_cpu(i) {
10633		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10634		struct softnet_data *sd = &per_cpu(softnet_data, i);
10635
10636		INIT_WORK(flush, flush_backlog);
10637
10638		skb_queue_head_init(&sd->input_pkt_queue);
10639		skb_queue_head_init(&sd->process_queue);
10640#ifdef CONFIG_XFRM_OFFLOAD
10641		skb_queue_head_init(&sd->xfrm_backlog);
10642#endif
10643		INIT_LIST_HEAD(&sd->poll_list);
10644		sd->output_queue_tailp = &sd->output_queue;
10645#ifdef CONFIG_RPS
10646		sd->csd.func = rps_trigger_softirq;
10647		sd->csd.info = sd;
10648		sd->cpu = i;
10649#endif
10650
10651		init_gro_hash(&sd->backlog);
10652		sd->backlog.poll = process_backlog;
10653		sd->backlog.weight = weight_p;
10654	}
10655
10656	dev_boot_phase = 0;
10657
10658	/* The loopback device is special if any other network devices
10659	 * is present in a network namespace the loopback device must
10660	 * be present. Since we now dynamically allocate and free the
10661	 * loopback device ensure this invariant is maintained by
10662	 * keeping the loopback device as the first device on the
10663	 * list of network devices.  Ensuring the loopback devices
10664	 * is the first device that appears and the last network device
10665	 * that disappears.
10666	 */
10667	if (register_pernet_device(&loopback_net_ops))
10668		goto out;
10669
10670	if (register_pernet_device(&default_device_ops))
10671		goto out;
10672
10673	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10674	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10675
10676	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10677				       NULL, dev_cpu_dead);
10678	WARN_ON(rc < 0);
10679	rc = 0;
10680out:
10681	return rc;
10682}
10683
10684subsys_initcall(net_dev_init);