net/core/dev.c at v5.11-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v5.11-rc6 289 kB view raw
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
  100#include <linux/stat.h>
  101#include <net/dsa.h>
  102#include <net/dst.h>
  103#include <net/dst_metadata.h>
  104#include <net/pkt_sched.h>
  105#include <net/pkt_cls.h>
  106#include <net/checksum.h>
  107#include <net/xfrm.h>
  108#include <linux/highmem.h>
  109#include <linux/init.h>
  110#include <linux/module.h>
  111#include <linux/netpoll.h>
  112#include <linux/rcupdate.h>
  113#include <linux/delay.h>
  114#include <net/iw_handler.h>
  115#include <asm/current.h>
  116#include <linux/audit.h>
  117#include <linux/dmaengine.h>
  118#include <linux/err.h>
  119#include <linux/ctype.h>
  120#include <linux/if_arp.h>
  121#include <linux/if_vlan.h>
  122#include <linux/ip.h>
  123#include <net/ip.h>
  124#include <net/mpls.h>
  125#include <linux/ipv6.h>
  126#include <linux/in.h>
  127#include <linux/jhash.h>
  128#include <linux/random.h>
  129#include <trace/events/napi.h>
  130#include <trace/events/net.h>
  131#include <trace/events/skb.h>
  132#include <linux/inetdevice.h>
  133#include <linux/cpu_rmap.h>
  134#include <linux/static_key.h>
  135#include <linux/hashtable.h>
  136#include <linux/vmalloc.h>
  137#include <linux/if_macvlan.h>
  138#include <linux/errqueue.h>
  139#include <linux/hrtimer.h>
  140#include <linux/netfilter_ingress.h>
  141#include <linux/crash_dump.h>
  142#include <linux/sctp.h>
  143#include <net/udp_tunnel.h>
  144#include <linux/net_namespace.h>
  145#include <linux/indirect_call_wrapper.h>
  146#include <net/devlink.h>
  147#include <linux/pm_runtime.h>
  148#include <linux/prandom.h>
  149
  150#include "net-sysfs.h"
  151
  152#define MAX_GRO_SKBS 8
  153
  154/* This should be increased if a protocol with a bigger head is added. */
  155#define GRO_MAX_HEAD (MAX_HEADER + 128)
  156
  157static DEFINE_SPINLOCK(ptype_lock);
  158static DEFINE_SPINLOCK(offload_lock);
  159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  160struct list_head ptype_all __read_mostly;	/* Taps */
  161static struct list_head offload_base __read_mostly;
  162
  163static int netif_rx_internal(struct sk_buff *skb);
  164static int call_netdevice_notifiers_info(unsigned long val,
  165					 struct netdev_notifier_info *info);
  166static int call_netdevice_notifiers_extack(unsigned long val,
  167					   struct net_device *dev,
  168					   struct netlink_ext_ack *extack);
  169static struct napi_struct *napi_by_id(unsigned int napi_id);
  170
  171/*
  172 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  173 * semaphore.
  174 *
  175 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  176 *
  177 * Writers must hold the rtnl semaphore while they loop through the
  178 * dev_base_head list, and hold dev_base_lock for writing when they do the
  179 * actual updates.  This allows pure readers to access the list even
  180 * while a writer is preparing to update it.
  181 *
  182 * To put it another way, dev_base_lock is held for writing only to
  183 * protect against pure readers; the rtnl semaphore provides the
  184 * protection against other writers.
  185 *
  186 * See, for example usages, register_netdevice() and
  187 * unregister_netdevice(), which must be called with the rtnl
  188 * semaphore held.
  189 */
  190DEFINE_RWLOCK(dev_base_lock);
  191EXPORT_SYMBOL(dev_base_lock);
  192
  193static DEFINE_MUTEX(ifalias_mutex);
  194
  195/* protects napi_hash addition/deletion and napi_gen_id */
  196static DEFINE_SPINLOCK(napi_hash_lock);
  197
  198static unsigned int napi_gen_id = NR_CPUS;
  199static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  200
  201static DECLARE_RWSEM(devnet_rename_sem);
  202
  203static inline void dev_base_seq_inc(struct net *net)
  204{
  205	while (++net->dev_base_seq == 0)
  206		;
  207}
  208
  209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  210{
  211	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  212
  213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  214}
  215
  216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  217{
  218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  219}
  220
  221static inline void rps_lock(struct softnet_data *sd)
  222{
  223#ifdef CONFIG_RPS
  224	spin_lock(&sd->input_pkt_queue.lock);
  225#endif
  226}
  227
  228static inline void rps_unlock(struct softnet_data *sd)
  229{
  230#ifdef CONFIG_RPS
  231	spin_unlock(&sd->input_pkt_queue.lock);
  232#endif
  233}
  234
  235static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  236						       const char *name)
  237{
  238	struct netdev_name_node *name_node;
  239
  240	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  241	if (!name_node)
  242		return NULL;
  243	INIT_HLIST_NODE(&name_node->hlist);
  244	name_node->dev = dev;
  245	name_node->name = name;
  246	return name_node;
  247}
  248
  249static struct netdev_name_node *
  250netdev_name_node_head_alloc(struct net_device *dev)
  251{
  252	struct netdev_name_node *name_node;
  253
  254	name_node = netdev_name_node_alloc(dev, dev->name);
  255	if (!name_node)
  256		return NULL;
  257	INIT_LIST_HEAD(&name_node->list);
  258	return name_node;
  259}
  260
  261static void netdev_name_node_free(struct netdev_name_node *name_node)
  262{
  263	kfree(name_node);
  264}
  265
  266static void netdev_name_node_add(struct net *net,
  267				 struct netdev_name_node *name_node)
  268{
  269	hlist_add_head_rcu(&name_node->hlist,
  270			   dev_name_hash(net, name_node->name));
  271}
  272
  273static void netdev_name_node_del(struct netdev_name_node *name_node)
  274{
  275	hlist_del_rcu(&name_node->hlist);
  276}
  277
  278static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  279							const char *name)
  280{
  281	struct hlist_head *head = dev_name_hash(net, name);
  282	struct netdev_name_node *name_node;
  283
  284	hlist_for_each_entry(name_node, head, hlist)
  285		if (!strcmp(name_node->name, name))
  286			return name_node;
  287	return NULL;
  288}
  289
  290static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  291							    const char *name)
  292{
  293	struct hlist_head *head = dev_name_hash(net, name);
  294	struct netdev_name_node *name_node;
  295
  296	hlist_for_each_entry_rcu(name_node, head, hlist)
  297		if (!strcmp(name_node->name, name))
  298			return name_node;
  299	return NULL;
  300}
  301
  302int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  303{
  304	struct netdev_name_node *name_node;
  305	struct net *net = dev_net(dev);
  306
  307	name_node = netdev_name_node_lookup(net, name);
  308	if (name_node)
  309		return -EEXIST;
  310	name_node = netdev_name_node_alloc(dev, name);
  311	if (!name_node)
  312		return -ENOMEM;
  313	netdev_name_node_add(net, name_node);
  314	/* The node that holds dev->name acts as a head of per-device list. */
  315	list_add_tail(&name_node->list, &dev->name_node->list);
  316
  317	return 0;
  318}
  319EXPORT_SYMBOL(netdev_name_node_alt_create);
  320
  321static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  322{
  323	list_del(&name_node->list);
  324	netdev_name_node_del(name_node);
  325	kfree(name_node->name);
  326	netdev_name_node_free(name_node);
  327}
  328
  329int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  330{
  331	struct netdev_name_node *name_node;
  332	struct net *net = dev_net(dev);
  333
  334	name_node = netdev_name_node_lookup(net, name);
  335	if (!name_node)
  336		return -ENOENT;
  337	/* lookup might have found our primary name or a name belonging
  338	 * to another device.
  339	 */
  340	if (name_node == dev->name_node || name_node->dev != dev)
  341		return -EINVAL;
  342
  343	__netdev_name_node_alt_destroy(name_node);
  344
  345	return 0;
  346}
  347EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  348
  349static void netdev_name_node_alt_flush(struct net_device *dev)
  350{
  351	struct netdev_name_node *name_node, *tmp;
  352
  353	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  354		__netdev_name_node_alt_destroy(name_node);
  355}
  356
  357/* Device list insertion */
  358static void list_netdevice(struct net_device *dev)
  359{
  360	struct net *net = dev_net(dev);
  361
  362	ASSERT_RTNL();
  363
  364	write_lock_bh(&dev_base_lock);
  365	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  366	netdev_name_node_add(net, dev->name_node);
  367	hlist_add_head_rcu(&dev->index_hlist,
  368			   dev_index_hash(net, dev->ifindex));
  369	write_unlock_bh(&dev_base_lock);
  370
  371	dev_base_seq_inc(net);
  372}
  373
  374/* Device list removal
  375 * caller must respect a RCU grace period before freeing/reusing dev
  376 */
  377static void unlist_netdevice(struct net_device *dev)
  378{
  379	ASSERT_RTNL();
  380
  381	/* Unlink dev from the device chain */
  382	write_lock_bh(&dev_base_lock);
  383	list_del_rcu(&dev->dev_list);
  384	netdev_name_node_del(dev->name_node);
  385	hlist_del_rcu(&dev->index_hlist);
  386	write_unlock_bh(&dev_base_lock);
  387
  388	dev_base_seq_inc(dev_net(dev));
  389}
  390
  391/*
  392 *	Our notifier list
  393 */
  394
  395static RAW_NOTIFIER_HEAD(netdev_chain);
  396
  397/*
  398 *	Device drivers call our routines to queue packets here. We empty the
  399 *	queue in the local softnet handler.
  400 */
  401
  402DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  403EXPORT_PER_CPU_SYMBOL(softnet_data);
  404
  405#ifdef CONFIG_LOCKDEP
  406/*
  407 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  408 * according to dev->type
  409 */
  410static const unsigned short netdev_lock_type[] = {
  411	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  412	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  413	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  414	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  415	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  416	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  417	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  418	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  419	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  420	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  421	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  422	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  423	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  424	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  425	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  426
  427static const char *const netdev_lock_name[] = {
  428	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  429	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  430	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  431	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  432	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  433	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  434	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  435	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  436	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  437	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  438	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  439	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  440	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  441	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  442	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  443
  444static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  445static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  446
  447static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  448{
  449	int i;
  450
  451	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  452		if (netdev_lock_type[i] == dev_type)
  453			return i;
  454	/* the last key is used by default */
  455	return ARRAY_SIZE(netdev_lock_type) - 1;
  456}
  457
  458static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  459						 unsigned short dev_type)
  460{
  461	int i;
  462
  463	i = netdev_lock_pos(dev_type);
  464	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  465				   netdev_lock_name[i]);
  466}
  467
  468static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  469{
  470	int i;
  471
  472	i = netdev_lock_pos(dev->type);
  473	lockdep_set_class_and_name(&dev->addr_list_lock,
  474				   &netdev_addr_lock_key[i],
  475				   netdev_lock_name[i]);
  476}
  477#else
  478static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  479						 unsigned short dev_type)
  480{
  481}
  482
  483static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  484{
  485}
  486#endif
  487
  488/*******************************************************************************
  489 *
  490 *		Protocol management and registration routines
  491 *
  492 *******************************************************************************/
  493
  494
  495/*
  496 *	Add a protocol ID to the list. Now that the input handler is
  497 *	smarter we can dispense with all the messy stuff that used to be
  498 *	here.
  499 *
  500 *	BEWARE!!! Protocol handlers, mangling input packets,
  501 *	MUST BE last in hash buckets and checking protocol handlers
  502 *	MUST start from promiscuous ptype_all chain in net_bh.
  503 *	It is true now, do not change it.
  504 *	Explanation follows: if protocol handler, mangling packet, will
  505 *	be the first on list, it is not able to sense, that packet
  506 *	is cloned and should be copied-on-write, so that it will
  507 *	change it and subsequent readers will get broken packet.
  508 *							--ANK (980803)
  509 */
  510
  511static inline struct list_head *ptype_head(const struct packet_type *pt)
  512{
  513	if (pt->type == htons(ETH_P_ALL))
  514		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  515	else
  516		return pt->dev ? &pt->dev->ptype_specific :
  517				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  518}
  519
  520/**
  521 *	dev_add_pack - add packet handler
  522 *	@pt: packet type declaration
  523 *
  524 *	Add a protocol handler to the networking stack. The passed &packet_type
  525 *	is linked into kernel lists and may not be freed until it has been
  526 *	removed from the kernel lists.
  527 *
  528 *	This call does not sleep therefore it can not
  529 *	guarantee all CPU's that are in middle of receiving packets
  530 *	will see the new packet type (until the next received packet).
  531 */
  532
  533void dev_add_pack(struct packet_type *pt)
  534{
  535	struct list_head *head = ptype_head(pt);
  536
  537	spin_lock(&ptype_lock);
  538	list_add_rcu(&pt->list, head);
  539	spin_unlock(&ptype_lock);
  540}
  541EXPORT_SYMBOL(dev_add_pack);
  542
  543/**
  544 *	__dev_remove_pack	 - remove packet handler
  545 *	@pt: packet type declaration
  546 *
  547 *	Remove a protocol handler that was previously added to the kernel
  548 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  549 *	from the kernel lists and can be freed or reused once this function
  550 *	returns.
  551 *
  552 *      The packet type might still be in use by receivers
  553 *	and must not be freed until after all the CPU's have gone
  554 *	through a quiescent state.
  555 */
  556void __dev_remove_pack(struct packet_type *pt)
  557{
  558	struct list_head *head = ptype_head(pt);
  559	struct packet_type *pt1;
  560
  561	spin_lock(&ptype_lock);
  562
  563	list_for_each_entry(pt1, head, list) {
  564		if (pt == pt1) {
  565			list_del_rcu(&pt->list);
  566			goto out;
  567		}
  568	}
  569
  570	pr_warn("dev_remove_pack: %p not found\n", pt);
  571out:
  572	spin_unlock(&ptype_lock);
  573}
  574EXPORT_SYMBOL(__dev_remove_pack);
  575
  576/**
  577 *	dev_remove_pack	 - remove packet handler
  578 *	@pt: packet type declaration
  579 *
  580 *	Remove a protocol handler that was previously added to the kernel
  581 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  582 *	from the kernel lists and can be freed or reused once this function
  583 *	returns.
  584 *
  585 *	This call sleeps to guarantee that no CPU is looking at the packet
  586 *	type after return.
  587 */
  588void dev_remove_pack(struct packet_type *pt)
  589{
  590	__dev_remove_pack(pt);
  591
  592	synchronize_net();
  593}
  594EXPORT_SYMBOL(dev_remove_pack);
  595
  596
  597/**
  598 *	dev_add_offload - register offload handlers
  599 *	@po: protocol offload declaration
  600 *
  601 *	Add protocol offload handlers to the networking stack. The passed
  602 *	&proto_offload is linked into kernel lists and may not be freed until
  603 *	it has been removed from the kernel lists.
  604 *
  605 *	This call does not sleep therefore it can not
  606 *	guarantee all CPU's that are in middle of receiving packets
  607 *	will see the new offload handlers (until the next received packet).
  608 */
  609void dev_add_offload(struct packet_offload *po)
  610{
  611	struct packet_offload *elem;
  612
  613	spin_lock(&offload_lock);
  614	list_for_each_entry(elem, &offload_base, list) {
  615		if (po->priority < elem->priority)
  616			break;
  617	}
  618	list_add_rcu(&po->list, elem->list.prev);
  619	spin_unlock(&offload_lock);
  620}
  621EXPORT_SYMBOL(dev_add_offload);
  622
  623/**
  624 *	__dev_remove_offload	 - remove offload handler
  625 *	@po: packet offload declaration
  626 *
  627 *	Remove a protocol offload handler that was previously added to the
  628 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  629 *	is removed from the kernel lists and can be freed or reused once this
  630 *	function returns.
  631 *
  632 *      The packet type might still be in use by receivers
  633 *	and must not be freed until after all the CPU's have gone
  634 *	through a quiescent state.
  635 */
  636static void __dev_remove_offload(struct packet_offload *po)
  637{
  638	struct list_head *head = &offload_base;
  639	struct packet_offload *po1;
  640
  641	spin_lock(&offload_lock);
  642
  643	list_for_each_entry(po1, head, list) {
  644		if (po == po1) {
  645			list_del_rcu(&po->list);
  646			goto out;
  647		}
  648	}
  649
  650	pr_warn("dev_remove_offload: %p not found\n", po);
  651out:
  652	spin_unlock(&offload_lock);
  653}
  654
  655/**
  656 *	dev_remove_offload	 - remove packet offload handler
  657 *	@po: packet offload declaration
  658 *
  659 *	Remove a packet offload handler that was previously added to the kernel
  660 *	offload handlers by dev_add_offload(). The passed &offload_type is
  661 *	removed from the kernel lists and can be freed or reused once this
  662 *	function returns.
  663 *
  664 *	This call sleeps to guarantee that no CPU is looking at the packet
  665 *	type after return.
  666 */
  667void dev_remove_offload(struct packet_offload *po)
  668{
  669	__dev_remove_offload(po);
  670
  671	synchronize_net();
  672}
  673EXPORT_SYMBOL(dev_remove_offload);
  674
  675/******************************************************************************
  676 *
  677 *		      Device Boot-time Settings Routines
  678 *
  679 ******************************************************************************/
  680
  681/* Boot time configuration table */
  682static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  683
  684/**
  685 *	netdev_boot_setup_add	- add new setup entry
  686 *	@name: name of the device
  687 *	@map: configured settings for the device
  688 *
  689 *	Adds new setup entry to the dev_boot_setup list.  The function
  690 *	returns 0 on error and 1 on success.  This is a generic routine to
  691 *	all netdevices.
  692 */
  693static int netdev_boot_setup_add(char *name, struct ifmap *map)
  694{
  695	struct netdev_boot_setup *s;
  696	int i;
  697
  698	s = dev_boot_setup;
  699	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  700		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  701			memset(s[i].name, 0, sizeof(s[i].name));
  702			strlcpy(s[i].name, name, IFNAMSIZ);
  703			memcpy(&s[i].map, map, sizeof(s[i].map));
  704			break;
  705		}
  706	}
  707
  708	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  709}
  710
  711/**
  712 * netdev_boot_setup_check	- check boot time settings
  713 * @dev: the netdevice
  714 *
  715 * Check boot time settings for the device.
  716 * The found settings are set for the device to be used
  717 * later in the device probing.
  718 * Returns 0 if no settings found, 1 if they are.
  719 */
  720int netdev_boot_setup_check(struct net_device *dev)
  721{
  722	struct netdev_boot_setup *s = dev_boot_setup;
  723	int i;
  724
  725	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  726		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  727		    !strcmp(dev->name, s[i].name)) {
  728			dev->irq = s[i].map.irq;
  729			dev->base_addr = s[i].map.base_addr;
  730			dev->mem_start = s[i].map.mem_start;
  731			dev->mem_end = s[i].map.mem_end;
  732			return 1;
  733		}
  734	}
  735	return 0;
  736}
  737EXPORT_SYMBOL(netdev_boot_setup_check);
  738
  739
  740/**
  741 * netdev_boot_base	- get address from boot time settings
  742 * @prefix: prefix for network device
  743 * @unit: id for network device
  744 *
  745 * Check boot time settings for the base address of device.
  746 * The found settings are set for the device to be used
  747 * later in the device probing.
  748 * Returns 0 if no settings found.
  749 */
  750unsigned long netdev_boot_base(const char *prefix, int unit)
  751{
  752	const struct netdev_boot_setup *s = dev_boot_setup;
  753	char name[IFNAMSIZ];
  754	int i;
  755
  756	sprintf(name, "%s%d", prefix, unit);
  757
  758	/*
  759	 * If device already registered then return base of 1
  760	 * to indicate not to probe for this interface
  761	 */
  762	if (__dev_get_by_name(&init_net, name))
  763		return 1;
  764
  765	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  766		if (!strcmp(name, s[i].name))
  767			return s[i].map.base_addr;
  768	return 0;
  769}
  770
  771/*
  772 * Saves at boot time configured settings for any netdevice.
  773 */
  774int __init netdev_boot_setup(char *str)
  775{
  776	int ints[5];
  777	struct ifmap map;
  778
  779	str = get_options(str, ARRAY_SIZE(ints), ints);
  780	if (!str || !*str)
  781		return 0;
  782
  783	/* Save settings */
  784	memset(&map, 0, sizeof(map));
  785	if (ints[0] > 0)
  786		map.irq = ints[1];
  787	if (ints[0] > 1)
  788		map.base_addr = ints[2];
  789	if (ints[0] > 2)
  790		map.mem_start = ints[3];
  791	if (ints[0] > 3)
  792		map.mem_end = ints[4];
  793
  794	/* Add new entry to the list */
  795	return netdev_boot_setup_add(str, &map);
  796}
  797
  798__setup("netdev=", netdev_boot_setup);
  799
  800/*******************************************************************************
  801 *
  802 *			    Device Interface Subroutines
  803 *
  804 *******************************************************************************/
  805
  806/**
  807 *	dev_get_iflink	- get 'iflink' value of a interface
  808 *	@dev: targeted interface
  809 *
  810 *	Indicates the ifindex the interface is linked to.
  811 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  812 */
  813
  814int dev_get_iflink(const struct net_device *dev)
  815{
  816	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  817		return dev->netdev_ops->ndo_get_iflink(dev);
  818
  819	return dev->ifindex;
  820}
  821EXPORT_SYMBOL(dev_get_iflink);
  822
  823/**
  824 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  825 *	@dev: targeted interface
  826 *	@skb: The packet.
  827 *
  828 *	For better visibility of tunnel traffic OVS needs to retrieve
  829 *	egress tunnel information for a packet. Following API allows
  830 *	user to get this info.
  831 */
  832int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  833{
  834	struct ip_tunnel_info *info;
  835
  836	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  837		return -EINVAL;
  838
  839	info = skb_tunnel_info_unclone(skb);
  840	if (!info)
  841		return -ENOMEM;
  842	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  843		return -EINVAL;
  844
  845	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  846}
  847EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  848
  849/**
  850 *	__dev_get_by_name	- find a device by its name
  851 *	@net: the applicable net namespace
  852 *	@name: name to find
  853 *
  854 *	Find an interface by name. Must be called under RTNL semaphore
  855 *	or @dev_base_lock. If the name is found a pointer to the device
  856 *	is returned. If the name is not found then %NULL is returned. The
  857 *	reference counters are not incremented so the caller must be
  858 *	careful with locks.
  859 */
  860
  861struct net_device *__dev_get_by_name(struct net *net, const char *name)
  862{
  863	struct netdev_name_node *node_name;
  864
  865	node_name = netdev_name_node_lookup(net, name);
  866	return node_name ? node_name->dev : NULL;
  867}
  868EXPORT_SYMBOL(__dev_get_by_name);
  869
  870/**
  871 * dev_get_by_name_rcu	- find a device by its name
  872 * @net: the applicable net namespace
  873 * @name: name to find
  874 *
  875 * Find an interface by name.
  876 * If the name is found a pointer to the device is returned.
  877 * If the name is not found then %NULL is returned.
  878 * The reference counters are not incremented so the caller must be
  879 * careful with locks. The caller must hold RCU lock.
  880 */
  881
  882struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  883{
  884	struct netdev_name_node *node_name;
  885
  886	node_name = netdev_name_node_lookup_rcu(net, name);
  887	return node_name ? node_name->dev : NULL;
  888}
  889EXPORT_SYMBOL(dev_get_by_name_rcu);
  890
  891/**
  892 *	dev_get_by_name		- find a device by its name
  893 *	@net: the applicable net namespace
  894 *	@name: name to find
  895 *
  896 *	Find an interface by name. This can be called from any
  897 *	context and does its own locking. The returned handle has
  898 *	the usage count incremented and the caller must use dev_put() to
  899 *	release it when it is no longer needed. %NULL is returned if no
  900 *	matching device is found.
  901 */
  902
  903struct net_device *dev_get_by_name(struct net *net, const char *name)
  904{
  905	struct net_device *dev;
  906
  907	rcu_read_lock();
  908	dev = dev_get_by_name_rcu(net, name);
  909	if (dev)
  910		dev_hold(dev);
  911	rcu_read_unlock();
  912	return dev;
  913}
  914EXPORT_SYMBOL(dev_get_by_name);
  915
  916/**
  917 *	__dev_get_by_index - find a device by its ifindex
  918 *	@net: the applicable net namespace
  919 *	@ifindex: index of device
  920 *
  921 *	Search for an interface by index. Returns %NULL if the device
  922 *	is not found or a pointer to the device. The device has not
  923 *	had its reference counter increased so the caller must be careful
  924 *	about locking. The caller must hold either the RTNL semaphore
  925 *	or @dev_base_lock.
  926 */
  927
  928struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  929{
  930	struct net_device *dev;
  931	struct hlist_head *head = dev_index_hash(net, ifindex);
  932
  933	hlist_for_each_entry(dev, head, index_hlist)
  934		if (dev->ifindex == ifindex)
  935			return dev;
  936
  937	return NULL;
  938}
  939EXPORT_SYMBOL(__dev_get_by_index);
  940
  941/**
  942 *	dev_get_by_index_rcu - find a device by its ifindex
  943 *	@net: the applicable net namespace
  944 *	@ifindex: index of device
  945 *
  946 *	Search for an interface by index. Returns %NULL if the device
  947 *	is not found or a pointer to the device. The device has not
  948 *	had its reference counter increased so the caller must be careful
  949 *	about locking. The caller must hold RCU lock.
  950 */
  951
  952struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  953{
  954	struct net_device *dev;
  955	struct hlist_head *head = dev_index_hash(net, ifindex);
  956
  957	hlist_for_each_entry_rcu(dev, head, index_hlist)
  958		if (dev->ifindex == ifindex)
  959			return dev;
  960
  961	return NULL;
  962}
  963EXPORT_SYMBOL(dev_get_by_index_rcu);
  964
  965
  966/**
  967 *	dev_get_by_index - find a device by its ifindex
  968 *	@net: the applicable net namespace
  969 *	@ifindex: index of device
  970 *
  971 *	Search for an interface by index. Returns NULL if the device
  972 *	is not found or a pointer to the device. The device returned has
  973 *	had a reference added and the pointer is safe until the user calls
  974 *	dev_put to indicate they have finished with it.
  975 */
  976
  977struct net_device *dev_get_by_index(struct net *net, int ifindex)
  978{
  979	struct net_device *dev;
  980
  981	rcu_read_lock();
  982	dev = dev_get_by_index_rcu(net, ifindex);
  983	if (dev)
  984		dev_hold(dev);
  985	rcu_read_unlock();
  986	return dev;
  987}
  988EXPORT_SYMBOL(dev_get_by_index);
  989
  990/**
  991 *	dev_get_by_napi_id - find a device by napi_id
  992 *	@napi_id: ID of the NAPI struct
  993 *
  994 *	Search for an interface by NAPI ID. Returns %NULL if the device
  995 *	is not found or a pointer to the device. The device has not had
  996 *	its reference counter increased so the caller must be careful
  997 *	about locking. The caller must hold RCU lock.
  998 */
  999
 1000struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 1001{
 1002	struct napi_struct *napi;
 1003
 1004	WARN_ON_ONCE(!rcu_read_lock_held());
 1005
 1006	if (napi_id < MIN_NAPI_ID)
 1007		return NULL;
 1008
 1009	napi = napi_by_id(napi_id);
 1010
 1011	return napi ? napi->dev : NULL;
 1012}
 1013EXPORT_SYMBOL(dev_get_by_napi_id);
 1014
 1015/**
 1016 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1017 *	@net: network namespace
 1018 *	@name: a pointer to the buffer where the name will be stored.
 1019 *	@ifindex: the ifindex of the interface to get the name from.
 1020 */
 1021int netdev_get_name(struct net *net, char *name, int ifindex)
 1022{
 1023	struct net_device *dev;
 1024	int ret;
 1025
 1026	down_read(&devnet_rename_sem);
 1027	rcu_read_lock();
 1028
 1029	dev = dev_get_by_index_rcu(net, ifindex);
 1030	if (!dev) {
 1031		ret = -ENODEV;
 1032		goto out;
 1033	}
 1034
 1035	strcpy(name, dev->name);
 1036
 1037	ret = 0;
 1038out:
 1039	rcu_read_unlock();
 1040	up_read(&devnet_rename_sem);
 1041	return ret;
 1042}
 1043
 1044/**
 1045 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1046 *	@net: the applicable net namespace
 1047 *	@type: media type of device
 1048 *	@ha: hardware address
 1049 *
 1050 *	Search for an interface by MAC address. Returns NULL if the device
 1051 *	is not found or a pointer to the device.
 1052 *	The caller must hold RCU or RTNL.
 1053 *	The returned device has not had its ref count increased
 1054 *	and the caller must therefore be careful about locking
 1055 *
 1056 */
 1057
 1058struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1059				       const char *ha)
 1060{
 1061	struct net_device *dev;
 1062
 1063	for_each_netdev_rcu(net, dev)
 1064		if (dev->type == type &&
 1065		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1066			return dev;
 1067
 1068	return NULL;
 1069}
 1070EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1071
 1072struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1073{
 1074	struct net_device *dev, *ret = NULL;
 1075
 1076	rcu_read_lock();
 1077	for_each_netdev_rcu(net, dev)
 1078		if (dev->type == type) {
 1079			dev_hold(dev);
 1080			ret = dev;
 1081			break;
 1082		}
 1083	rcu_read_unlock();
 1084	return ret;
 1085}
 1086EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1087
 1088/**
 1089 *	__dev_get_by_flags - find any device with given flags
 1090 *	@net: the applicable net namespace
 1091 *	@if_flags: IFF_* values
 1092 *	@mask: bitmask of bits in if_flags to check
 1093 *
 1094 *	Search for any interface with the given flags. Returns NULL if a device
 1095 *	is not found or a pointer to the device. Must be called inside
 1096 *	rtnl_lock(), and result refcount is unchanged.
 1097 */
 1098
 1099struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1100				      unsigned short mask)
 1101{
 1102	struct net_device *dev, *ret;
 1103
 1104	ASSERT_RTNL();
 1105
 1106	ret = NULL;
 1107	for_each_netdev(net, dev) {
 1108		if (((dev->flags ^ if_flags) & mask) == 0) {
 1109			ret = dev;
 1110			break;
 1111		}
 1112	}
 1113	return ret;
 1114}
 1115EXPORT_SYMBOL(__dev_get_by_flags);
 1116
 1117/**
 1118 *	dev_valid_name - check if name is okay for network device
 1119 *	@name: name string
 1120 *
 1121 *	Network device names need to be valid file names to
 1122 *	allow sysfs to work.  We also disallow any kind of
 1123 *	whitespace.
 1124 */
 1125bool dev_valid_name(const char *name)
 1126{
 1127	if (*name == '\0')
 1128		return false;
 1129	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1130		return false;
 1131	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1132		return false;
 1133
 1134	while (*name) {
 1135		if (*name == '/' || *name == ':' || isspace(*name))
 1136			return false;
 1137		name++;
 1138	}
 1139	return true;
 1140}
 1141EXPORT_SYMBOL(dev_valid_name);
 1142
 1143/**
 1144 *	__dev_alloc_name - allocate a name for a device
 1145 *	@net: network namespace to allocate the device name in
 1146 *	@name: name format string
 1147 *	@buf:  scratch buffer and result name string
 1148 *
 1149 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1150 *	id. It scans list of devices to build up a free map, then chooses
 1151 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1152 *	while allocating the name and adding the device in order to avoid
 1153 *	duplicates.
 1154 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1155 *	Returns the number of the unit assigned or a negative errno code.
 1156 */
 1157
 1158static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1159{
 1160	int i = 0;
 1161	const char *p;
 1162	const int max_netdevices = 8*PAGE_SIZE;
 1163	unsigned long *inuse;
 1164	struct net_device *d;
 1165
 1166	if (!dev_valid_name(name))
 1167		return -EINVAL;
 1168
 1169	p = strchr(name, '%');
 1170	if (p) {
 1171		/*
 1172		 * Verify the string as this thing may have come from
 1173		 * the user.  There must be either one "%d" and no other "%"
 1174		 * characters.
 1175		 */
 1176		if (p[1] != 'd' || strchr(p + 2, '%'))
 1177			return -EINVAL;
 1178
 1179		/* Use one page as a bit array of possible slots */
 1180		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1181		if (!inuse)
 1182			return -ENOMEM;
 1183
 1184		for_each_netdev(net, d) {
 1185			if (!sscanf(d->name, name, &i))
 1186				continue;
 1187			if (i < 0 || i >= max_netdevices)
 1188				continue;
 1189
 1190			/*  avoid cases where sscanf is not exact inverse of printf */
 1191			snprintf(buf, IFNAMSIZ, name, i);
 1192			if (!strncmp(buf, d->name, IFNAMSIZ))
 1193				set_bit(i, inuse);
 1194		}
 1195
 1196		i = find_first_zero_bit(inuse, max_netdevices);
 1197		free_page((unsigned long) inuse);
 1198	}
 1199
 1200	snprintf(buf, IFNAMSIZ, name, i);
 1201	if (!__dev_get_by_name(net, buf))
 1202		return i;
 1203
 1204	/* It is possible to run out of possible slots
 1205	 * when the name is long and there isn't enough space left
 1206	 * for the digits, or if all bits are used.
 1207	 */
 1208	return -ENFILE;
 1209}
 1210
 1211static int dev_alloc_name_ns(struct net *net,
 1212			     struct net_device *dev,
 1213			     const char *name)
 1214{
 1215	char buf[IFNAMSIZ];
 1216	int ret;
 1217
 1218	BUG_ON(!net);
 1219	ret = __dev_alloc_name(net, name, buf);
 1220	if (ret >= 0)
 1221		strlcpy(dev->name, buf, IFNAMSIZ);
 1222	return ret;
 1223}
 1224
 1225/**
 1226 *	dev_alloc_name - allocate a name for a device
 1227 *	@dev: device
 1228 *	@name: name format string
 1229 *
 1230 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1231 *	id. It scans list of devices to build up a free map, then chooses
 1232 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1233 *	while allocating the name and adding the device in order to avoid
 1234 *	duplicates.
 1235 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1236 *	Returns the number of the unit assigned or a negative errno code.
 1237 */
 1238
 1239int dev_alloc_name(struct net_device *dev, const char *name)
 1240{
 1241	return dev_alloc_name_ns(dev_net(dev), dev, name);
 1242}
 1243EXPORT_SYMBOL(dev_alloc_name);
 1244
 1245static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1246			      const char *name)
 1247{
 1248	BUG_ON(!net);
 1249
 1250	if (!dev_valid_name(name))
 1251		return -EINVAL;
 1252
 1253	if (strchr(name, '%'))
 1254		return dev_alloc_name_ns(net, dev, name);
 1255	else if (__dev_get_by_name(net, name))
 1256		return -EEXIST;
 1257	else if (dev->name != name)
 1258		strlcpy(dev->name, name, IFNAMSIZ);
 1259
 1260	return 0;
 1261}
 1262
 1263/**
 1264 *	dev_change_name - change name of a device
 1265 *	@dev: device
 1266 *	@newname: name (or format string) must be at least IFNAMSIZ
 1267 *
 1268 *	Change name of a device, can pass format strings "eth%d".
 1269 *	for wildcarding.
 1270 */
 1271int dev_change_name(struct net_device *dev, const char *newname)
 1272{
 1273	unsigned char old_assign_type;
 1274	char oldname[IFNAMSIZ];
 1275	int err = 0;
 1276	int ret;
 1277	struct net *net;
 1278
 1279	ASSERT_RTNL();
 1280	BUG_ON(!dev_net(dev));
 1281
 1282	net = dev_net(dev);
 1283
 1284	/* Some auto-enslaved devices e.g. failover slaves are
 1285	 * special, as userspace might rename the device after
 1286	 * the interface had been brought up and running since
 1287	 * the point kernel initiated auto-enslavement. Allow
 1288	 * live name change even when these slave devices are
 1289	 * up and running.
 1290	 *
 1291	 * Typically, users of these auto-enslaving devices
 1292	 * don't actually care about slave name change, as
 1293	 * they are supposed to operate on master interface
 1294	 * directly.
 1295	 */
 1296	if (dev->flags & IFF_UP &&
 1297	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1298		return -EBUSY;
 1299
 1300	down_write(&devnet_rename_sem);
 1301
 1302	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1303		up_write(&devnet_rename_sem);
 1304		return 0;
 1305	}
 1306
 1307	memcpy(oldname, dev->name, IFNAMSIZ);
 1308
 1309	err = dev_get_valid_name(net, dev, newname);
 1310	if (err < 0) {
 1311		up_write(&devnet_rename_sem);
 1312		return err;
 1313	}
 1314
 1315	if (oldname[0] && !strchr(oldname, '%'))
 1316		netdev_info(dev, "renamed from %s\n", oldname);
 1317
 1318	old_assign_type = dev->name_assign_type;
 1319	dev->name_assign_type = NET_NAME_RENAMED;
 1320
 1321rollback:
 1322	ret = device_rename(&dev->dev, dev->name);
 1323	if (ret) {
 1324		memcpy(dev->name, oldname, IFNAMSIZ);
 1325		dev->name_assign_type = old_assign_type;
 1326		up_write(&devnet_rename_sem);
 1327		return ret;
 1328	}
 1329
 1330	up_write(&devnet_rename_sem);
 1331
 1332	netdev_adjacent_rename_links(dev, oldname);
 1333
 1334	write_lock_bh(&dev_base_lock);
 1335	netdev_name_node_del(dev->name_node);
 1336	write_unlock_bh(&dev_base_lock);
 1337
 1338	synchronize_rcu();
 1339
 1340	write_lock_bh(&dev_base_lock);
 1341	netdev_name_node_add(net, dev->name_node);
 1342	write_unlock_bh(&dev_base_lock);
 1343
 1344	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1345	ret = notifier_to_errno(ret);
 1346
 1347	if (ret) {
 1348		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1349		if (err >= 0) {
 1350			err = ret;
 1351			down_write(&devnet_rename_sem);
 1352			memcpy(dev->name, oldname, IFNAMSIZ);
 1353			memcpy(oldname, newname, IFNAMSIZ);
 1354			dev->name_assign_type = old_assign_type;
 1355			old_assign_type = NET_NAME_RENAMED;
 1356			goto rollback;
 1357		} else {
 1358			pr_err("%s: name change rollback failed: %d\n",
 1359			       dev->name, ret);
 1360		}
 1361	}
 1362
 1363	return err;
 1364}
 1365
 1366/**
 1367 *	dev_set_alias - change ifalias of a device
 1368 *	@dev: device
 1369 *	@alias: name up to IFALIASZ
 1370 *	@len: limit of bytes to copy from info
 1371 *
 1372 *	Set ifalias for a device,
 1373 */
 1374int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1375{
 1376	struct dev_ifalias *new_alias = NULL;
 1377
 1378	if (len >= IFALIASZ)
 1379		return -EINVAL;
 1380
 1381	if (len) {
 1382		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1383		if (!new_alias)
 1384			return -ENOMEM;
 1385
 1386		memcpy(new_alias->ifalias, alias, len);
 1387		new_alias->ifalias[len] = 0;
 1388	}
 1389
 1390	mutex_lock(&ifalias_mutex);
 1391	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1392					mutex_is_locked(&ifalias_mutex));
 1393	mutex_unlock(&ifalias_mutex);
 1394
 1395	if (new_alias)
 1396		kfree_rcu(new_alias, rcuhead);
 1397
 1398	return len;
 1399}
 1400EXPORT_SYMBOL(dev_set_alias);
 1401
 1402/**
 1403 *	dev_get_alias - get ifalias of a device
 1404 *	@dev: device
 1405 *	@name: buffer to store name of ifalias
 1406 *	@len: size of buffer
 1407 *
 1408 *	get ifalias for a device.  Caller must make sure dev cannot go
 1409 *	away,  e.g. rcu read lock or own a reference count to device.
 1410 */
 1411int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1412{
 1413	const struct dev_ifalias *alias;
 1414	int ret = 0;
 1415
 1416	rcu_read_lock();
 1417	alias = rcu_dereference(dev->ifalias);
 1418	if (alias)
 1419		ret = snprintf(name, len, "%s", alias->ifalias);
 1420	rcu_read_unlock();
 1421
 1422	return ret;
 1423}
 1424
 1425/**
 1426 *	netdev_features_change - device changes features
 1427 *	@dev: device to cause notification
 1428 *
 1429 *	Called to indicate a device has changed features.
 1430 */
 1431void netdev_features_change(struct net_device *dev)
 1432{
 1433	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1434}
 1435EXPORT_SYMBOL(netdev_features_change);
 1436
 1437/**
 1438 *	netdev_state_change - device changes state
 1439 *	@dev: device to cause notification
 1440 *
 1441 *	Called to indicate a device has changed state. This function calls
 1442 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1443 *	to the routing socket.
 1444 */
 1445void netdev_state_change(struct net_device *dev)
 1446{
 1447	if (dev->flags & IFF_UP) {
 1448		struct netdev_notifier_change_info change_info = {
 1449			.info.dev = dev,
 1450		};
 1451
 1452		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1453					      &change_info.info);
 1454		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1455	}
 1456}
 1457EXPORT_SYMBOL(netdev_state_change);
 1458
 1459/**
 1460 * __netdev_notify_peers - notify network peers about existence of @dev,
 1461 * to be called when rtnl lock is already held.
 1462 * @dev: network device
 1463 *
 1464 * Generate traffic such that interested network peers are aware of
 1465 * @dev, such as by generating a gratuitous ARP. This may be used when
 1466 * a device wants to inform the rest of the network about some sort of
 1467 * reconfiguration such as a failover event or virtual machine
 1468 * migration.
 1469 */
 1470void __netdev_notify_peers(struct net_device *dev)
 1471{
 1472	ASSERT_RTNL();
 1473	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1474	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1475}
 1476EXPORT_SYMBOL(__netdev_notify_peers);
 1477
 1478/**
 1479 * netdev_notify_peers - notify network peers about existence of @dev
 1480 * @dev: network device
 1481 *
 1482 * Generate traffic such that interested network peers are aware of
 1483 * @dev, such as by generating a gratuitous ARP. This may be used when
 1484 * a device wants to inform the rest of the network about some sort of
 1485 * reconfiguration such as a failover event or virtual machine
 1486 * migration.
 1487 */
 1488void netdev_notify_peers(struct net_device *dev)
 1489{
 1490	rtnl_lock();
 1491	__netdev_notify_peers(dev);
 1492	rtnl_unlock();
 1493}
 1494EXPORT_SYMBOL(netdev_notify_peers);
 1495
 1496static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1497{
 1498	const struct net_device_ops *ops = dev->netdev_ops;
 1499	int ret;
 1500
 1501	ASSERT_RTNL();
 1502
 1503	if (!netif_device_present(dev)) {
 1504		/* may be detached because parent is runtime-suspended */
 1505		if (dev->dev.parent)
 1506			pm_runtime_resume(dev->dev.parent);
 1507		if (!netif_device_present(dev))
 1508			return -ENODEV;
 1509	}
 1510
 1511	/* Block netpoll from trying to do any rx path servicing.
 1512	 * If we don't do this there is a chance ndo_poll_controller
 1513	 * or ndo_poll may be running while we open the device
 1514	 */
 1515	netpoll_poll_disable(dev);
 1516
 1517	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1518	ret = notifier_to_errno(ret);
 1519	if (ret)
 1520		return ret;
 1521
 1522	set_bit(__LINK_STATE_START, &dev->state);
 1523
 1524	if (ops->ndo_validate_addr)
 1525		ret = ops->ndo_validate_addr(dev);
 1526
 1527	if (!ret && ops->ndo_open)
 1528		ret = ops->ndo_open(dev);
 1529
 1530	netpoll_poll_enable(dev);
 1531
 1532	if (ret)
 1533		clear_bit(__LINK_STATE_START, &dev->state);
 1534	else {
 1535		dev->flags |= IFF_UP;
 1536		dev_set_rx_mode(dev);
 1537		dev_activate(dev);
 1538		add_device_randomness(dev->dev_addr, dev->addr_len);
 1539	}
 1540
 1541	return ret;
 1542}
 1543
 1544/**
 1545 *	dev_open	- prepare an interface for use.
 1546 *	@dev: device to open
 1547 *	@extack: netlink extended ack
 1548 *
 1549 *	Takes a device from down to up state. The device's private open
 1550 *	function is invoked and then the multicast lists are loaded. Finally
 1551 *	the device is moved into the up state and a %NETDEV_UP message is
 1552 *	sent to the netdev notifier chain.
 1553 *
 1554 *	Calling this function on an active interface is a nop. On a failure
 1555 *	a negative errno code is returned.
 1556 */
 1557int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1558{
 1559	int ret;
 1560
 1561	if (dev->flags & IFF_UP)
 1562		return 0;
 1563
 1564	ret = __dev_open(dev, extack);
 1565	if (ret < 0)
 1566		return ret;
 1567
 1568	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1569	call_netdevice_notifiers(NETDEV_UP, dev);
 1570
 1571	return ret;
 1572}
 1573EXPORT_SYMBOL(dev_open);
 1574
 1575static void __dev_close_many(struct list_head *head)
 1576{
 1577	struct net_device *dev;
 1578
 1579	ASSERT_RTNL();
 1580	might_sleep();
 1581
 1582	list_for_each_entry(dev, head, close_list) {
 1583		/* Temporarily disable netpoll until the interface is down */
 1584		netpoll_poll_disable(dev);
 1585
 1586		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1587
 1588		clear_bit(__LINK_STATE_START, &dev->state);
 1589
 1590		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1591		 * can be even on different cpu. So just clear netif_running().
 1592		 *
 1593		 * dev->stop() will invoke napi_disable() on all of it's
 1594		 * napi_struct instances on this device.
 1595		 */
 1596		smp_mb__after_atomic(); /* Commit netif_running(). */
 1597	}
 1598
 1599	dev_deactivate_many(head);
 1600
 1601	list_for_each_entry(dev, head, close_list) {
 1602		const struct net_device_ops *ops = dev->netdev_ops;
 1603
 1604		/*
 1605		 *	Call the device specific close. This cannot fail.
 1606		 *	Only if device is UP
 1607		 *
 1608		 *	We allow it to be called even after a DETACH hot-plug
 1609		 *	event.
 1610		 */
 1611		if (ops->ndo_stop)
 1612			ops->ndo_stop(dev);
 1613
 1614		dev->flags &= ~IFF_UP;
 1615		netpoll_poll_enable(dev);
 1616	}
 1617}
 1618
 1619static void __dev_close(struct net_device *dev)
 1620{
 1621	LIST_HEAD(single);
 1622
 1623	list_add(&dev->close_list, &single);
 1624	__dev_close_many(&single);
 1625	list_del(&single);
 1626}
 1627
 1628void dev_close_many(struct list_head *head, bool unlink)
 1629{
 1630	struct net_device *dev, *tmp;
 1631
 1632	/* Remove the devices that don't need to be closed */
 1633	list_for_each_entry_safe(dev, tmp, head, close_list)
 1634		if (!(dev->flags & IFF_UP))
 1635			list_del_init(&dev->close_list);
 1636
 1637	__dev_close_many(head);
 1638
 1639	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1640		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1641		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1642		if (unlink)
 1643			list_del_init(&dev->close_list);
 1644	}
 1645}
 1646EXPORT_SYMBOL(dev_close_many);
 1647
 1648/**
 1649 *	dev_close - shutdown an interface.
 1650 *	@dev: device to shutdown
 1651 *
 1652 *	This function moves an active device into down state. A
 1653 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1654 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1655 *	chain.
 1656 */
 1657void dev_close(struct net_device *dev)
 1658{
 1659	if (dev->flags & IFF_UP) {
 1660		LIST_HEAD(single);
 1661
 1662		list_add(&dev->close_list, &single);
 1663		dev_close_many(&single, true);
 1664		list_del(&single);
 1665	}
 1666}
 1667EXPORT_SYMBOL(dev_close);
 1668
 1669
 1670/**
 1671 *	dev_disable_lro - disable Large Receive Offload on a device
 1672 *	@dev: device
 1673 *
 1674 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1675 *	called under RTNL.  This is needed if received packets may be
 1676 *	forwarded to another interface.
 1677 */
 1678void dev_disable_lro(struct net_device *dev)
 1679{
 1680	struct net_device *lower_dev;
 1681	struct list_head *iter;
 1682
 1683	dev->wanted_features &= ~NETIF_F_LRO;
 1684	netdev_update_features(dev);
 1685
 1686	if (unlikely(dev->features & NETIF_F_LRO))
 1687		netdev_WARN(dev, "failed to disable LRO!\n");
 1688
 1689	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1690		dev_disable_lro(lower_dev);
 1691}
 1692EXPORT_SYMBOL(dev_disable_lro);
 1693
 1694/**
 1695 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1696 *	@dev: device
 1697 *
 1698 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1699 *	called under RTNL.  This is needed if Generic XDP is installed on
 1700 *	the device.
 1701 */
 1702static void dev_disable_gro_hw(struct net_device *dev)
 1703{
 1704	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1705	netdev_update_features(dev);
 1706
 1707	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1708		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1709}
 1710
 1711const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1712{
 1713#define N(val) 						\
 1714	case NETDEV_##val:				\
 1715		return "NETDEV_" __stringify(val);
 1716	switch (cmd) {
 1717	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1718	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1719	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1720	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1721	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1722	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1723	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1724	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1725	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1726	N(PRE_CHANGEADDR)
 1727	}
 1728#undef N
 1729	return "UNKNOWN_NETDEV_EVENT";
 1730}
 1731EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1732
 1733static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1734				   struct net_device *dev)
 1735{
 1736	struct netdev_notifier_info info = {
 1737		.dev = dev,
 1738	};
 1739
 1740	return nb->notifier_call(nb, val, &info);
 1741}
 1742
 1743static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1744					     struct net_device *dev)
 1745{
 1746	int err;
 1747
 1748	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1749	err = notifier_to_errno(err);
 1750	if (err)
 1751		return err;
 1752
 1753	if (!(dev->flags & IFF_UP))
 1754		return 0;
 1755
 1756	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1757	return 0;
 1758}
 1759
 1760static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1761						struct net_device *dev)
 1762{
 1763	if (dev->flags & IFF_UP) {
 1764		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1765					dev);
 1766		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1767	}
 1768	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1769}
 1770
 1771static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1772						 struct net *net)
 1773{
 1774	struct net_device *dev;
 1775	int err;
 1776
 1777	for_each_netdev(net, dev) {
 1778		err = call_netdevice_register_notifiers(nb, dev);
 1779		if (err)
 1780			goto rollback;
 1781	}
 1782	return 0;
 1783
 1784rollback:
 1785	for_each_netdev_continue_reverse(net, dev)
 1786		call_netdevice_unregister_notifiers(nb, dev);
 1787	return err;
 1788}
 1789
 1790static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1791						    struct net *net)
 1792{
 1793	struct net_device *dev;
 1794
 1795	for_each_netdev(net, dev)
 1796		call_netdevice_unregister_notifiers(nb, dev);
 1797}
 1798
 1799static int dev_boot_phase = 1;
 1800
 1801/**
 1802 * register_netdevice_notifier - register a network notifier block
 1803 * @nb: notifier
 1804 *
 1805 * Register a notifier to be called when network device events occur.
 1806 * The notifier passed is linked into the kernel structures and must
 1807 * not be reused until it has been unregistered. A negative errno code
 1808 * is returned on a failure.
 1809 *
 1810 * When registered all registration and up events are replayed
 1811 * to the new notifier to allow device to have a race free
 1812 * view of the network device list.
 1813 */
 1814
 1815int register_netdevice_notifier(struct notifier_block *nb)
 1816{
 1817	struct net *net;
 1818	int err;
 1819
 1820	/* Close race with setup_net() and cleanup_net() */
 1821	down_write(&pernet_ops_rwsem);
 1822	rtnl_lock();
 1823	err = raw_notifier_chain_register(&netdev_chain, nb);
 1824	if (err)
 1825		goto unlock;
 1826	if (dev_boot_phase)
 1827		goto unlock;
 1828	for_each_net(net) {
 1829		err = call_netdevice_register_net_notifiers(nb, net);
 1830		if (err)
 1831			goto rollback;
 1832	}
 1833
 1834unlock:
 1835	rtnl_unlock();
 1836	up_write(&pernet_ops_rwsem);
 1837	return err;
 1838
 1839rollback:
 1840	for_each_net_continue_reverse(net)
 1841		call_netdevice_unregister_net_notifiers(nb, net);
 1842
 1843	raw_notifier_chain_unregister(&netdev_chain, nb);
 1844	goto unlock;
 1845}
 1846EXPORT_SYMBOL(register_netdevice_notifier);
 1847
 1848/**
 1849 * unregister_netdevice_notifier - unregister a network notifier block
 1850 * @nb: notifier
 1851 *
 1852 * Unregister a notifier previously registered by
 1853 * register_netdevice_notifier(). The notifier is unlinked into the
 1854 * kernel structures and may then be reused. A negative errno code
 1855 * is returned on a failure.
 1856 *
 1857 * After unregistering unregister and down device events are synthesized
 1858 * for all devices on the device list to the removed notifier to remove
 1859 * the need for special case cleanup code.
 1860 */
 1861
 1862int unregister_netdevice_notifier(struct notifier_block *nb)
 1863{
 1864	struct net *net;
 1865	int err;
 1866
 1867	/* Close race with setup_net() and cleanup_net() */
 1868	down_write(&pernet_ops_rwsem);
 1869	rtnl_lock();
 1870	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1871	if (err)
 1872		goto unlock;
 1873
 1874	for_each_net(net)
 1875		call_netdevice_unregister_net_notifiers(nb, net);
 1876
 1877unlock:
 1878	rtnl_unlock();
 1879	up_write(&pernet_ops_rwsem);
 1880	return err;
 1881}
 1882EXPORT_SYMBOL(unregister_netdevice_notifier);
 1883
 1884static int __register_netdevice_notifier_net(struct net *net,
 1885					     struct notifier_block *nb,
 1886					     bool ignore_call_fail)
 1887{
 1888	int err;
 1889
 1890	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1891	if (err)
 1892		return err;
 1893	if (dev_boot_phase)
 1894		return 0;
 1895
 1896	err = call_netdevice_register_net_notifiers(nb, net);
 1897	if (err && !ignore_call_fail)
 1898		goto chain_unregister;
 1899
 1900	return 0;
 1901
 1902chain_unregister:
 1903	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1904	return err;
 1905}
 1906
 1907static int __unregister_netdevice_notifier_net(struct net *net,
 1908					       struct notifier_block *nb)
 1909{
 1910	int err;
 1911
 1912	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1913	if (err)
 1914		return err;
 1915
 1916	call_netdevice_unregister_net_notifiers(nb, net);
 1917	return 0;
 1918}
 1919
 1920/**
 1921 * register_netdevice_notifier_net - register a per-netns network notifier block
 1922 * @net: network namespace
 1923 * @nb: notifier
 1924 *
 1925 * Register a notifier to be called when network device events occur.
 1926 * The notifier passed is linked into the kernel structures and must
 1927 * not be reused until it has been unregistered. A negative errno code
 1928 * is returned on a failure.
 1929 *
 1930 * When registered all registration and up events are replayed
 1931 * to the new notifier to allow device to have a race free
 1932 * view of the network device list.
 1933 */
 1934
 1935int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1936{
 1937	int err;
 1938
 1939	rtnl_lock();
 1940	err = __register_netdevice_notifier_net(net, nb, false);
 1941	rtnl_unlock();
 1942	return err;
 1943}
 1944EXPORT_SYMBOL(register_netdevice_notifier_net);
 1945
 1946/**
 1947 * unregister_netdevice_notifier_net - unregister a per-netns
 1948 *                                     network notifier block
 1949 * @net: network namespace
 1950 * @nb: notifier
 1951 *
 1952 * Unregister a notifier previously registered by
 1953 * register_netdevice_notifier(). The notifier is unlinked into the
 1954 * kernel structures and may then be reused. A negative errno code
 1955 * is returned on a failure.
 1956 *
 1957 * After unregistering unregister and down device events are synthesized
 1958 * for all devices on the device list to the removed notifier to remove
 1959 * the need for special case cleanup code.
 1960 */
 1961
 1962int unregister_netdevice_notifier_net(struct net *net,
 1963				      struct notifier_block *nb)
 1964{
 1965	int err;
 1966
 1967	rtnl_lock();
 1968	err = __unregister_netdevice_notifier_net(net, nb);
 1969	rtnl_unlock();
 1970	return err;
 1971}
 1972EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1973
 1974int register_netdevice_notifier_dev_net(struct net_device *dev,
 1975					struct notifier_block *nb,
 1976					struct netdev_net_notifier *nn)
 1977{
 1978	int err;
 1979
 1980	rtnl_lock();
 1981	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1982	if (!err) {
 1983		nn->nb = nb;
 1984		list_add(&nn->list, &dev->net_notifier_list);
 1985	}
 1986	rtnl_unlock();
 1987	return err;
 1988}
 1989EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1990
 1991int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1992					  struct notifier_block *nb,
 1993					  struct netdev_net_notifier *nn)
 1994{
 1995	int err;
 1996
 1997	rtnl_lock();
 1998	list_del(&nn->list);
 1999	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 2000	rtnl_unlock();
 2001	return err;
 2002}
 2003EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 2004
 2005static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 2006					     struct net *net)
 2007{
 2008	struct netdev_net_notifier *nn;
 2009
 2010	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 2011		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 2012		__register_netdevice_notifier_net(net, nn->nb, true);
 2013	}
 2014}
 2015
 2016/**
 2017 *	call_netdevice_notifiers_info - call all network notifier blocks
 2018 *	@val: value passed unmodified to notifier function
 2019 *	@info: notifier information data
 2020 *
 2021 *	Call all network notifier blocks.  Parameters and return value
 2022 *	are as for raw_notifier_call_chain().
 2023 */
 2024
 2025static int call_netdevice_notifiers_info(unsigned long val,
 2026					 struct netdev_notifier_info *info)
 2027{
 2028	struct net *net = dev_net(info->dev);
 2029	int ret;
 2030
 2031	ASSERT_RTNL();
 2032
 2033	/* Run per-netns notifier block chain first, then run the global one.
 2034	 * Hopefully, one day, the global one is going to be removed after
 2035	 * all notifier block registrators get converted to be per-netns.
 2036	 */
 2037	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2038	if (ret & NOTIFY_STOP_MASK)
 2039		return ret;
 2040	return raw_notifier_call_chain(&netdev_chain, val, info);
 2041}
 2042
 2043static int call_netdevice_notifiers_extack(unsigned long val,
 2044					   struct net_device *dev,
 2045					   struct netlink_ext_ack *extack)
 2046{
 2047	struct netdev_notifier_info info = {
 2048		.dev = dev,
 2049		.extack = extack,
 2050	};
 2051
 2052	return call_netdevice_notifiers_info(val, &info);
 2053}
 2054
 2055/**
 2056 *	call_netdevice_notifiers - call all network notifier blocks
 2057 *      @val: value passed unmodified to notifier function
 2058 *      @dev: net_device pointer passed unmodified to notifier function
 2059 *
 2060 *	Call all network notifier blocks.  Parameters and return value
 2061 *	are as for raw_notifier_call_chain().
 2062 */
 2063
 2064int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2065{
 2066	return call_netdevice_notifiers_extack(val, dev, NULL);
 2067}
 2068EXPORT_SYMBOL(call_netdevice_notifiers);
 2069
 2070/**
 2071 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2072 *	@val: value passed unmodified to notifier function
 2073 *	@dev: net_device pointer passed unmodified to notifier function
 2074 *	@arg: additional u32 argument passed to the notifier function
 2075 *
 2076 *	Call all network notifier blocks.  Parameters and return value
 2077 *	are as for raw_notifier_call_chain().
 2078 */
 2079static int call_netdevice_notifiers_mtu(unsigned long val,
 2080					struct net_device *dev, u32 arg)
 2081{
 2082	struct netdev_notifier_info_ext info = {
 2083		.info.dev = dev,
 2084		.ext.mtu = arg,
 2085	};
 2086
 2087	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2088
 2089	return call_netdevice_notifiers_info(val, &info.info);
 2090}
 2091
 2092#ifdef CONFIG_NET_INGRESS
 2093static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2094
 2095void net_inc_ingress_queue(void)
 2096{
 2097	static_branch_inc(&ingress_needed_key);
 2098}
 2099EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2100
 2101void net_dec_ingress_queue(void)
 2102{
 2103	static_branch_dec(&ingress_needed_key);
 2104}
 2105EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2106#endif
 2107
 2108#ifdef CONFIG_NET_EGRESS
 2109static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2110
 2111void net_inc_egress_queue(void)
 2112{
 2113	static_branch_inc(&egress_needed_key);
 2114}
 2115EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2116
 2117void net_dec_egress_queue(void)
 2118{
 2119	static_branch_dec(&egress_needed_key);
 2120}
 2121EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2122#endif
 2123
 2124static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2125#ifdef CONFIG_JUMP_LABEL
 2126static atomic_t netstamp_needed_deferred;
 2127static atomic_t netstamp_wanted;
 2128static void netstamp_clear(struct work_struct *work)
 2129{
 2130	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2131	int wanted;
 2132
 2133	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2134	if (wanted > 0)
 2135		static_branch_enable(&netstamp_needed_key);
 2136	else
 2137		static_branch_disable(&netstamp_needed_key);
 2138}
 2139static DECLARE_WORK(netstamp_work, netstamp_clear);
 2140#endif
 2141
 2142void net_enable_timestamp(void)
 2143{
 2144#ifdef CONFIG_JUMP_LABEL
 2145	int wanted;
 2146
 2147	while (1) {
 2148		wanted = atomic_read(&netstamp_wanted);
 2149		if (wanted <= 0)
 2150			break;
 2151		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2152			return;
 2153	}
 2154	atomic_inc(&netstamp_needed_deferred);
 2155	schedule_work(&netstamp_work);
 2156#else
 2157	static_branch_inc(&netstamp_needed_key);
 2158#endif
 2159}
 2160EXPORT_SYMBOL(net_enable_timestamp);
 2161
 2162void net_disable_timestamp(void)
 2163{
 2164#ifdef CONFIG_JUMP_LABEL
 2165	int wanted;
 2166
 2167	while (1) {
 2168		wanted = atomic_read(&netstamp_wanted);
 2169		if (wanted <= 1)
 2170			break;
 2171		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2172			return;
 2173	}
 2174	atomic_dec(&netstamp_needed_deferred);
 2175	schedule_work(&netstamp_work);
 2176#else
 2177	static_branch_dec(&netstamp_needed_key);
 2178#endif
 2179}
 2180EXPORT_SYMBOL(net_disable_timestamp);
 2181
 2182static inline void net_timestamp_set(struct sk_buff *skb)
 2183{
 2184	skb->tstamp = 0;
 2185	if (static_branch_unlikely(&netstamp_needed_key))
 2186		__net_timestamp(skb);
 2187}
 2188
 2189#define net_timestamp_check(COND, SKB)				\
 2190	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2191		if ((COND) && !(SKB)->tstamp)			\
 2192			__net_timestamp(SKB);			\
 2193	}							\
 2194
 2195bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2196{
 2197	unsigned int len;
 2198
 2199	if (!(dev->flags & IFF_UP))
 2200		return false;
 2201
 2202	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2203	if (skb->len <= len)
 2204		return true;
 2205
 2206	/* if TSO is enabled, we don't care about the length as the packet
 2207	 * could be forwarded without being segmented before
 2208	 */
 2209	if (skb_is_gso(skb))
 2210		return true;
 2211
 2212	return false;
 2213}
 2214EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2215
 2216int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2217{
 2218	int ret = ____dev_forward_skb(dev, skb);
 2219
 2220	if (likely(!ret)) {
 2221		skb->protocol = eth_type_trans(skb, dev);
 2222		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2223	}
 2224
 2225	return ret;
 2226}
 2227EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2228
 2229/**
 2230 * dev_forward_skb - loopback an skb to another netif
 2231 *
 2232 * @dev: destination network device
 2233 * @skb: buffer to forward
 2234 *
 2235 * return values:
 2236 *	NET_RX_SUCCESS	(no congestion)
 2237 *	NET_RX_DROP     (packet was dropped, but freed)
 2238 *
 2239 * dev_forward_skb can be used for injecting an skb from the
 2240 * start_xmit function of one device into the receive queue
 2241 * of another device.
 2242 *
 2243 * The receiving device may be in another namespace, so
 2244 * we have to clear all information in the skb that could
 2245 * impact namespace isolation.
 2246 */
 2247int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2248{
 2249	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2250}
 2251EXPORT_SYMBOL_GPL(dev_forward_skb);
 2252
 2253static inline int deliver_skb(struct sk_buff *skb,
 2254			      struct packet_type *pt_prev,
 2255			      struct net_device *orig_dev)
 2256{
 2257	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2258		return -ENOMEM;
 2259	refcount_inc(&skb->users);
 2260	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2261}
 2262
 2263static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2264					  struct packet_type **pt,
 2265					  struct net_device *orig_dev,
 2266					  __be16 type,
 2267					  struct list_head *ptype_list)
 2268{
 2269	struct packet_type *ptype, *pt_prev = *pt;
 2270
 2271	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2272		if (ptype->type != type)
 2273			continue;
 2274		if (pt_prev)
 2275			deliver_skb(skb, pt_prev, orig_dev);
 2276		pt_prev = ptype;
 2277	}
 2278	*pt = pt_prev;
 2279}
 2280
 2281static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2282{
 2283	if (!ptype->af_packet_priv || !skb->sk)
 2284		return false;
 2285
 2286	if (ptype->id_match)
 2287		return ptype->id_match(ptype, skb->sk);
 2288	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2289		return true;
 2290
 2291	return false;
 2292}
 2293
 2294/**
 2295 * dev_nit_active - return true if any network interface taps are in use
 2296 *
 2297 * @dev: network device to check for the presence of taps
 2298 */
 2299bool dev_nit_active(struct net_device *dev)
 2300{
 2301	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2302}
 2303EXPORT_SYMBOL_GPL(dev_nit_active);
 2304
 2305/*
 2306 *	Support routine. Sends outgoing frames to any network
 2307 *	taps currently in use.
 2308 */
 2309
 2310void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2311{
 2312	struct packet_type *ptype;
 2313	struct sk_buff *skb2 = NULL;
 2314	struct packet_type *pt_prev = NULL;
 2315	struct list_head *ptype_list = &ptype_all;
 2316
 2317	rcu_read_lock();
 2318again:
 2319	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2320		if (ptype->ignore_outgoing)
 2321			continue;
 2322
 2323		/* Never send packets back to the socket
 2324		 * they originated from - MvS (miquels@drinkel.ow.org)
 2325		 */
 2326		if (skb_loop_sk(ptype, skb))
 2327			continue;
 2328
 2329		if (pt_prev) {
 2330			deliver_skb(skb2, pt_prev, skb->dev);
 2331			pt_prev = ptype;
 2332			continue;
 2333		}
 2334
 2335		/* need to clone skb, done only once */
 2336		skb2 = skb_clone(skb, GFP_ATOMIC);
 2337		if (!skb2)
 2338			goto out_unlock;
 2339
 2340		net_timestamp_set(skb2);
 2341
 2342		/* skb->nh should be correctly
 2343		 * set by sender, so that the second statement is
 2344		 * just protection against buggy protocols.
 2345		 */
 2346		skb_reset_mac_header(skb2);
 2347
 2348		if (skb_network_header(skb2) < skb2->data ||
 2349		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2350			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2351					     ntohs(skb2->protocol),
 2352					     dev->name);
 2353			skb_reset_network_header(skb2);
 2354		}
 2355
 2356		skb2->transport_header = skb2->network_header;
 2357		skb2->pkt_type = PACKET_OUTGOING;
 2358		pt_prev = ptype;
 2359	}
 2360
 2361	if (ptype_list == &ptype_all) {
 2362		ptype_list = &dev->ptype_all;
 2363		goto again;
 2364	}
 2365out_unlock:
 2366	if (pt_prev) {
 2367		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2368			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2369		else
 2370			kfree_skb(skb2);
 2371	}
 2372	rcu_read_unlock();
 2373}
 2374EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2375
 2376/**
 2377 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2378 * @dev: Network device
 2379 * @txq: number of queues available
 2380 *
 2381 * If real_num_tx_queues is changed the tc mappings may no longer be
 2382 * valid. To resolve this verify the tc mapping remains valid and if
 2383 * not NULL the mapping. With no priorities mapping to this
 2384 * offset/count pair it will no longer be used. In the worst case TC0
 2385 * is invalid nothing can be done so disable priority mappings. If is
 2386 * expected that drivers will fix this mapping if they can before
 2387 * calling netif_set_real_num_tx_queues.
 2388 */
 2389static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2390{
 2391	int i;
 2392	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2393
 2394	/* If TC0 is invalidated disable TC mapping */
 2395	if (tc->offset + tc->count > txq) {
 2396		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2397		dev->num_tc = 0;
 2398		return;
 2399	}
 2400
 2401	/* Invalidated prio to tc mappings set to TC0 */
 2402	for (i = 1; i < TC_BITMASK + 1; i++) {
 2403		int q = netdev_get_prio_tc_map(dev, i);
 2404
 2405		tc = &dev->tc_to_txq[q];
 2406		if (tc->offset + tc->count > txq) {
 2407			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2408				i, q);
 2409			netdev_set_prio_tc_map(dev, i, 0);
 2410		}
 2411	}
 2412}
 2413
 2414int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2415{
 2416	if (dev->num_tc) {
 2417		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2418		int i;
 2419
 2420		/* walk through the TCs and see if it falls into any of them */
 2421		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2422			if ((txq - tc->offset) < tc->count)
 2423				return i;
 2424		}
 2425
 2426		/* didn't find it, just return -1 to indicate no match */
 2427		return -1;
 2428	}
 2429
 2430	return 0;
 2431}
 2432EXPORT_SYMBOL(netdev_txq_to_tc);
 2433
 2434#ifdef CONFIG_XPS
 2435struct static_key xps_needed __read_mostly;
 2436EXPORT_SYMBOL(xps_needed);
 2437struct static_key xps_rxqs_needed __read_mostly;
 2438EXPORT_SYMBOL(xps_rxqs_needed);
 2439static DEFINE_MUTEX(xps_map_mutex);
 2440#define xmap_dereference(P)		\
 2441	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2442
 2443static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2444			     int tci, u16 index)
 2445{
 2446	struct xps_map *map = NULL;
 2447	int pos;
 2448
 2449	if (dev_maps)
 2450		map = xmap_dereference(dev_maps->attr_map[tci]);
 2451	if (!map)
 2452		return false;
 2453
 2454	for (pos = map->len; pos--;) {
 2455		if (map->queues[pos] != index)
 2456			continue;
 2457
 2458		if (map->len > 1) {
 2459			map->queues[pos] = map->queues[--map->len];
 2460			break;
 2461		}
 2462
 2463		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2464		kfree_rcu(map, rcu);
 2465		return false;
 2466	}
 2467
 2468	return true;
 2469}
 2470
 2471static bool remove_xps_queue_cpu(struct net_device *dev,
 2472				 struct xps_dev_maps *dev_maps,
 2473				 int cpu, u16 offset, u16 count)
 2474{
 2475	int num_tc = dev->num_tc ? : 1;
 2476	bool active = false;
 2477	int tci;
 2478
 2479	for (tci = cpu * num_tc; num_tc--; tci++) {
 2480		int i, j;
 2481
 2482		for (i = count, j = offset; i--; j++) {
 2483			if (!remove_xps_queue(dev_maps, tci, j))
 2484				break;
 2485		}
 2486
 2487		active |= i < 0;
 2488	}
 2489
 2490	return active;
 2491}
 2492
 2493static void reset_xps_maps(struct net_device *dev,
 2494			   struct xps_dev_maps *dev_maps,
 2495			   bool is_rxqs_map)
 2496{
 2497	if (is_rxqs_map) {
 2498		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2499		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2500	} else {
 2501		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2502	}
 2503	static_key_slow_dec_cpuslocked(&xps_needed);
 2504	kfree_rcu(dev_maps, rcu);
 2505}
 2506
 2507static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2508			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2509			   u16 offset, u16 count, bool is_rxqs_map)
 2510{
 2511	bool active = false;
 2512	int i, j;
 2513
 2514	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2515	     j < nr_ids;)
 2516		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2517					       count);
 2518	if (!active)
 2519		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2520
 2521	if (!is_rxqs_map) {
 2522		for (i = offset + (count - 1); count--; i--) {
 2523			netdev_queue_numa_node_write(
 2524				netdev_get_tx_queue(dev, i),
 2525				NUMA_NO_NODE);
 2526		}
 2527	}
 2528}
 2529
 2530static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2531				   u16 count)
 2532{
 2533	const unsigned long *possible_mask = NULL;
 2534	struct xps_dev_maps *dev_maps;
 2535	unsigned int nr_ids;
 2536
 2537	if (!static_key_false(&xps_needed))
 2538		return;
 2539
 2540	cpus_read_lock();
 2541	mutex_lock(&xps_map_mutex);
 2542
 2543	if (static_key_false(&xps_rxqs_needed)) {
 2544		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2545		if (dev_maps) {
 2546			nr_ids = dev->num_rx_queues;
 2547			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2548				       offset, count, true);
 2549		}
 2550	}
 2551
 2552	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2553	if (!dev_maps)
 2554		goto out_no_maps;
 2555
 2556	if (num_possible_cpus() > 1)
 2557		possible_mask = cpumask_bits(cpu_possible_mask);
 2558	nr_ids = nr_cpu_ids;
 2559	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2560		       false);
 2561
 2562out_no_maps:
 2563	mutex_unlock(&xps_map_mutex);
 2564	cpus_read_unlock();
 2565}
 2566
 2567static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2568{
 2569	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2570}
 2571
 2572static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2573				      u16 index, bool is_rxqs_map)
 2574{
 2575	struct xps_map *new_map;
 2576	int alloc_len = XPS_MIN_MAP_ALLOC;
 2577	int i, pos;
 2578
 2579	for (pos = 0; map && pos < map->len; pos++) {
 2580		if (map->queues[pos] != index)
 2581			continue;
 2582		return map;
 2583	}
 2584
 2585	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2586	if (map) {
 2587		if (pos < map->alloc_len)
 2588			return map;
 2589
 2590		alloc_len = map->alloc_len * 2;
 2591	}
 2592
 2593	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2594	 *  map
 2595	 */
 2596	if (is_rxqs_map)
 2597		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2598	else
 2599		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2600				       cpu_to_node(attr_index));
 2601	if (!new_map)
 2602		return NULL;
 2603
 2604	for (i = 0; i < pos; i++)
 2605		new_map->queues[i] = map->queues[i];
 2606	new_map->alloc_len = alloc_len;
 2607	new_map->len = pos;
 2608
 2609	return new_map;
 2610}
 2611
 2612/* Must be called under cpus_read_lock */
 2613int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2614			  u16 index, bool is_rxqs_map)
 2615{
 2616	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2617	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2618	int i, j, tci, numa_node_id = -2;
 2619	int maps_sz, num_tc = 1, tc = 0;
 2620	struct xps_map *map, *new_map;
 2621	bool active = false;
 2622	unsigned int nr_ids;
 2623
 2624	if (dev->num_tc) {
 2625		/* Do not allow XPS on subordinate device directly */
 2626		num_tc = dev->num_tc;
 2627		if (num_tc < 0)
 2628			return -EINVAL;
 2629
 2630		/* If queue belongs to subordinate dev use its map */
 2631		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2632
 2633		tc = netdev_txq_to_tc(dev, index);
 2634		if (tc < 0)
 2635			return -EINVAL;
 2636	}
 2637
 2638	mutex_lock(&xps_map_mutex);
 2639	if (is_rxqs_map) {
 2640		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2641		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2642		nr_ids = dev->num_rx_queues;
 2643	} else {
 2644		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2645		if (num_possible_cpus() > 1) {
 2646			online_mask = cpumask_bits(cpu_online_mask);
 2647			possible_mask = cpumask_bits(cpu_possible_mask);
 2648		}
 2649		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2650		nr_ids = nr_cpu_ids;
 2651	}
 2652
 2653	if (maps_sz < L1_CACHE_BYTES)
 2654		maps_sz = L1_CACHE_BYTES;
 2655
 2656	/* allocate memory for queue storage */
 2657	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2658	     j < nr_ids;) {
 2659		if (!new_dev_maps)
 2660			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2661		if (!new_dev_maps) {
 2662			mutex_unlock(&xps_map_mutex);
 2663			return -ENOMEM;
 2664		}
 2665
 2666		tci = j * num_tc + tc;
 2667		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2668				 NULL;
 2669
 2670		map = expand_xps_map(map, j, index, is_rxqs_map);
 2671		if (!map)
 2672			goto error;
 2673
 2674		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2675	}
 2676
 2677	if (!new_dev_maps)
 2678		goto out_no_new_maps;
 2679
 2680	if (!dev_maps) {
 2681		/* Increment static keys at most once per type */
 2682		static_key_slow_inc_cpuslocked(&xps_needed);
 2683		if (is_rxqs_map)
 2684			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2685	}
 2686
 2687	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2688	     j < nr_ids;) {
 2689		/* copy maps belonging to foreign traffic classes */
 2690		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2691			/* fill in the new device map from the old device map */
 2692			map = xmap_dereference(dev_maps->attr_map[tci]);
 2693			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2694		}
 2695
 2696		/* We need to explicitly update tci as prevous loop
 2697		 * could break out early if dev_maps is NULL.
 2698		 */
 2699		tci = j * num_tc + tc;
 2700
 2701		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2702		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2703			/* add tx-queue to CPU/rx-queue maps */
 2704			int pos = 0;
 2705
 2706			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2707			while ((pos < map->len) && (map->queues[pos] != index))
 2708				pos++;
 2709
 2710			if (pos == map->len)
 2711				map->queues[map->len++] = index;
 2712#ifdef CONFIG_NUMA
 2713			if (!is_rxqs_map) {
 2714				if (numa_node_id == -2)
 2715					numa_node_id = cpu_to_node(j);
 2716				else if (numa_node_id != cpu_to_node(j))
 2717					numa_node_id = -1;
 2718			}
 2719#endif
 2720		} else if (dev_maps) {
 2721			/* fill in the new device map from the old device map */
 2722			map = xmap_dereference(dev_maps->attr_map[tci]);
 2723			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2724		}
 2725
 2726		/* copy maps belonging to foreign traffic classes */
 2727		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2728			/* fill in the new device map from the old device map */
 2729			map = xmap_dereference(dev_maps->attr_map[tci]);
 2730			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2731		}
 2732	}
 2733
 2734	if (is_rxqs_map)
 2735		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2736	else
 2737		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2738
 2739	/* Cleanup old maps */
 2740	if (!dev_maps)
 2741		goto out_no_old_maps;
 2742
 2743	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2744	     j < nr_ids;) {
 2745		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2746			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2747			map = xmap_dereference(dev_maps->attr_map[tci]);
 2748			if (map && map != new_map)
 2749				kfree_rcu(map, rcu);
 2750		}
 2751	}
 2752
 2753	kfree_rcu(dev_maps, rcu);
 2754
 2755out_no_old_maps:
 2756	dev_maps = new_dev_maps;
 2757	active = true;
 2758
 2759out_no_new_maps:
 2760	if (!is_rxqs_map) {
 2761		/* update Tx queue numa node */
 2762		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2763					     (numa_node_id >= 0) ?
 2764					     numa_node_id : NUMA_NO_NODE);
 2765	}
 2766
 2767	if (!dev_maps)
 2768		goto out_no_maps;
 2769
 2770	/* removes tx-queue from unused CPUs/rx-queues */
 2771	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2772	     j < nr_ids;) {
 2773		for (i = tc, tci = j * num_tc; i--; tci++)
 2774			active |= remove_xps_queue(dev_maps, tci, index);
 2775		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2776		    !netif_attr_test_online(j, online_mask, nr_ids))
 2777			active |= remove_xps_queue(dev_maps, tci, index);
 2778		for (i = num_tc - tc, tci++; --i; tci++)
 2779			active |= remove_xps_queue(dev_maps, tci, index);
 2780	}
 2781
 2782	/* free map if not active */
 2783	if (!active)
 2784		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2785
 2786out_no_maps:
 2787	mutex_unlock(&xps_map_mutex);
 2788
 2789	return 0;
 2790error:
 2791	/* remove any maps that we added */
 2792	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2793	     j < nr_ids;) {
 2794		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2795			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2796			map = dev_maps ?
 2797			      xmap_dereference(dev_maps->attr_map[tci]) :
 2798			      NULL;
 2799			if (new_map && new_map != map)
 2800				kfree(new_map);
 2801		}
 2802	}
 2803
 2804	mutex_unlock(&xps_map_mutex);
 2805
 2806	kfree(new_dev_maps);
 2807	return -ENOMEM;
 2808}
 2809EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2810
 2811int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2812			u16 index)
 2813{
 2814	int ret;
 2815
 2816	cpus_read_lock();
 2817	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2818	cpus_read_unlock();
 2819
 2820	return ret;
 2821}
 2822EXPORT_SYMBOL(netif_set_xps_queue);
 2823
 2824#endif
 2825static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2826{
 2827	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2828
 2829	/* Unbind any subordinate channels */
 2830	while (txq-- != &dev->_tx[0]) {
 2831		if (txq->sb_dev)
 2832			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2833	}
 2834}
 2835
 2836void netdev_reset_tc(struct net_device *dev)
 2837{
 2838#ifdef CONFIG_XPS
 2839	netif_reset_xps_queues_gt(dev, 0);
 2840#endif
 2841	netdev_unbind_all_sb_channels(dev);
 2842
 2843	/* Reset TC configuration of device */
 2844	dev->num_tc = 0;
 2845	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2846	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2847}
 2848EXPORT_SYMBOL(netdev_reset_tc);
 2849
 2850int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2851{
 2852	if (tc >= dev->num_tc)
 2853		return -EINVAL;
 2854
 2855#ifdef CONFIG_XPS
 2856	netif_reset_xps_queues(dev, offset, count);
 2857#endif
 2858	dev->tc_to_txq[tc].count = count;
 2859	dev->tc_to_txq[tc].offset = offset;
 2860	return 0;
 2861}
 2862EXPORT_SYMBOL(netdev_set_tc_queue);
 2863
 2864int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2865{
 2866	if (num_tc > TC_MAX_QUEUE)
 2867		return -EINVAL;
 2868
 2869#ifdef CONFIG_XPS
 2870	netif_reset_xps_queues_gt(dev, 0);
 2871#endif
 2872	netdev_unbind_all_sb_channels(dev);
 2873
 2874	dev->num_tc = num_tc;
 2875	return 0;
 2876}
 2877EXPORT_SYMBOL(netdev_set_num_tc);
 2878
 2879void netdev_unbind_sb_channel(struct net_device *dev,
 2880			      struct net_device *sb_dev)
 2881{
 2882	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2883
 2884#ifdef CONFIG_XPS
 2885	netif_reset_xps_queues_gt(sb_dev, 0);
 2886#endif
 2887	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2888	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2889
 2890	while (txq-- != &dev->_tx[0]) {
 2891		if (txq->sb_dev == sb_dev)
 2892			txq->sb_dev = NULL;
 2893	}
 2894}
 2895EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2896
 2897int netdev_bind_sb_channel_queue(struct net_device *dev,
 2898				 struct net_device *sb_dev,
 2899				 u8 tc, u16 count, u16 offset)
 2900{
 2901	/* Make certain the sb_dev and dev are already configured */
 2902	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2903		return -EINVAL;
 2904
 2905	/* We cannot hand out queues we don't have */
 2906	if ((offset + count) > dev->real_num_tx_queues)
 2907		return -EINVAL;
 2908
 2909	/* Record the mapping */
 2910	sb_dev->tc_to_txq[tc].count = count;
 2911	sb_dev->tc_to_txq[tc].offset = offset;
 2912
 2913	/* Provide a way for Tx queue to find the tc_to_txq map or
 2914	 * XPS map for itself.
 2915	 */
 2916	while (count--)
 2917		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2918
 2919	return 0;
 2920}
 2921EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2922
 2923int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2924{
 2925	/* Do not use a multiqueue device to represent a subordinate channel */
 2926	if (netif_is_multiqueue(dev))
 2927		return -ENODEV;
 2928
 2929	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2930	 * Channel 0 is meant to be "native" mode and used only to represent
 2931	 * the main root device. We allow writing 0 to reset the device back
 2932	 * to normal mode after being used as a subordinate channel.
 2933	 */
 2934	if (channel > S16_MAX)
 2935		return -EINVAL;
 2936
 2937	dev->num_tc = -channel;
 2938
 2939	return 0;
 2940}
 2941EXPORT_SYMBOL(netdev_set_sb_channel);
 2942
 2943/*
 2944 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2945 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2946 */
 2947int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2948{
 2949	bool disabling;
 2950	int rc;
 2951
 2952	disabling = txq < dev->real_num_tx_queues;
 2953
 2954	if (txq < 1 || txq > dev->num_tx_queues)
 2955		return -EINVAL;
 2956
 2957	if (dev->reg_state == NETREG_REGISTERED ||
 2958	    dev->reg_state == NETREG_UNREGISTERING) {
 2959		ASSERT_RTNL();
 2960
 2961		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2962						  txq);
 2963		if (rc)
 2964			return rc;
 2965
 2966		if (dev->num_tc)
 2967			netif_setup_tc(dev, txq);
 2968
 2969		dev->real_num_tx_queues = txq;
 2970
 2971		if (disabling) {
 2972			synchronize_net();
 2973			qdisc_reset_all_tx_gt(dev, txq);
 2974#ifdef CONFIG_XPS
 2975			netif_reset_xps_queues_gt(dev, txq);
 2976#endif
 2977		}
 2978	} else {
 2979		dev->real_num_tx_queues = txq;
 2980	}
 2981
 2982	return 0;
 2983}
 2984EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2985
 2986#ifdef CONFIG_SYSFS
 2987/**
 2988 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2989 *	@dev: Network device
 2990 *	@rxq: Actual number of RX queues
 2991 *
 2992 *	This must be called either with the rtnl_lock held or before
 2993 *	registration of the net device.  Returns 0 on success, or a
 2994 *	negative error code.  If called before registration, it always
 2995 *	succeeds.
 2996 */
 2997int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2998{
 2999	int rc;
 3000
 3001	if (rxq < 1 || rxq > dev->num_rx_queues)
 3002		return -EINVAL;
 3003
 3004	if (dev->reg_state == NETREG_REGISTERED) {
 3005		ASSERT_RTNL();
 3006
 3007		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3008						  rxq);
 3009		if (rc)
 3010			return rc;
 3011	}
 3012
 3013	dev->real_num_rx_queues = rxq;
 3014	return 0;
 3015}
 3016EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3017#endif
 3018
 3019/**
 3020 * netif_get_num_default_rss_queues - default number of RSS queues
 3021 *
 3022 * This routine should set an upper limit on the number of RSS queues
 3023 * used by default by multiqueue devices.
 3024 */
 3025int netif_get_num_default_rss_queues(void)
 3026{
 3027	return is_kdump_kernel() ?
 3028		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3029}
 3030EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3031
 3032static void __netif_reschedule(struct Qdisc *q)
 3033{
 3034	struct softnet_data *sd;
 3035	unsigned long flags;
 3036
 3037	local_irq_save(flags);
 3038	sd = this_cpu_ptr(&softnet_data);
 3039	q->next_sched = NULL;
 3040	*sd->output_queue_tailp = q;
 3041	sd->output_queue_tailp = &q->next_sched;
 3042	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3043	local_irq_restore(flags);
 3044}
 3045
 3046void __netif_schedule(struct Qdisc *q)
 3047{
 3048	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3049		__netif_reschedule(q);
 3050}
 3051EXPORT_SYMBOL(__netif_schedule);
 3052
 3053struct dev_kfree_skb_cb {
 3054	enum skb_free_reason reason;
 3055};
 3056
 3057static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3058{
 3059	return (struct dev_kfree_skb_cb *)skb->cb;
 3060}
 3061
 3062void netif_schedule_queue(struct netdev_queue *txq)
 3063{
 3064	rcu_read_lock();
 3065	if (!netif_xmit_stopped(txq)) {
 3066		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3067
 3068		__netif_schedule(q);
 3069	}
 3070	rcu_read_unlock();
 3071}
 3072EXPORT_SYMBOL(netif_schedule_queue);
 3073
 3074void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3075{
 3076	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3077		struct Qdisc *q;
 3078
 3079		rcu_read_lock();
 3080		q = rcu_dereference(dev_queue->qdisc);
 3081		__netif_schedule(q);
 3082		rcu_read_unlock();
 3083	}
 3084}
 3085EXPORT_SYMBOL(netif_tx_wake_queue);
 3086
 3087void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3088{
 3089	unsigned long flags;
 3090
 3091	if (unlikely(!skb))
 3092		return;
 3093
 3094	if (likely(refcount_read(&skb->users) == 1)) {
 3095		smp_rmb();
 3096		refcount_set(&skb->users, 0);
 3097	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3098		return;
 3099	}
 3100	get_kfree_skb_cb(skb)->reason = reason;
 3101	local_irq_save(flags);
 3102	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3103	__this_cpu_write(softnet_data.completion_queue, skb);
 3104	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3105	local_irq_restore(flags);
 3106}
 3107EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3108
 3109void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3110{
 3111	if (in_irq() || irqs_disabled())
 3112		__dev_kfree_skb_irq(skb, reason);
 3113	else
 3114		dev_kfree_skb(skb);
 3115}
 3116EXPORT_SYMBOL(__dev_kfree_skb_any);
 3117
 3118
 3119/**
 3120 * netif_device_detach - mark device as removed
 3121 * @dev: network device
 3122 *
 3123 * Mark device as removed from system and therefore no longer available.
 3124 */
 3125void netif_device_detach(struct net_device *dev)
 3126{
 3127	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3128	    netif_running(dev)) {
 3129		netif_tx_stop_all_queues(dev);
 3130	}
 3131}
 3132EXPORT_SYMBOL(netif_device_detach);
 3133
 3134/**
 3135 * netif_device_attach - mark device as attached
 3136 * @dev: network device
 3137 *
 3138 * Mark device as attached from system and restart if needed.
 3139 */
 3140void netif_device_attach(struct net_device *dev)
 3141{
 3142	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3143	    netif_running(dev)) {
 3144		netif_tx_wake_all_queues(dev);
 3145		__netdev_watchdog_up(dev);
 3146	}
 3147}
 3148EXPORT_SYMBOL(netif_device_attach);
 3149
 3150/*
 3151 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3152 * to be used as a distribution range.
 3153 */
 3154static u16 skb_tx_hash(const struct net_device *dev,
 3155		       const struct net_device *sb_dev,
 3156		       struct sk_buff *skb)
 3157{
 3158	u32 hash;
 3159	u16 qoffset = 0;
 3160	u16 qcount = dev->real_num_tx_queues;
 3161
 3162	if (dev->num_tc) {
 3163		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3164
 3165		qoffset = sb_dev->tc_to_txq[tc].offset;
 3166		qcount = sb_dev->tc_to_txq[tc].count;
 3167	}
 3168
 3169	if (skb_rx_queue_recorded(skb)) {
 3170		hash = skb_get_rx_queue(skb);
 3171		if (hash >= qoffset)
 3172			hash -= qoffset;
 3173		while (unlikely(hash >= qcount))
 3174			hash -= qcount;
 3175		return hash + qoffset;
 3176	}
 3177
 3178	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3179}
 3180
 3181static void skb_warn_bad_offload(const struct sk_buff *skb)
 3182{
 3183	static const netdev_features_t null_features;
 3184	struct net_device *dev = skb->dev;
 3185	const char *name = "";
 3186
 3187	if (!net_ratelimit())
 3188		return;
 3189
 3190	if (dev) {
 3191		if (dev->dev.parent)
 3192			name = dev_driver_string(dev->dev.parent);
 3193		else
 3194			name = netdev_name(dev);
 3195	}
 3196	skb_dump(KERN_WARNING, skb, false);
 3197	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3198	     name, dev ? &dev->features : &null_features,
 3199	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3200}
 3201
 3202/*
 3203 * Invalidate hardware checksum when packet is to be mangled, and
 3204 * complete checksum manually on outgoing path.
 3205 */
 3206int skb_checksum_help(struct sk_buff *skb)
 3207{
 3208	__wsum csum;
 3209	int ret = 0, offset;
 3210
 3211	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3212		goto out_set_summed;
 3213
 3214	if (unlikely(skb_is_gso(skb))) {
 3215		skb_warn_bad_offload(skb);
 3216		return -EINVAL;
 3217	}
 3218
 3219	/* Before computing a checksum, we should make sure no frag could
 3220	 * be modified by an external entity : checksum could be wrong.
 3221	 */
 3222	if (skb_has_shared_frag(skb)) {
 3223		ret = __skb_linearize(skb);
 3224		if (ret)
 3225			goto out;
 3226	}
 3227
 3228	offset = skb_checksum_start_offset(skb);
 3229	BUG_ON(offset >= skb_headlen(skb));
 3230	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3231
 3232	offset += skb->csum_offset;
 3233	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3234
 3235	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3236	if (ret)
 3237		goto out;
 3238
 3239	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3240out_set_summed:
 3241	skb->ip_summed = CHECKSUM_NONE;
 3242out:
 3243	return ret;
 3244}
 3245EXPORT_SYMBOL(skb_checksum_help);
 3246
 3247int skb_crc32c_csum_help(struct sk_buff *skb)
 3248{
 3249	__le32 crc32c_csum;
 3250	int ret = 0, offset, start;
 3251
 3252	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3253		goto out;
 3254
 3255	if (unlikely(skb_is_gso(skb)))
 3256		goto out;
 3257
 3258	/* Before computing a checksum, we should make sure no frag could
 3259	 * be modified by an external entity : checksum could be wrong.
 3260	 */
 3261	if (unlikely(skb_has_shared_frag(skb))) {
 3262		ret = __skb_linearize(skb);
 3263		if (ret)
 3264			goto out;
 3265	}
 3266	start = skb_checksum_start_offset(skb);
 3267	offset = start + offsetof(struct sctphdr, checksum);
 3268	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3269		ret = -EINVAL;
 3270		goto out;
 3271	}
 3272
 3273	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3274	if (ret)
 3275		goto out;
 3276
 3277	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3278						  skb->len - start, ~(__u32)0,
 3279						  crc32c_csum_stub));
 3280	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3281	skb->ip_summed = CHECKSUM_NONE;
 3282	skb->csum_not_inet = 0;
 3283out:
 3284	return ret;
 3285}
 3286
 3287__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3288{
 3289	__be16 type = skb->protocol;
 3290
 3291	/* Tunnel gso handlers can set protocol to ethernet. */
 3292	if (type == htons(ETH_P_TEB)) {
 3293		struct ethhdr *eth;
 3294
 3295		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3296			return 0;
 3297
 3298		eth = (struct ethhdr *)skb->data;
 3299		type = eth->h_proto;
 3300	}
 3301
 3302	return __vlan_get_protocol(skb, type, depth);
 3303}
 3304
 3305/**
 3306 *	skb_mac_gso_segment - mac layer segmentation handler.
 3307 *	@skb: buffer to segment
 3308 *	@features: features for the output path (see dev->features)
 3309 */
 3310struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3311				    netdev_features_t features)
 3312{
 3313	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3314	struct packet_offload *ptype;
 3315	int vlan_depth = skb->mac_len;
 3316	__be16 type = skb_network_protocol(skb, &vlan_depth);
 3317
 3318	if (unlikely(!type))
 3319		return ERR_PTR(-EINVAL);
 3320
 3321	__skb_pull(skb, vlan_depth);
 3322
 3323	rcu_read_lock();
 3324	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3325		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3326			segs = ptype->callbacks.gso_segment(skb, features);
 3327			break;
 3328		}
 3329	}
 3330	rcu_read_unlock();
 3331
 3332	__skb_push(skb, skb->data - skb_mac_header(skb));
 3333
 3334	return segs;
 3335}
 3336EXPORT_SYMBOL(skb_mac_gso_segment);
 3337
 3338
 3339/* openvswitch calls this on rx path, so we need a different check.
 3340 */
 3341static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3342{
 3343	if (tx_path)
 3344		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3345		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3346
 3347	return skb->ip_summed == CHECKSUM_NONE;
 3348}
 3349
 3350/**
 3351 *	__skb_gso_segment - Perform segmentation on skb.
 3352 *	@skb: buffer to segment
 3353 *	@features: features for the output path (see dev->features)
 3354 *	@tx_path: whether it is called in TX path
 3355 *
 3356 *	This function segments the given skb and returns a list of segments.
 3357 *
 3358 *	It may return NULL if the skb requires no segmentation.  This is
 3359 *	only possible when GSO is used for verifying header integrity.
 3360 *
 3361 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3362 */
 3363struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3364				  netdev_features_t features, bool tx_path)
 3365{
 3366	struct sk_buff *segs;
 3367
 3368	if (unlikely(skb_needs_check(skb, tx_path))) {
 3369		int err;
 3370
 3371		/* We're going to init ->check field in TCP or UDP header */
 3372		err = skb_cow_head(skb, 0);
 3373		if (err < 0)
 3374			return ERR_PTR(err);
 3375	}
 3376
 3377	/* Only report GSO partial support if it will enable us to
 3378	 * support segmentation on this frame without needing additional
 3379	 * work.
 3380	 */
 3381	if (features & NETIF_F_GSO_PARTIAL) {
 3382		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3383		struct net_device *dev = skb->dev;
 3384
 3385		partial_features |= dev->features & dev->gso_partial_features;
 3386		if (!skb_gso_ok(skb, features | partial_features))
 3387			features &= ~NETIF_F_GSO_PARTIAL;
 3388	}
 3389
 3390	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3391		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3392
 3393	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3394	SKB_GSO_CB(skb)->encap_level = 0;
 3395
 3396	skb_reset_mac_header(skb);
 3397	skb_reset_mac_len(skb);
 3398
 3399	segs = skb_mac_gso_segment(skb, features);
 3400
 3401	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3402		skb_warn_bad_offload(skb);
 3403
 3404	return segs;
 3405}
 3406EXPORT_SYMBOL(__skb_gso_segment);
 3407
 3408/* Take action when hardware reception checksum errors are detected. */
 3409#ifdef CONFIG_BUG
 3410void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3411{
 3412	if (net_ratelimit()) {
 3413		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3414		skb_dump(KERN_ERR, skb, true);
 3415		dump_stack();
 3416	}
 3417}
 3418EXPORT_SYMBOL(netdev_rx_csum_fault);
 3419#endif
 3420
 3421/* XXX: check that highmem exists at all on the given machine. */
 3422static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3423{
 3424#ifdef CONFIG_HIGHMEM
 3425	int i;
 3426
 3427	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3428		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3429			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3430
 3431			if (PageHighMem(skb_frag_page(frag)))
 3432				return 1;
 3433		}
 3434	}
 3435#endif
 3436	return 0;
 3437}
 3438
 3439/* If MPLS offload request, verify we are testing hardware MPLS features
 3440 * instead of standard features for the netdev.
 3441 */
 3442#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3443static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3444					   netdev_features_t features,
 3445					   __be16 type)
 3446{
 3447	if (eth_p_mpls(type))
 3448		features &= skb->dev->mpls_features;
 3449
 3450	return features;
 3451}
 3452#else
 3453static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3454					   netdev_features_t features,
 3455					   __be16 type)
 3456{
 3457	return features;
 3458}
 3459#endif
 3460
 3461static netdev_features_t harmonize_features(struct sk_buff *skb,
 3462	netdev_features_t features)
 3463{
 3464	__be16 type;
 3465
 3466	type = skb_network_protocol(skb, NULL);
 3467	features = net_mpls_features(skb, features, type);
 3468
 3469	if (skb->ip_summed != CHECKSUM_NONE &&
 3470	    !can_checksum_protocol(features, type)) {
 3471		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3472	}
 3473	if (illegal_highdma(skb->dev, skb))
 3474		features &= ~NETIF_F_SG;
 3475
 3476	return features;
 3477}
 3478
 3479netdev_features_t passthru_features_check(struct sk_buff *skb,
 3480					  struct net_device *dev,
 3481					  netdev_features_t features)
 3482{
 3483	return features;
 3484}
 3485EXPORT_SYMBOL(passthru_features_check);
 3486
 3487static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3488					     struct net_device *dev,
 3489					     netdev_features_t features)
 3490{
 3491	return vlan_features_check(skb, features);
 3492}
 3493
 3494static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3495					    struct net_device *dev,
 3496					    netdev_features_t features)
 3497{
 3498	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3499
 3500	if (gso_segs > dev->gso_max_segs)
 3501		return features & ~NETIF_F_GSO_MASK;
 3502
 3503	if (!skb_shinfo(skb)->gso_type) {
 3504		skb_warn_bad_offload(skb);
 3505		return features & ~NETIF_F_GSO_MASK;
 3506	}
 3507
 3508	/* Support for GSO partial features requires software
 3509	 * intervention before we can actually process the packets
 3510	 * so we need to strip support for any partial features now
 3511	 * and we can pull them back in after we have partially
 3512	 * segmented the frame.
 3513	 */
 3514	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3515		features &= ~dev->gso_partial_features;
 3516
 3517	/* Make sure to clear the IPv4 ID mangling feature if the
 3518	 * IPv4 header has the potential to be fragmented.
 3519	 */
 3520	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3521		struct iphdr *iph = skb->encapsulation ?
 3522				    inner_ip_hdr(skb) : ip_hdr(skb);
 3523
 3524		if (!(iph->frag_off & htons(IP_DF)))
 3525			features &= ~NETIF_F_TSO_MANGLEID;
 3526	}
 3527
 3528	return features;
 3529}
 3530
 3531netdev_features_t netif_skb_features(struct sk_buff *skb)
 3532{
 3533	struct net_device *dev = skb->dev;
 3534	netdev_features_t features = dev->features;
 3535
 3536	if (skb_is_gso(skb))
 3537		features = gso_features_check(skb, dev, features);
 3538
 3539	/* If encapsulation offload request, verify we are testing
 3540	 * hardware encapsulation features instead of standard
 3541	 * features for the netdev
 3542	 */
 3543	if (skb->encapsulation)
 3544		features &= dev->hw_enc_features;
 3545
 3546	if (skb_vlan_tagged(skb))
 3547		features = netdev_intersect_features(features,
 3548						     dev->vlan_features |
 3549						     NETIF_F_HW_VLAN_CTAG_TX |
 3550						     NETIF_F_HW_VLAN_STAG_TX);
 3551
 3552	if (dev->netdev_ops->ndo_features_check)
 3553		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3554								features);
 3555	else
 3556		features &= dflt_features_check(skb, dev, features);
 3557
 3558	return harmonize_features(skb, features);
 3559}
 3560EXPORT_SYMBOL(netif_skb_features);
 3561
 3562static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3563		    struct netdev_queue *txq, bool more)
 3564{
 3565	unsigned int len;
 3566	int rc;
 3567
 3568	if (dev_nit_active(dev))
 3569		dev_queue_xmit_nit(skb, dev);
 3570
 3571	len = skb->len;
 3572	PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
 3573	trace_net_dev_start_xmit(skb, dev);
 3574	rc = netdev_start_xmit(skb, dev, txq, more);
 3575	trace_net_dev_xmit(skb, rc, dev, len);
 3576
 3577	return rc;
 3578}
 3579
 3580struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3581				    struct netdev_queue *txq, int *ret)
 3582{
 3583	struct sk_buff *skb = first;
 3584	int rc = NETDEV_TX_OK;
 3585
 3586	while (skb) {
 3587		struct sk_buff *next = skb->next;
 3588
 3589		skb_mark_not_on_list(skb);
 3590		rc = xmit_one(skb, dev, txq, next != NULL);
 3591		if (unlikely(!dev_xmit_complete(rc))) {
 3592			skb->next = next;
 3593			goto out;
 3594		}
 3595
 3596		skb = next;
 3597		if (netif_tx_queue_stopped(txq) && skb) {
 3598			rc = NETDEV_TX_BUSY;
 3599			break;
 3600		}
 3601	}
 3602
 3603out:
 3604	*ret = rc;
 3605	return skb;
 3606}
 3607
 3608static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3609					  netdev_features_t features)
 3610{
 3611	if (skb_vlan_tag_present(skb) &&
 3612	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3613		skb = __vlan_hwaccel_push_inside(skb);
 3614	return skb;
 3615}
 3616
 3617int skb_csum_hwoffload_help(struct sk_buff *skb,
 3618			    const netdev_features_t features)
 3619{
 3620	if (unlikely(skb->csum_not_inet))
 3621		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3622			skb_crc32c_csum_help(skb);
 3623
 3624	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3625}
 3626EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3627
 3628static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3629{
 3630	netdev_features_t features;
 3631
 3632	features = netif_skb_features(skb);
 3633	skb = validate_xmit_vlan(skb, features);
 3634	if (unlikely(!skb))
 3635		goto out_null;
 3636
 3637	skb = sk_validate_xmit_skb(skb, dev);
 3638	if (unlikely(!skb))
 3639		goto out_null;
 3640
 3641	if (netif_needs_gso(skb, features)) {
 3642		struct sk_buff *segs;
 3643
 3644		segs = skb_gso_segment(skb, features);
 3645		if (IS_ERR(segs)) {
 3646			goto out_kfree_skb;
 3647		} else if (segs) {
 3648			consume_skb(skb);
 3649			skb = segs;
 3650		}
 3651	} else {
 3652		if (skb_needs_linearize(skb, features) &&
 3653		    __skb_linearize(skb))
 3654			goto out_kfree_skb;
 3655
 3656		/* If packet is not checksummed and device does not
 3657		 * support checksumming for this protocol, complete
 3658		 * checksumming here.
 3659		 */
 3660		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3661			if (skb->encapsulation)
 3662				skb_set_inner_transport_header(skb,
 3663							       skb_checksum_start_offset(skb));
 3664			else
 3665				skb_set_transport_header(skb,
 3666							 skb_checksum_start_offset(skb));
 3667			if (skb_csum_hwoffload_help(skb, features))
 3668				goto out_kfree_skb;
 3669		}
 3670	}
 3671
 3672	skb = validate_xmit_xfrm(skb, features, again);
 3673
 3674	return skb;
 3675
 3676out_kfree_skb:
 3677	kfree_skb(skb);
 3678out_null:
 3679	atomic_long_inc(&dev->tx_dropped);
 3680	return NULL;
 3681}
 3682
 3683struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3684{
 3685	struct sk_buff *next, *head = NULL, *tail;
 3686
 3687	for (; skb != NULL; skb = next) {
 3688		next = skb->next;
 3689		skb_mark_not_on_list(skb);
 3690
 3691		/* in case skb wont be segmented, point to itself */
 3692		skb->prev = skb;
 3693
 3694		skb = validate_xmit_skb(skb, dev, again);
 3695		if (!skb)
 3696			continue;
 3697
 3698		if (!head)
 3699			head = skb;
 3700		else
 3701			tail->next = skb;
 3702		/* If skb was segmented, skb->prev points to
 3703		 * the last segment. If not, it still contains skb.
 3704		 */
 3705		tail = skb->prev;
 3706	}
 3707	return head;
 3708}
 3709EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3710
 3711static void qdisc_pkt_len_init(struct sk_buff *skb)
 3712{
 3713	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3714
 3715	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3716
 3717	/* To get more precise estimation of bytes sent on wire,
 3718	 * we add to pkt_len the headers size of all segments
 3719	 */
 3720	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3721		unsigned int hdr_len;
 3722		u16 gso_segs = shinfo->gso_segs;
 3723
 3724		/* mac layer + network layer */
 3725		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3726
 3727		/* + transport layer */
 3728		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3729			const struct tcphdr *th;
 3730			struct tcphdr _tcphdr;
 3731
 3732			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3733						sizeof(_tcphdr), &_tcphdr);
 3734			if (likely(th))
 3735				hdr_len += __tcp_hdrlen(th);
 3736		} else {
 3737			struct udphdr _udphdr;
 3738
 3739			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3740					       sizeof(_udphdr), &_udphdr))
 3741				hdr_len += sizeof(struct udphdr);
 3742		}
 3743
 3744		if (shinfo->gso_type & SKB_GSO_DODGY)
 3745			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3746						shinfo->gso_size);
 3747
 3748		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3749	}
 3750}
 3751
 3752static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3753				 struct net_device *dev,
 3754				 struct netdev_queue *txq)
 3755{
 3756	spinlock_t *root_lock = qdisc_lock(q);
 3757	struct sk_buff *to_free = NULL;
 3758	bool contended;
 3759	int rc;
 3760
 3761	qdisc_calculate_pkt_len(skb, q);
 3762
 3763	if (q->flags & TCQ_F_NOLOCK) {
 3764		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3765		qdisc_run(q);
 3766
 3767		if (unlikely(to_free))
 3768			kfree_skb_list(to_free);
 3769		return rc;
 3770	}
 3771
 3772	/*
 3773	 * Heuristic to force contended enqueues to serialize on a
 3774	 * separate lock before trying to get qdisc main lock.
 3775	 * This permits qdisc->running owner to get the lock more
 3776	 * often and dequeue packets faster.
 3777	 */
 3778	contended = qdisc_is_running(q);
 3779	if (unlikely(contended))
 3780		spin_lock(&q->busylock);
 3781
 3782	spin_lock(root_lock);
 3783	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3784		__qdisc_drop(skb, &to_free);
 3785		rc = NET_XMIT_DROP;
 3786	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3787		   qdisc_run_begin(q)) {
 3788		/*
 3789		 * This is a work-conserving queue; there are no old skbs
 3790		 * waiting to be sent out; and the qdisc is not running -
 3791		 * xmit the skb directly.
 3792		 */
 3793
 3794		qdisc_bstats_update(q, skb);
 3795
 3796		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3797			if (unlikely(contended)) {
 3798				spin_unlock(&q->busylock);
 3799				contended = false;
 3800			}
 3801			__qdisc_run(q);
 3802		}
 3803
 3804		qdisc_run_end(q);
 3805		rc = NET_XMIT_SUCCESS;
 3806	} else {
 3807		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3808		if (qdisc_run_begin(q)) {
 3809			if (unlikely(contended)) {
 3810				spin_unlock(&q->busylock);
 3811				contended = false;
 3812			}
 3813			__qdisc_run(q);
 3814			qdisc_run_end(q);
 3815		}
 3816	}
 3817	spin_unlock(root_lock);
 3818	if (unlikely(to_free))
 3819		kfree_skb_list(to_free);
 3820	if (unlikely(contended))
 3821		spin_unlock(&q->busylock);
 3822	return rc;
 3823}
 3824
 3825#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3826static void skb_update_prio(struct sk_buff *skb)
 3827{
 3828	const struct netprio_map *map;
 3829	const struct sock *sk;
 3830	unsigned int prioidx;
 3831
 3832	if (skb->priority)
 3833		return;
 3834	map = rcu_dereference_bh(skb->dev->priomap);
 3835	if (!map)
 3836		return;
 3837	sk = skb_to_full_sk(skb);
 3838	if (!sk)
 3839		return;
 3840
 3841	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3842
 3843	if (prioidx < map->priomap_len)
 3844		skb->priority = map->priomap[prioidx];
 3845}
 3846#else
 3847#define skb_update_prio(skb)
 3848#endif
 3849
 3850/**
 3851 *	dev_loopback_xmit - loop back @skb
 3852 *	@net: network namespace this loopback is happening in
 3853 *	@sk:  sk needed to be a netfilter okfn
 3854 *	@skb: buffer to transmit
 3855 */
 3856int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3857{
 3858	skb_reset_mac_header(skb);
 3859	__skb_pull(skb, skb_network_offset(skb));
 3860	skb->pkt_type = PACKET_LOOPBACK;
 3861	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3862	WARN_ON(!skb_dst(skb));
 3863	skb_dst_force(skb);
 3864	netif_rx_ni(skb);
 3865	return 0;
 3866}
 3867EXPORT_SYMBOL(dev_loopback_xmit);
 3868
 3869#ifdef CONFIG_NET_EGRESS
 3870static struct sk_buff *
 3871sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3872{
 3873	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3874	struct tcf_result cl_res;
 3875
 3876	if (!miniq)
 3877		return skb;
 3878
 3879	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3880	qdisc_skb_cb(skb)->mru = 0;
 3881	mini_qdisc_bstats_cpu_update(miniq, skb);
 3882
 3883	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3884	case TC_ACT_OK:
 3885	case TC_ACT_RECLASSIFY:
 3886		skb->tc_index = TC_H_MIN(cl_res.classid);
 3887		break;
 3888	case TC_ACT_SHOT:
 3889		mini_qdisc_qstats_cpu_drop(miniq);
 3890		*ret = NET_XMIT_DROP;
 3891		kfree_skb(skb);
 3892		return NULL;
 3893	case TC_ACT_STOLEN:
 3894	case TC_ACT_QUEUED:
 3895	case TC_ACT_TRAP:
 3896		*ret = NET_XMIT_SUCCESS;
 3897		consume_skb(skb);
 3898		return NULL;
 3899	case TC_ACT_REDIRECT:
 3900		/* No need to push/pop skb's mac_header here on egress! */
 3901		skb_do_redirect(skb);
 3902		*ret = NET_XMIT_SUCCESS;
 3903		return NULL;
 3904	default:
 3905		break;
 3906	}
 3907
 3908	return skb;
 3909}
 3910#endif /* CONFIG_NET_EGRESS */
 3911
 3912#ifdef CONFIG_XPS
 3913static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3914			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3915{
 3916	struct xps_map *map;
 3917	int queue_index = -1;
 3918
 3919	if (dev->num_tc) {
 3920		tci *= dev->num_tc;
 3921		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3922	}
 3923
 3924	map = rcu_dereference(dev_maps->attr_map[tci]);
 3925	if (map) {
 3926		if (map->len == 1)
 3927			queue_index = map->queues[0];
 3928		else
 3929			queue_index = map->queues[reciprocal_scale(
 3930						skb_get_hash(skb), map->len)];
 3931		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3932			queue_index = -1;
 3933	}
 3934	return queue_index;
 3935}
 3936#endif
 3937
 3938static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3939			 struct sk_buff *skb)
 3940{
 3941#ifdef CONFIG_XPS
 3942	struct xps_dev_maps *dev_maps;
 3943	struct sock *sk = skb->sk;
 3944	int queue_index = -1;
 3945
 3946	if (!static_key_false(&xps_needed))
 3947		return -1;
 3948
 3949	rcu_read_lock();
 3950	if (!static_key_false(&xps_rxqs_needed))
 3951		goto get_cpus_map;
 3952
 3953	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3954	if (dev_maps) {
 3955		int tci = sk_rx_queue_get(sk);
 3956
 3957		if (tci >= 0 && tci < dev->num_rx_queues)
 3958			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3959							  tci);
 3960	}
 3961
 3962get_cpus_map:
 3963	if (queue_index < 0) {
 3964		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3965		if (dev_maps) {
 3966			unsigned int tci = skb->sender_cpu - 1;
 3967
 3968			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3969							  tci);
 3970		}
 3971	}
 3972	rcu_read_unlock();
 3973
 3974	return queue_index;
 3975#else
 3976	return -1;
 3977#endif
 3978}
 3979
 3980u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3981		     struct net_device *sb_dev)
 3982{
 3983	return 0;
 3984}
 3985EXPORT_SYMBOL(dev_pick_tx_zero);
 3986
 3987u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3988		       struct net_device *sb_dev)
 3989{
 3990	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3991}
 3992EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3993
 3994u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3995		     struct net_device *sb_dev)
 3996{
 3997	struct sock *sk = skb->sk;
 3998	int queue_index = sk_tx_queue_get(sk);
 3999
 4000	sb_dev = sb_dev ? : dev;
 4001
 4002	if (queue_index < 0 || skb->ooo_okay ||
 4003	    queue_index >= dev->real_num_tx_queues) {
 4004		int new_index = get_xps_queue(dev, sb_dev, skb);
 4005
 4006		if (new_index < 0)
 4007			new_index = skb_tx_hash(dev, sb_dev, skb);
 4008
 4009		if (queue_index != new_index && sk &&
 4010		    sk_fullsock(sk) &&
 4011		    rcu_access_pointer(sk->sk_dst_cache))
 4012			sk_tx_queue_set(sk, new_index);
 4013
 4014		queue_index = new_index;
 4015	}
 4016
 4017	return queue_index;
 4018}
 4019EXPORT_SYMBOL(netdev_pick_tx);
 4020
 4021struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4022					 struct sk_buff *skb,
 4023					 struct net_device *sb_dev)
 4024{
 4025	int queue_index = 0;
 4026
 4027#ifdef CONFIG_XPS
 4028	u32 sender_cpu = skb->sender_cpu - 1;
 4029
 4030	if (sender_cpu >= (u32)NR_CPUS)
 4031		skb->sender_cpu = raw_smp_processor_id() + 1;
 4032#endif
 4033
 4034	if (dev->real_num_tx_queues != 1) {
 4035		const struct net_device_ops *ops = dev->netdev_ops;
 4036
 4037		if (ops->ndo_select_queue)
 4038			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4039		else
 4040			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4041
 4042		queue_index = netdev_cap_txqueue(dev, queue_index);
 4043	}
 4044
 4045	skb_set_queue_mapping(skb, queue_index);
 4046	return netdev_get_tx_queue(dev, queue_index);
 4047}
 4048
 4049/**
 4050 *	__dev_queue_xmit - transmit a buffer
 4051 *	@skb: buffer to transmit
 4052 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4053 *
 4054 *	Queue a buffer for transmission to a network device. The caller must
 4055 *	have set the device and priority and built the buffer before calling
 4056 *	this function. The function can be called from an interrupt.
 4057 *
 4058 *	A negative errno code is returned on a failure. A success does not
 4059 *	guarantee the frame will be transmitted as it may be dropped due
 4060 *	to congestion or traffic shaping.
 4061 *
 4062 * -----------------------------------------------------------------------------------
 4063 *      I notice this method can also return errors from the queue disciplines,
 4064 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4065 *      be positive.
 4066 *
 4067 *      Regardless of the return value, the skb is consumed, so it is currently
 4068 *      difficult to retry a send to this method.  (You can bump the ref count
 4069 *      before sending to hold a reference for retry if you are careful.)
 4070 *
 4071 *      When calling this method, interrupts MUST be enabled.  This is because
 4072 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4073 *          --BLG
 4074 */
 4075static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4076{
 4077	struct net_device *dev = skb->dev;
 4078	struct netdev_queue *txq;
 4079	struct Qdisc *q;
 4080	int rc = -ENOMEM;
 4081	bool again = false;
 4082
 4083	skb_reset_mac_header(skb);
 4084
 4085	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4086		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4087
 4088	/* Disable soft irqs for various locks below. Also
 4089	 * stops preemption for RCU.
 4090	 */
 4091	rcu_read_lock_bh();
 4092
 4093	skb_update_prio(skb);
 4094
 4095	qdisc_pkt_len_init(skb);
 4096#ifdef CONFIG_NET_CLS_ACT
 4097	skb->tc_at_ingress = 0;
 4098# ifdef CONFIG_NET_EGRESS
 4099	if (static_branch_unlikely(&egress_needed_key)) {
 4100		skb = sch_handle_egress(skb, &rc, dev);
 4101		if (!skb)
 4102			goto out;
 4103	}
 4104# endif
 4105#endif
 4106	/* If device/qdisc don't need skb->dst, release it right now while
 4107	 * its hot in this cpu cache.
 4108	 */
 4109	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4110		skb_dst_drop(skb);
 4111	else
 4112		skb_dst_force(skb);
 4113
 4114	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4115	q = rcu_dereference_bh(txq->qdisc);
 4116
 4117	trace_net_dev_queue(skb);
 4118	if (q->enqueue) {
 4119		rc = __dev_xmit_skb(skb, q, dev, txq);
 4120		goto out;
 4121	}
 4122
 4123	/* The device has no queue. Common case for software devices:
 4124	 * loopback, all the sorts of tunnels...
 4125
 4126	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4127	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4128	 * counters.)
 4129	 * However, it is possible, that they rely on protection
 4130	 * made by us here.
 4131
 4132	 * Check this and shot the lock. It is not prone from deadlocks.
 4133	 *Either shot noqueue qdisc, it is even simpler 8)
 4134	 */
 4135	if (dev->flags & IFF_UP) {
 4136		int cpu = smp_processor_id(); /* ok because BHs are off */
 4137
 4138		if (txq->xmit_lock_owner != cpu) {
 4139			if (dev_xmit_recursion())
 4140				goto recursion_alert;
 4141
 4142			skb = validate_xmit_skb(skb, dev, &again);
 4143			if (!skb)
 4144				goto out;
 4145
 4146			PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
 4147			HARD_TX_LOCK(dev, txq, cpu);
 4148
 4149			if (!netif_xmit_stopped(txq)) {
 4150				dev_xmit_recursion_inc();
 4151				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4152				dev_xmit_recursion_dec();
 4153				if (dev_xmit_complete(rc)) {
 4154					HARD_TX_UNLOCK(dev, txq);
 4155					goto out;
 4156				}
 4157			}
 4158			HARD_TX_UNLOCK(dev, txq);
 4159			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4160					     dev->name);
 4161		} else {
 4162			/* Recursion is detected! It is possible,
 4163			 * unfortunately
 4164			 */
 4165recursion_alert:
 4166			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4167					     dev->name);
 4168		}
 4169	}
 4170
 4171	rc = -ENETDOWN;
 4172	rcu_read_unlock_bh();
 4173
 4174	atomic_long_inc(&dev->tx_dropped);
 4175	kfree_skb_list(skb);
 4176	return rc;
 4177out:
 4178	rcu_read_unlock_bh();
 4179	return rc;
 4180}
 4181
 4182int dev_queue_xmit(struct sk_buff *skb)
 4183{
 4184	return __dev_queue_xmit(skb, NULL);
 4185}
 4186EXPORT_SYMBOL(dev_queue_xmit);
 4187
 4188int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4189{
 4190	return __dev_queue_xmit(skb, sb_dev);
 4191}
 4192EXPORT_SYMBOL(dev_queue_xmit_accel);
 4193
 4194int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4195{
 4196	struct net_device *dev = skb->dev;
 4197	struct sk_buff *orig_skb = skb;
 4198	struct netdev_queue *txq;
 4199	int ret = NETDEV_TX_BUSY;
 4200	bool again = false;
 4201
 4202	if (unlikely(!netif_running(dev) ||
 4203		     !netif_carrier_ok(dev)))
 4204		goto drop;
 4205
 4206	skb = validate_xmit_skb_list(skb, dev, &again);
 4207	if (skb != orig_skb)
 4208		goto drop;
 4209
 4210	skb_set_queue_mapping(skb, queue_id);
 4211	txq = skb_get_tx_queue(dev, skb);
 4212	PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
 4213
 4214	local_bh_disable();
 4215
 4216	dev_xmit_recursion_inc();
 4217	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4218	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4219		ret = netdev_start_xmit(skb, dev, txq, false);
 4220	HARD_TX_UNLOCK(dev, txq);
 4221	dev_xmit_recursion_dec();
 4222
 4223	local_bh_enable();
 4224	return ret;
 4225drop:
 4226	atomic_long_inc(&dev->tx_dropped);
 4227	kfree_skb_list(skb);
 4228	return NET_XMIT_DROP;
 4229}
 4230EXPORT_SYMBOL(__dev_direct_xmit);
 4231
 4232/*************************************************************************
 4233 *			Receiver routines
 4234 *************************************************************************/
 4235
 4236int netdev_max_backlog __read_mostly = 1000;
 4237EXPORT_SYMBOL(netdev_max_backlog);
 4238
 4239int netdev_tstamp_prequeue __read_mostly = 1;
 4240int netdev_budget __read_mostly = 300;
 4241/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4242unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4243int weight_p __read_mostly = 64;           /* old backlog weight */
 4244int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4245int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4246int dev_rx_weight __read_mostly = 64;
 4247int dev_tx_weight __read_mostly = 64;
 4248/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4249int gro_normal_batch __read_mostly = 8;
 4250
 4251/* Called with irq disabled */
 4252static inline void ____napi_schedule(struct softnet_data *sd,
 4253				     struct napi_struct *napi)
 4254{
 4255	list_add_tail(&napi->poll_list, &sd->poll_list);
 4256	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4257}
 4258
 4259#ifdef CONFIG_RPS
 4260
 4261/* One global table that all flow-based protocols share. */
 4262struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4263EXPORT_SYMBOL(rps_sock_flow_table);
 4264u32 rps_cpu_mask __read_mostly;
 4265EXPORT_SYMBOL(rps_cpu_mask);
 4266
 4267struct static_key_false rps_needed __read_mostly;
 4268EXPORT_SYMBOL(rps_needed);
 4269struct static_key_false rfs_needed __read_mostly;
 4270EXPORT_SYMBOL(rfs_needed);
 4271
 4272static struct rps_dev_flow *
 4273set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4274	    struct rps_dev_flow *rflow, u16 next_cpu)
 4275{
 4276	if (next_cpu < nr_cpu_ids) {
 4277#ifdef CONFIG_RFS_ACCEL
 4278		struct netdev_rx_queue *rxqueue;
 4279		struct rps_dev_flow_table *flow_table;
 4280		struct rps_dev_flow *old_rflow;
 4281		u32 flow_id;
 4282		u16 rxq_index;
 4283		int rc;
 4284
 4285		/* Should we steer this flow to a different hardware queue? */
 4286		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4287		    !(dev->features & NETIF_F_NTUPLE))
 4288			goto out;
 4289		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4290		if (rxq_index == skb_get_rx_queue(skb))
 4291			goto out;
 4292
 4293		rxqueue = dev->_rx + rxq_index;
 4294		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4295		if (!flow_table)
 4296			goto out;
 4297		flow_id = skb_get_hash(skb) & flow_table->mask;
 4298		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4299							rxq_index, flow_id);
 4300		if (rc < 0)
 4301			goto out;
 4302		old_rflow = rflow;
 4303		rflow = &flow_table->flows[flow_id];
 4304		rflow->filter = rc;
 4305		if (old_rflow->filter == rflow->filter)
 4306			old_rflow->filter = RPS_NO_FILTER;
 4307	out:
 4308#endif
 4309		rflow->last_qtail =
 4310			per_cpu(softnet_data, next_cpu).input_queue_head;
 4311	}
 4312
 4313	rflow->cpu = next_cpu;
 4314	return rflow;
 4315}
 4316
 4317/*
 4318 * get_rps_cpu is called from netif_receive_skb and returns the target
 4319 * CPU from the RPS map of the receiving queue for a given skb.
 4320 * rcu_read_lock must be held on entry.
 4321 */
 4322static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4323		       struct rps_dev_flow **rflowp)
 4324{
 4325	const struct rps_sock_flow_table *sock_flow_table;
 4326	struct netdev_rx_queue *rxqueue = dev->_rx;
 4327	struct rps_dev_flow_table *flow_table;
 4328	struct rps_map *map;
 4329	int cpu = -1;
 4330	u32 tcpu;
 4331	u32 hash;
 4332
 4333	if (skb_rx_queue_recorded(skb)) {
 4334		u16 index = skb_get_rx_queue(skb);
 4335
 4336		if (unlikely(index >= dev->real_num_rx_queues)) {
 4337			WARN_ONCE(dev->real_num_rx_queues > 1,
 4338				  "%s received packet on queue %u, but number "
 4339				  "of RX queues is %u\n",
 4340				  dev->name, index, dev->real_num_rx_queues);
 4341			goto done;
 4342		}
 4343		rxqueue += index;
 4344	}
 4345
 4346	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4347
 4348	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4349	map = rcu_dereference(rxqueue->rps_map);
 4350	if (!flow_table && !map)
 4351		goto done;
 4352
 4353	skb_reset_network_header(skb);
 4354	hash = skb_get_hash(skb);
 4355	if (!hash)
 4356		goto done;
 4357
 4358	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4359	if (flow_table && sock_flow_table) {
 4360		struct rps_dev_flow *rflow;
 4361		u32 next_cpu;
 4362		u32 ident;
 4363
 4364		/* First check into global flow table if there is a match */
 4365		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4366		if ((ident ^ hash) & ~rps_cpu_mask)
 4367			goto try_rps;
 4368
 4369		next_cpu = ident & rps_cpu_mask;
 4370
 4371		/* OK, now we know there is a match,
 4372		 * we can look at the local (per receive queue) flow table
 4373		 */
 4374		rflow = &flow_table->flows[hash & flow_table->mask];
 4375		tcpu = rflow->cpu;
 4376
 4377		/*
 4378		 * If the desired CPU (where last recvmsg was done) is
 4379		 * different from current CPU (one in the rx-queue flow
 4380		 * table entry), switch if one of the following holds:
 4381		 *   - Current CPU is unset (>= nr_cpu_ids).
 4382		 *   - Current CPU is offline.
 4383		 *   - The current CPU's queue tail has advanced beyond the
 4384		 *     last packet that was enqueued using this table entry.
 4385		 *     This guarantees that all previous packets for the flow
 4386		 *     have been dequeued, thus preserving in order delivery.
 4387		 */
 4388		if (unlikely(tcpu != next_cpu) &&
 4389		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4390		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4391		      rflow->last_qtail)) >= 0)) {
 4392			tcpu = next_cpu;
 4393			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4394		}
 4395
 4396		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4397			*rflowp = rflow;
 4398			cpu = tcpu;
 4399			goto done;
 4400		}
 4401	}
 4402
 4403try_rps:
 4404
 4405	if (map) {
 4406		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4407		if (cpu_online(tcpu)) {
 4408			cpu = tcpu;
 4409			goto done;
 4410		}
 4411	}
 4412
 4413done:
 4414	return cpu;
 4415}
 4416
 4417#ifdef CONFIG_RFS_ACCEL
 4418
 4419/**
 4420 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4421 * @dev: Device on which the filter was set
 4422 * @rxq_index: RX queue index
 4423 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4424 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4425 *
 4426 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4427 * this function for each installed filter and remove the filters for
 4428 * which it returns %true.
 4429 */
 4430bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4431			 u32 flow_id, u16 filter_id)
 4432{
 4433	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4434	struct rps_dev_flow_table *flow_table;
 4435	struct rps_dev_flow *rflow;
 4436	bool expire = true;
 4437	unsigned int cpu;
 4438
 4439	rcu_read_lock();
 4440	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4441	if (flow_table && flow_id <= flow_table->mask) {
 4442		rflow = &flow_table->flows[flow_id];
 4443		cpu = READ_ONCE(rflow->cpu);
 4444		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4445		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4446			   rflow->last_qtail) <
 4447		     (int)(10 * flow_table->mask)))
 4448			expire = false;
 4449	}
 4450	rcu_read_unlock();
 4451	return expire;
 4452}
 4453EXPORT_SYMBOL(rps_may_expire_flow);
 4454
 4455#endif /* CONFIG_RFS_ACCEL */
 4456
 4457/* Called from hardirq (IPI) context */
 4458static void rps_trigger_softirq(void *data)
 4459{
 4460	struct softnet_data *sd = data;
 4461
 4462	____napi_schedule(sd, &sd->backlog);
 4463	sd->received_rps++;
 4464}
 4465
 4466#endif /* CONFIG_RPS */
 4467
 4468/*
 4469 * Check if this softnet_data structure is another cpu one
 4470 * If yes, queue it to our IPI list and return 1
 4471 * If no, return 0
 4472 */
 4473static int rps_ipi_queued(struct softnet_data *sd)
 4474{
 4475#ifdef CONFIG_RPS
 4476	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4477
 4478	if (sd != mysd) {
 4479		sd->rps_ipi_next = mysd->rps_ipi_list;
 4480		mysd->rps_ipi_list = sd;
 4481
 4482		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4483		return 1;
 4484	}
 4485#endif /* CONFIG_RPS */
 4486	return 0;
 4487}
 4488
 4489#ifdef CONFIG_NET_FLOW_LIMIT
 4490int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4491#endif
 4492
 4493static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4494{
 4495#ifdef CONFIG_NET_FLOW_LIMIT
 4496	struct sd_flow_limit *fl;
 4497	struct softnet_data *sd;
 4498	unsigned int old_flow, new_flow;
 4499
 4500	if (qlen < (netdev_max_backlog >> 1))
 4501		return false;
 4502
 4503	sd = this_cpu_ptr(&softnet_data);
 4504
 4505	rcu_read_lock();
 4506	fl = rcu_dereference(sd->flow_limit);
 4507	if (fl) {
 4508		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4509		old_flow = fl->history[fl->history_head];
 4510		fl->history[fl->history_head] = new_flow;
 4511
 4512		fl->history_head++;
 4513		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4514
 4515		if (likely(fl->buckets[old_flow]))
 4516			fl->buckets[old_flow]--;
 4517
 4518		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4519			fl->count++;
 4520			rcu_read_unlock();
 4521			return true;
 4522		}
 4523	}
 4524	rcu_read_unlock();
 4525#endif
 4526	return false;
 4527}
 4528
 4529/*
 4530 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4531 * queue (may be a remote CPU queue).
 4532 */
 4533static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4534			      unsigned int *qtail)
 4535{
 4536	struct softnet_data *sd;
 4537	unsigned long flags;
 4538	unsigned int qlen;
 4539
 4540	sd = &per_cpu(softnet_data, cpu);
 4541
 4542	local_irq_save(flags);
 4543
 4544	rps_lock(sd);
 4545	if (!netif_running(skb->dev))
 4546		goto drop;
 4547	qlen = skb_queue_len(&sd->input_pkt_queue);
 4548	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4549		if (qlen) {
 4550enqueue:
 4551			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4552			input_queue_tail_incr_save(sd, qtail);
 4553			rps_unlock(sd);
 4554			local_irq_restore(flags);
 4555			return NET_RX_SUCCESS;
 4556		}
 4557
 4558		/* Schedule NAPI for backlog device
 4559		 * We can use non atomic operation since we own the queue lock
 4560		 */
 4561		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4562			if (!rps_ipi_queued(sd))
 4563				____napi_schedule(sd, &sd->backlog);
 4564		}
 4565		goto enqueue;
 4566	}
 4567
 4568drop:
 4569	sd->dropped++;
 4570	rps_unlock(sd);
 4571
 4572	local_irq_restore(flags);
 4573
 4574	atomic_long_inc(&skb->dev->rx_dropped);
 4575	kfree_skb(skb);
 4576	return NET_RX_DROP;
 4577}
 4578
 4579static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4580{
 4581	struct net_device *dev = skb->dev;
 4582	struct netdev_rx_queue *rxqueue;
 4583
 4584	rxqueue = dev->_rx;
 4585
 4586	if (skb_rx_queue_recorded(skb)) {
 4587		u16 index = skb_get_rx_queue(skb);
 4588
 4589		if (unlikely(index >= dev->real_num_rx_queues)) {
 4590			WARN_ONCE(dev->real_num_rx_queues > 1,
 4591				  "%s received packet on queue %u, but number "
 4592				  "of RX queues is %u\n",
 4593				  dev->name, index, dev->real_num_rx_queues);
 4594
 4595			return rxqueue; /* Return first rxqueue */
 4596		}
 4597		rxqueue += index;
 4598	}
 4599	return rxqueue;
 4600}
 4601
 4602static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4603				     struct xdp_buff *xdp,
 4604				     struct bpf_prog *xdp_prog)
 4605{
 4606	struct netdev_rx_queue *rxqueue;
 4607	void *orig_data, *orig_data_end;
 4608	u32 metalen, act = XDP_DROP;
 4609	__be16 orig_eth_type;
 4610	struct ethhdr *eth;
 4611	bool orig_bcast;
 4612	int hlen, off;
 4613	u32 mac_len;
 4614
 4615	/* Reinjected packets coming from act_mirred or similar should
 4616	 * not get XDP generic processing.
 4617	 */
 4618	if (skb_is_redirected(skb))
 4619		return XDP_PASS;
 4620
 4621	/* XDP packets must be linear and must have sufficient headroom
 4622	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4623	 * native XDP provides, thus we need to do it here as well.
 4624	 */
 4625	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4626	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4627		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4628		int troom = skb->tail + skb->data_len - skb->end;
 4629
 4630		/* In case we have to go down the path and also linearize,
 4631		 * then lets do the pskb_expand_head() work just once here.
 4632		 */
 4633		if (pskb_expand_head(skb,
 4634				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4635				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4636			goto do_drop;
 4637		if (skb_linearize(skb))
 4638			goto do_drop;
 4639	}
 4640
 4641	/* The XDP program wants to see the packet starting at the MAC
 4642	 * header.
 4643	 */
 4644	mac_len = skb->data - skb_mac_header(skb);
 4645	hlen = skb_headlen(skb) + mac_len;
 4646	xdp->data = skb->data - mac_len;
 4647	xdp->data_meta = xdp->data;
 4648	xdp->data_end = xdp->data + hlen;
 4649	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4650
 4651	/* SKB "head" area always have tailroom for skb_shared_info */
 4652	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4653	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4654
 4655	orig_data_end = xdp->data_end;
 4656	orig_data = xdp->data;
 4657	eth = (struct ethhdr *)xdp->data;
 4658	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4659	orig_eth_type = eth->h_proto;
 4660
 4661	rxqueue = netif_get_rxqueue(skb);
 4662	xdp->rxq = &rxqueue->xdp_rxq;
 4663
 4664	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4665
 4666	/* check if bpf_xdp_adjust_head was used */
 4667	off = xdp->data - orig_data;
 4668	if (off) {
 4669		if (off > 0)
 4670			__skb_pull(skb, off);
 4671		else if (off < 0)
 4672			__skb_push(skb, -off);
 4673
 4674		skb->mac_header += off;
 4675		skb_reset_network_header(skb);
 4676	}
 4677
 4678	/* check if bpf_xdp_adjust_tail was used */
 4679	off = xdp->data_end - orig_data_end;
 4680	if (off != 0) {
 4681		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4682		skb->len += off; /* positive on grow, negative on shrink */
 4683	}
 4684
 4685	/* check if XDP changed eth hdr such SKB needs update */
 4686	eth = (struct ethhdr *)xdp->data;
 4687	if ((orig_eth_type != eth->h_proto) ||
 4688	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4689		__skb_push(skb, ETH_HLEN);
 4690		skb->protocol = eth_type_trans(skb, skb->dev);
 4691	}
 4692
 4693	switch (act) {
 4694	case XDP_REDIRECT:
 4695	case XDP_TX:
 4696		__skb_push(skb, mac_len);
 4697		break;
 4698	case XDP_PASS:
 4699		metalen = xdp->data - xdp->data_meta;
 4700		if (metalen)
 4701			skb_metadata_set(skb, metalen);
 4702		break;
 4703	default:
 4704		bpf_warn_invalid_xdp_action(act);
 4705		fallthrough;
 4706	case XDP_ABORTED:
 4707		trace_xdp_exception(skb->dev, xdp_prog, act);
 4708		fallthrough;
 4709	case XDP_DROP:
 4710	do_drop:
 4711		kfree_skb(skb);
 4712		break;
 4713	}
 4714
 4715	return act;
 4716}
 4717
 4718/* When doing generic XDP we have to bypass the qdisc layer and the
 4719 * network taps in order to match in-driver-XDP behavior.
 4720 */
 4721void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4722{
 4723	struct net_device *dev = skb->dev;
 4724	struct netdev_queue *txq;
 4725	bool free_skb = true;
 4726	int cpu, rc;
 4727
 4728	txq = netdev_core_pick_tx(dev, skb, NULL);
 4729	cpu = smp_processor_id();
 4730	HARD_TX_LOCK(dev, txq, cpu);
 4731	if (!netif_xmit_stopped(txq)) {
 4732		rc = netdev_start_xmit(skb, dev, txq, 0);
 4733		if (dev_xmit_complete(rc))
 4734			free_skb = false;
 4735	}
 4736	HARD_TX_UNLOCK(dev, txq);
 4737	if (free_skb) {
 4738		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4739		kfree_skb(skb);
 4740	}
 4741}
 4742
 4743static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4744
 4745int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4746{
 4747	if (xdp_prog) {
 4748		struct xdp_buff xdp;
 4749		u32 act;
 4750		int err;
 4751
 4752		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4753		if (act != XDP_PASS) {
 4754			switch (act) {
 4755			case XDP_REDIRECT:
 4756				err = xdp_do_generic_redirect(skb->dev, skb,
 4757							      &xdp, xdp_prog);
 4758				if (err)
 4759					goto out_redir;
 4760				break;
 4761			case XDP_TX:
 4762				generic_xdp_tx(skb, xdp_prog);
 4763				break;
 4764			}
 4765			return XDP_DROP;
 4766		}
 4767	}
 4768	return XDP_PASS;
 4769out_redir:
 4770	kfree_skb(skb);
 4771	return XDP_DROP;
 4772}
 4773EXPORT_SYMBOL_GPL(do_xdp_generic);
 4774
 4775static int netif_rx_internal(struct sk_buff *skb)
 4776{
 4777	int ret;
 4778
 4779	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4780
 4781	trace_netif_rx(skb);
 4782
 4783#ifdef CONFIG_RPS
 4784	if (static_branch_unlikely(&rps_needed)) {
 4785		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4786		int cpu;
 4787
 4788		preempt_disable();
 4789		rcu_read_lock();
 4790
 4791		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4792		if (cpu < 0)
 4793			cpu = smp_processor_id();
 4794
 4795		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4796
 4797		rcu_read_unlock();
 4798		preempt_enable();
 4799	} else
 4800#endif
 4801	{
 4802		unsigned int qtail;
 4803
 4804		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4805		put_cpu();
 4806	}
 4807	return ret;
 4808}
 4809
 4810/**
 4811 *	netif_rx	-	post buffer to the network code
 4812 *	@skb: buffer to post
 4813 *
 4814 *	This function receives a packet from a device driver and queues it for
 4815 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4816 *	may be dropped during processing for congestion control or by the
 4817 *	protocol layers.
 4818 *
 4819 *	return values:
 4820 *	NET_RX_SUCCESS	(no congestion)
 4821 *	NET_RX_DROP     (packet was dropped)
 4822 *
 4823 */
 4824
 4825int netif_rx(struct sk_buff *skb)
 4826{
 4827	int ret;
 4828
 4829	trace_netif_rx_entry(skb);
 4830
 4831	ret = netif_rx_internal(skb);
 4832	trace_netif_rx_exit(ret);
 4833
 4834	return ret;
 4835}
 4836EXPORT_SYMBOL(netif_rx);
 4837
 4838int netif_rx_ni(struct sk_buff *skb)
 4839{
 4840	int err;
 4841
 4842	trace_netif_rx_ni_entry(skb);
 4843
 4844	preempt_disable();
 4845	err = netif_rx_internal(skb);
 4846	if (local_softirq_pending())
 4847		do_softirq();
 4848	preempt_enable();
 4849	trace_netif_rx_ni_exit(err);
 4850
 4851	return err;
 4852}
 4853EXPORT_SYMBOL(netif_rx_ni);
 4854
 4855int netif_rx_any_context(struct sk_buff *skb)
 4856{
 4857	/*
 4858	 * If invoked from contexts which do not invoke bottom half
 4859	 * processing either at return from interrupt or when softrqs are
 4860	 * reenabled, use netif_rx_ni() which invokes bottomhalf processing
 4861	 * directly.
 4862	 */
 4863	if (in_interrupt())
 4864		return netif_rx(skb);
 4865	else
 4866		return netif_rx_ni(skb);
 4867}
 4868EXPORT_SYMBOL(netif_rx_any_context);
 4869
 4870static __latent_entropy void net_tx_action(struct softirq_action *h)
 4871{
 4872	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4873
 4874	if (sd->completion_queue) {
 4875		struct sk_buff *clist;
 4876
 4877		local_irq_disable();
 4878		clist = sd->completion_queue;
 4879		sd->completion_queue = NULL;
 4880		local_irq_enable();
 4881
 4882		while (clist) {
 4883			struct sk_buff *skb = clist;
 4884
 4885			clist = clist->next;
 4886
 4887			WARN_ON(refcount_read(&skb->users));
 4888			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4889				trace_consume_skb(skb);
 4890			else
 4891				trace_kfree_skb(skb, net_tx_action);
 4892
 4893			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4894				__kfree_skb(skb);
 4895			else
 4896				__kfree_skb_defer(skb);
 4897		}
 4898
 4899		__kfree_skb_flush();
 4900	}
 4901
 4902	if (sd->output_queue) {
 4903		struct Qdisc *head;
 4904
 4905		local_irq_disable();
 4906		head = sd->output_queue;
 4907		sd->output_queue = NULL;
 4908		sd->output_queue_tailp = &sd->output_queue;
 4909		local_irq_enable();
 4910
 4911		while (head) {
 4912			struct Qdisc *q = head;
 4913			spinlock_t *root_lock = NULL;
 4914
 4915			head = head->next_sched;
 4916
 4917			if (!(q->flags & TCQ_F_NOLOCK)) {
 4918				root_lock = qdisc_lock(q);
 4919				spin_lock(root_lock);
 4920			}
 4921			/* We need to make sure head->next_sched is read
 4922			 * before clearing __QDISC_STATE_SCHED
 4923			 */
 4924			smp_mb__before_atomic();
 4925			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4926			qdisc_run(q);
 4927			if (root_lock)
 4928				spin_unlock(root_lock);
 4929		}
 4930	}
 4931
 4932	xfrm_dev_backlog(sd);
 4933}
 4934
 4935#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 4936/* This hook is defined here for ATM LANE */
 4937int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4938			     unsigned char *addr) __read_mostly;
 4939EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4940#endif
 4941
 4942static inline struct sk_buff *
 4943sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4944		   struct net_device *orig_dev, bool *another)
 4945{
 4946#ifdef CONFIG_NET_CLS_ACT
 4947	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4948	struct tcf_result cl_res;
 4949
 4950	/* If there's at least one ingress present somewhere (so
 4951	 * we get here via enabled static key), remaining devices
 4952	 * that are not configured with an ingress qdisc will bail
 4953	 * out here.
 4954	 */
 4955	if (!miniq)
 4956		return skb;
 4957
 4958	if (*pt_prev) {
 4959		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4960		*pt_prev = NULL;
 4961	}
 4962
 4963	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4964	qdisc_skb_cb(skb)->mru = 0;
 4965	skb->tc_at_ingress = 1;
 4966	mini_qdisc_bstats_cpu_update(miniq, skb);
 4967
 4968	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4969				     &cl_res, false)) {
 4970	case TC_ACT_OK:
 4971	case TC_ACT_RECLASSIFY:
 4972		skb->tc_index = TC_H_MIN(cl_res.classid);
 4973		break;
 4974	case TC_ACT_SHOT:
 4975		mini_qdisc_qstats_cpu_drop(miniq);
 4976		kfree_skb(skb);
 4977		return NULL;
 4978	case TC_ACT_STOLEN:
 4979	case TC_ACT_QUEUED:
 4980	case TC_ACT_TRAP:
 4981		consume_skb(skb);
 4982		return NULL;
 4983	case TC_ACT_REDIRECT:
 4984		/* skb_mac_header check was done by cls/act_bpf, so
 4985		 * we can safely push the L2 header back before
 4986		 * redirecting to another netdev
 4987		 */
 4988		__skb_push(skb, skb->mac_len);
 4989		if (skb_do_redirect(skb) == -EAGAIN) {
 4990			__skb_pull(skb, skb->mac_len);
 4991			*another = true;
 4992			break;
 4993		}
 4994		return NULL;
 4995	case TC_ACT_CONSUMED:
 4996		return NULL;
 4997	default:
 4998		break;
 4999	}
 5000#endif /* CONFIG_NET_CLS_ACT */
 5001	return skb;
 5002}
 5003
 5004/**
 5005 *	netdev_is_rx_handler_busy - check if receive handler is registered
 5006 *	@dev: device to check
 5007 *
 5008 *	Check if a receive handler is already registered for a given device.
 5009 *	Return true if there one.
 5010 *
 5011 *	The caller must hold the rtnl_mutex.
 5012 */
 5013bool netdev_is_rx_handler_busy(struct net_device *dev)
 5014{
 5015	ASSERT_RTNL();
 5016	return dev && rtnl_dereference(dev->rx_handler);
 5017}
 5018EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5019
 5020/**
 5021 *	netdev_rx_handler_register - register receive handler
 5022 *	@dev: device to register a handler for
 5023 *	@rx_handler: receive handler to register
 5024 *	@rx_handler_data: data pointer that is used by rx handler
 5025 *
 5026 *	Register a receive handler for a device. This handler will then be
 5027 *	called from __netif_receive_skb. A negative errno code is returned
 5028 *	on a failure.
 5029 *
 5030 *	The caller must hold the rtnl_mutex.
 5031 *
 5032 *	For a general description of rx_handler, see enum rx_handler_result.
 5033 */
 5034int netdev_rx_handler_register(struct net_device *dev,
 5035			       rx_handler_func_t *rx_handler,
 5036			       void *rx_handler_data)
 5037{
 5038	if (netdev_is_rx_handler_busy(dev))
 5039		return -EBUSY;
 5040
 5041	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5042		return -EINVAL;
 5043
 5044	/* Note: rx_handler_data must be set before rx_handler */
 5045	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5046	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5047
 5048	return 0;
 5049}
 5050EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5051
 5052/**
 5053 *	netdev_rx_handler_unregister - unregister receive handler
 5054 *	@dev: device to unregister a handler from
 5055 *
 5056 *	Unregister a receive handler from a device.
 5057 *
 5058 *	The caller must hold the rtnl_mutex.
 5059 */
 5060void netdev_rx_handler_unregister(struct net_device *dev)
 5061{
 5062
 5063	ASSERT_RTNL();
 5064	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5065	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5066	 * section has a guarantee to see a non NULL rx_handler_data
 5067	 * as well.
 5068	 */
 5069	synchronize_net();
 5070	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5071}
 5072EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5073
 5074/*
 5075 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5076 * the special handling of PFMEMALLOC skbs.
 5077 */
 5078static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5079{
 5080	switch (skb->protocol) {
 5081	case htons(ETH_P_ARP):
 5082	case htons(ETH_P_IP):
 5083	case htons(ETH_P_IPV6):
 5084	case htons(ETH_P_8021Q):
 5085	case htons(ETH_P_8021AD):
 5086		return true;
 5087	default:
 5088		return false;
 5089	}
 5090}
 5091
 5092static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5093			     int *ret, struct net_device *orig_dev)
 5094{
 5095	if (nf_hook_ingress_active(skb)) {
 5096		int ingress_retval;
 5097
 5098		if (*pt_prev) {
 5099			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5100			*pt_prev = NULL;
 5101		}
 5102
 5103		rcu_read_lock();
 5104		ingress_retval = nf_hook_ingress(skb);
 5105		rcu_read_unlock();
 5106		return ingress_retval;
 5107	}
 5108	return 0;
 5109}
 5110
 5111static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5112				    struct packet_type **ppt_prev)
 5113{
 5114	struct packet_type *ptype, *pt_prev;
 5115	rx_handler_func_t *rx_handler;
 5116	struct sk_buff *skb = *pskb;
 5117	struct net_device *orig_dev;
 5118	bool deliver_exact = false;
 5119	int ret = NET_RX_DROP;
 5120	__be16 type;
 5121
 5122	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 5123
 5124	trace_netif_receive_skb(skb);
 5125
 5126	orig_dev = skb->dev;
 5127
 5128	skb_reset_network_header(skb);
 5129	if (!skb_transport_header_was_set(skb))
 5130		skb_reset_transport_header(skb);
 5131	skb_reset_mac_len(skb);
 5132
 5133	pt_prev = NULL;
 5134
 5135another_round:
 5136	skb->skb_iif = skb->dev->ifindex;
 5137
 5138	__this_cpu_inc(softnet_data.processed);
 5139
 5140	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5141		int ret2;
 5142
 5143		preempt_disable();
 5144		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5145		preempt_enable();
 5146
 5147		if (ret2 != XDP_PASS) {
 5148			ret = NET_RX_DROP;
 5149			goto out;
 5150		}
 5151		skb_reset_mac_len(skb);
 5152	}
 5153
 5154	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5155	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5156		skb = skb_vlan_untag(skb);
 5157		if (unlikely(!skb))
 5158			goto out;
 5159	}
 5160
 5161	if (skb_skip_tc_classify(skb))
 5162		goto skip_classify;
 5163
 5164	if (pfmemalloc)
 5165		goto skip_taps;
 5166
 5167	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5168		if (pt_prev)
 5169			ret = deliver_skb(skb, pt_prev, orig_dev);
 5170		pt_prev = ptype;
 5171	}
 5172
 5173	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5174		if (pt_prev)
 5175			ret = deliver_skb(skb, pt_prev, orig_dev);
 5176		pt_prev = ptype;
 5177	}
 5178
 5179skip_taps:
 5180#ifdef CONFIG_NET_INGRESS
 5181	if (static_branch_unlikely(&ingress_needed_key)) {
 5182		bool another = false;
 5183
 5184		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5185					 &another);
 5186		if (another)
 5187			goto another_round;
 5188		if (!skb)
 5189			goto out;
 5190
 5191		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5192			goto out;
 5193	}
 5194#endif
 5195	skb_reset_redirect(skb);
 5196skip_classify:
 5197	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5198		goto drop;
 5199
 5200	if (skb_vlan_tag_present(skb)) {
 5201		if (pt_prev) {
 5202			ret = deliver_skb(skb, pt_prev, orig_dev);
 5203			pt_prev = NULL;
 5204		}
 5205		if (vlan_do_receive(&skb))
 5206			goto another_round;
 5207		else if (unlikely(!skb))
 5208			goto out;
 5209	}
 5210
 5211	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5212	if (rx_handler) {
 5213		if (pt_prev) {
 5214			ret = deliver_skb(skb, pt_prev, orig_dev);
 5215			pt_prev = NULL;
 5216		}
 5217		switch (rx_handler(&skb)) {
 5218		case RX_HANDLER_CONSUMED:
 5219			ret = NET_RX_SUCCESS;
 5220			goto out;
 5221		case RX_HANDLER_ANOTHER:
 5222			goto another_round;
 5223		case RX_HANDLER_EXACT:
 5224			deliver_exact = true;
 5225		case RX_HANDLER_PASS:
 5226			break;
 5227		default:
 5228			BUG();
 5229		}
 5230	}
 5231
 5232	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5233check_vlan_id:
 5234		if (skb_vlan_tag_get_id(skb)) {
 5235			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5236			 * find vlan device.
 5237			 */
 5238			skb->pkt_type = PACKET_OTHERHOST;
 5239		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5240			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5241			/* Outer header is 802.1P with vlan 0, inner header is
 5242			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5243			 * not find vlan dev for vlan id 0.
 5244			 */
 5245			__vlan_hwaccel_clear_tag(skb);
 5246			skb = skb_vlan_untag(skb);
 5247			if (unlikely(!skb))
 5248				goto out;
 5249			if (vlan_do_receive(&skb))
 5250				/* After stripping off 802.1P header with vlan 0
 5251				 * vlan dev is found for inner header.
 5252				 */
 5253				goto another_round;
 5254			else if (unlikely(!skb))
 5255				goto out;
 5256			else
 5257				/* We have stripped outer 802.1P vlan 0 header.
 5258				 * But could not find vlan dev.
 5259				 * check again for vlan id to set OTHERHOST.
 5260				 */
 5261				goto check_vlan_id;
 5262		}
 5263		/* Note: we might in the future use prio bits
 5264		 * and set skb->priority like in vlan_do_receive()
 5265		 * For the time being, just ignore Priority Code Point
 5266		 */
 5267		__vlan_hwaccel_clear_tag(skb);
 5268	}
 5269
 5270	type = skb->protocol;
 5271
 5272	/* deliver only exact match when indicated */
 5273	if (likely(!deliver_exact)) {
 5274		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5275				       &ptype_base[ntohs(type) &
 5276						   PTYPE_HASH_MASK]);
 5277	}
 5278
 5279	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5280			       &orig_dev->ptype_specific);
 5281
 5282	if (unlikely(skb->dev != orig_dev)) {
 5283		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5284				       &skb->dev->ptype_specific);
 5285	}
 5286
 5287	if (pt_prev) {
 5288		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5289			goto drop;
 5290		*ppt_prev = pt_prev;
 5291	} else {
 5292drop:
 5293		if (!deliver_exact)
 5294			atomic_long_inc(&skb->dev->rx_dropped);
 5295		else
 5296			atomic_long_inc(&skb->dev->rx_nohandler);
 5297		kfree_skb(skb);
 5298		/* Jamal, now you will not able to escape explaining
 5299		 * me how you were going to use this. :-)
 5300		 */
 5301		ret = NET_RX_DROP;
 5302	}
 5303
 5304out:
 5305	/* The invariant here is that if *ppt_prev is not NULL
 5306	 * then skb should also be non-NULL.
 5307	 *
 5308	 * Apparently *ppt_prev assignment above holds this invariant due to
 5309	 * skb dereferencing near it.
 5310	 */
 5311	*pskb = skb;
 5312	return ret;
 5313}
 5314
 5315static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5316{
 5317	struct net_device *orig_dev = skb->dev;
 5318	struct packet_type *pt_prev = NULL;
 5319	int ret;
 5320
 5321	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5322	if (pt_prev)
 5323		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5324					 skb->dev, pt_prev, orig_dev);
 5325	return ret;
 5326}
 5327
 5328/**
 5329 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5330 *	@skb: buffer to process
 5331 *
 5332 *	More direct receive version of netif_receive_skb().  It should
 5333 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5334 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5335 *
 5336 *	This function may only be called from softirq context and interrupts
 5337 *	should be enabled.
 5338 *
 5339 *	Return values (usually ignored):
 5340 *	NET_RX_SUCCESS: no congestion
 5341 *	NET_RX_DROP: packet was dropped
 5342 */
 5343int netif_receive_skb_core(struct sk_buff *skb)
 5344{
 5345	int ret;
 5346
 5347	rcu_read_lock();
 5348	ret = __netif_receive_skb_one_core(skb, false);
 5349	rcu_read_unlock();
 5350
 5351	return ret;
 5352}
 5353EXPORT_SYMBOL(netif_receive_skb_core);
 5354
 5355static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5356						  struct packet_type *pt_prev,
 5357						  struct net_device *orig_dev)
 5358{
 5359	struct sk_buff *skb, *next;
 5360
 5361	if (!pt_prev)
 5362		return;
 5363	if (list_empty(head))
 5364		return;
 5365	if (pt_prev->list_func != NULL)
 5366		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5367				   ip_list_rcv, head, pt_prev, orig_dev);
 5368	else
 5369		list_for_each_entry_safe(skb, next, head, list) {
 5370			skb_list_del_init(skb);
 5371			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5372		}
 5373}
 5374
 5375static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5376{
 5377	/* Fast-path assumptions:
 5378	 * - There is no RX handler.
 5379	 * - Only one packet_type matches.
 5380	 * If either of these fails, we will end up doing some per-packet
 5381	 * processing in-line, then handling the 'last ptype' for the whole
 5382	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5383	 * because the 'last ptype' must be constant across the sublist, and all
 5384	 * other ptypes are handled per-packet.
 5385	 */
 5386	/* Current (common) ptype of sublist */
 5387	struct packet_type *pt_curr = NULL;
 5388	/* Current (common) orig_dev of sublist */
 5389	struct net_device *od_curr = NULL;
 5390	struct list_head sublist;
 5391	struct sk_buff *skb, *next;
 5392
 5393	INIT_LIST_HEAD(&sublist);
 5394	list_for_each_entry_safe(skb, next, head, list) {
 5395		struct net_device *orig_dev = skb->dev;
 5396		struct packet_type *pt_prev = NULL;
 5397
 5398		skb_list_del_init(skb);
 5399		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5400		if (!pt_prev)
 5401			continue;
 5402		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5403			/* dispatch old sublist */
 5404			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5405			/* start new sublist */
 5406			INIT_LIST_HEAD(&sublist);
 5407			pt_curr = pt_prev;
 5408			od_curr = orig_dev;
 5409		}
 5410		list_add_tail(&skb->list, &sublist);
 5411	}
 5412
 5413	/* dispatch final sublist */
 5414	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5415}
 5416
 5417static int __netif_receive_skb(struct sk_buff *skb)
 5418{
 5419	int ret;
 5420
 5421	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5422		unsigned int noreclaim_flag;
 5423
 5424		/*
 5425		 * PFMEMALLOC skbs are special, they should
 5426		 * - be delivered to SOCK_MEMALLOC sockets only
 5427		 * - stay away from userspace
 5428		 * - have bounded memory usage
 5429		 *
 5430		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5431		 * context down to all allocation sites.
 5432		 */
 5433		noreclaim_flag = memalloc_noreclaim_save();
 5434		ret = __netif_receive_skb_one_core(skb, true);
 5435		memalloc_noreclaim_restore(noreclaim_flag);
 5436	} else
 5437		ret = __netif_receive_skb_one_core(skb, false);
 5438
 5439	return ret;
 5440}
 5441
 5442static void __netif_receive_skb_list(struct list_head *head)
 5443{
 5444	unsigned long noreclaim_flag = 0;
 5445	struct sk_buff *skb, *next;
 5446	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5447
 5448	list_for_each_entry_safe(skb, next, head, list) {
 5449		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5450			struct list_head sublist;
 5451
 5452			/* Handle the previous sublist */
 5453			list_cut_before(&sublist, head, &skb->list);
 5454			if (!list_empty(&sublist))
 5455				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5456			pfmemalloc = !pfmemalloc;
 5457			/* See comments in __netif_receive_skb */
 5458			if (pfmemalloc)
 5459				noreclaim_flag = memalloc_noreclaim_save();
 5460			else
 5461				memalloc_noreclaim_restore(noreclaim_flag);
 5462		}
 5463	}
 5464	/* Handle the remaining sublist */
 5465	if (!list_empty(head))
 5466		__netif_receive_skb_list_core(head, pfmemalloc);
 5467	/* Restore pflags */
 5468	if (pfmemalloc)
 5469		memalloc_noreclaim_restore(noreclaim_flag);
 5470}
 5471
 5472static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5473{
 5474	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5475	struct bpf_prog *new = xdp->prog;
 5476	int ret = 0;
 5477
 5478	if (new) {
 5479		u32 i;
 5480
 5481		mutex_lock(&new->aux->used_maps_mutex);
 5482
 5483		/* generic XDP does not work with DEVMAPs that can
 5484		 * have a bpf_prog installed on an entry
 5485		 */
 5486		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5487			if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
 5488			    cpu_map_prog_allowed(new->aux->used_maps[i])) {
 5489				mutex_unlock(&new->aux->used_maps_mutex);
 5490				return -EINVAL;
 5491			}
 5492		}
 5493
 5494		mutex_unlock(&new->aux->used_maps_mutex);
 5495	}
 5496
 5497	switch (xdp->command) {
 5498	case XDP_SETUP_PROG:
 5499		rcu_assign_pointer(dev->xdp_prog, new);
 5500		if (old)
 5501			bpf_prog_put(old);
 5502
 5503		if (old && !new) {
 5504			static_branch_dec(&generic_xdp_needed_key);
 5505		} else if (new && !old) {
 5506			static_branch_inc(&generic_xdp_needed_key);
 5507			dev_disable_lro(dev);
 5508			dev_disable_gro_hw(dev);
 5509		}
 5510		break;
 5511
 5512	default:
 5513		ret = -EINVAL;
 5514		break;
 5515	}
 5516
 5517	return ret;
 5518}
 5519
 5520static int netif_receive_skb_internal(struct sk_buff *skb)
 5521{
 5522	int ret;
 5523
 5524	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5525
 5526	if (skb_defer_rx_timestamp(skb))
 5527		return NET_RX_SUCCESS;
 5528
 5529	rcu_read_lock();
 5530#ifdef CONFIG_RPS
 5531	if (static_branch_unlikely(&rps_needed)) {
 5532		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5533		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5534
 5535		if (cpu >= 0) {
 5536			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5537			rcu_read_unlock();
 5538			return ret;
 5539		}
 5540	}
 5541#endif
 5542	ret = __netif_receive_skb(skb);
 5543	rcu_read_unlock();
 5544	return ret;
 5545}
 5546
 5547static void netif_receive_skb_list_internal(struct list_head *head)
 5548{
 5549	struct sk_buff *skb, *next;
 5550	struct list_head sublist;
 5551
 5552	INIT_LIST_HEAD(&sublist);
 5553	list_for_each_entry_safe(skb, next, head, list) {
 5554		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5555		skb_list_del_init(skb);
 5556		if (!skb_defer_rx_timestamp(skb))
 5557			list_add_tail(&skb->list, &sublist);
 5558	}
 5559	list_splice_init(&sublist, head);
 5560
 5561	rcu_read_lock();
 5562#ifdef CONFIG_RPS
 5563	if (static_branch_unlikely(&rps_needed)) {
 5564		list_for_each_entry_safe(skb, next, head, list) {
 5565			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5566			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5567
 5568			if (cpu >= 0) {
 5569				/* Will be handled, remove from list */
 5570				skb_list_del_init(skb);
 5571				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5572			}
 5573		}
 5574	}
 5575#endif
 5576	__netif_receive_skb_list(head);
 5577	rcu_read_unlock();
 5578}
 5579
 5580/**
 5581 *	netif_receive_skb - process receive buffer from network
 5582 *	@skb: buffer to process
 5583 *
 5584 *	netif_receive_skb() is the main receive data processing function.
 5585 *	It always succeeds. The buffer may be dropped during processing
 5586 *	for congestion control or by the protocol layers.
 5587 *
 5588 *	This function may only be called from softirq context and interrupts
 5589 *	should be enabled.
 5590 *
 5591 *	Return values (usually ignored):
 5592 *	NET_RX_SUCCESS: no congestion
 5593 *	NET_RX_DROP: packet was dropped
 5594 */
 5595int netif_receive_skb(struct sk_buff *skb)
 5596{
 5597	int ret;
 5598
 5599	trace_netif_receive_skb_entry(skb);
 5600
 5601	ret = netif_receive_skb_internal(skb);
 5602	trace_netif_receive_skb_exit(ret);
 5603
 5604	return ret;
 5605}
 5606EXPORT_SYMBOL(netif_receive_skb);
 5607
 5608/**
 5609 *	netif_receive_skb_list - process many receive buffers from network
 5610 *	@head: list of skbs to process.
 5611 *
 5612 *	Since return value of netif_receive_skb() is normally ignored, and
 5613 *	wouldn't be meaningful for a list, this function returns void.
 5614 *
 5615 *	This function may only be called from softirq context and interrupts
 5616 *	should be enabled.
 5617 */
 5618void netif_receive_skb_list(struct list_head *head)
 5619{
 5620	struct sk_buff *skb;
 5621
 5622	if (list_empty(head))
 5623		return;
 5624	if (trace_netif_receive_skb_list_entry_enabled()) {
 5625		list_for_each_entry(skb, head, list)
 5626			trace_netif_receive_skb_list_entry(skb);
 5627	}
 5628	netif_receive_skb_list_internal(head);
 5629	trace_netif_receive_skb_list_exit(0);
 5630}
 5631EXPORT_SYMBOL(netif_receive_skb_list);
 5632
 5633static DEFINE_PER_CPU(struct work_struct, flush_works);
 5634
 5635/* Network device is going away, flush any packets still pending */
 5636static void flush_backlog(struct work_struct *work)
 5637{
 5638	struct sk_buff *skb, *tmp;
 5639	struct softnet_data *sd;
 5640
 5641	local_bh_disable();
 5642	sd = this_cpu_ptr(&softnet_data);
 5643
 5644	local_irq_disable();
 5645	rps_lock(sd);
 5646	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5647		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5648			__skb_unlink(skb, &sd->input_pkt_queue);
 5649			dev_kfree_skb_irq(skb);
 5650			input_queue_head_incr(sd);
 5651		}
 5652	}
 5653	rps_unlock(sd);
 5654	local_irq_enable();
 5655
 5656	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5657		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5658			__skb_unlink(skb, &sd->process_queue);
 5659			kfree_skb(skb);
 5660			input_queue_head_incr(sd);
 5661		}
 5662	}
 5663	local_bh_enable();
 5664}
 5665
 5666static bool flush_required(int cpu)
 5667{
 5668#if IS_ENABLED(CONFIG_RPS)
 5669	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 5670	bool do_flush;
 5671
 5672	local_irq_disable();
 5673	rps_lock(sd);
 5674
 5675	/* as insertion into process_queue happens with the rps lock held,
 5676	 * process_queue access may race only with dequeue
 5677	 */
 5678	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 5679		   !skb_queue_empty_lockless(&sd->process_queue);
 5680	rps_unlock(sd);
 5681	local_irq_enable();
 5682
 5683	return do_flush;
 5684#endif
 5685	/* without RPS we can't safely check input_pkt_queue: during a
 5686	 * concurrent remote skb_queue_splice() we can detect as empty both
 5687	 * input_pkt_queue and process_queue even if the latter could end-up
 5688	 * containing a lot of packets.
 5689	 */
 5690	return true;
 5691}
 5692
 5693static void flush_all_backlogs(void)
 5694{
 5695	static cpumask_t flush_cpus;
 5696	unsigned int cpu;
 5697
 5698	/* since we are under rtnl lock protection we can use static data
 5699	 * for the cpumask and avoid allocating on stack the possibly
 5700	 * large mask
 5701	 */
 5702	ASSERT_RTNL();
 5703
 5704	get_online_cpus();
 5705
 5706	cpumask_clear(&flush_cpus);
 5707	for_each_online_cpu(cpu) {
 5708		if (flush_required(cpu)) {
 5709			queue_work_on(cpu, system_highpri_wq,
 5710				      per_cpu_ptr(&flush_works, cpu));
 5711			cpumask_set_cpu(cpu, &flush_cpus);
 5712		}
 5713	}
 5714
 5715	/* we can have in flight packet[s] on the cpus we are not flushing,
 5716	 * synchronize_net() in rollback_registered_many() will take care of
 5717	 * them
 5718	 */
 5719	for_each_cpu(cpu, &flush_cpus)
 5720		flush_work(per_cpu_ptr(&flush_works, cpu));
 5721
 5722	put_online_cpus();
 5723}
 5724
 5725/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5726static void gro_normal_list(struct napi_struct *napi)
 5727{
 5728	if (!napi->rx_count)
 5729		return;
 5730	netif_receive_skb_list_internal(&napi->rx_list);
 5731	INIT_LIST_HEAD(&napi->rx_list);
 5732	napi->rx_count = 0;
 5733}
 5734
 5735/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5736 * pass the whole batch up to the stack.
 5737 */
 5738static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5739{
 5740	list_add_tail(&skb->list, &napi->rx_list);
 5741	if (++napi->rx_count >= gro_normal_batch)
 5742		gro_normal_list(napi);
 5743}
 5744
 5745INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5746INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5747static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5748{
 5749	struct packet_offload *ptype;
 5750	__be16 type = skb->protocol;
 5751	struct list_head *head = &offload_base;
 5752	int err = -ENOENT;
 5753
 5754	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5755
 5756	if (NAPI_GRO_CB(skb)->count == 1) {
 5757		skb_shinfo(skb)->gso_size = 0;
 5758		goto out;
 5759	}
 5760
 5761	rcu_read_lock();
 5762	list_for_each_entry_rcu(ptype, head, list) {
 5763		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5764			continue;
 5765
 5766		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5767					 ipv6_gro_complete, inet_gro_complete,
 5768					 skb, 0);
 5769		break;
 5770	}
 5771	rcu_read_unlock();
 5772
 5773	if (err) {
 5774		WARN_ON(&ptype->list == head);
 5775		kfree_skb(skb);
 5776		return NET_RX_SUCCESS;
 5777	}
 5778
 5779out:
 5780	gro_normal_one(napi, skb);
 5781	return NET_RX_SUCCESS;
 5782}
 5783
 5784static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5785				   bool flush_old)
 5786{
 5787	struct list_head *head = &napi->gro_hash[index].list;
 5788	struct sk_buff *skb, *p;
 5789
 5790	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5791		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5792			return;
 5793		skb_list_del_init(skb);
 5794		napi_gro_complete(napi, skb);
 5795		napi->gro_hash[index].count--;
 5796	}
 5797
 5798	if (!napi->gro_hash[index].count)
 5799		__clear_bit(index, &napi->gro_bitmask);
 5800}
 5801
 5802/* napi->gro_hash[].list contains packets ordered by age.
 5803 * youngest packets at the head of it.
 5804 * Complete skbs in reverse order to reduce latencies.
 5805 */
 5806void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5807{
 5808	unsigned long bitmask = napi->gro_bitmask;
 5809	unsigned int i, base = ~0U;
 5810
 5811	while ((i = ffs(bitmask)) != 0) {
 5812		bitmask >>= i;
 5813		base += i;
 5814		__napi_gro_flush_chain(napi, base, flush_old);
 5815	}
 5816}
 5817EXPORT_SYMBOL(napi_gro_flush);
 5818
 5819static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5820					  struct sk_buff *skb)
 5821{
 5822	unsigned int maclen = skb->dev->hard_header_len;
 5823	u32 hash = skb_get_hash_raw(skb);
 5824	struct list_head *head;
 5825	struct sk_buff *p;
 5826
 5827	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5828	list_for_each_entry(p, head, list) {
 5829		unsigned long diffs;
 5830
 5831		NAPI_GRO_CB(p)->flush = 0;
 5832
 5833		if (hash != skb_get_hash_raw(p)) {
 5834			NAPI_GRO_CB(p)->same_flow = 0;
 5835			continue;
 5836		}
 5837
 5838		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5839		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5840		if (skb_vlan_tag_present(p))
 5841			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5842		diffs |= skb_metadata_dst_cmp(p, skb);
 5843		diffs |= skb_metadata_differs(p, skb);
 5844		if (maclen == ETH_HLEN)
 5845			diffs |= compare_ether_header(skb_mac_header(p),
 5846						      skb_mac_header(skb));
 5847		else if (!diffs)
 5848			diffs = memcmp(skb_mac_header(p),
 5849				       skb_mac_header(skb),
 5850				       maclen);
 5851		NAPI_GRO_CB(p)->same_flow = !diffs;
 5852	}
 5853
 5854	return head;
 5855}
 5856
 5857static void skb_gro_reset_offset(struct sk_buff *skb)
 5858{
 5859	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5860	const skb_frag_t *frag0 = &pinfo->frags[0];
 5861
 5862	NAPI_GRO_CB(skb)->data_offset = 0;
 5863	NAPI_GRO_CB(skb)->frag0 = NULL;
 5864	NAPI_GRO_CB(skb)->frag0_len = 0;
 5865
 5866	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5867	    !PageHighMem(skb_frag_page(frag0))) {
 5868		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5869		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5870						    skb_frag_size(frag0),
 5871						    skb->end - skb->tail);
 5872	}
 5873}
 5874
 5875static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5876{
 5877	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5878
 5879	BUG_ON(skb->end - skb->tail < grow);
 5880
 5881	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5882
 5883	skb->data_len -= grow;
 5884	skb->tail += grow;
 5885
 5886	skb_frag_off_add(&pinfo->frags[0], grow);
 5887	skb_frag_size_sub(&pinfo->frags[0], grow);
 5888
 5889	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5890		skb_frag_unref(skb, 0);
 5891		memmove(pinfo->frags, pinfo->frags + 1,
 5892			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5893	}
 5894}
 5895
 5896static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5897{
 5898	struct sk_buff *oldest;
 5899
 5900	oldest = list_last_entry(head, struct sk_buff, list);
 5901
 5902	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5903	 * impossible.
 5904	 */
 5905	if (WARN_ON_ONCE(!oldest))
 5906		return;
 5907
 5908	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5909	 * SKB to the chain.
 5910	 */
 5911	skb_list_del_init(oldest);
 5912	napi_gro_complete(napi, oldest);
 5913}
 5914
 5915INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5916							   struct sk_buff *));
 5917INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5918							   struct sk_buff *));
 5919static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5920{
 5921	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5922	struct list_head *head = &offload_base;
 5923	struct packet_offload *ptype;
 5924	__be16 type = skb->protocol;
 5925	struct list_head *gro_head;
 5926	struct sk_buff *pp = NULL;
 5927	enum gro_result ret;
 5928	int same_flow;
 5929	int grow;
 5930
 5931	if (netif_elide_gro(skb->dev))
 5932		goto normal;
 5933
 5934	gro_head = gro_list_prepare(napi, skb);
 5935
 5936	rcu_read_lock();
 5937	list_for_each_entry_rcu(ptype, head, list) {
 5938		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5939			continue;
 5940
 5941		skb_set_network_header(skb, skb_gro_offset(skb));
 5942		skb_reset_mac_len(skb);
 5943		NAPI_GRO_CB(skb)->same_flow = 0;
 5944		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5945		NAPI_GRO_CB(skb)->free = 0;
 5946		NAPI_GRO_CB(skb)->encap_mark = 0;
 5947		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5948		NAPI_GRO_CB(skb)->is_fou = 0;
 5949		NAPI_GRO_CB(skb)->is_atomic = 1;
 5950		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5951
 5952		/* Setup for GRO checksum validation */
 5953		switch (skb->ip_summed) {
 5954		case CHECKSUM_COMPLETE:
 5955			NAPI_GRO_CB(skb)->csum = skb->csum;
 5956			NAPI_GRO_CB(skb)->csum_valid = 1;
 5957			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5958			break;
 5959		case CHECKSUM_UNNECESSARY:
 5960			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5961			NAPI_GRO_CB(skb)->csum_valid = 0;
 5962			break;
 5963		default:
 5964			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5965			NAPI_GRO_CB(skb)->csum_valid = 0;
 5966		}
 5967
 5968		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5969					ipv6_gro_receive, inet_gro_receive,
 5970					gro_head, skb);
 5971		break;
 5972	}
 5973	rcu_read_unlock();
 5974
 5975	if (&ptype->list == head)
 5976		goto normal;
 5977
 5978	if (PTR_ERR(pp) == -EINPROGRESS) {
 5979		ret = GRO_CONSUMED;
 5980		goto ok;
 5981	}
 5982
 5983	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5984	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5985
 5986	if (pp) {
 5987		skb_list_del_init(pp);
 5988		napi_gro_complete(napi, pp);
 5989		napi->gro_hash[hash].count--;
 5990	}
 5991
 5992	if (same_flow)
 5993		goto ok;
 5994
 5995	if (NAPI_GRO_CB(skb)->flush)
 5996		goto normal;
 5997
 5998	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5999		gro_flush_oldest(napi, gro_head);
 6000	} else {
 6001		napi->gro_hash[hash].count++;
 6002	}
 6003	NAPI_GRO_CB(skb)->count = 1;
 6004	NAPI_GRO_CB(skb)->age = jiffies;
 6005	NAPI_GRO_CB(skb)->last = skb;
 6006	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 6007	list_add(&skb->list, gro_head);
 6008	ret = GRO_HELD;
 6009
 6010pull:
 6011	grow = skb_gro_offset(skb) - skb_headlen(skb);
 6012	if (grow > 0)
 6013		gro_pull_from_frag0(skb, grow);
 6014ok:
 6015	if (napi->gro_hash[hash].count) {
 6016		if (!test_bit(hash, &napi->gro_bitmask))
 6017			__set_bit(hash, &napi->gro_bitmask);
 6018	} else if (test_bit(hash, &napi->gro_bitmask)) {
 6019		__clear_bit(hash, &napi->gro_bitmask);
 6020	}
 6021
 6022	return ret;
 6023
 6024normal:
 6025	ret = GRO_NORMAL;
 6026	goto pull;
 6027}
 6028
 6029struct packet_offload *gro_find_receive_by_type(__be16 type)
 6030{
 6031	struct list_head *offload_head = &offload_base;
 6032	struct packet_offload *ptype;
 6033
 6034	list_for_each_entry_rcu(ptype, offload_head, list) {
 6035		if (ptype->type != type || !ptype->callbacks.gro_receive)
 6036			continue;
 6037		return ptype;
 6038	}
 6039	return NULL;
 6040}
 6041EXPORT_SYMBOL(gro_find_receive_by_type);
 6042
 6043struct packet_offload *gro_find_complete_by_type(__be16 type)
 6044{
 6045	struct list_head *offload_head = &offload_base;
 6046	struct packet_offload *ptype;
 6047
 6048	list_for_each_entry_rcu(ptype, offload_head, list) {
 6049		if (ptype->type != type || !ptype->callbacks.gro_complete)
 6050			continue;
 6051		return ptype;
 6052	}
 6053	return NULL;
 6054}
 6055EXPORT_SYMBOL(gro_find_complete_by_type);
 6056
 6057static void napi_skb_free_stolen_head(struct sk_buff *skb)
 6058{
 6059	skb_dst_drop(skb);
 6060	skb_ext_put(skb);
 6061	kmem_cache_free(skbuff_head_cache, skb);
 6062}
 6063
 6064static gro_result_t napi_skb_finish(struct napi_struct *napi,
 6065				    struct sk_buff *skb,
 6066				    gro_result_t ret)
 6067{
 6068	switch (ret) {
 6069	case GRO_NORMAL:
 6070		gro_normal_one(napi, skb);
 6071		break;
 6072
 6073	case GRO_DROP:
 6074		kfree_skb(skb);
 6075		break;
 6076
 6077	case GRO_MERGED_FREE:
 6078		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6079			napi_skb_free_stolen_head(skb);
 6080		else
 6081			__kfree_skb(skb);
 6082		break;
 6083
 6084	case GRO_HELD:
 6085	case GRO_MERGED:
 6086	case GRO_CONSUMED:
 6087		break;
 6088	}
 6089
 6090	return ret;
 6091}
 6092
 6093gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6094{
 6095	gro_result_t ret;
 6096
 6097	skb_mark_napi_id(skb, napi);
 6098	trace_napi_gro_receive_entry(skb);
 6099
 6100	skb_gro_reset_offset(skb);
 6101
 6102	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6103	trace_napi_gro_receive_exit(ret);
 6104
 6105	return ret;
 6106}
 6107EXPORT_SYMBOL(napi_gro_receive);
 6108
 6109static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6110{
 6111	if (unlikely(skb->pfmemalloc)) {
 6112		consume_skb(skb);
 6113		return;
 6114	}
 6115	__skb_pull(skb, skb_headlen(skb));
 6116	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6117	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6118	__vlan_hwaccel_clear_tag(skb);
 6119	skb->dev = napi->dev;
 6120	skb->skb_iif = 0;
 6121
 6122	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6123	skb->pkt_type = PACKET_HOST;
 6124
 6125	skb->encapsulation = 0;
 6126	skb_shinfo(skb)->gso_type = 0;
 6127	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6128	skb_ext_reset(skb);
 6129
 6130	napi->skb = skb;
 6131}
 6132
 6133struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6134{
 6135	struct sk_buff *skb = napi->skb;
 6136
 6137	if (!skb) {
 6138		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6139		if (skb) {
 6140			napi->skb = skb;
 6141			skb_mark_napi_id(skb, napi);
 6142		}
 6143	}
 6144	return skb;
 6145}
 6146EXPORT_SYMBOL(napi_get_frags);
 6147
 6148static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6149				      struct sk_buff *skb,
 6150				      gro_result_t ret)
 6151{
 6152	switch (ret) {
 6153	case GRO_NORMAL:
 6154	case GRO_HELD:
 6155		__skb_push(skb, ETH_HLEN);
 6156		skb->protocol = eth_type_trans(skb, skb->dev);
 6157		if (ret == GRO_NORMAL)
 6158			gro_normal_one(napi, skb);
 6159		break;
 6160
 6161	case GRO_DROP:
 6162		napi_reuse_skb(napi, skb);
 6163		break;
 6164
 6165	case GRO_MERGED_FREE:
 6166		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6167			napi_skb_free_stolen_head(skb);
 6168		else
 6169			napi_reuse_skb(napi, skb);
 6170		break;
 6171
 6172	case GRO_MERGED:
 6173	case GRO_CONSUMED:
 6174		break;
 6175	}
 6176
 6177	return ret;
 6178}
 6179
 6180/* Upper GRO stack assumes network header starts at gro_offset=0
 6181 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6182 * We copy ethernet header into skb->data to have a common layout.
 6183 */
 6184static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6185{
 6186	struct sk_buff *skb = napi->skb;
 6187	const struct ethhdr *eth;
 6188	unsigned int hlen = sizeof(*eth);
 6189
 6190	napi->skb = NULL;
 6191
 6192	skb_reset_mac_header(skb);
 6193	skb_gro_reset_offset(skb);
 6194
 6195	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6196		eth = skb_gro_header_slow(skb, hlen, 0);
 6197		if (unlikely(!eth)) {
 6198			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6199					     __func__, napi->dev->name);
 6200			napi_reuse_skb(napi, skb);
 6201			return NULL;
 6202		}
 6203	} else {
 6204		eth = (const struct ethhdr *)skb->data;
 6205		gro_pull_from_frag0(skb, hlen);
 6206		NAPI_GRO_CB(skb)->frag0 += hlen;
 6207		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6208	}
 6209	__skb_pull(skb, hlen);
 6210
 6211	/*
 6212	 * This works because the only protocols we care about don't require
 6213	 * special handling.
 6214	 * We'll fix it up properly in napi_frags_finish()
 6215	 */
 6216	skb->protocol = eth->h_proto;
 6217
 6218	return skb;
 6219}
 6220
 6221gro_result_t napi_gro_frags(struct napi_struct *napi)
 6222{
 6223	gro_result_t ret;
 6224	struct sk_buff *skb = napi_frags_skb(napi);
 6225
 6226	if (!skb)
 6227		return GRO_DROP;
 6228
 6229	trace_napi_gro_frags_entry(skb);
 6230
 6231	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6232	trace_napi_gro_frags_exit(ret);
 6233
 6234	return ret;
 6235}
 6236EXPORT_SYMBOL(napi_gro_frags);
 6237
 6238/* Compute the checksum from gro_offset and return the folded value
 6239 * after adding in any pseudo checksum.
 6240 */
 6241__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6242{
 6243	__wsum wsum;
 6244	__sum16 sum;
 6245
 6246	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6247
 6248	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6249	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6250	/* See comments in __skb_checksum_complete(). */
 6251	if (likely(!sum)) {
 6252		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6253		    !skb->csum_complete_sw)
 6254			netdev_rx_csum_fault(skb->dev, skb);
 6255	}
 6256
 6257	NAPI_GRO_CB(skb)->csum = wsum;
 6258	NAPI_GRO_CB(skb)->csum_valid = 1;
 6259
 6260	return sum;
 6261}
 6262EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6263
 6264static void net_rps_send_ipi(struct softnet_data *remsd)
 6265{
 6266#ifdef CONFIG_RPS
 6267	while (remsd) {
 6268		struct softnet_data *next = remsd->rps_ipi_next;
 6269
 6270		if (cpu_online(remsd->cpu))
 6271			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6272		remsd = next;
 6273	}
 6274#endif
 6275}
 6276
 6277/*
 6278 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6279 * Note: called with local irq disabled, but exits with local irq enabled.
 6280 */
 6281static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6282{
 6283#ifdef CONFIG_RPS
 6284	struct softnet_data *remsd = sd->rps_ipi_list;
 6285
 6286	if (remsd) {
 6287		sd->rps_ipi_list = NULL;
 6288
 6289		local_irq_enable();
 6290
 6291		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6292		net_rps_send_ipi(remsd);
 6293	} else
 6294#endif
 6295		local_irq_enable();
 6296}
 6297
 6298static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6299{
 6300#ifdef CONFIG_RPS
 6301	return sd->rps_ipi_list != NULL;
 6302#else
 6303	return false;
 6304#endif
 6305}
 6306
 6307static int process_backlog(struct napi_struct *napi, int quota)
 6308{
 6309	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6310	bool again = true;
 6311	int work = 0;
 6312
 6313	/* Check if we have pending ipi, its better to send them now,
 6314	 * not waiting net_rx_action() end.
 6315	 */
 6316	if (sd_has_rps_ipi_waiting(sd)) {
 6317		local_irq_disable();
 6318		net_rps_action_and_irq_enable(sd);
 6319	}
 6320
 6321	napi->weight = dev_rx_weight;
 6322	while (again) {
 6323		struct sk_buff *skb;
 6324
 6325		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6326			rcu_read_lock();
 6327			__netif_receive_skb(skb);
 6328			rcu_read_unlock();
 6329			input_queue_head_incr(sd);
 6330			if (++work >= quota)
 6331				return work;
 6332
 6333		}
 6334
 6335		local_irq_disable();
 6336		rps_lock(sd);
 6337		if (skb_queue_empty(&sd->input_pkt_queue)) {
 6338			/*
 6339			 * Inline a custom version of __napi_complete().
 6340			 * only current cpu owns and manipulates this napi,
 6341			 * and NAPI_STATE_SCHED is the only possible flag set
 6342			 * on backlog.
 6343			 * We can use a plain write instead of clear_bit(),
 6344			 * and we dont need an smp_mb() memory barrier.
 6345			 */
 6346			napi->state = 0;
 6347			again = false;
 6348		} else {
 6349			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6350						   &sd->process_queue);
 6351		}
 6352		rps_unlock(sd);
 6353		local_irq_enable();
 6354	}
 6355
 6356	return work;
 6357}
 6358
 6359/**
 6360 * __napi_schedule - schedule for receive
 6361 * @n: entry to schedule
 6362 *
 6363 * The entry's receive function will be scheduled to run.
 6364 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6365 */
 6366void __napi_schedule(struct napi_struct *n)
 6367{
 6368	unsigned long flags;
 6369
 6370	local_irq_save(flags);
 6371	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6372	local_irq_restore(flags);
 6373}
 6374EXPORT_SYMBOL(__napi_schedule);
 6375
 6376/**
 6377 *	napi_schedule_prep - check if napi can be scheduled
 6378 *	@n: napi context
 6379 *
 6380 * Test if NAPI routine is already running, and if not mark
 6381 * it as running.  This is used as a condition variable to
 6382 * insure only one NAPI poll instance runs.  We also make
 6383 * sure there is no pending NAPI disable.
 6384 */
 6385bool napi_schedule_prep(struct napi_struct *n)
 6386{
 6387	unsigned long val, new;
 6388
 6389	do {
 6390		val = READ_ONCE(n->state);
 6391		if (unlikely(val & NAPIF_STATE_DISABLE))
 6392			return false;
 6393		new = val | NAPIF_STATE_SCHED;
 6394
 6395		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6396		 * This was suggested by Alexander Duyck, as compiler
 6397		 * emits better code than :
 6398		 * if (val & NAPIF_STATE_SCHED)
 6399		 *     new |= NAPIF_STATE_MISSED;
 6400		 */
 6401		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6402						   NAPIF_STATE_MISSED;
 6403	} while (cmpxchg(&n->state, val, new) != val);
 6404
 6405	return !(val & NAPIF_STATE_SCHED);
 6406}
 6407EXPORT_SYMBOL(napi_schedule_prep);
 6408
 6409/**
 6410 * __napi_schedule_irqoff - schedule for receive
 6411 * @n: entry to schedule
 6412 *
 6413 * Variant of __napi_schedule() assuming hard irqs are masked
 6414 */
 6415void __napi_schedule_irqoff(struct napi_struct *n)
 6416{
 6417	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6418}
 6419EXPORT_SYMBOL(__napi_schedule_irqoff);
 6420
 6421bool napi_complete_done(struct napi_struct *n, int work_done)
 6422{
 6423	unsigned long flags, val, new, timeout = 0;
 6424	bool ret = true;
 6425
 6426	/*
 6427	 * 1) Don't let napi dequeue from the cpu poll list
 6428	 *    just in case its running on a different cpu.
 6429	 * 2) If we are busy polling, do nothing here, we have
 6430	 *    the guarantee we will be called later.
 6431	 */
 6432	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6433				 NAPIF_STATE_IN_BUSY_POLL)))
 6434		return false;
 6435
 6436	if (work_done) {
 6437		if (n->gro_bitmask)
 6438			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6439		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6440	}
 6441	if (n->defer_hard_irqs_count > 0) {
 6442		n->defer_hard_irqs_count--;
 6443		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6444		if (timeout)
 6445			ret = false;
 6446	}
 6447	if (n->gro_bitmask) {
 6448		/* When the NAPI instance uses a timeout and keeps postponing
 6449		 * it, we need to bound somehow the time packets are kept in
 6450		 * the GRO layer
 6451		 */
 6452		napi_gro_flush(n, !!timeout);
 6453	}
 6454
 6455	gro_normal_list(n);
 6456
 6457	if (unlikely(!list_empty(&n->poll_list))) {
 6458		/* If n->poll_list is not empty, we need to mask irqs */
 6459		local_irq_save(flags);
 6460		list_del_init(&n->poll_list);
 6461		local_irq_restore(flags);
 6462	}
 6463
 6464	do {
 6465		val = READ_ONCE(n->state);
 6466
 6467		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6468
 6469		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
 6470			      NAPIF_STATE_PREFER_BUSY_POLL);
 6471
 6472		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6473		 * because we will call napi->poll() one more time.
 6474		 * This C code was suggested by Alexander Duyck to help gcc.
 6475		 */
 6476		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6477						    NAPIF_STATE_SCHED;
 6478	} while (cmpxchg(&n->state, val, new) != val);
 6479
 6480	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6481		__napi_schedule(n);
 6482		return false;
 6483	}
 6484
 6485	if (timeout)
 6486		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6487			      HRTIMER_MODE_REL_PINNED);
 6488	return ret;
 6489}
 6490EXPORT_SYMBOL(napi_complete_done);
 6491
 6492/* must be called under rcu_read_lock(), as we dont take a reference */
 6493static struct napi_struct *napi_by_id(unsigned int napi_id)
 6494{
 6495	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6496	struct napi_struct *napi;
 6497
 6498	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6499		if (napi->napi_id == napi_id)
 6500			return napi;
 6501
 6502	return NULL;
 6503}
 6504
 6505#if defined(CONFIG_NET_RX_BUSY_POLL)
 6506
 6507static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 6508{
 6509	if (!skip_schedule) {
 6510		gro_normal_list(napi);
 6511		__napi_schedule(napi);
 6512		return;
 6513	}
 6514
 6515	if (napi->gro_bitmask) {
 6516		/* flush too old packets
 6517		 * If HZ < 1000, flush all packets.
 6518		 */
 6519		napi_gro_flush(napi, HZ >= 1000);
 6520	}
 6521
 6522	gro_normal_list(napi);
 6523	clear_bit(NAPI_STATE_SCHED, &napi->state);
 6524}
 6525
 6526static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
 6527			   u16 budget)
 6528{
 6529	bool skip_schedule = false;
 6530	unsigned long timeout;
 6531	int rc;
 6532
 6533	/* Busy polling means there is a high chance device driver hard irq
 6534	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6535	 * set in napi_schedule_prep().
 6536	 * Since we are about to call napi->poll() once more, we can safely
 6537	 * clear NAPI_STATE_MISSED.
 6538	 *
 6539	 * Note: x86 could use a single "lock and ..." instruction
 6540	 * to perform these two clear_bit()
 6541	 */
 6542	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6543	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6544
 6545	local_bh_disable();
 6546
 6547	if (prefer_busy_poll) {
 6548		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
 6549		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
 6550		if (napi->defer_hard_irqs_count && timeout) {
 6551			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
 6552			skip_schedule = true;
 6553		}
 6554	}
 6555
 6556	/* All we really want here is to re-enable device interrupts.
 6557	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6558	 */
 6559	rc = napi->poll(napi, budget);
 6560	/* We can't gro_normal_list() here, because napi->poll() might have
 6561	 * rearmed the napi (napi_complete_done()) in which case it could
 6562	 * already be running on another CPU.
 6563	 */
 6564	trace_napi_poll(napi, rc, budget);
 6565	netpoll_poll_unlock(have_poll_lock);
 6566	if (rc == budget)
 6567		__busy_poll_stop(napi, skip_schedule);
 6568	local_bh_enable();
 6569}
 6570
 6571void napi_busy_loop(unsigned int napi_id,
 6572		    bool (*loop_end)(void *, unsigned long),
 6573		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
 6574{
 6575	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6576	int (*napi_poll)(struct napi_struct *napi, int budget);
 6577	void *have_poll_lock = NULL;
 6578	struct napi_struct *napi;
 6579
 6580restart:
 6581	napi_poll = NULL;
 6582
 6583	rcu_read_lock();
 6584
 6585	napi = napi_by_id(napi_id);
 6586	if (!napi)
 6587		goto out;
 6588
 6589	preempt_disable();
 6590	for (;;) {
 6591		int work = 0;
 6592
 6593		local_bh_disable();
 6594		if (!napi_poll) {
 6595			unsigned long val = READ_ONCE(napi->state);
 6596
 6597			/* If multiple threads are competing for this napi,
 6598			 * we avoid dirtying napi->state as much as we can.
 6599			 */
 6600			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6601				   NAPIF_STATE_IN_BUSY_POLL)) {
 6602				if (prefer_busy_poll)
 6603					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6604				goto count;
 6605			}
 6606			if (cmpxchg(&napi->state, val,
 6607				    val | NAPIF_STATE_IN_BUSY_POLL |
 6608					  NAPIF_STATE_SCHED) != val) {
 6609				if (prefer_busy_poll)
 6610					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6611				goto count;
 6612			}
 6613			have_poll_lock = netpoll_poll_lock(napi);
 6614			napi_poll = napi->poll;
 6615		}
 6616		work = napi_poll(napi, budget);
 6617		trace_napi_poll(napi, work, budget);
 6618		gro_normal_list(napi);
 6619count:
 6620		if (work > 0)
 6621			__NET_ADD_STATS(dev_net(napi->dev),
 6622					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6623		local_bh_enable();
 6624
 6625		if (!loop_end || loop_end(loop_end_arg, start_time))
 6626			break;
 6627
 6628		if (unlikely(need_resched())) {
 6629			if (napi_poll)
 6630				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6631			preempt_enable();
 6632			rcu_read_unlock();
 6633			cond_resched();
 6634			if (loop_end(loop_end_arg, start_time))
 6635				return;
 6636			goto restart;
 6637		}
 6638		cpu_relax();
 6639	}
 6640	if (napi_poll)
 6641		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
 6642	preempt_enable();
 6643out:
 6644	rcu_read_unlock();
 6645}
 6646EXPORT_SYMBOL(napi_busy_loop);
 6647
 6648#endif /* CONFIG_NET_RX_BUSY_POLL */
 6649
 6650static void napi_hash_add(struct napi_struct *napi)
 6651{
 6652	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 6653		return;
 6654
 6655	spin_lock(&napi_hash_lock);
 6656
 6657	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6658	do {
 6659		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6660			napi_gen_id = MIN_NAPI_ID;
 6661	} while (napi_by_id(napi_gen_id));
 6662	napi->napi_id = napi_gen_id;
 6663
 6664	hlist_add_head_rcu(&napi->napi_hash_node,
 6665			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6666
 6667	spin_unlock(&napi_hash_lock);
 6668}
 6669
 6670/* Warning : caller is responsible to make sure rcu grace period
 6671 * is respected before freeing memory containing @napi
 6672 */
 6673static void napi_hash_del(struct napi_struct *napi)
 6674{
 6675	spin_lock(&napi_hash_lock);
 6676
 6677	hlist_del_init_rcu(&napi->napi_hash_node);
 6678
 6679	spin_unlock(&napi_hash_lock);
 6680}
 6681
 6682static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6683{
 6684	struct napi_struct *napi;
 6685
 6686	napi = container_of(timer, struct napi_struct, timer);
 6687
 6688	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6689	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6690	 */
 6691	if (!napi_disable_pending(napi) &&
 6692	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
 6693		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
 6694		__napi_schedule_irqoff(napi);
 6695	}
 6696
 6697	return HRTIMER_NORESTART;
 6698}
 6699
 6700static void init_gro_hash(struct napi_struct *napi)
 6701{
 6702	int i;
 6703
 6704	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6705		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6706		napi->gro_hash[i].count = 0;
 6707	}
 6708	napi->gro_bitmask = 0;
 6709}
 6710
 6711void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6712		    int (*poll)(struct napi_struct *, int), int weight)
 6713{
 6714	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6715		return;
 6716
 6717	INIT_LIST_HEAD(&napi->poll_list);
 6718	INIT_HLIST_NODE(&napi->napi_hash_node);
 6719	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6720	napi->timer.function = napi_watchdog;
 6721	init_gro_hash(napi);
 6722	napi->skb = NULL;
 6723	INIT_LIST_HEAD(&napi->rx_list);
 6724	napi->rx_count = 0;
 6725	napi->poll = poll;
 6726	if (weight > NAPI_POLL_WEIGHT)
 6727		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6728				weight);
 6729	napi->weight = weight;
 6730	napi->dev = dev;
 6731#ifdef CONFIG_NETPOLL
 6732	napi->poll_owner = -1;
 6733#endif
 6734	set_bit(NAPI_STATE_SCHED, &napi->state);
 6735	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6736	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6737	napi_hash_add(napi);
 6738}
 6739EXPORT_SYMBOL(netif_napi_add);
 6740
 6741void napi_disable(struct napi_struct *n)
 6742{
 6743	might_sleep();
 6744	set_bit(NAPI_STATE_DISABLE, &n->state);
 6745
 6746	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6747		msleep(1);
 6748	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6749		msleep(1);
 6750
 6751	hrtimer_cancel(&n->timer);
 6752
 6753	clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
 6754	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6755}
 6756EXPORT_SYMBOL(napi_disable);
 6757
 6758static void flush_gro_hash(struct napi_struct *napi)
 6759{
 6760	int i;
 6761
 6762	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6763		struct sk_buff *skb, *n;
 6764
 6765		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6766			kfree_skb(skb);
 6767		napi->gro_hash[i].count = 0;
 6768	}
 6769}
 6770
 6771/* Must be called in process context */
 6772void __netif_napi_del(struct napi_struct *napi)
 6773{
 6774	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6775		return;
 6776
 6777	napi_hash_del(napi);
 6778	list_del_rcu(&napi->dev_list);
 6779	napi_free_frags(napi);
 6780
 6781	flush_gro_hash(napi);
 6782	napi->gro_bitmask = 0;
 6783}
 6784EXPORT_SYMBOL(__netif_napi_del);
 6785
 6786static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6787{
 6788	void *have;
 6789	int work, weight;
 6790
 6791	list_del_init(&n->poll_list);
 6792
 6793	have = netpoll_poll_lock(n);
 6794
 6795	weight = n->weight;
 6796
 6797	/* This NAPI_STATE_SCHED test is for avoiding a race
 6798	 * with netpoll's poll_napi().  Only the entity which
 6799	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6800	 * actually make the ->poll() call.  Therefore we avoid
 6801	 * accidentally calling ->poll() when NAPI is not scheduled.
 6802	 */
 6803	work = 0;
 6804	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6805		work = n->poll(n, weight);
 6806		trace_napi_poll(n, work, weight);
 6807	}
 6808
 6809	if (unlikely(work > weight))
 6810		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6811			    n->poll, work, weight);
 6812
 6813	if (likely(work < weight))
 6814		goto out_unlock;
 6815
 6816	/* Drivers must not modify the NAPI state if they
 6817	 * consume the entire weight.  In such cases this code
 6818	 * still "owns" the NAPI instance and therefore can
 6819	 * move the instance around on the list at-will.
 6820	 */
 6821	if (unlikely(napi_disable_pending(n))) {
 6822		napi_complete(n);
 6823		goto out_unlock;
 6824	}
 6825
 6826	/* The NAPI context has more processing work, but busy-polling
 6827	 * is preferred. Exit early.
 6828	 */
 6829	if (napi_prefer_busy_poll(n)) {
 6830		if (napi_complete_done(n, work)) {
 6831			/* If timeout is not set, we need to make sure
 6832			 * that the NAPI is re-scheduled.
 6833			 */
 6834			napi_schedule(n);
 6835		}
 6836		goto out_unlock;
 6837	}
 6838
 6839	if (n->gro_bitmask) {
 6840		/* flush too old packets
 6841		 * If HZ < 1000, flush all packets.
 6842		 */
 6843		napi_gro_flush(n, HZ >= 1000);
 6844	}
 6845
 6846	gro_normal_list(n);
 6847
 6848	/* Some drivers may have called napi_schedule
 6849	 * prior to exhausting their budget.
 6850	 */
 6851	if (unlikely(!list_empty(&n->poll_list))) {
 6852		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6853			     n->dev ? n->dev->name : "backlog");
 6854		goto out_unlock;
 6855	}
 6856
 6857	list_add_tail(&n->poll_list, repoll);
 6858
 6859out_unlock:
 6860	netpoll_poll_unlock(have);
 6861
 6862	return work;
 6863}
 6864
 6865static __latent_entropy void net_rx_action(struct softirq_action *h)
 6866{
 6867	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6868	unsigned long time_limit = jiffies +
 6869		usecs_to_jiffies(netdev_budget_usecs);
 6870	int budget = netdev_budget;
 6871	LIST_HEAD(list);
 6872	LIST_HEAD(repoll);
 6873
 6874	local_irq_disable();
 6875	list_splice_init(&sd->poll_list, &list);
 6876	local_irq_enable();
 6877
 6878	for (;;) {
 6879		struct napi_struct *n;
 6880
 6881		if (list_empty(&list)) {
 6882			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6883				goto out;
 6884			break;
 6885		}
 6886
 6887		n = list_first_entry(&list, struct napi_struct, poll_list);
 6888		budget -= napi_poll(n, &repoll);
 6889
 6890		/* If softirq window is exhausted then punt.
 6891		 * Allow this to run for 2 jiffies since which will allow
 6892		 * an average latency of 1.5/HZ.
 6893		 */
 6894		if (unlikely(budget <= 0 ||
 6895			     time_after_eq(jiffies, time_limit))) {
 6896			sd->time_squeeze++;
 6897			break;
 6898		}
 6899	}
 6900
 6901	local_irq_disable();
 6902
 6903	list_splice_tail_init(&sd->poll_list, &list);
 6904	list_splice_tail(&repoll, &list);
 6905	list_splice(&list, &sd->poll_list);
 6906	if (!list_empty(&sd->poll_list))
 6907		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6908
 6909	net_rps_action_and_irq_enable(sd);
 6910out:
 6911	__kfree_skb_flush();
 6912}
 6913
 6914struct netdev_adjacent {
 6915	struct net_device *dev;
 6916
 6917	/* upper master flag, there can only be one master device per list */
 6918	bool master;
 6919
 6920	/* lookup ignore flag */
 6921	bool ignore;
 6922
 6923	/* counter for the number of times this device was added to us */
 6924	u16 ref_nr;
 6925
 6926	/* private field for the users */
 6927	void *private;
 6928
 6929	struct list_head list;
 6930	struct rcu_head rcu;
 6931};
 6932
 6933static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6934						 struct list_head *adj_list)
 6935{
 6936	struct netdev_adjacent *adj;
 6937
 6938	list_for_each_entry(adj, adj_list, list) {
 6939		if (adj->dev == adj_dev)
 6940			return adj;
 6941	}
 6942	return NULL;
 6943}
 6944
 6945static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6946				    struct netdev_nested_priv *priv)
 6947{
 6948	struct net_device *dev = (struct net_device *)priv->data;
 6949
 6950	return upper_dev == dev;
 6951}
 6952
 6953/**
 6954 * netdev_has_upper_dev - Check if device is linked to an upper device
 6955 * @dev: device
 6956 * @upper_dev: upper device to check
 6957 *
 6958 * Find out if a device is linked to specified upper device and return true
 6959 * in case it is. Note that this checks only immediate upper device,
 6960 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6961 */
 6962bool netdev_has_upper_dev(struct net_device *dev,
 6963			  struct net_device *upper_dev)
 6964{
 6965	struct netdev_nested_priv priv = {
 6966		.data = (void *)upper_dev,
 6967	};
 6968
 6969	ASSERT_RTNL();
 6970
 6971	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6972					     &priv);
 6973}
 6974EXPORT_SYMBOL(netdev_has_upper_dev);
 6975
 6976/**
 6977 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 6978 * @dev: device
 6979 * @upper_dev: upper device to check
 6980 *
 6981 * Find out if a device is linked to specified upper device and return true
 6982 * in case it is. Note that this checks the entire upper device chain.
 6983 * The caller must hold rcu lock.
 6984 */
 6985
 6986bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6987				  struct net_device *upper_dev)
 6988{
 6989	struct netdev_nested_priv priv = {
 6990		.data = (void *)upper_dev,
 6991	};
 6992
 6993	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6994					       &priv);
 6995}
 6996EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6997
 6998/**
 6999 * netdev_has_any_upper_dev - Check if device is linked to some device
 7000 * @dev: device
 7001 *
 7002 * Find out if a device is linked to an upper device and return true in case
 7003 * it is. The caller must hold the RTNL lock.
 7004 */
 7005bool netdev_has_any_upper_dev(struct net_device *dev)
 7006{
 7007	ASSERT_RTNL();
 7008
 7009	return !list_empty(&dev->adj_list.upper);
 7010}
 7011EXPORT_SYMBOL(netdev_has_any_upper_dev);
 7012
 7013/**
 7014 * netdev_master_upper_dev_get - Get master upper device
 7015 * @dev: device
 7016 *
 7017 * Find a master upper device and return pointer to it or NULL in case
 7018 * it's not there. The caller must hold the RTNL lock.
 7019 */
 7020struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 7021{
 7022	struct netdev_adjacent *upper;
 7023
 7024	ASSERT_RTNL();
 7025
 7026	if (list_empty(&dev->adj_list.upper))
 7027		return NULL;
 7028
 7029	upper = list_first_entry(&dev->adj_list.upper,
 7030				 struct netdev_adjacent, list);
 7031	if (likely(upper->master))
 7032		return upper->dev;
 7033	return NULL;
 7034}
 7035EXPORT_SYMBOL(netdev_master_upper_dev_get);
 7036
 7037static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 7038{
 7039	struct netdev_adjacent *upper;
 7040
 7041	ASSERT_RTNL();
 7042
 7043	if (list_empty(&dev->adj_list.upper))
 7044		return NULL;
 7045
 7046	upper = list_first_entry(&dev->adj_list.upper,
 7047				 struct netdev_adjacent, list);
 7048	if (likely(upper->master) && !upper->ignore)
 7049		return upper->dev;
 7050	return NULL;
 7051}
 7052
 7053/**
 7054 * netdev_has_any_lower_dev - Check if device is linked to some device
 7055 * @dev: device
 7056 *
 7057 * Find out if a device is linked to a lower device and return true in case
 7058 * it is. The caller must hold the RTNL lock.
 7059 */
 7060static bool netdev_has_any_lower_dev(struct net_device *dev)
 7061{
 7062	ASSERT_RTNL();
 7063
 7064	return !list_empty(&dev->adj_list.lower);
 7065}
 7066
 7067void *netdev_adjacent_get_private(struct list_head *adj_list)
 7068{
 7069	struct netdev_adjacent *adj;
 7070
 7071	adj = list_entry(adj_list, struct netdev_adjacent, list);
 7072
 7073	return adj->private;
 7074}
 7075EXPORT_SYMBOL(netdev_adjacent_get_private);
 7076
 7077/**
 7078 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 7079 * @dev: device
 7080 * @iter: list_head ** of the current position
 7081 *
 7082 * Gets the next device from the dev's upper list, starting from iter
 7083 * position. The caller must hold RCU read lock.
 7084 */
 7085struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 7086						 struct list_head **iter)
 7087{
 7088	struct netdev_adjacent *upper;
 7089
 7090	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7091
 7092	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7093
 7094	if (&upper->list == &dev->adj_list.upper)
 7095		return NULL;
 7096
 7097	*iter = &upper->list;
 7098
 7099	return upper->dev;
 7100}
 7101EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 7102
 7103static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 7104						  struct list_head **iter,
 7105						  bool *ignore)
 7106{
 7107	struct netdev_adjacent *upper;
 7108
 7109	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 7110
 7111	if (&upper->list == &dev->adj_list.upper)
 7112		return NULL;
 7113
 7114	*iter = &upper->list;
 7115	*ignore = upper->ignore;
 7116
 7117	return upper->dev;
 7118}
 7119
 7120static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 7121						    struct list_head **iter)
 7122{
 7123	struct netdev_adjacent *upper;
 7124
 7125	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7126
 7127	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7128
 7129	if (&upper->list == &dev->adj_list.upper)
 7130		return NULL;
 7131
 7132	*iter = &upper->list;
 7133
 7134	return upper->dev;
 7135}
 7136
 7137static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7138				       int (*fn)(struct net_device *dev,
 7139					 struct netdev_nested_priv *priv),
 7140				       struct netdev_nested_priv *priv)
 7141{
 7142	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7143	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7144	int ret, cur = 0;
 7145	bool ignore;
 7146
 7147	now = dev;
 7148	iter = &dev->adj_list.upper;
 7149
 7150	while (1) {
 7151		if (now != dev) {
 7152			ret = fn(now, priv);
 7153			if (ret)
 7154				return ret;
 7155		}
 7156
 7157		next = NULL;
 7158		while (1) {
 7159			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7160			if (!udev)
 7161				break;
 7162			if (ignore)
 7163				continue;
 7164
 7165			next = udev;
 7166			niter = &udev->adj_list.upper;
 7167			dev_stack[cur] = now;
 7168			iter_stack[cur++] = iter;
 7169			break;
 7170		}
 7171
 7172		if (!next) {
 7173			if (!cur)
 7174				return 0;
 7175			next = dev_stack[--cur];
 7176			niter = iter_stack[cur];
 7177		}
 7178
 7179		now = next;
 7180		iter = niter;
 7181	}
 7182
 7183	return 0;
 7184}
 7185
 7186int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7187				  int (*fn)(struct net_device *dev,
 7188					    struct netdev_nested_priv *priv),
 7189				  struct netdev_nested_priv *priv)
 7190{
 7191	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7192	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7193	int ret, cur = 0;
 7194
 7195	now = dev;
 7196	iter = &dev->adj_list.upper;
 7197
 7198	while (1) {
 7199		if (now != dev) {
 7200			ret = fn(now, priv);
 7201			if (ret)
 7202				return ret;
 7203		}
 7204
 7205		next = NULL;
 7206		while (1) {
 7207			udev = netdev_next_upper_dev_rcu(now, &iter);
 7208			if (!udev)
 7209				break;
 7210
 7211			next = udev;
 7212			niter = &udev->adj_list.upper;
 7213			dev_stack[cur] = now;
 7214			iter_stack[cur++] = iter;
 7215			break;
 7216		}
 7217
 7218		if (!next) {
 7219			if (!cur)
 7220				return 0;
 7221			next = dev_stack[--cur];
 7222			niter = iter_stack[cur];
 7223		}
 7224
 7225		now = next;
 7226		iter = niter;
 7227	}
 7228
 7229	return 0;
 7230}
 7231EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7232
 7233static bool __netdev_has_upper_dev(struct net_device *dev,
 7234				   struct net_device *upper_dev)
 7235{
 7236	struct netdev_nested_priv priv = {
 7237		.flags = 0,
 7238		.data = (void *)upper_dev,
 7239	};
 7240
 7241	ASSERT_RTNL();
 7242
 7243	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7244					   &priv);
 7245}
 7246
 7247/**
 7248 * netdev_lower_get_next_private - Get the next ->private from the
 7249 *				   lower neighbour list
 7250 * @dev: device
 7251 * @iter: list_head ** of the current position
 7252 *
 7253 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7254 * list, starting from iter position. The caller must hold either hold the
 7255 * RTNL lock or its own locking that guarantees that the neighbour lower
 7256 * list will remain unchanged.
 7257 */
 7258void *netdev_lower_get_next_private(struct net_device *dev,
 7259				    struct list_head **iter)
 7260{
 7261	struct netdev_adjacent *lower;
 7262
 7263	lower = list_entry(*iter, struct netdev_adjacent, list);
 7264
 7265	if (&lower->list == &dev->adj_list.lower)
 7266		return NULL;
 7267
 7268	*iter = lower->list.next;
 7269
 7270	return lower->private;
 7271}
 7272EXPORT_SYMBOL(netdev_lower_get_next_private);
 7273
 7274/**
 7275 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7276 *				       lower neighbour list, RCU
 7277 *				       variant
 7278 * @dev: device
 7279 * @iter: list_head ** of the current position
 7280 *
 7281 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7282 * list, starting from iter position. The caller must hold RCU read lock.
 7283 */
 7284void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7285					struct list_head **iter)
 7286{
 7287	struct netdev_adjacent *lower;
 7288
 7289	WARN_ON_ONCE(!rcu_read_lock_held());
 7290
 7291	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7292
 7293	if (&lower->list == &dev->adj_list.lower)
 7294		return NULL;
 7295
 7296	*iter = &lower->list;
 7297
 7298	return lower->private;
 7299}
 7300EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7301
 7302/**
 7303 * netdev_lower_get_next - Get the next device from the lower neighbour
 7304 *                         list
 7305 * @dev: device
 7306 * @iter: list_head ** of the current position
 7307 *
 7308 * Gets the next netdev_adjacent from the dev's lower neighbour
 7309 * list, starting from iter position. The caller must hold RTNL lock or
 7310 * its own locking that guarantees that the neighbour lower
 7311 * list will remain unchanged.
 7312 */
 7313void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7314{
 7315	struct netdev_adjacent *lower;
 7316
 7317	lower = list_entry(*iter, struct netdev_adjacent, list);
 7318
 7319	if (&lower->list == &dev->adj_list.lower)
 7320		return NULL;
 7321
 7322	*iter = lower->list.next;
 7323
 7324	return lower->dev;
 7325}
 7326EXPORT_SYMBOL(netdev_lower_get_next);
 7327
 7328static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7329						struct list_head **iter)
 7330{
 7331	struct netdev_adjacent *lower;
 7332
 7333	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7334
 7335	if (&lower->list == &dev->adj_list.lower)
 7336		return NULL;
 7337
 7338	*iter = &lower->list;
 7339
 7340	return lower->dev;
 7341}
 7342
 7343static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7344						  struct list_head **iter,
 7345						  bool *ignore)
 7346{
 7347	struct netdev_adjacent *lower;
 7348
 7349	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7350
 7351	if (&lower->list == &dev->adj_list.lower)
 7352		return NULL;
 7353
 7354	*iter = &lower->list;
 7355	*ignore = lower->ignore;
 7356
 7357	return lower->dev;
 7358}
 7359
 7360int netdev_walk_all_lower_dev(struct net_device *dev,
 7361			      int (*fn)(struct net_device *dev,
 7362					struct netdev_nested_priv *priv),
 7363			      struct netdev_nested_priv *priv)
 7364{
 7365	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7366	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7367	int ret, cur = 0;
 7368
 7369	now = dev;
 7370	iter = &dev->adj_list.lower;
 7371
 7372	while (1) {
 7373		if (now != dev) {
 7374			ret = fn(now, priv);
 7375			if (ret)
 7376				return ret;
 7377		}
 7378
 7379		next = NULL;
 7380		while (1) {
 7381			ldev = netdev_next_lower_dev(now, &iter);
 7382			if (!ldev)
 7383				break;
 7384
 7385			next = ldev;
 7386			niter = &ldev->adj_list.lower;
 7387			dev_stack[cur] = now;
 7388			iter_stack[cur++] = iter;
 7389			break;
 7390		}
 7391
 7392		if (!next) {
 7393			if (!cur)
 7394				return 0;
 7395			next = dev_stack[--cur];
 7396			niter = iter_stack[cur];
 7397		}
 7398
 7399		now = next;
 7400		iter = niter;
 7401	}
 7402
 7403	return 0;
 7404}
 7405EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7406
 7407static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7408				       int (*fn)(struct net_device *dev,
 7409					 struct netdev_nested_priv *priv),
 7410				       struct netdev_nested_priv *priv)
 7411{
 7412	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7413	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7414	int ret, cur = 0;
 7415	bool ignore;
 7416
 7417	now = dev;
 7418	iter = &dev->adj_list.lower;
 7419
 7420	while (1) {
 7421		if (now != dev) {
 7422			ret = fn(now, priv);
 7423			if (ret)
 7424				return ret;
 7425		}
 7426
 7427		next = NULL;
 7428		while (1) {
 7429			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7430			if (!ldev)
 7431				break;
 7432			if (ignore)
 7433				continue;
 7434
 7435			next = ldev;
 7436			niter = &ldev->adj_list.lower;
 7437			dev_stack[cur] = now;
 7438			iter_stack[cur++] = iter;
 7439			break;
 7440		}
 7441
 7442		if (!next) {
 7443			if (!cur)
 7444				return 0;
 7445			next = dev_stack[--cur];
 7446			niter = iter_stack[cur];
 7447		}
 7448
 7449		now = next;
 7450		iter = niter;
 7451	}
 7452
 7453	return 0;
 7454}
 7455
 7456struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7457					     struct list_head **iter)
 7458{
 7459	struct netdev_adjacent *lower;
 7460
 7461	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7462	if (&lower->list == &dev->adj_list.lower)
 7463		return NULL;
 7464
 7465	*iter = &lower->list;
 7466
 7467	return lower->dev;
 7468}
 7469EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7470
 7471static u8 __netdev_upper_depth(struct net_device *dev)
 7472{
 7473	struct net_device *udev;
 7474	struct list_head *iter;
 7475	u8 max_depth = 0;
 7476	bool ignore;
 7477
 7478	for (iter = &dev->adj_list.upper,
 7479	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7480	     udev;
 7481	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7482		if (ignore)
 7483			continue;
 7484		if (max_depth < udev->upper_level)
 7485			max_depth = udev->upper_level;
 7486	}
 7487
 7488	return max_depth;
 7489}
 7490
 7491static u8 __netdev_lower_depth(struct net_device *dev)
 7492{
 7493	struct net_device *ldev;
 7494	struct list_head *iter;
 7495	u8 max_depth = 0;
 7496	bool ignore;
 7497
 7498	for (iter = &dev->adj_list.lower,
 7499	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7500	     ldev;
 7501	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7502		if (ignore)
 7503			continue;
 7504		if (max_depth < ldev->lower_level)
 7505			max_depth = ldev->lower_level;
 7506	}
 7507
 7508	return max_depth;
 7509}
 7510
 7511static int __netdev_update_upper_level(struct net_device *dev,
 7512				       struct netdev_nested_priv *__unused)
 7513{
 7514	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7515	return 0;
 7516}
 7517
 7518static int __netdev_update_lower_level(struct net_device *dev,
 7519				       struct netdev_nested_priv *priv)
 7520{
 7521	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7522
 7523#ifdef CONFIG_LOCKDEP
 7524	if (!priv)
 7525		return 0;
 7526
 7527	if (priv->flags & NESTED_SYNC_IMM)
 7528		dev->nested_level = dev->lower_level - 1;
 7529	if (priv->flags & NESTED_SYNC_TODO)
 7530		net_unlink_todo(dev);
 7531#endif
 7532	return 0;
 7533}
 7534
 7535int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7536				  int (*fn)(struct net_device *dev,
 7537					    struct netdev_nested_priv *priv),
 7538				  struct netdev_nested_priv *priv)
 7539{
 7540	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7541	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7542	int ret, cur = 0;
 7543
 7544	now = dev;
 7545	iter = &dev->adj_list.lower;
 7546
 7547	while (1) {
 7548		if (now != dev) {
 7549			ret = fn(now, priv);
 7550			if (ret)
 7551				return ret;
 7552		}
 7553
 7554		next = NULL;
 7555		while (1) {
 7556			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7557			if (!ldev)
 7558				break;
 7559
 7560			next = ldev;
 7561			niter = &ldev->adj_list.lower;
 7562			dev_stack[cur] = now;
 7563			iter_stack[cur++] = iter;
 7564			break;
 7565		}
 7566
 7567		if (!next) {
 7568			if (!cur)
 7569				return 0;
 7570			next = dev_stack[--cur];
 7571			niter = iter_stack[cur];
 7572		}
 7573
 7574		now = next;
 7575		iter = niter;
 7576	}
 7577
 7578	return 0;
 7579}
 7580EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7581
 7582/**
 7583 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7584 *				       lower neighbour list, RCU
 7585 *				       variant
 7586 * @dev: device
 7587 *
 7588 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7589 * list. The caller must hold RCU read lock.
 7590 */
 7591void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7592{
 7593	struct netdev_adjacent *lower;
 7594
 7595	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7596			struct netdev_adjacent, list);
 7597	if (lower)
 7598		return lower->private;
 7599	return NULL;
 7600}
 7601EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7602
 7603/**
 7604 * netdev_master_upper_dev_get_rcu - Get master upper device
 7605 * @dev: device
 7606 *
 7607 * Find a master upper device and return pointer to it or NULL in case
 7608 * it's not there. The caller must hold the RCU read lock.
 7609 */
 7610struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7611{
 7612	struct netdev_adjacent *upper;
 7613
 7614	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7615				       struct netdev_adjacent, list);
 7616	if (upper && likely(upper->master))
 7617		return upper->dev;
 7618	return NULL;
 7619}
 7620EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7621
 7622static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7623			      struct net_device *adj_dev,
 7624			      struct list_head *dev_list)
 7625{
 7626	char linkname[IFNAMSIZ+7];
 7627
 7628	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7629		"upper_%s" : "lower_%s", adj_dev->name);
 7630	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7631				 linkname);
 7632}
 7633static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7634			       char *name,
 7635			       struct list_head *dev_list)
 7636{
 7637	char linkname[IFNAMSIZ+7];
 7638
 7639	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7640		"upper_%s" : "lower_%s", name);
 7641	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7642}
 7643
 7644static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7645						 struct net_device *adj_dev,
 7646						 struct list_head *dev_list)
 7647{
 7648	return (dev_list == &dev->adj_list.upper ||
 7649		dev_list == &dev->adj_list.lower) &&
 7650		net_eq(dev_net(dev), dev_net(adj_dev));
 7651}
 7652
 7653static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7654					struct net_device *adj_dev,
 7655					struct list_head *dev_list,
 7656					void *private, bool master)
 7657{
 7658	struct netdev_adjacent *adj;
 7659	int ret;
 7660
 7661	adj = __netdev_find_adj(adj_dev, dev_list);
 7662
 7663	if (adj) {
 7664		adj->ref_nr += 1;
 7665		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7666			 dev->name, adj_dev->name, adj->ref_nr);
 7667
 7668		return 0;
 7669	}
 7670
 7671	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7672	if (!adj)
 7673		return -ENOMEM;
 7674
 7675	adj->dev = adj_dev;
 7676	adj->master = master;
 7677	adj->ref_nr = 1;
 7678	adj->private = private;
 7679	adj->ignore = false;
 7680	dev_hold(adj_dev);
 7681
 7682	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7683		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7684
 7685	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7686		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7687		if (ret)
 7688			goto free_adj;
 7689	}
 7690
 7691	/* Ensure that master link is always the first item in list. */
 7692	if (master) {
 7693		ret = sysfs_create_link(&(dev->dev.kobj),
 7694					&(adj_dev->dev.kobj), "master");
 7695		if (ret)
 7696			goto remove_symlinks;
 7697
 7698		list_add_rcu(&adj->list, dev_list);
 7699	} else {
 7700		list_add_tail_rcu(&adj->list, dev_list);
 7701	}
 7702
 7703	return 0;
 7704
 7705remove_symlinks:
 7706	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7707		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7708free_adj:
 7709	kfree(adj);
 7710	dev_put(adj_dev);
 7711
 7712	return ret;
 7713}
 7714
 7715static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7716					 struct net_device *adj_dev,
 7717					 u16 ref_nr,
 7718					 struct list_head *dev_list)
 7719{
 7720	struct netdev_adjacent *adj;
 7721
 7722	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7723		 dev->name, adj_dev->name, ref_nr);
 7724
 7725	adj = __netdev_find_adj(adj_dev, dev_list);
 7726
 7727	if (!adj) {
 7728		pr_err("Adjacency does not exist for device %s from %s\n",
 7729		       dev->name, adj_dev->name);
 7730		WARN_ON(1);
 7731		return;
 7732	}
 7733
 7734	if (adj->ref_nr > ref_nr) {
 7735		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7736			 dev->name, adj_dev->name, ref_nr,
 7737			 adj->ref_nr - ref_nr);
 7738		adj->ref_nr -= ref_nr;
 7739		return;
 7740	}
 7741
 7742	if (adj->master)
 7743		sysfs_remove_link(&(dev->dev.kobj), "master");
 7744
 7745	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7746		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7747
 7748	list_del_rcu(&adj->list);
 7749	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7750		 adj_dev->name, dev->name, adj_dev->name);
 7751	dev_put(adj_dev);
 7752	kfree_rcu(adj, rcu);
 7753}
 7754
 7755static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7756					    struct net_device *upper_dev,
 7757					    struct list_head *up_list,
 7758					    struct list_head *down_list,
 7759					    void *private, bool master)
 7760{
 7761	int ret;
 7762
 7763	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7764					   private, master);
 7765	if (ret)
 7766		return ret;
 7767
 7768	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7769					   private, false);
 7770	if (ret) {
 7771		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7772		return ret;
 7773	}
 7774
 7775	return 0;
 7776}
 7777
 7778static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7779					       struct net_device *upper_dev,
 7780					       u16 ref_nr,
 7781					       struct list_head *up_list,
 7782					       struct list_head *down_list)
 7783{
 7784	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7785	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7786}
 7787
 7788static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7789						struct net_device *upper_dev,
 7790						void *private, bool master)
 7791{
 7792	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7793						&dev->adj_list.upper,
 7794						&upper_dev->adj_list.lower,
 7795						private, master);
 7796}
 7797
 7798static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7799						   struct net_device *upper_dev)
 7800{
 7801	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7802					   &dev->adj_list.upper,
 7803					   &upper_dev->adj_list.lower);
 7804}
 7805
 7806static int __netdev_upper_dev_link(struct net_device *dev,
 7807				   struct net_device *upper_dev, bool master,
 7808				   void *upper_priv, void *upper_info,
 7809				   struct netdev_nested_priv *priv,
 7810				   struct netlink_ext_ack *extack)
 7811{
 7812	struct netdev_notifier_changeupper_info changeupper_info = {
 7813		.info = {
 7814			.dev = dev,
 7815			.extack = extack,
 7816		},
 7817		.upper_dev = upper_dev,
 7818		.master = master,
 7819		.linking = true,
 7820		.upper_info = upper_info,
 7821	};
 7822	struct net_device *master_dev;
 7823	int ret = 0;
 7824
 7825	ASSERT_RTNL();
 7826
 7827	if (dev == upper_dev)
 7828		return -EBUSY;
 7829
 7830	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7831	if (__netdev_has_upper_dev(upper_dev, dev))
 7832		return -EBUSY;
 7833
 7834	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7835		return -EMLINK;
 7836
 7837	if (!master) {
 7838		if (__netdev_has_upper_dev(dev, upper_dev))
 7839			return -EEXIST;
 7840	} else {
 7841		master_dev = __netdev_master_upper_dev_get(dev);
 7842		if (master_dev)
 7843			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7844	}
 7845
 7846	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7847					    &changeupper_info.info);
 7848	ret = notifier_to_errno(ret);
 7849	if (ret)
 7850		return ret;
 7851
 7852	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7853						   master);
 7854	if (ret)
 7855		return ret;
 7856
 7857	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7858					    &changeupper_info.info);
 7859	ret = notifier_to_errno(ret);
 7860	if (ret)
 7861		goto rollback;
 7862
 7863	__netdev_update_upper_level(dev, NULL);
 7864	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7865
 7866	__netdev_update_lower_level(upper_dev, priv);
 7867	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7868				    priv);
 7869
 7870	return 0;
 7871
 7872rollback:
 7873	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7874
 7875	return ret;
 7876}
 7877
 7878/**
 7879 * netdev_upper_dev_link - Add a link to the upper device
 7880 * @dev: device
 7881 * @upper_dev: new upper device
 7882 * @extack: netlink extended ack
 7883 *
 7884 * Adds a link to device which is upper to this one. The caller must hold
 7885 * the RTNL lock. On a failure a negative errno code is returned.
 7886 * On success the reference counts are adjusted and the function
 7887 * returns zero.
 7888 */
 7889int netdev_upper_dev_link(struct net_device *dev,
 7890			  struct net_device *upper_dev,
 7891			  struct netlink_ext_ack *extack)
 7892{
 7893	struct netdev_nested_priv priv = {
 7894		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7895		.data = NULL,
 7896	};
 7897
 7898	return __netdev_upper_dev_link(dev, upper_dev, false,
 7899				       NULL, NULL, &priv, extack);
 7900}
 7901EXPORT_SYMBOL(netdev_upper_dev_link);
 7902
 7903/**
 7904 * netdev_master_upper_dev_link - Add a master link to the upper device
 7905 * @dev: device
 7906 * @upper_dev: new upper device
 7907 * @upper_priv: upper device private
 7908 * @upper_info: upper info to be passed down via notifier
 7909 * @extack: netlink extended ack
 7910 *
 7911 * Adds a link to device which is upper to this one. In this case, only
 7912 * one master upper device can be linked, although other non-master devices
 7913 * might be linked as well. The caller must hold the RTNL lock.
 7914 * On a failure a negative errno code is returned. On success the reference
 7915 * counts are adjusted and the function returns zero.
 7916 */
 7917int netdev_master_upper_dev_link(struct net_device *dev,
 7918				 struct net_device *upper_dev,
 7919				 void *upper_priv, void *upper_info,
 7920				 struct netlink_ext_ack *extack)
 7921{
 7922	struct netdev_nested_priv priv = {
 7923		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7924		.data = NULL,
 7925	};
 7926
 7927	return __netdev_upper_dev_link(dev, upper_dev, true,
 7928				       upper_priv, upper_info, &priv, extack);
 7929}
 7930EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7931
 7932static void __netdev_upper_dev_unlink(struct net_device *dev,
 7933				      struct net_device *upper_dev,
 7934				      struct netdev_nested_priv *priv)
 7935{
 7936	struct netdev_notifier_changeupper_info changeupper_info = {
 7937		.info = {
 7938			.dev = dev,
 7939		},
 7940		.upper_dev = upper_dev,
 7941		.linking = false,
 7942	};
 7943
 7944	ASSERT_RTNL();
 7945
 7946	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7947
 7948	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7949				      &changeupper_info.info);
 7950
 7951	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7952
 7953	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7954				      &changeupper_info.info);
 7955
 7956	__netdev_update_upper_level(dev, NULL);
 7957	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7958
 7959	__netdev_update_lower_level(upper_dev, priv);
 7960	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7961				    priv);
 7962}
 7963
 7964/**
 7965 * netdev_upper_dev_unlink - Removes a link to upper device
 7966 * @dev: device
 7967 * @upper_dev: new upper device
 7968 *
 7969 * Removes a link to device which is upper to this one. The caller must hold
 7970 * the RTNL lock.
 7971 */
 7972void netdev_upper_dev_unlink(struct net_device *dev,
 7973			     struct net_device *upper_dev)
 7974{
 7975	struct netdev_nested_priv priv = {
 7976		.flags = NESTED_SYNC_TODO,
 7977		.data = NULL,
 7978	};
 7979
 7980	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7981}
 7982EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7983
 7984static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7985				      struct net_device *lower_dev,
 7986				      bool val)
 7987{
 7988	struct netdev_adjacent *adj;
 7989
 7990	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7991	if (adj)
 7992		adj->ignore = val;
 7993
 7994	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7995	if (adj)
 7996		adj->ignore = val;
 7997}
 7998
 7999static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 8000					struct net_device *lower_dev)
 8001{
 8002	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 8003}
 8004
 8005static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 8006				       struct net_device *lower_dev)
 8007{
 8008	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 8009}
 8010
 8011int netdev_adjacent_change_prepare(struct net_device *old_dev,
 8012				   struct net_device *new_dev,
 8013				   struct net_device *dev,
 8014				   struct netlink_ext_ack *extack)
 8015{
 8016	struct netdev_nested_priv priv = {
 8017		.flags = 0,
 8018		.data = NULL,
 8019	};
 8020	int err;
 8021
 8022	if (!new_dev)
 8023		return 0;
 8024
 8025	if (old_dev && new_dev != old_dev)
 8026		netdev_adjacent_dev_disable(dev, old_dev);
 8027	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 8028				      extack);
 8029	if (err) {
 8030		if (old_dev && new_dev != old_dev)
 8031			netdev_adjacent_dev_enable(dev, old_dev);
 8032		return err;
 8033	}
 8034
 8035	return 0;
 8036}
 8037EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 8038
 8039void netdev_adjacent_change_commit(struct net_device *old_dev,
 8040				   struct net_device *new_dev,
 8041				   struct net_device *dev)
 8042{
 8043	struct netdev_nested_priv priv = {
 8044		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 8045		.data = NULL,
 8046	};
 8047
 8048	if (!new_dev || !old_dev)
 8049		return;
 8050
 8051	if (new_dev == old_dev)
 8052		return;
 8053
 8054	netdev_adjacent_dev_enable(dev, old_dev);
 8055	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 8056}
 8057EXPORT_SYMBOL(netdev_adjacent_change_commit);
 8058
 8059void netdev_adjacent_change_abort(struct net_device *old_dev,
 8060				  struct net_device *new_dev,
 8061				  struct net_device *dev)
 8062{
 8063	struct netdev_nested_priv priv = {
 8064		.flags = 0,
 8065		.data = NULL,
 8066	};
 8067
 8068	if (!new_dev)
 8069		return;
 8070
 8071	if (old_dev && new_dev != old_dev)
 8072		netdev_adjacent_dev_enable(dev, old_dev);
 8073
 8074	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 8075}
 8076EXPORT_SYMBOL(netdev_adjacent_change_abort);
 8077
 8078/**
 8079 * netdev_bonding_info_change - Dispatch event about slave change
 8080 * @dev: device
 8081 * @bonding_info: info to dispatch
 8082 *
 8083 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 8084 * The caller must hold the RTNL lock.
 8085 */
 8086void netdev_bonding_info_change(struct net_device *dev,
 8087				struct netdev_bonding_info *bonding_info)
 8088{
 8089	struct netdev_notifier_bonding_info info = {
 8090		.info.dev = dev,
 8091	};
 8092
 8093	memcpy(&info.bonding_info, bonding_info,
 8094	       sizeof(struct netdev_bonding_info));
 8095	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 8096				      &info.info);
 8097}
 8098EXPORT_SYMBOL(netdev_bonding_info_change);
 8099
 8100/**
 8101 * netdev_get_xmit_slave - Get the xmit slave of master device
 8102 * @dev: device
 8103 * @skb: The packet
 8104 * @all_slaves: assume all the slaves are active
 8105 *
 8106 * The reference counters are not incremented so the caller must be
 8107 * careful with locks. The caller must hold RCU lock.
 8108 * %NULL is returned if no slave is found.
 8109 */
 8110
 8111struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8112					 struct sk_buff *skb,
 8113					 bool all_slaves)
 8114{
 8115	const struct net_device_ops *ops = dev->netdev_ops;
 8116
 8117	if (!ops->ndo_get_xmit_slave)
 8118		return NULL;
 8119	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8120}
 8121EXPORT_SYMBOL(netdev_get_xmit_slave);
 8122
 8123static void netdev_adjacent_add_links(struct net_device *dev)
 8124{
 8125	struct netdev_adjacent *iter;
 8126
 8127	struct net *net = dev_net(dev);
 8128
 8129	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8130		if (!net_eq(net, dev_net(iter->dev)))
 8131			continue;
 8132		netdev_adjacent_sysfs_add(iter->dev, dev,
 8133					  &iter->dev->adj_list.lower);
 8134		netdev_adjacent_sysfs_add(dev, iter->dev,
 8135					  &dev->adj_list.upper);
 8136	}
 8137
 8138	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8139		if (!net_eq(net, dev_net(iter->dev)))
 8140			continue;
 8141		netdev_adjacent_sysfs_add(iter->dev, dev,
 8142					  &iter->dev->adj_list.upper);
 8143		netdev_adjacent_sysfs_add(dev, iter->dev,
 8144					  &dev->adj_list.lower);
 8145	}
 8146}
 8147
 8148static void netdev_adjacent_del_links(struct net_device *dev)
 8149{
 8150	struct netdev_adjacent *iter;
 8151
 8152	struct net *net = dev_net(dev);
 8153
 8154	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8155		if (!net_eq(net, dev_net(iter->dev)))
 8156			continue;
 8157		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8158					  &iter->dev->adj_list.lower);
 8159		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8160					  &dev->adj_list.upper);
 8161	}
 8162
 8163	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8164		if (!net_eq(net, dev_net(iter->dev)))
 8165			continue;
 8166		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8167					  &iter->dev->adj_list.upper);
 8168		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8169					  &dev->adj_list.lower);
 8170	}
 8171}
 8172
 8173void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8174{
 8175	struct netdev_adjacent *iter;
 8176
 8177	struct net *net = dev_net(dev);
 8178
 8179	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8180		if (!net_eq(net, dev_net(iter->dev)))
 8181			continue;
 8182		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8183					  &iter->dev->adj_list.lower);
 8184		netdev_adjacent_sysfs_add(iter->dev, dev,
 8185					  &iter->dev->adj_list.lower);
 8186	}
 8187
 8188	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8189		if (!net_eq(net, dev_net(iter->dev)))
 8190			continue;
 8191		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8192					  &iter->dev->adj_list.upper);
 8193		netdev_adjacent_sysfs_add(iter->dev, dev,
 8194					  &iter->dev->adj_list.upper);
 8195	}
 8196}
 8197
 8198void *netdev_lower_dev_get_private(struct net_device *dev,
 8199				   struct net_device *lower_dev)
 8200{
 8201	struct netdev_adjacent *lower;
 8202
 8203	if (!lower_dev)
 8204		return NULL;
 8205	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8206	if (!lower)
 8207		return NULL;
 8208
 8209	return lower->private;
 8210}
 8211EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8212
 8213
 8214/**
 8215 * netdev_lower_state_changed - Dispatch event about lower device state change
 8216 * @lower_dev: device
 8217 * @lower_state_info: state to dispatch
 8218 *
 8219 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8220 * The caller must hold the RTNL lock.
 8221 */
 8222void netdev_lower_state_changed(struct net_device *lower_dev,
 8223				void *lower_state_info)
 8224{
 8225	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8226		.info.dev = lower_dev,
 8227	};
 8228
 8229	ASSERT_RTNL();
 8230	changelowerstate_info.lower_state_info = lower_state_info;
 8231	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8232				      &changelowerstate_info.info);
 8233}
 8234EXPORT_SYMBOL(netdev_lower_state_changed);
 8235
 8236static void dev_change_rx_flags(struct net_device *dev, int flags)
 8237{
 8238	const struct net_device_ops *ops = dev->netdev_ops;
 8239
 8240	if (ops->ndo_change_rx_flags)
 8241		ops->ndo_change_rx_flags(dev, flags);
 8242}
 8243
 8244static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8245{
 8246	unsigned int old_flags = dev->flags;
 8247	kuid_t uid;
 8248	kgid_t gid;
 8249
 8250	ASSERT_RTNL();
 8251
 8252	dev->flags |= IFF_PROMISC;
 8253	dev->promiscuity += inc;
 8254	if (dev->promiscuity == 0) {
 8255		/*
 8256		 * Avoid overflow.
 8257		 * If inc causes overflow, untouch promisc and return error.
 8258		 */
 8259		if (inc < 0)
 8260			dev->flags &= ~IFF_PROMISC;
 8261		else {
 8262			dev->promiscuity -= inc;
 8263			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8264				dev->name);
 8265			return -EOVERFLOW;
 8266		}
 8267	}
 8268	if (dev->flags != old_flags) {
 8269		pr_info("device %s %s promiscuous mode\n",
 8270			dev->name,
 8271			dev->flags & IFF_PROMISC ? "entered" : "left");
 8272		if (audit_enabled) {
 8273			current_uid_gid(&uid, &gid);
 8274			audit_log(audit_context(), GFP_ATOMIC,
 8275				  AUDIT_ANOM_PROMISCUOUS,
 8276				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8277				  dev->name, (dev->flags & IFF_PROMISC),
 8278				  (old_flags & IFF_PROMISC),
 8279				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8280				  from_kuid(&init_user_ns, uid),
 8281				  from_kgid(&init_user_ns, gid),
 8282				  audit_get_sessionid(current));
 8283		}
 8284
 8285		dev_change_rx_flags(dev, IFF_PROMISC);
 8286	}
 8287	if (notify)
 8288		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8289	return 0;
 8290}
 8291
 8292/**
 8293 *	dev_set_promiscuity	- update promiscuity count on a device
 8294 *	@dev: device
 8295 *	@inc: modifier
 8296 *
 8297 *	Add or remove promiscuity from a device. While the count in the device
 8298 *	remains above zero the interface remains promiscuous. Once it hits zero
 8299 *	the device reverts back to normal filtering operation. A negative inc
 8300 *	value is used to drop promiscuity on the device.
 8301 *	Return 0 if successful or a negative errno code on error.
 8302 */
 8303int dev_set_promiscuity(struct net_device *dev, int inc)
 8304{
 8305	unsigned int old_flags = dev->flags;
 8306	int err;
 8307
 8308	err = __dev_set_promiscuity(dev, inc, true);
 8309	if (err < 0)
 8310		return err;
 8311	if (dev->flags != old_flags)
 8312		dev_set_rx_mode(dev);
 8313	return err;
 8314}
 8315EXPORT_SYMBOL(dev_set_promiscuity);
 8316
 8317static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8318{
 8319	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8320
 8321	ASSERT_RTNL();
 8322
 8323	dev->flags |= IFF_ALLMULTI;
 8324	dev->allmulti += inc;
 8325	if (dev->allmulti == 0) {
 8326		/*
 8327		 * Avoid overflow.
 8328		 * If inc causes overflow, untouch allmulti and return error.
 8329		 */
 8330		if (inc < 0)
 8331			dev->flags &= ~IFF_ALLMULTI;
 8332		else {
 8333			dev->allmulti -= inc;
 8334			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8335				dev->name);
 8336			return -EOVERFLOW;
 8337		}
 8338	}
 8339	if (dev->flags ^ old_flags) {
 8340		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8341		dev_set_rx_mode(dev);
 8342		if (notify)
 8343			__dev_notify_flags(dev, old_flags,
 8344					   dev->gflags ^ old_gflags);
 8345	}
 8346	return 0;
 8347}
 8348
 8349/**
 8350 *	dev_set_allmulti	- update allmulti count on a device
 8351 *	@dev: device
 8352 *	@inc: modifier
 8353 *
 8354 *	Add or remove reception of all multicast frames to a device. While the
 8355 *	count in the device remains above zero the interface remains listening
 8356 *	to all interfaces. Once it hits zero the device reverts back to normal
 8357 *	filtering operation. A negative @inc value is used to drop the counter
 8358 *	when releasing a resource needing all multicasts.
 8359 *	Return 0 if successful or a negative errno code on error.
 8360 */
 8361
 8362int dev_set_allmulti(struct net_device *dev, int inc)
 8363{
 8364	return __dev_set_allmulti(dev, inc, true);
 8365}
 8366EXPORT_SYMBOL(dev_set_allmulti);
 8367
 8368/*
 8369 *	Upload unicast and multicast address lists to device and
 8370 *	configure RX filtering. When the device doesn't support unicast
 8371 *	filtering it is put in promiscuous mode while unicast addresses
 8372 *	are present.
 8373 */
 8374void __dev_set_rx_mode(struct net_device *dev)
 8375{
 8376	const struct net_device_ops *ops = dev->netdev_ops;
 8377
 8378	/* dev_open will call this function so the list will stay sane. */
 8379	if (!(dev->flags&IFF_UP))
 8380		return;
 8381
 8382	if (!netif_device_present(dev))
 8383		return;
 8384
 8385	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8386		/* Unicast addresses changes may only happen under the rtnl,
 8387		 * therefore calling __dev_set_promiscuity here is safe.
 8388		 */
 8389		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8390			__dev_set_promiscuity(dev, 1, false);
 8391			dev->uc_promisc = true;
 8392		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8393			__dev_set_promiscuity(dev, -1, false);
 8394			dev->uc_promisc = false;
 8395		}
 8396	}
 8397
 8398	if (ops->ndo_set_rx_mode)
 8399		ops->ndo_set_rx_mode(dev);
 8400}
 8401
 8402void dev_set_rx_mode(struct net_device *dev)
 8403{
 8404	netif_addr_lock_bh(dev);
 8405	__dev_set_rx_mode(dev);
 8406	netif_addr_unlock_bh(dev);
 8407}
 8408
 8409/**
 8410 *	dev_get_flags - get flags reported to userspace
 8411 *	@dev: device
 8412 *
 8413 *	Get the combination of flag bits exported through APIs to userspace.
 8414 */
 8415unsigned int dev_get_flags(const struct net_device *dev)
 8416{
 8417	unsigned int flags;
 8418
 8419	flags = (dev->flags & ~(IFF_PROMISC |
 8420				IFF_ALLMULTI |
 8421				IFF_RUNNING |
 8422				IFF_LOWER_UP |
 8423				IFF_DORMANT)) |
 8424		(dev->gflags & (IFF_PROMISC |
 8425				IFF_ALLMULTI));
 8426
 8427	if (netif_running(dev)) {
 8428		if (netif_oper_up(dev))
 8429			flags |= IFF_RUNNING;
 8430		if (netif_carrier_ok(dev))
 8431			flags |= IFF_LOWER_UP;
 8432		if (netif_dormant(dev))
 8433			flags |= IFF_DORMANT;
 8434	}
 8435
 8436	return flags;
 8437}
 8438EXPORT_SYMBOL(dev_get_flags);
 8439
 8440int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8441		       struct netlink_ext_ack *extack)
 8442{
 8443	unsigned int old_flags = dev->flags;
 8444	int ret;
 8445
 8446	ASSERT_RTNL();
 8447
 8448	/*
 8449	 *	Set the flags on our device.
 8450	 */
 8451
 8452	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8453			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8454			       IFF_AUTOMEDIA)) |
 8455		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8456				    IFF_ALLMULTI));
 8457
 8458	/*
 8459	 *	Load in the correct multicast list now the flags have changed.
 8460	 */
 8461
 8462	if ((old_flags ^ flags) & IFF_MULTICAST)
 8463		dev_change_rx_flags(dev, IFF_MULTICAST);
 8464
 8465	dev_set_rx_mode(dev);
 8466
 8467	/*
 8468	 *	Have we downed the interface. We handle IFF_UP ourselves
 8469	 *	according to user attempts to set it, rather than blindly
 8470	 *	setting it.
 8471	 */
 8472
 8473	ret = 0;
 8474	if ((old_flags ^ flags) & IFF_UP) {
 8475		if (old_flags & IFF_UP)
 8476			__dev_close(dev);
 8477		else
 8478			ret = __dev_open(dev, extack);
 8479	}
 8480
 8481	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8482		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8483		unsigned int old_flags = dev->flags;
 8484
 8485		dev->gflags ^= IFF_PROMISC;
 8486
 8487		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8488			if (dev->flags != old_flags)
 8489				dev_set_rx_mode(dev);
 8490	}
 8491
 8492	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8493	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8494	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8495	 */
 8496	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8497		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8498
 8499		dev->gflags ^= IFF_ALLMULTI;
 8500		__dev_set_allmulti(dev, inc, false);
 8501	}
 8502
 8503	return ret;
 8504}
 8505
 8506void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8507			unsigned int gchanges)
 8508{
 8509	unsigned int changes = dev->flags ^ old_flags;
 8510
 8511	if (gchanges)
 8512		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8513
 8514	if (changes & IFF_UP) {
 8515		if (dev->flags & IFF_UP)
 8516			call_netdevice_notifiers(NETDEV_UP, dev);
 8517		else
 8518			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8519	}
 8520
 8521	if (dev->flags & IFF_UP &&
 8522	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8523		struct netdev_notifier_change_info change_info = {
 8524			.info = {
 8525				.dev = dev,
 8526			},
 8527			.flags_changed = changes,
 8528		};
 8529
 8530		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8531	}
 8532}
 8533
 8534/**
 8535 *	dev_change_flags - change device settings
 8536 *	@dev: device
 8537 *	@flags: device state flags
 8538 *	@extack: netlink extended ack
 8539 *
 8540 *	Change settings on device based state flags. The flags are
 8541 *	in the userspace exported format.
 8542 */
 8543int dev_change_flags(struct net_device *dev, unsigned int flags,
 8544		     struct netlink_ext_ack *extack)
 8545{
 8546	int ret;
 8547	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8548
 8549	ret = __dev_change_flags(dev, flags, extack);
 8550	if (ret < 0)
 8551		return ret;
 8552
 8553	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8554	__dev_notify_flags(dev, old_flags, changes);
 8555	return ret;
 8556}
 8557EXPORT_SYMBOL(dev_change_flags);
 8558
 8559int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8560{
 8561	const struct net_device_ops *ops = dev->netdev_ops;
 8562
 8563	if (ops->ndo_change_mtu)
 8564		return ops->ndo_change_mtu(dev, new_mtu);
 8565
 8566	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8567	WRITE_ONCE(dev->mtu, new_mtu);
 8568	return 0;
 8569}
 8570EXPORT_SYMBOL(__dev_set_mtu);
 8571
 8572int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8573		     struct netlink_ext_ack *extack)
 8574{
 8575	/* MTU must be positive, and in range */
 8576	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8577		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8578		return -EINVAL;
 8579	}
 8580
 8581	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8582		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8583		return -EINVAL;
 8584	}
 8585	return 0;
 8586}
 8587
 8588/**
 8589 *	dev_set_mtu_ext - Change maximum transfer unit
 8590 *	@dev: device
 8591 *	@new_mtu: new transfer unit
 8592 *	@extack: netlink extended ack
 8593 *
 8594 *	Change the maximum transfer size of the network device.
 8595 */
 8596int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8597		    struct netlink_ext_ack *extack)
 8598{
 8599	int err, orig_mtu;
 8600
 8601	if (new_mtu == dev->mtu)
 8602		return 0;
 8603
 8604	err = dev_validate_mtu(dev, new_mtu, extack);
 8605	if (err)
 8606		return err;
 8607
 8608	if (!netif_device_present(dev))
 8609		return -ENODEV;
 8610
 8611	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8612	err = notifier_to_errno(err);
 8613	if (err)
 8614		return err;
 8615
 8616	orig_mtu = dev->mtu;
 8617	err = __dev_set_mtu(dev, new_mtu);
 8618
 8619	if (!err) {
 8620		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8621						   orig_mtu);
 8622		err = notifier_to_errno(err);
 8623		if (err) {
 8624			/* setting mtu back and notifying everyone again,
 8625			 * so that they have a chance to revert changes.
 8626			 */
 8627			__dev_set_mtu(dev, orig_mtu);
 8628			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8629						     new_mtu);
 8630		}
 8631	}
 8632	return err;
 8633}
 8634
 8635int dev_set_mtu(struct net_device *dev, int new_mtu)
 8636{
 8637	struct netlink_ext_ack extack;
 8638	int err;
 8639
 8640	memset(&extack, 0, sizeof(extack));
 8641	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8642	if (err && extack._msg)
 8643		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8644	return err;
 8645}
 8646EXPORT_SYMBOL(dev_set_mtu);
 8647
 8648/**
 8649 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8650 *	@dev: device
 8651 *	@new_len: new tx queue length
 8652 */
 8653int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8654{
 8655	unsigned int orig_len = dev->tx_queue_len;
 8656	int res;
 8657
 8658	if (new_len != (unsigned int)new_len)
 8659		return -ERANGE;
 8660
 8661	if (new_len != orig_len) {
 8662		dev->tx_queue_len = new_len;
 8663		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8664		res = notifier_to_errno(res);
 8665		if (res)
 8666			goto err_rollback;
 8667		res = dev_qdisc_change_tx_queue_len(dev);
 8668		if (res)
 8669			goto err_rollback;
 8670	}
 8671
 8672	return 0;
 8673
 8674err_rollback:
 8675	netdev_err(dev, "refused to change device tx_queue_len\n");
 8676	dev->tx_queue_len = orig_len;
 8677	return res;
 8678}
 8679
 8680/**
 8681 *	dev_set_group - Change group this device belongs to
 8682 *	@dev: device
 8683 *	@new_group: group this device should belong to
 8684 */
 8685void dev_set_group(struct net_device *dev, int new_group)
 8686{
 8687	dev->group = new_group;
 8688}
 8689EXPORT_SYMBOL(dev_set_group);
 8690
 8691/**
 8692 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8693 *	@dev: device
 8694 *	@addr: new address
 8695 *	@extack: netlink extended ack
 8696 */
 8697int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8698			      struct netlink_ext_ack *extack)
 8699{
 8700	struct netdev_notifier_pre_changeaddr_info info = {
 8701		.info.dev = dev,
 8702		.info.extack = extack,
 8703		.dev_addr = addr,
 8704	};
 8705	int rc;
 8706
 8707	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8708	return notifier_to_errno(rc);
 8709}
 8710EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8711
 8712/**
 8713 *	dev_set_mac_address - Change Media Access Control Address
 8714 *	@dev: device
 8715 *	@sa: new address
 8716 *	@extack: netlink extended ack
 8717 *
 8718 *	Change the hardware (MAC) address of the device
 8719 */
 8720int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8721			struct netlink_ext_ack *extack)
 8722{
 8723	const struct net_device_ops *ops = dev->netdev_ops;
 8724	int err;
 8725
 8726	if (!ops->ndo_set_mac_address)
 8727		return -EOPNOTSUPP;
 8728	if (sa->sa_family != dev->type)
 8729		return -EINVAL;
 8730	if (!netif_device_present(dev))
 8731		return -ENODEV;
 8732	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8733	if (err)
 8734		return err;
 8735	err = ops->ndo_set_mac_address(dev, sa);
 8736	if (err)
 8737		return err;
 8738	dev->addr_assign_type = NET_ADDR_SET;
 8739	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8740	add_device_randomness(dev->dev_addr, dev->addr_len);
 8741	return 0;
 8742}
 8743EXPORT_SYMBOL(dev_set_mac_address);
 8744
 8745/**
 8746 *	dev_change_carrier - Change device carrier
 8747 *	@dev: device
 8748 *	@new_carrier: new value
 8749 *
 8750 *	Change device carrier
 8751 */
 8752int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8753{
 8754	const struct net_device_ops *ops = dev->netdev_ops;
 8755
 8756	if (!ops->ndo_change_carrier)
 8757		return -EOPNOTSUPP;
 8758	if (!netif_device_present(dev))
 8759		return -ENODEV;
 8760	return ops->ndo_change_carrier(dev, new_carrier);
 8761}
 8762EXPORT_SYMBOL(dev_change_carrier);
 8763
 8764/**
 8765 *	dev_get_phys_port_id - Get device physical port ID
 8766 *	@dev: device
 8767 *	@ppid: port ID
 8768 *
 8769 *	Get device physical port ID
 8770 */
 8771int dev_get_phys_port_id(struct net_device *dev,
 8772			 struct netdev_phys_item_id *ppid)
 8773{
 8774	const struct net_device_ops *ops = dev->netdev_ops;
 8775
 8776	if (!ops->ndo_get_phys_port_id)
 8777		return -EOPNOTSUPP;
 8778	return ops->ndo_get_phys_port_id(dev, ppid);
 8779}
 8780EXPORT_SYMBOL(dev_get_phys_port_id);
 8781
 8782/**
 8783 *	dev_get_phys_port_name - Get device physical port name
 8784 *	@dev: device
 8785 *	@name: port name
 8786 *	@len: limit of bytes to copy to name
 8787 *
 8788 *	Get device physical port name
 8789 */
 8790int dev_get_phys_port_name(struct net_device *dev,
 8791			   char *name, size_t len)
 8792{
 8793	const struct net_device_ops *ops = dev->netdev_ops;
 8794	int err;
 8795
 8796	if (ops->ndo_get_phys_port_name) {
 8797		err = ops->ndo_get_phys_port_name(dev, name, len);
 8798		if (err != -EOPNOTSUPP)
 8799			return err;
 8800	}
 8801	return devlink_compat_phys_port_name_get(dev, name, len);
 8802}
 8803EXPORT_SYMBOL(dev_get_phys_port_name);
 8804
 8805/**
 8806 *	dev_get_port_parent_id - Get the device's port parent identifier
 8807 *	@dev: network device
 8808 *	@ppid: pointer to a storage for the port's parent identifier
 8809 *	@recurse: allow/disallow recursion to lower devices
 8810 *
 8811 *	Get the devices's port parent identifier
 8812 */
 8813int dev_get_port_parent_id(struct net_device *dev,
 8814			   struct netdev_phys_item_id *ppid,
 8815			   bool recurse)
 8816{
 8817	const struct net_device_ops *ops = dev->netdev_ops;
 8818	struct netdev_phys_item_id first = { };
 8819	struct net_device *lower_dev;
 8820	struct list_head *iter;
 8821	int err;
 8822
 8823	if (ops->ndo_get_port_parent_id) {
 8824		err = ops->ndo_get_port_parent_id(dev, ppid);
 8825		if (err != -EOPNOTSUPP)
 8826			return err;
 8827	}
 8828
 8829	err = devlink_compat_switch_id_get(dev, ppid);
 8830	if (!err || err != -EOPNOTSUPP)
 8831		return err;
 8832
 8833	if (!recurse)
 8834		return -EOPNOTSUPP;
 8835
 8836	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8837		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8838		if (err)
 8839			break;
 8840		if (!first.id_len)
 8841			first = *ppid;
 8842		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8843			return -EOPNOTSUPP;
 8844	}
 8845
 8846	return err;
 8847}
 8848EXPORT_SYMBOL(dev_get_port_parent_id);
 8849
 8850/**
 8851 *	netdev_port_same_parent_id - Indicate if two network devices have
 8852 *	the same port parent identifier
 8853 *	@a: first network device
 8854 *	@b: second network device
 8855 */
 8856bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8857{
 8858	struct netdev_phys_item_id a_id = { };
 8859	struct netdev_phys_item_id b_id = { };
 8860
 8861	if (dev_get_port_parent_id(a, &a_id, true) ||
 8862	    dev_get_port_parent_id(b, &b_id, true))
 8863		return false;
 8864
 8865	return netdev_phys_item_id_same(&a_id, &b_id);
 8866}
 8867EXPORT_SYMBOL(netdev_port_same_parent_id);
 8868
 8869/**
 8870 *	dev_change_proto_down - update protocol port state information
 8871 *	@dev: device
 8872 *	@proto_down: new value
 8873 *
 8874 *	This info can be used by switch drivers to set the phys state of the
 8875 *	port.
 8876 */
 8877int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8878{
 8879	const struct net_device_ops *ops = dev->netdev_ops;
 8880
 8881	if (!ops->ndo_change_proto_down)
 8882		return -EOPNOTSUPP;
 8883	if (!netif_device_present(dev))
 8884		return -ENODEV;
 8885	return ops->ndo_change_proto_down(dev, proto_down);
 8886}
 8887EXPORT_SYMBOL(dev_change_proto_down);
 8888
 8889/**
 8890 *	dev_change_proto_down_generic - generic implementation for
 8891 * 	ndo_change_proto_down that sets carrier according to
 8892 * 	proto_down.
 8893 *
 8894 *	@dev: device
 8895 *	@proto_down: new value
 8896 */
 8897int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8898{
 8899	if (proto_down)
 8900		netif_carrier_off(dev);
 8901	else
 8902		netif_carrier_on(dev);
 8903	dev->proto_down = proto_down;
 8904	return 0;
 8905}
 8906EXPORT_SYMBOL(dev_change_proto_down_generic);
 8907
 8908/**
 8909 *	dev_change_proto_down_reason - proto down reason
 8910 *
 8911 *	@dev: device
 8912 *	@mask: proto down mask
 8913 *	@value: proto down value
 8914 */
 8915void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8916				  u32 value)
 8917{
 8918	int b;
 8919
 8920	if (!mask) {
 8921		dev->proto_down_reason = value;
 8922	} else {
 8923		for_each_set_bit(b, &mask, 32) {
 8924			if (value & (1 << b))
 8925				dev->proto_down_reason |= BIT(b);
 8926			else
 8927				dev->proto_down_reason &= ~BIT(b);
 8928		}
 8929	}
 8930}
 8931EXPORT_SYMBOL(dev_change_proto_down_reason);
 8932
 8933struct bpf_xdp_link {
 8934	struct bpf_link link;
 8935	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8936	int flags;
 8937};
 8938
 8939static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 8940{
 8941	if (flags & XDP_FLAGS_HW_MODE)
 8942		return XDP_MODE_HW;
 8943	if (flags & XDP_FLAGS_DRV_MODE)
 8944		return XDP_MODE_DRV;
 8945	if (flags & XDP_FLAGS_SKB_MODE)
 8946		return XDP_MODE_SKB;
 8947	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 8948}
 8949
 8950static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 8951{
 8952	switch (mode) {
 8953	case XDP_MODE_SKB:
 8954		return generic_xdp_install;
 8955	case XDP_MODE_DRV:
 8956	case XDP_MODE_HW:
 8957		return dev->netdev_ops->ndo_bpf;
 8958	default:
 8959		return NULL;
 8960	}
 8961}
 8962
 8963static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 8964					 enum bpf_xdp_mode mode)
 8965{
 8966	return dev->xdp_state[mode].link;
 8967}
 8968
 8969static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 8970				     enum bpf_xdp_mode mode)
 8971{
 8972	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 8973
 8974	if (link)
 8975		return link->link.prog;
 8976	return dev->xdp_state[mode].prog;
 8977}
 8978
 8979static u8 dev_xdp_prog_count(struct net_device *dev)
 8980{
 8981	u8 count = 0;
 8982	int i;
 8983
 8984	for (i = 0; i < __MAX_XDP_MODE; i++)
 8985		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
 8986			count++;
 8987	return count;
 8988}
 8989
 8990u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 8991{
 8992	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 8993
 8994	return prog ? prog->aux->id : 0;
 8995}
 8996
 8997static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 8998			     struct bpf_xdp_link *link)
 8999{
 9000	dev->xdp_state[mode].link = link;
 9001	dev->xdp_state[mode].prog = NULL;
 9002}
 9003
 9004static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 9005			     struct bpf_prog *prog)
 9006{
 9007	dev->xdp_state[mode].link = NULL;
 9008	dev->xdp_state[mode].prog = prog;
 9009}
 9010
 9011static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 9012			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 9013			   u32 flags, struct bpf_prog *prog)
 9014{
 9015	struct netdev_bpf xdp;
 9016	int err;
 9017
 9018	memset(&xdp, 0, sizeof(xdp));
 9019	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 9020	xdp.extack = extack;
 9021	xdp.flags = flags;
 9022	xdp.prog = prog;
 9023
 9024	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 9025	 * "moved" into driver), so they don't increment it on their own, but
 9026	 * they do decrement refcnt when program is detached or replaced.
 9027	 * Given net_device also owns link/prog, we need to bump refcnt here
 9028	 * to prevent drivers from underflowing it.
 9029	 */
 9030	if (prog)
 9031		bpf_prog_inc(prog);
 9032	err = bpf_op(dev, &xdp);
 9033	if (err) {
 9034		if (prog)
 9035			bpf_prog_put(prog);
 9036		return err;
 9037	}
 9038
 9039	if (mode != XDP_MODE_HW)
 9040		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 9041
 9042	return 0;
 9043}
 9044
 9045static void dev_xdp_uninstall(struct net_device *dev)
 9046{
 9047	struct bpf_xdp_link *link;
 9048	struct bpf_prog *prog;
 9049	enum bpf_xdp_mode mode;
 9050	bpf_op_t bpf_op;
 9051
 9052	ASSERT_RTNL();
 9053
 9054	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 9055		prog = dev_xdp_prog(dev, mode);
 9056		if (!prog)
 9057			continue;
 9058
 9059		bpf_op = dev_xdp_bpf_op(dev, mode);
 9060		if (!bpf_op)
 9061			continue;
 9062
 9063		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9064
 9065		/* auto-detach link from net device */
 9066		link = dev_xdp_link(dev, mode);
 9067		if (link)
 9068			link->dev = NULL;
 9069		else
 9070			bpf_prog_put(prog);
 9071
 9072		dev_xdp_set_link(dev, mode, NULL);
 9073	}
 9074}
 9075
 9076static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9077			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9078			  struct bpf_prog *old_prog, u32 flags)
 9079{
 9080	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
 9081	struct bpf_prog *cur_prog;
 9082	enum bpf_xdp_mode mode;
 9083	bpf_op_t bpf_op;
 9084	int err;
 9085
 9086	ASSERT_RTNL();
 9087
 9088	/* either link or prog attachment, never both */
 9089	if (link && (new_prog || old_prog))
 9090		return -EINVAL;
 9091	/* link supports only XDP mode flags */
 9092	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9093		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9094		return -EINVAL;
 9095	}
 9096	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
 9097	if (num_modes > 1) {
 9098		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9099		return -EINVAL;
 9100	}
 9101	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
 9102	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
 9103		NL_SET_ERR_MSG(extack,
 9104			       "More than one program loaded, unset mode is ambiguous");
 9105		return -EINVAL;
 9106	}
 9107	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9108	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9109		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9110		return -EINVAL;
 9111	}
 9112
 9113	mode = dev_xdp_mode(dev, flags);
 9114	/* can't replace attached link */
 9115	if (dev_xdp_link(dev, mode)) {
 9116		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9117		return -EBUSY;
 9118	}
 9119
 9120	cur_prog = dev_xdp_prog(dev, mode);
 9121	/* can't replace attached prog with link */
 9122	if (link && cur_prog) {
 9123		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9124		return -EBUSY;
 9125	}
 9126	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9127		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9128		return -EEXIST;
 9129	}
 9130
 9131	/* put effective new program into new_prog */
 9132	if (link)
 9133		new_prog = link->link.prog;
 9134
 9135	if (new_prog) {
 9136		bool offload = mode == XDP_MODE_HW;
 9137		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9138					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9139
 9140		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9141			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9142			return -EBUSY;
 9143		}
 9144		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9145			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9146			return -EEXIST;
 9147		}
 9148		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9149			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9150			return -EINVAL;
 9151		}
 9152		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9153			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9154			return -EINVAL;
 9155		}
 9156		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9157			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9158			return -EINVAL;
 9159		}
 9160	}
 9161
 9162	/* don't call drivers if the effective program didn't change */
 9163	if (new_prog != cur_prog) {
 9164		bpf_op = dev_xdp_bpf_op(dev, mode);
 9165		if (!bpf_op) {
 9166			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9167			return -EOPNOTSUPP;
 9168		}
 9169
 9170		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9171		if (err)
 9172			return err;
 9173	}
 9174
 9175	if (link)
 9176		dev_xdp_set_link(dev, mode, link);
 9177	else
 9178		dev_xdp_set_prog(dev, mode, new_prog);
 9179	if (cur_prog)
 9180		bpf_prog_put(cur_prog);
 9181
 9182	return 0;
 9183}
 9184
 9185static int dev_xdp_attach_link(struct net_device *dev,
 9186			       struct netlink_ext_ack *extack,
 9187			       struct bpf_xdp_link *link)
 9188{
 9189	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9190}
 9191
 9192static int dev_xdp_detach_link(struct net_device *dev,
 9193			       struct netlink_ext_ack *extack,
 9194			       struct bpf_xdp_link *link)
 9195{
 9196	enum bpf_xdp_mode mode;
 9197	bpf_op_t bpf_op;
 9198
 9199	ASSERT_RTNL();
 9200
 9201	mode = dev_xdp_mode(dev, link->flags);
 9202	if (dev_xdp_link(dev, mode) != link)
 9203		return -EINVAL;
 9204
 9205	bpf_op = dev_xdp_bpf_op(dev, mode);
 9206	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9207	dev_xdp_set_link(dev, mode, NULL);
 9208	return 0;
 9209}
 9210
 9211static void bpf_xdp_link_release(struct bpf_link *link)
 9212{
 9213	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9214
 9215	rtnl_lock();
 9216
 9217	/* if racing with net_device's tear down, xdp_link->dev might be
 9218	 * already NULL, in which case link was already auto-detached
 9219	 */
 9220	if (xdp_link->dev) {
 9221		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9222		xdp_link->dev = NULL;
 9223	}
 9224
 9225	rtnl_unlock();
 9226}
 9227
 9228static int bpf_xdp_link_detach(struct bpf_link *link)
 9229{
 9230	bpf_xdp_link_release(link);
 9231	return 0;
 9232}
 9233
 9234static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9235{
 9236	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9237
 9238	kfree(xdp_link);
 9239}
 9240
 9241static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9242				     struct seq_file *seq)
 9243{
 9244	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9245	u32 ifindex = 0;
 9246
 9247	rtnl_lock();
 9248	if (xdp_link->dev)
 9249		ifindex = xdp_link->dev->ifindex;
 9250	rtnl_unlock();
 9251
 9252	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9253}
 9254
 9255static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9256				       struct bpf_link_info *info)
 9257{
 9258	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9259	u32 ifindex = 0;
 9260
 9261	rtnl_lock();
 9262	if (xdp_link->dev)
 9263		ifindex = xdp_link->dev->ifindex;
 9264	rtnl_unlock();
 9265
 9266	info->xdp.ifindex = ifindex;
 9267	return 0;
 9268}
 9269
 9270static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9271			       struct bpf_prog *old_prog)
 9272{
 9273	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9274	enum bpf_xdp_mode mode;
 9275	bpf_op_t bpf_op;
 9276	int err = 0;
 9277
 9278	rtnl_lock();
 9279
 9280	/* link might have been auto-released already, so fail */
 9281	if (!xdp_link->dev) {
 9282		err = -ENOLINK;
 9283		goto out_unlock;
 9284	}
 9285
 9286	if (old_prog && link->prog != old_prog) {
 9287		err = -EPERM;
 9288		goto out_unlock;
 9289	}
 9290	old_prog = link->prog;
 9291	if (old_prog == new_prog) {
 9292		/* no-op, don't disturb drivers */
 9293		bpf_prog_put(new_prog);
 9294		goto out_unlock;
 9295	}
 9296
 9297	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9298	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9299	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9300			      xdp_link->flags, new_prog);
 9301	if (err)
 9302		goto out_unlock;
 9303
 9304	old_prog = xchg(&link->prog, new_prog);
 9305	bpf_prog_put(old_prog);
 9306
 9307out_unlock:
 9308	rtnl_unlock();
 9309	return err;
 9310}
 9311
 9312static const struct bpf_link_ops bpf_xdp_link_lops = {
 9313	.release = bpf_xdp_link_release,
 9314	.dealloc = bpf_xdp_link_dealloc,
 9315	.detach = bpf_xdp_link_detach,
 9316	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9317	.fill_link_info = bpf_xdp_link_fill_link_info,
 9318	.update_prog = bpf_xdp_link_update,
 9319};
 9320
 9321int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9322{
 9323	struct net *net = current->nsproxy->net_ns;
 9324	struct bpf_link_primer link_primer;
 9325	struct bpf_xdp_link *link;
 9326	struct net_device *dev;
 9327	int err, fd;
 9328
 9329	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9330	if (!dev)
 9331		return -EINVAL;
 9332
 9333	link = kzalloc(sizeof(*link), GFP_USER);
 9334	if (!link) {
 9335		err = -ENOMEM;
 9336		goto out_put_dev;
 9337	}
 9338
 9339	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9340	link->dev = dev;
 9341	link->flags = attr->link_create.flags;
 9342
 9343	err = bpf_link_prime(&link->link, &link_primer);
 9344	if (err) {
 9345		kfree(link);
 9346		goto out_put_dev;
 9347	}
 9348
 9349	rtnl_lock();
 9350	err = dev_xdp_attach_link(dev, NULL, link);
 9351	rtnl_unlock();
 9352
 9353	if (err) {
 9354		bpf_link_cleanup(&link_primer);
 9355		goto out_put_dev;
 9356	}
 9357
 9358	fd = bpf_link_settle(&link_primer);
 9359	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9360	dev_put(dev);
 9361	return fd;
 9362
 9363out_put_dev:
 9364	dev_put(dev);
 9365	return err;
 9366}
 9367
 9368/**
 9369 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9370 *	@dev: device
 9371 *	@extack: netlink extended ack
 9372 *	@fd: new program fd or negative value to clear
 9373 *	@expected_fd: old program fd that userspace expects to replace or clear
 9374 *	@flags: xdp-related flags
 9375 *
 9376 *	Set or clear a bpf program for a device
 9377 */
 9378int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9379		      int fd, int expected_fd, u32 flags)
 9380{
 9381	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9382	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9383	int err;
 9384
 9385	ASSERT_RTNL();
 9386
 9387	if (fd >= 0) {
 9388		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9389						 mode != XDP_MODE_SKB);
 9390		if (IS_ERR(new_prog))
 9391			return PTR_ERR(new_prog);
 9392	}
 9393
 9394	if (expected_fd >= 0) {
 9395		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9396						 mode != XDP_MODE_SKB);
 9397		if (IS_ERR(old_prog)) {
 9398			err = PTR_ERR(old_prog);
 9399			old_prog = NULL;
 9400			goto err_out;
 9401		}
 9402	}
 9403
 9404	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9405
 9406err_out:
 9407	if (err && new_prog)
 9408		bpf_prog_put(new_prog);
 9409	if (old_prog)
 9410		bpf_prog_put(old_prog);
 9411	return err;
 9412}
 9413
 9414/**
 9415 *	dev_new_index	-	allocate an ifindex
 9416 *	@net: the applicable net namespace
 9417 *
 9418 *	Returns a suitable unique value for a new device interface
 9419 *	number.  The caller must hold the rtnl semaphore or the
 9420 *	dev_base_lock to be sure it remains unique.
 9421 */
 9422static int dev_new_index(struct net *net)
 9423{
 9424	int ifindex = net->ifindex;
 9425
 9426	for (;;) {
 9427		if (++ifindex <= 0)
 9428			ifindex = 1;
 9429		if (!__dev_get_by_index(net, ifindex))
 9430			return net->ifindex = ifindex;
 9431	}
 9432}
 9433
 9434/* Delayed registration/unregisteration */
 9435static LIST_HEAD(net_todo_list);
 9436DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9437
 9438static void net_set_todo(struct net_device *dev)
 9439{
 9440	list_add_tail(&dev->todo_list, &net_todo_list);
 9441	dev_net(dev)->dev_unreg_count++;
 9442}
 9443
 9444static void rollback_registered_many(struct list_head *head)
 9445{
 9446	struct net_device *dev, *tmp;
 9447	LIST_HEAD(close_head);
 9448
 9449	BUG_ON(dev_boot_phase);
 9450	ASSERT_RTNL();
 9451
 9452	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 9453		/* Some devices call without registering
 9454		 * for initialization unwind. Remove those
 9455		 * devices and proceed with the remaining.
 9456		 */
 9457		if (dev->reg_state == NETREG_UNINITIALIZED) {
 9458			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 9459				 dev->name, dev);
 9460
 9461			WARN_ON(1);
 9462			list_del(&dev->unreg_list);
 9463			continue;
 9464		}
 9465		dev->dismantle = true;
 9466		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 9467	}
 9468
 9469	/* If device is running, close it first. */
 9470	list_for_each_entry(dev, head, unreg_list)
 9471		list_add_tail(&dev->close_list, &close_head);
 9472	dev_close_many(&close_head, true);
 9473
 9474	list_for_each_entry(dev, head, unreg_list) {
 9475		/* And unlink it from device chain. */
 9476		unlist_netdevice(dev);
 9477
 9478		dev->reg_state = NETREG_UNREGISTERING;
 9479	}
 9480	flush_all_backlogs();
 9481
 9482	synchronize_net();
 9483
 9484	list_for_each_entry(dev, head, unreg_list) {
 9485		struct sk_buff *skb = NULL;
 9486
 9487		/* Shutdown queueing discipline. */
 9488		dev_shutdown(dev);
 9489
 9490		dev_xdp_uninstall(dev);
 9491
 9492		/* Notify protocols, that we are about to destroy
 9493		 * this device. They should clean all the things.
 9494		 */
 9495		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9496
 9497		if (!dev->rtnl_link_ops ||
 9498		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9499			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 9500						     GFP_KERNEL, NULL, 0);
 9501
 9502		/*
 9503		 *	Flush the unicast and multicast chains
 9504		 */
 9505		dev_uc_flush(dev);
 9506		dev_mc_flush(dev);
 9507
 9508		netdev_name_node_alt_flush(dev);
 9509		netdev_name_node_free(dev->name_node);
 9510
 9511		if (dev->netdev_ops->ndo_uninit)
 9512			dev->netdev_ops->ndo_uninit(dev);
 9513
 9514		if (skb)
 9515			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 9516
 9517		/* Notifier chain MUST detach us all upper devices. */
 9518		WARN_ON(netdev_has_any_upper_dev(dev));
 9519		WARN_ON(netdev_has_any_lower_dev(dev));
 9520
 9521		/* Remove entries from kobject tree */
 9522		netdev_unregister_kobject(dev);
 9523#ifdef CONFIG_XPS
 9524		/* Remove XPS queueing entries */
 9525		netif_reset_xps_queues_gt(dev, 0);
 9526#endif
 9527	}
 9528
 9529	synchronize_net();
 9530
 9531	list_for_each_entry(dev, head, unreg_list)
 9532		dev_put(dev);
 9533}
 9534
 9535static void rollback_registered(struct net_device *dev)
 9536{
 9537	LIST_HEAD(single);
 9538
 9539	list_add(&dev->unreg_list, &single);
 9540	rollback_registered_many(&single);
 9541	list_del(&single);
 9542}
 9543
 9544static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9545	struct net_device *upper, netdev_features_t features)
 9546{
 9547	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9548	netdev_features_t feature;
 9549	int feature_bit;
 9550
 9551	for_each_netdev_feature(upper_disables, feature_bit) {
 9552		feature = __NETIF_F_BIT(feature_bit);
 9553		if (!(upper->wanted_features & feature)
 9554		    && (features & feature)) {
 9555			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9556				   &feature, upper->name);
 9557			features &= ~feature;
 9558		}
 9559	}
 9560
 9561	return features;
 9562}
 9563
 9564static void netdev_sync_lower_features(struct net_device *upper,
 9565	struct net_device *lower, netdev_features_t features)
 9566{
 9567	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9568	netdev_features_t feature;
 9569	int feature_bit;
 9570
 9571	for_each_netdev_feature(upper_disables, feature_bit) {
 9572		feature = __NETIF_F_BIT(feature_bit);
 9573		if (!(features & feature) && (lower->features & feature)) {
 9574			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9575				   &feature, lower->name);
 9576			lower->wanted_features &= ~feature;
 9577			__netdev_update_features(lower);
 9578
 9579			if (unlikely(lower->features & feature))
 9580				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9581					    &feature, lower->name);
 9582			else
 9583				netdev_features_change(lower);
 9584		}
 9585	}
 9586}
 9587
 9588static netdev_features_t netdev_fix_features(struct net_device *dev,
 9589	netdev_features_t features)
 9590{
 9591	/* Fix illegal checksum combinations */
 9592	if ((features & NETIF_F_HW_CSUM) &&
 9593	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9594		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9595		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9596	}
 9597
 9598	/* TSO requires that SG is present as well. */
 9599	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9600		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9601		features &= ~NETIF_F_ALL_TSO;
 9602	}
 9603
 9604	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9605					!(features & NETIF_F_IP_CSUM)) {
 9606		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9607		features &= ~NETIF_F_TSO;
 9608		features &= ~NETIF_F_TSO_ECN;
 9609	}
 9610
 9611	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9612					 !(features & NETIF_F_IPV6_CSUM)) {
 9613		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9614		features &= ~NETIF_F_TSO6;
 9615	}
 9616
 9617	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9618	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9619		features &= ~NETIF_F_TSO_MANGLEID;
 9620
 9621	/* TSO ECN requires that TSO is present as well. */
 9622	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9623		features &= ~NETIF_F_TSO_ECN;
 9624
 9625	/* Software GSO depends on SG. */
 9626	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9627		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9628		features &= ~NETIF_F_GSO;
 9629	}
 9630
 9631	/* GSO partial features require GSO partial be set */
 9632	if ((features & dev->gso_partial_features) &&
 9633	    !(features & NETIF_F_GSO_PARTIAL)) {
 9634		netdev_dbg(dev,
 9635			   "Dropping partially supported GSO features since no GSO partial.\n");
 9636		features &= ~dev->gso_partial_features;
 9637	}
 9638
 9639	if (!(features & NETIF_F_RXCSUM)) {
 9640		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9641		 * successfully merged by hardware must also have the
 9642		 * checksum verified by hardware.  If the user does not
 9643		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9644		 */
 9645		if (features & NETIF_F_GRO_HW) {
 9646			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9647			features &= ~NETIF_F_GRO_HW;
 9648		}
 9649	}
 9650
 9651	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9652	if (features & NETIF_F_RXFCS) {
 9653		if (features & NETIF_F_LRO) {
 9654			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9655			features &= ~NETIF_F_LRO;
 9656		}
 9657
 9658		if (features & NETIF_F_GRO_HW) {
 9659			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9660			features &= ~NETIF_F_GRO_HW;
 9661		}
 9662	}
 9663
 9664	if (features & NETIF_F_HW_TLS_TX) {
 9665		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
 9666			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
 9667		bool hw_csum = features & NETIF_F_HW_CSUM;
 9668
 9669		if (!ip_csum && !hw_csum) {
 9670			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
 9671			features &= ~NETIF_F_HW_TLS_TX;
 9672		}
 9673	}
 9674
 9675	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
 9676		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
 9677		features &= ~NETIF_F_HW_TLS_RX;
 9678	}
 9679
 9680	return features;
 9681}
 9682
 9683int __netdev_update_features(struct net_device *dev)
 9684{
 9685	struct net_device *upper, *lower;
 9686	netdev_features_t features;
 9687	struct list_head *iter;
 9688	int err = -1;
 9689
 9690	ASSERT_RTNL();
 9691
 9692	features = netdev_get_wanted_features(dev);
 9693
 9694	if (dev->netdev_ops->ndo_fix_features)
 9695		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9696
 9697	/* driver might be less strict about feature dependencies */
 9698	features = netdev_fix_features(dev, features);
 9699
 9700	/* some features can't be enabled if they're off on an upper device */
 9701	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9702		features = netdev_sync_upper_features(dev, upper, features);
 9703
 9704	if (dev->features == features)
 9705		goto sync_lower;
 9706
 9707	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9708		&dev->features, &features);
 9709
 9710	if (dev->netdev_ops->ndo_set_features)
 9711		err = dev->netdev_ops->ndo_set_features(dev, features);
 9712	else
 9713		err = 0;
 9714
 9715	if (unlikely(err < 0)) {
 9716		netdev_err(dev,
 9717			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9718			err, &features, &dev->features);
 9719		/* return non-0 since some features might have changed and
 9720		 * it's better to fire a spurious notification than miss it
 9721		 */
 9722		return -1;
 9723	}
 9724
 9725sync_lower:
 9726	/* some features must be disabled on lower devices when disabled
 9727	 * on an upper device (think: bonding master or bridge)
 9728	 */
 9729	netdev_for_each_lower_dev(dev, lower, iter)
 9730		netdev_sync_lower_features(dev, lower, features);
 9731
 9732	if (!err) {
 9733		netdev_features_t diff = features ^ dev->features;
 9734
 9735		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9736			/* udp_tunnel_{get,drop}_rx_info both need
 9737			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9738			 * device, or they won't do anything.
 9739			 * Thus we need to update dev->features
 9740			 * *before* calling udp_tunnel_get_rx_info,
 9741			 * but *after* calling udp_tunnel_drop_rx_info.
 9742			 */
 9743			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9744				dev->features = features;
 9745				udp_tunnel_get_rx_info(dev);
 9746			} else {
 9747				udp_tunnel_drop_rx_info(dev);
 9748			}
 9749		}
 9750
 9751		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9752			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9753				dev->features = features;
 9754				err |= vlan_get_rx_ctag_filter_info(dev);
 9755			} else {
 9756				vlan_drop_rx_ctag_filter_info(dev);
 9757			}
 9758		}
 9759
 9760		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9761			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9762				dev->features = features;
 9763				err |= vlan_get_rx_stag_filter_info(dev);
 9764			} else {
 9765				vlan_drop_rx_stag_filter_info(dev);
 9766			}
 9767		}
 9768
 9769		dev->features = features;
 9770	}
 9771
 9772	return err < 0 ? 0 : 1;
 9773}
 9774
 9775/**
 9776 *	netdev_update_features - recalculate device features
 9777 *	@dev: the device to check
 9778 *
 9779 *	Recalculate dev->features set and send notifications if it
 9780 *	has changed. Should be called after driver or hardware dependent
 9781 *	conditions might have changed that influence the features.
 9782 */
 9783void netdev_update_features(struct net_device *dev)
 9784{
 9785	if (__netdev_update_features(dev))
 9786		netdev_features_change(dev);
 9787}
 9788EXPORT_SYMBOL(netdev_update_features);
 9789
 9790/**
 9791 *	netdev_change_features - recalculate device features
 9792 *	@dev: the device to check
 9793 *
 9794 *	Recalculate dev->features set and send notifications even
 9795 *	if they have not changed. Should be called instead of
 9796 *	netdev_update_features() if also dev->vlan_features might
 9797 *	have changed to allow the changes to be propagated to stacked
 9798 *	VLAN devices.
 9799 */
 9800void netdev_change_features(struct net_device *dev)
 9801{
 9802	__netdev_update_features(dev);
 9803	netdev_features_change(dev);
 9804}
 9805EXPORT_SYMBOL(netdev_change_features);
 9806
 9807/**
 9808 *	netif_stacked_transfer_operstate -	transfer operstate
 9809 *	@rootdev: the root or lower level device to transfer state from
 9810 *	@dev: the device to transfer operstate to
 9811 *
 9812 *	Transfer operational state from root to device. This is normally
 9813 *	called when a stacking relationship exists between the root
 9814 *	device and the device(a leaf device).
 9815 */
 9816void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9817					struct net_device *dev)
 9818{
 9819	if (rootdev->operstate == IF_OPER_DORMANT)
 9820		netif_dormant_on(dev);
 9821	else
 9822		netif_dormant_off(dev);
 9823
 9824	if (rootdev->operstate == IF_OPER_TESTING)
 9825		netif_testing_on(dev);
 9826	else
 9827		netif_testing_off(dev);
 9828
 9829	if (netif_carrier_ok(rootdev))
 9830		netif_carrier_on(dev);
 9831	else
 9832		netif_carrier_off(dev);
 9833}
 9834EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9835
 9836static int netif_alloc_rx_queues(struct net_device *dev)
 9837{
 9838	unsigned int i, count = dev->num_rx_queues;
 9839	struct netdev_rx_queue *rx;
 9840	size_t sz = count * sizeof(*rx);
 9841	int err = 0;
 9842
 9843	BUG_ON(count < 1);
 9844
 9845	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9846	if (!rx)
 9847		return -ENOMEM;
 9848
 9849	dev->_rx = rx;
 9850
 9851	for (i = 0; i < count; i++) {
 9852		rx[i].dev = dev;
 9853
 9854		/* XDP RX-queue setup */
 9855		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
 9856		if (err < 0)
 9857			goto err_rxq_info;
 9858	}
 9859	return 0;
 9860
 9861err_rxq_info:
 9862	/* Rollback successful reg's and free other resources */
 9863	while (i--)
 9864		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9865	kvfree(dev->_rx);
 9866	dev->_rx = NULL;
 9867	return err;
 9868}
 9869
 9870static void netif_free_rx_queues(struct net_device *dev)
 9871{
 9872	unsigned int i, count = dev->num_rx_queues;
 9873
 9874	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9875	if (!dev->_rx)
 9876		return;
 9877
 9878	for (i = 0; i < count; i++)
 9879		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9880
 9881	kvfree(dev->_rx);
 9882}
 9883
 9884static void netdev_init_one_queue(struct net_device *dev,
 9885				  struct netdev_queue *queue, void *_unused)
 9886{
 9887	/* Initialize queue lock */
 9888	spin_lock_init(&queue->_xmit_lock);
 9889	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9890	queue->xmit_lock_owner = -1;
 9891	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9892	queue->dev = dev;
 9893#ifdef CONFIG_BQL
 9894	dql_init(&queue->dql, HZ);
 9895#endif
 9896}
 9897
 9898static void netif_free_tx_queues(struct net_device *dev)
 9899{
 9900	kvfree(dev->_tx);
 9901}
 9902
 9903static int netif_alloc_netdev_queues(struct net_device *dev)
 9904{
 9905	unsigned int count = dev->num_tx_queues;
 9906	struct netdev_queue *tx;
 9907	size_t sz = count * sizeof(*tx);
 9908
 9909	if (count < 1 || count > 0xffff)
 9910		return -EINVAL;
 9911
 9912	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9913	if (!tx)
 9914		return -ENOMEM;
 9915
 9916	dev->_tx = tx;
 9917
 9918	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9919	spin_lock_init(&dev->tx_global_lock);
 9920
 9921	return 0;
 9922}
 9923
 9924void netif_tx_stop_all_queues(struct net_device *dev)
 9925{
 9926	unsigned int i;
 9927
 9928	for (i = 0; i < dev->num_tx_queues; i++) {
 9929		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9930
 9931		netif_tx_stop_queue(txq);
 9932	}
 9933}
 9934EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9935
 9936/**
 9937 *	register_netdevice	- register a network device
 9938 *	@dev: device to register
 9939 *
 9940 *	Take a completed network device structure and add it to the kernel
 9941 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9942 *	chain. 0 is returned on success. A negative errno code is returned
 9943 *	on a failure to set up the device, or if the name is a duplicate.
 9944 *
 9945 *	Callers must hold the rtnl semaphore. You may want
 9946 *	register_netdev() instead of this.
 9947 *
 9948 *	BUGS:
 9949 *	The locking appears insufficient to guarantee two parallel registers
 9950 *	will not get the same name.
 9951 */
 9952
 9953int register_netdevice(struct net_device *dev)
 9954{
 9955	int ret;
 9956	struct net *net = dev_net(dev);
 9957
 9958	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9959		     NETDEV_FEATURE_COUNT);
 9960	BUG_ON(dev_boot_phase);
 9961	ASSERT_RTNL();
 9962
 9963	might_sleep();
 9964
 9965	/* When net_device's are persistent, this will be fatal. */
 9966	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9967	BUG_ON(!net);
 9968
 9969	ret = ethtool_check_ops(dev->ethtool_ops);
 9970	if (ret)
 9971		return ret;
 9972
 9973	spin_lock_init(&dev->addr_list_lock);
 9974	netdev_set_addr_lockdep_class(dev);
 9975
 9976	ret = dev_get_valid_name(net, dev, dev->name);
 9977	if (ret < 0)
 9978		goto out;
 9979
 9980	ret = -ENOMEM;
 9981	dev->name_node = netdev_name_node_head_alloc(dev);
 9982	if (!dev->name_node)
 9983		goto out;
 9984
 9985	/* Init, if this function is available */
 9986	if (dev->netdev_ops->ndo_init) {
 9987		ret = dev->netdev_ops->ndo_init(dev);
 9988		if (ret) {
 9989			if (ret > 0)
 9990				ret = -EIO;
 9991			goto err_free_name;
 9992		}
 9993	}
 9994
 9995	if (((dev->hw_features | dev->features) &
 9996	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9997	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9998	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9999		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10000		ret = -EINVAL;
10001		goto err_uninit;
10002	}
10003
10004	ret = -EBUSY;
10005	if (!dev->ifindex)
10006		dev->ifindex = dev_new_index(net);
10007	else if (__dev_get_by_index(net, dev->ifindex))
10008		goto err_uninit;
10009
10010	/* Transfer changeable features to wanted_features and enable
10011	 * software offloads (GSO and GRO).
10012	 */
10013	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10014	dev->features |= NETIF_F_SOFT_FEATURES;
10015
10016	if (dev->netdev_ops->ndo_udp_tunnel_add) {
10017		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10018		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10019	}
10020
10021	dev->wanted_features = dev->features & dev->hw_features;
10022
10023	if (!(dev->flags & IFF_LOOPBACK))
10024		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10025
10026	/* If IPv4 TCP segmentation offload is supported we should also
10027	 * allow the device to enable segmenting the frame with the option
10028	 * of ignoring a static IP ID value.  This doesn't enable the
10029	 * feature itself but allows the user to enable it later.
10030	 */
10031	if (dev->hw_features & NETIF_F_TSO)
10032		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10033	if (dev->vlan_features & NETIF_F_TSO)
10034		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10035	if (dev->mpls_features & NETIF_F_TSO)
10036		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10037	if (dev->hw_enc_features & NETIF_F_TSO)
10038		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10039
10040	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10041	 */
10042	dev->vlan_features |= NETIF_F_HIGHDMA;
10043
10044	/* Make NETIF_F_SG inheritable to tunnel devices.
10045	 */
10046	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10047
10048	/* Make NETIF_F_SG inheritable to MPLS.
10049	 */
10050	dev->mpls_features |= NETIF_F_SG;
10051
10052	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10053	ret = notifier_to_errno(ret);
10054	if (ret)
10055		goto err_uninit;
10056
10057	ret = netdev_register_kobject(dev);
10058	if (ret) {
10059		dev->reg_state = NETREG_UNREGISTERED;
10060		goto err_uninit;
10061	}
10062	dev->reg_state = NETREG_REGISTERED;
10063
10064	__netdev_update_features(dev);
10065
10066	/*
10067	 *	Default initial state at registry is that the
10068	 *	device is present.
10069	 */
10070
10071	set_bit(__LINK_STATE_PRESENT, &dev->state);
10072
10073	linkwatch_init_dev(dev);
10074
10075	dev_init_scheduler(dev);
10076	dev_hold(dev);
10077	list_netdevice(dev);
10078	add_device_randomness(dev->dev_addr, dev->addr_len);
10079
10080	/* If the device has permanent device address, driver should
10081	 * set dev_addr and also addr_assign_type should be set to
10082	 * NET_ADDR_PERM (default value).
10083	 */
10084	if (dev->addr_assign_type == NET_ADDR_PERM)
10085		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10086
10087	/* Notify protocols, that a new device appeared. */
10088	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10089	ret = notifier_to_errno(ret);
10090	if (ret) {
10091		/* Expect explicit free_netdev() on failure */
10092		dev->needs_free_netdev = false;
10093		rollback_registered(dev);
10094		net_set_todo(dev);
10095		goto out;
10096	}
10097	/*
10098	 *	Prevent userspace races by waiting until the network
10099	 *	device is fully setup before sending notifications.
10100	 */
10101	if (!dev->rtnl_link_ops ||
10102	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10103		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10104
10105out:
10106	return ret;
10107
10108err_uninit:
10109	if (dev->netdev_ops->ndo_uninit)
10110		dev->netdev_ops->ndo_uninit(dev);
10111	if (dev->priv_destructor)
10112		dev->priv_destructor(dev);
10113err_free_name:
10114	netdev_name_node_free(dev->name_node);
10115	goto out;
10116}
10117EXPORT_SYMBOL(register_netdevice);
10118
10119/**
10120 *	init_dummy_netdev	- init a dummy network device for NAPI
10121 *	@dev: device to init
10122 *
10123 *	This takes a network device structure and initialize the minimum
10124 *	amount of fields so it can be used to schedule NAPI polls without
10125 *	registering a full blown interface. This is to be used by drivers
10126 *	that need to tie several hardware interfaces to a single NAPI
10127 *	poll scheduler due to HW limitations.
10128 */
10129int init_dummy_netdev(struct net_device *dev)
10130{
10131	/* Clear everything. Note we don't initialize spinlocks
10132	 * are they aren't supposed to be taken by any of the
10133	 * NAPI code and this dummy netdev is supposed to be
10134	 * only ever used for NAPI polls
10135	 */
10136	memset(dev, 0, sizeof(struct net_device));
10137
10138	/* make sure we BUG if trying to hit standard
10139	 * register/unregister code path
10140	 */
10141	dev->reg_state = NETREG_DUMMY;
10142
10143	/* NAPI wants this */
10144	INIT_LIST_HEAD(&dev->napi_list);
10145
10146	/* a dummy interface is started by default */
10147	set_bit(__LINK_STATE_PRESENT, &dev->state);
10148	set_bit(__LINK_STATE_START, &dev->state);
10149
10150	/* napi_busy_loop stats accounting wants this */
10151	dev_net_set(dev, &init_net);
10152
10153	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10154	 * because users of this 'device' dont need to change
10155	 * its refcount.
10156	 */
10157
10158	return 0;
10159}
10160EXPORT_SYMBOL_GPL(init_dummy_netdev);
10161
10162
10163/**
10164 *	register_netdev	- register a network device
10165 *	@dev: device to register
10166 *
10167 *	Take a completed network device structure and add it to the kernel
10168 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10169 *	chain. 0 is returned on success. A negative errno code is returned
10170 *	on a failure to set up the device, or if the name is a duplicate.
10171 *
10172 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10173 *	and expands the device name if you passed a format string to
10174 *	alloc_netdev.
10175 */
10176int register_netdev(struct net_device *dev)
10177{
10178	int err;
10179
10180	if (rtnl_lock_killable())
10181		return -EINTR;
10182	err = register_netdevice(dev);
10183	rtnl_unlock();
10184	return err;
10185}
10186EXPORT_SYMBOL(register_netdev);
10187
10188int netdev_refcnt_read(const struct net_device *dev)
10189{
10190	int i, refcnt = 0;
10191
10192	for_each_possible_cpu(i)
10193		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10194	return refcnt;
10195}
10196EXPORT_SYMBOL(netdev_refcnt_read);
10197
10198#define WAIT_REFS_MIN_MSECS 1
10199#define WAIT_REFS_MAX_MSECS 250
10200/**
10201 * netdev_wait_allrefs - wait until all references are gone.
10202 * @dev: target net_device
10203 *
10204 * This is called when unregistering network devices.
10205 *
10206 * Any protocol or device that holds a reference should register
10207 * for netdevice notification, and cleanup and put back the
10208 * reference if they receive an UNREGISTER event.
10209 * We can get stuck here if buggy protocols don't correctly
10210 * call dev_put.
10211 */
10212static void netdev_wait_allrefs(struct net_device *dev)
10213{
10214	unsigned long rebroadcast_time, warning_time;
10215	int wait = 0, refcnt;
10216
10217	linkwatch_forget_dev(dev);
10218
10219	rebroadcast_time = warning_time = jiffies;
10220	refcnt = netdev_refcnt_read(dev);
10221
10222	while (refcnt != 0) {
10223		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10224			rtnl_lock();
10225
10226			/* Rebroadcast unregister notification */
10227			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10228
10229			__rtnl_unlock();
10230			rcu_barrier();
10231			rtnl_lock();
10232
10233			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10234				     &dev->state)) {
10235				/* We must not have linkwatch events
10236				 * pending on unregister. If this
10237				 * happens, we simply run the queue
10238				 * unscheduled, resulting in a noop
10239				 * for this device.
10240				 */
10241				linkwatch_run_queue();
10242			}
10243
10244			__rtnl_unlock();
10245
10246			rebroadcast_time = jiffies;
10247		}
10248
10249		if (!wait) {
10250			rcu_barrier();
10251			wait = WAIT_REFS_MIN_MSECS;
10252		} else {
10253			msleep(wait);
10254			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10255		}
10256
10257		refcnt = netdev_refcnt_read(dev);
10258
10259		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10260			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10261				 dev->name, refcnt);
10262			warning_time = jiffies;
10263		}
10264	}
10265}
10266
10267/* The sequence is:
10268 *
10269 *	rtnl_lock();
10270 *	...
10271 *	register_netdevice(x1);
10272 *	register_netdevice(x2);
10273 *	...
10274 *	unregister_netdevice(y1);
10275 *	unregister_netdevice(y2);
10276 *      ...
10277 *	rtnl_unlock();
10278 *	free_netdev(y1);
10279 *	free_netdev(y2);
10280 *
10281 * We are invoked by rtnl_unlock().
10282 * This allows us to deal with problems:
10283 * 1) We can delete sysfs objects which invoke hotplug
10284 *    without deadlocking with linkwatch via keventd.
10285 * 2) Since we run with the RTNL semaphore not held, we can sleep
10286 *    safely in order to wait for the netdev refcnt to drop to zero.
10287 *
10288 * We must not return until all unregister events added during
10289 * the interval the lock was held have been completed.
10290 */
10291void netdev_run_todo(void)
10292{
10293	struct list_head list;
10294#ifdef CONFIG_LOCKDEP
10295	struct list_head unlink_list;
10296
10297	list_replace_init(&net_unlink_list, &unlink_list);
10298
10299	while (!list_empty(&unlink_list)) {
10300		struct net_device *dev = list_first_entry(&unlink_list,
10301							  struct net_device,
10302							  unlink_list);
10303		list_del_init(&dev->unlink_list);
10304		dev->nested_level = dev->lower_level - 1;
10305	}
10306#endif
10307
10308	/* Snapshot list, allow later requests */
10309	list_replace_init(&net_todo_list, &list);
10310
10311	__rtnl_unlock();
10312
10313
10314	/* Wait for rcu callbacks to finish before next phase */
10315	if (!list_empty(&list))
10316		rcu_barrier();
10317
10318	while (!list_empty(&list)) {
10319		struct net_device *dev
10320			= list_first_entry(&list, struct net_device, todo_list);
10321		list_del(&dev->todo_list);
10322
10323		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10324			pr_err("network todo '%s' but state %d\n",
10325			       dev->name, dev->reg_state);
10326			dump_stack();
10327			continue;
10328		}
10329
10330		dev->reg_state = NETREG_UNREGISTERED;
10331
10332		netdev_wait_allrefs(dev);
10333
10334		/* paranoia */
10335		BUG_ON(netdev_refcnt_read(dev));
10336		BUG_ON(!list_empty(&dev->ptype_all));
10337		BUG_ON(!list_empty(&dev->ptype_specific));
10338		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10339		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10340#if IS_ENABLED(CONFIG_DECNET)
10341		WARN_ON(dev->dn_ptr);
10342#endif
10343		if (dev->priv_destructor)
10344			dev->priv_destructor(dev);
10345		if (dev->needs_free_netdev)
10346			free_netdev(dev);
10347
10348		/* Report a network device has been unregistered */
10349		rtnl_lock();
10350		dev_net(dev)->dev_unreg_count--;
10351		__rtnl_unlock();
10352		wake_up(&netdev_unregistering_wq);
10353
10354		/* Free network device */
10355		kobject_put(&dev->dev.kobj);
10356	}
10357}
10358
10359/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10360 * all the same fields in the same order as net_device_stats, with only
10361 * the type differing, but rtnl_link_stats64 may have additional fields
10362 * at the end for newer counters.
10363 */
10364void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10365			     const struct net_device_stats *netdev_stats)
10366{
10367#if BITS_PER_LONG == 64
10368	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10369	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10370	/* zero out counters that only exist in rtnl_link_stats64 */
10371	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10372	       sizeof(*stats64) - sizeof(*netdev_stats));
10373#else
10374	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10375	const unsigned long *src = (const unsigned long *)netdev_stats;
10376	u64 *dst = (u64 *)stats64;
10377
10378	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10379	for (i = 0; i < n; i++)
10380		dst[i] = src[i];
10381	/* zero out counters that only exist in rtnl_link_stats64 */
10382	memset((char *)stats64 + n * sizeof(u64), 0,
10383	       sizeof(*stats64) - n * sizeof(u64));
10384#endif
10385}
10386EXPORT_SYMBOL(netdev_stats_to_stats64);
10387
10388/**
10389 *	dev_get_stats	- get network device statistics
10390 *	@dev: device to get statistics from
10391 *	@storage: place to store stats
10392 *
10393 *	Get network statistics from device. Return @storage.
10394 *	The device driver may provide its own method by setting
10395 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10396 *	otherwise the internal statistics structure is used.
10397 */
10398struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10399					struct rtnl_link_stats64 *storage)
10400{
10401	const struct net_device_ops *ops = dev->netdev_ops;
10402
10403	if (ops->ndo_get_stats64) {
10404		memset(storage, 0, sizeof(*storage));
10405		ops->ndo_get_stats64(dev, storage);
10406	} else if (ops->ndo_get_stats) {
10407		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10408	} else {
10409		netdev_stats_to_stats64(storage, &dev->stats);
10410	}
10411	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10412	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10413	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10414	return storage;
10415}
10416EXPORT_SYMBOL(dev_get_stats);
10417
10418/**
10419 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10420 *	@s: place to store stats
10421 *	@netstats: per-cpu network stats to read from
10422 *
10423 *	Read per-cpu network statistics and populate the related fields in @s.
10424 */
10425void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10426			   const struct pcpu_sw_netstats __percpu *netstats)
10427{
10428	int cpu;
10429
10430	for_each_possible_cpu(cpu) {
10431		const struct pcpu_sw_netstats *stats;
10432		struct pcpu_sw_netstats tmp;
10433		unsigned int start;
10434
10435		stats = per_cpu_ptr(netstats, cpu);
10436		do {
10437			start = u64_stats_fetch_begin_irq(&stats->syncp);
10438			tmp.rx_packets = stats->rx_packets;
10439			tmp.rx_bytes   = stats->rx_bytes;
10440			tmp.tx_packets = stats->tx_packets;
10441			tmp.tx_bytes   = stats->tx_bytes;
10442		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10443
10444		s->rx_packets += tmp.rx_packets;
10445		s->rx_bytes   += tmp.rx_bytes;
10446		s->tx_packets += tmp.tx_packets;
10447		s->tx_bytes   += tmp.tx_bytes;
10448	}
10449}
10450EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10451
10452/**
10453 *	dev_get_tstats64 - ndo_get_stats64 implementation
10454 *	@dev: device to get statistics from
10455 *	@s: place to store stats
10456 *
10457 *	Populate @s from dev->stats and dev->tstats. Can be used as
10458 *	ndo_get_stats64() callback.
10459 */
10460void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10461{
10462	netdev_stats_to_stats64(s, &dev->stats);
10463	dev_fetch_sw_netstats(s, dev->tstats);
10464}
10465EXPORT_SYMBOL_GPL(dev_get_tstats64);
10466
10467struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10468{
10469	struct netdev_queue *queue = dev_ingress_queue(dev);
10470
10471#ifdef CONFIG_NET_CLS_ACT
10472	if (queue)
10473		return queue;
10474	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10475	if (!queue)
10476		return NULL;
10477	netdev_init_one_queue(dev, queue, NULL);
10478	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10479	queue->qdisc_sleeping = &noop_qdisc;
10480	rcu_assign_pointer(dev->ingress_queue, queue);
10481#endif
10482	return queue;
10483}
10484
10485static const struct ethtool_ops default_ethtool_ops;
10486
10487void netdev_set_default_ethtool_ops(struct net_device *dev,
10488				    const struct ethtool_ops *ops)
10489{
10490	if (dev->ethtool_ops == &default_ethtool_ops)
10491		dev->ethtool_ops = ops;
10492}
10493EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10494
10495void netdev_freemem(struct net_device *dev)
10496{
10497	char *addr = (char *)dev - dev->padded;
10498
10499	kvfree(addr);
10500}
10501
10502/**
10503 * alloc_netdev_mqs - allocate network device
10504 * @sizeof_priv: size of private data to allocate space for
10505 * @name: device name format string
10506 * @name_assign_type: origin of device name
10507 * @setup: callback to initialize device
10508 * @txqs: the number of TX subqueues to allocate
10509 * @rxqs: the number of RX subqueues to allocate
10510 *
10511 * Allocates a struct net_device with private data area for driver use
10512 * and performs basic initialization.  Also allocates subqueue structs
10513 * for each queue on the device.
10514 */
10515struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10516		unsigned char name_assign_type,
10517		void (*setup)(struct net_device *),
10518		unsigned int txqs, unsigned int rxqs)
10519{
10520	struct net_device *dev;
10521	unsigned int alloc_size;
10522	struct net_device *p;
10523
10524	BUG_ON(strlen(name) >= sizeof(dev->name));
10525
10526	if (txqs < 1) {
10527		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10528		return NULL;
10529	}
10530
10531	if (rxqs < 1) {
10532		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10533		return NULL;
10534	}
10535
10536	alloc_size = sizeof(struct net_device);
10537	if (sizeof_priv) {
10538		/* ensure 32-byte alignment of private area */
10539		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10540		alloc_size += sizeof_priv;
10541	}
10542	/* ensure 32-byte alignment of whole construct */
10543	alloc_size += NETDEV_ALIGN - 1;
10544
10545	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10546	if (!p)
10547		return NULL;
10548
10549	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10550	dev->padded = (char *)dev - (char *)p;
10551
10552	dev->pcpu_refcnt = alloc_percpu(int);
10553	if (!dev->pcpu_refcnt)
10554		goto free_dev;
10555
10556	if (dev_addr_init(dev))
10557		goto free_pcpu;
10558
10559	dev_mc_init(dev);
10560	dev_uc_init(dev);
10561
10562	dev_net_set(dev, &init_net);
10563
10564	dev->gso_max_size = GSO_MAX_SIZE;
10565	dev->gso_max_segs = GSO_MAX_SEGS;
10566	dev->upper_level = 1;
10567	dev->lower_level = 1;
10568#ifdef CONFIG_LOCKDEP
10569	dev->nested_level = 0;
10570	INIT_LIST_HEAD(&dev->unlink_list);
10571#endif
10572
10573	INIT_LIST_HEAD(&dev->napi_list);
10574	INIT_LIST_HEAD(&dev->unreg_list);
10575	INIT_LIST_HEAD(&dev->close_list);
10576	INIT_LIST_HEAD(&dev->link_watch_list);
10577	INIT_LIST_HEAD(&dev->adj_list.upper);
10578	INIT_LIST_HEAD(&dev->adj_list.lower);
10579	INIT_LIST_HEAD(&dev->ptype_all);
10580	INIT_LIST_HEAD(&dev->ptype_specific);
10581	INIT_LIST_HEAD(&dev->net_notifier_list);
10582#ifdef CONFIG_NET_SCHED
10583	hash_init(dev->qdisc_hash);
10584#endif
10585	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10586	setup(dev);
10587
10588	if (!dev->tx_queue_len) {
10589		dev->priv_flags |= IFF_NO_QUEUE;
10590		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10591	}
10592
10593	dev->num_tx_queues = txqs;
10594	dev->real_num_tx_queues = txqs;
10595	if (netif_alloc_netdev_queues(dev))
10596		goto free_all;
10597
10598	dev->num_rx_queues = rxqs;
10599	dev->real_num_rx_queues = rxqs;
10600	if (netif_alloc_rx_queues(dev))
10601		goto free_all;
10602
10603	strcpy(dev->name, name);
10604	dev->name_assign_type = name_assign_type;
10605	dev->group = INIT_NETDEV_GROUP;
10606	if (!dev->ethtool_ops)
10607		dev->ethtool_ops = &default_ethtool_ops;
10608
10609	nf_hook_ingress_init(dev);
10610
10611	return dev;
10612
10613free_all:
10614	free_netdev(dev);
10615	return NULL;
10616
10617free_pcpu:
10618	free_percpu(dev->pcpu_refcnt);
10619free_dev:
10620	netdev_freemem(dev);
10621	return NULL;
10622}
10623EXPORT_SYMBOL(alloc_netdev_mqs);
10624
10625/**
10626 * free_netdev - free network device
10627 * @dev: device
10628 *
10629 * This function does the last stage of destroying an allocated device
10630 * interface. The reference to the device object is released. If this
10631 * is the last reference then it will be freed.Must be called in process
10632 * context.
10633 */
10634void free_netdev(struct net_device *dev)
10635{
10636	struct napi_struct *p, *n;
10637
10638	might_sleep();
10639
10640	/* When called immediately after register_netdevice() failed the unwind
10641	 * handling may still be dismantling the device. Handle that case by
10642	 * deferring the free.
10643	 */
10644	if (dev->reg_state == NETREG_UNREGISTERING) {
10645		ASSERT_RTNL();
10646		dev->needs_free_netdev = true;
10647		return;
10648	}
10649
10650	netif_free_tx_queues(dev);
10651	netif_free_rx_queues(dev);
10652
10653	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10654
10655	/* Flush device addresses */
10656	dev_addr_flush(dev);
10657
10658	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10659		netif_napi_del(p);
10660
10661	free_percpu(dev->pcpu_refcnt);
10662	dev->pcpu_refcnt = NULL;
10663	free_percpu(dev->xdp_bulkq);
10664	dev->xdp_bulkq = NULL;
10665
10666	/*  Compatibility with error handling in drivers */
10667	if (dev->reg_state == NETREG_UNINITIALIZED) {
10668		netdev_freemem(dev);
10669		return;
10670	}
10671
10672	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10673	dev->reg_state = NETREG_RELEASED;
10674
10675	/* will free via device release */
10676	put_device(&dev->dev);
10677}
10678EXPORT_SYMBOL(free_netdev);
10679
10680/**
10681 *	synchronize_net -  Synchronize with packet receive processing
10682 *
10683 *	Wait for packets currently being received to be done.
10684 *	Does not block later packets from starting.
10685 */
10686void synchronize_net(void)
10687{
10688	might_sleep();
10689	if (rtnl_is_locked())
10690		synchronize_rcu_expedited();
10691	else
10692		synchronize_rcu();
10693}
10694EXPORT_SYMBOL(synchronize_net);
10695
10696/**
10697 *	unregister_netdevice_queue - remove device from the kernel
10698 *	@dev: device
10699 *	@head: list
10700 *
10701 *	This function shuts down a device interface and removes it
10702 *	from the kernel tables.
10703 *	If head not NULL, device is queued to be unregistered later.
10704 *
10705 *	Callers must hold the rtnl semaphore.  You may want
10706 *	unregister_netdev() instead of this.
10707 */
10708
10709void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10710{
10711	ASSERT_RTNL();
10712
10713	if (head) {
10714		list_move_tail(&dev->unreg_list, head);
10715	} else {
10716		rollback_registered(dev);
10717		/* Finish processing unregister after unlock */
10718		net_set_todo(dev);
10719	}
10720}
10721EXPORT_SYMBOL(unregister_netdevice_queue);
10722
10723/**
10724 *	unregister_netdevice_many - unregister many devices
10725 *	@head: list of devices
10726 *
10727 *  Note: As most callers use a stack allocated list_head,
10728 *  we force a list_del() to make sure stack wont be corrupted later.
10729 */
10730void unregister_netdevice_many(struct list_head *head)
10731{
10732	struct net_device *dev;
10733
10734	if (!list_empty(head)) {
10735		rollback_registered_many(head);
10736		list_for_each_entry(dev, head, unreg_list)
10737			net_set_todo(dev);
10738		list_del(head);
10739	}
10740}
10741EXPORT_SYMBOL(unregister_netdevice_many);
10742
10743/**
10744 *	unregister_netdev - remove device from the kernel
10745 *	@dev: device
10746 *
10747 *	This function shuts down a device interface and removes it
10748 *	from the kernel tables.
10749 *
10750 *	This is just a wrapper for unregister_netdevice that takes
10751 *	the rtnl semaphore.  In general you want to use this and not
10752 *	unregister_netdevice.
10753 */
10754void unregister_netdev(struct net_device *dev)
10755{
10756	rtnl_lock();
10757	unregister_netdevice(dev);
10758	rtnl_unlock();
10759}
10760EXPORT_SYMBOL(unregister_netdev);
10761
10762/**
10763 *	dev_change_net_namespace - move device to different nethost namespace
10764 *	@dev: device
10765 *	@net: network namespace
10766 *	@pat: If not NULL name pattern to try if the current device name
10767 *	      is already taken in the destination network namespace.
10768 *
10769 *	This function shuts down a device interface and moves it
10770 *	to a new network namespace. On success 0 is returned, on
10771 *	a failure a netagive errno code is returned.
10772 *
10773 *	Callers must hold the rtnl semaphore.
10774 */
10775
10776int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10777{
10778	struct net *net_old = dev_net(dev);
10779	int err, new_nsid, new_ifindex;
10780
10781	ASSERT_RTNL();
10782
10783	/* Don't allow namespace local devices to be moved. */
10784	err = -EINVAL;
10785	if (dev->features & NETIF_F_NETNS_LOCAL)
10786		goto out;
10787
10788	/* Ensure the device has been registrered */
10789	if (dev->reg_state != NETREG_REGISTERED)
10790		goto out;
10791
10792	/* Get out if there is nothing todo */
10793	err = 0;
10794	if (net_eq(net_old, net))
10795		goto out;
10796
10797	/* Pick the destination device name, and ensure
10798	 * we can use it in the destination network namespace.
10799	 */
10800	err = -EEXIST;
10801	if (__dev_get_by_name(net, dev->name)) {
10802		/* We get here if we can't use the current device name */
10803		if (!pat)
10804			goto out;
10805		err = dev_get_valid_name(net, dev, pat);
10806		if (err < 0)
10807			goto out;
10808	}
10809
10810	/*
10811	 * And now a mini version of register_netdevice unregister_netdevice.
10812	 */
10813
10814	/* If device is running close it first. */
10815	dev_close(dev);
10816
10817	/* And unlink it from device chain */
10818	unlist_netdevice(dev);
10819
10820	synchronize_net();
10821
10822	/* Shutdown queueing discipline. */
10823	dev_shutdown(dev);
10824
10825	/* Notify protocols, that we are about to destroy
10826	 * this device. They should clean all the things.
10827	 *
10828	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10829	 * This is wanted because this way 8021q and macvlan know
10830	 * the device is just moving and can keep their slaves up.
10831	 */
10832	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10833	rcu_barrier();
10834
10835	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10836	/* If there is an ifindex conflict assign a new one */
10837	if (__dev_get_by_index(net, dev->ifindex))
10838		new_ifindex = dev_new_index(net);
10839	else
10840		new_ifindex = dev->ifindex;
10841
10842	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10843			    new_ifindex);
10844
10845	/*
10846	 *	Flush the unicast and multicast chains
10847	 */
10848	dev_uc_flush(dev);
10849	dev_mc_flush(dev);
10850
10851	/* Send a netdev-removed uevent to the old namespace */
10852	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10853	netdev_adjacent_del_links(dev);
10854
10855	/* Move per-net netdevice notifiers that are following the netdevice */
10856	move_netdevice_notifiers_dev_net(dev, net);
10857
10858	/* Actually switch the network namespace */
10859	dev_net_set(dev, net);
10860	dev->ifindex = new_ifindex;
10861
10862	/* Send a netdev-add uevent to the new namespace */
10863	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10864	netdev_adjacent_add_links(dev);
10865
10866	/* Fixup kobjects */
10867	err = device_rename(&dev->dev, dev->name);
10868	WARN_ON(err);
10869
10870	/* Adapt owner in case owning user namespace of target network
10871	 * namespace is different from the original one.
10872	 */
10873	err = netdev_change_owner(dev, net_old, net);
10874	WARN_ON(err);
10875
10876	/* Add the device back in the hashes */
10877	list_netdevice(dev);
10878
10879	/* Notify protocols, that a new device appeared. */
10880	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10881
10882	/*
10883	 *	Prevent userspace races by waiting until the network
10884	 *	device is fully setup before sending notifications.
10885	 */
10886	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10887
10888	synchronize_net();
10889	err = 0;
10890out:
10891	return err;
10892}
10893EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10894
10895static int dev_cpu_dead(unsigned int oldcpu)
10896{
10897	struct sk_buff **list_skb;
10898	struct sk_buff *skb;
10899	unsigned int cpu;
10900	struct softnet_data *sd, *oldsd, *remsd = NULL;
10901
10902	local_irq_disable();
10903	cpu = smp_processor_id();
10904	sd = &per_cpu(softnet_data, cpu);
10905	oldsd = &per_cpu(softnet_data, oldcpu);
10906
10907	/* Find end of our completion_queue. */
10908	list_skb = &sd->completion_queue;
10909	while (*list_skb)
10910		list_skb = &(*list_skb)->next;
10911	/* Append completion queue from offline CPU. */
10912	*list_skb = oldsd->completion_queue;
10913	oldsd->completion_queue = NULL;
10914
10915	/* Append output queue from offline CPU. */
10916	if (oldsd->output_queue) {
10917		*sd->output_queue_tailp = oldsd->output_queue;
10918		sd->output_queue_tailp = oldsd->output_queue_tailp;
10919		oldsd->output_queue = NULL;
10920		oldsd->output_queue_tailp = &oldsd->output_queue;
10921	}
10922	/* Append NAPI poll list from offline CPU, with one exception :
10923	 * process_backlog() must be called by cpu owning percpu backlog.
10924	 * We properly handle process_queue & input_pkt_queue later.
10925	 */
10926	while (!list_empty(&oldsd->poll_list)) {
10927		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10928							    struct napi_struct,
10929							    poll_list);
10930
10931		list_del_init(&napi->poll_list);
10932		if (napi->poll == process_backlog)
10933			napi->state = 0;
10934		else
10935			____napi_schedule(sd, napi);
10936	}
10937
10938	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10939	local_irq_enable();
10940
10941#ifdef CONFIG_RPS
10942	remsd = oldsd->rps_ipi_list;
10943	oldsd->rps_ipi_list = NULL;
10944#endif
10945	/* send out pending IPI's on offline CPU */
10946	net_rps_send_ipi(remsd);
10947
10948	/* Process offline CPU's input_pkt_queue */
10949	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10950		netif_rx_ni(skb);
10951		input_queue_head_incr(oldsd);
10952	}
10953	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10954		netif_rx_ni(skb);
10955		input_queue_head_incr(oldsd);
10956	}
10957
10958	return 0;
10959}
10960
10961/**
10962 *	netdev_increment_features - increment feature set by one
10963 *	@all: current feature set
10964 *	@one: new feature set
10965 *	@mask: mask feature set
10966 *
10967 *	Computes a new feature set after adding a device with feature set
10968 *	@one to the master device with current feature set @all.  Will not
10969 *	enable anything that is off in @mask. Returns the new feature set.
10970 */
10971netdev_features_t netdev_increment_features(netdev_features_t all,
10972	netdev_features_t one, netdev_features_t mask)
10973{
10974	if (mask & NETIF_F_HW_CSUM)
10975		mask |= NETIF_F_CSUM_MASK;
10976	mask |= NETIF_F_VLAN_CHALLENGED;
10977
10978	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10979	all &= one | ~NETIF_F_ALL_FOR_ALL;
10980
10981	/* If one device supports hw checksumming, set for all. */
10982	if (all & NETIF_F_HW_CSUM)
10983		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10984
10985	return all;
10986}
10987EXPORT_SYMBOL(netdev_increment_features);
10988
10989static struct hlist_head * __net_init netdev_create_hash(void)
10990{
10991	int i;
10992	struct hlist_head *hash;
10993
10994	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10995	if (hash != NULL)
10996		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10997			INIT_HLIST_HEAD(&hash[i]);
10998
10999	return hash;
11000}
11001
11002/* Initialize per network namespace state */
11003static int __net_init netdev_init(struct net *net)
11004{
11005	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11006		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11007
11008	if (net != &init_net)
11009		INIT_LIST_HEAD(&net->dev_base_head);
11010
11011	net->dev_name_head = netdev_create_hash();
11012	if (net->dev_name_head == NULL)
11013		goto err_name;
11014
11015	net->dev_index_head = netdev_create_hash();
11016	if (net->dev_index_head == NULL)
11017		goto err_idx;
11018
11019	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11020
11021	return 0;
11022
11023err_idx:
11024	kfree(net->dev_name_head);
11025err_name:
11026	return -ENOMEM;
11027}
11028
11029/**
11030 *	netdev_drivername - network driver for the device
11031 *	@dev: network device
11032 *
11033 *	Determine network driver for device.
11034 */
11035const char *netdev_drivername(const struct net_device *dev)
11036{
11037	const struct device_driver *driver;
11038	const struct device *parent;
11039	const char *empty = "";
11040
11041	parent = dev->dev.parent;
11042	if (!parent)
11043		return empty;
11044
11045	driver = parent->driver;
11046	if (driver && driver->name)
11047		return driver->name;
11048	return empty;
11049}
11050
11051static void __netdev_printk(const char *level, const struct net_device *dev,
11052			    struct va_format *vaf)
11053{
11054	if (dev && dev->dev.parent) {
11055		dev_printk_emit(level[1] - '0',
11056				dev->dev.parent,
11057				"%s %s %s%s: %pV",
11058				dev_driver_string(dev->dev.parent),
11059				dev_name(dev->dev.parent),
11060				netdev_name(dev), netdev_reg_state(dev),
11061				vaf);
11062	} else if (dev) {
11063		printk("%s%s%s: %pV",
11064		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11065	} else {
11066		printk("%s(NULL net_device): %pV", level, vaf);
11067	}
11068}
11069
11070void netdev_printk(const char *level, const struct net_device *dev,
11071		   const char *format, ...)
11072{
11073	struct va_format vaf;
11074	va_list args;
11075
11076	va_start(args, format);
11077
11078	vaf.fmt = format;
11079	vaf.va = &args;
11080
11081	__netdev_printk(level, dev, &vaf);
11082
11083	va_end(args);
11084}
11085EXPORT_SYMBOL(netdev_printk);
11086
11087#define define_netdev_printk_level(func, level)			\
11088void func(const struct net_device *dev, const char *fmt, ...)	\
11089{								\
11090	struct va_format vaf;					\
11091	va_list args;						\
11092								\
11093	va_start(args, fmt);					\
11094								\
11095	vaf.fmt = fmt;						\
11096	vaf.va = &args;						\
11097								\
11098	__netdev_printk(level, dev, &vaf);			\
11099								\
11100	va_end(args);						\
11101}								\
11102EXPORT_SYMBOL(func);
11103
11104define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11105define_netdev_printk_level(netdev_alert, KERN_ALERT);
11106define_netdev_printk_level(netdev_crit, KERN_CRIT);
11107define_netdev_printk_level(netdev_err, KERN_ERR);
11108define_netdev_printk_level(netdev_warn, KERN_WARNING);
11109define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11110define_netdev_printk_level(netdev_info, KERN_INFO);
11111
11112static void __net_exit netdev_exit(struct net *net)
11113{
11114	kfree(net->dev_name_head);
11115	kfree(net->dev_index_head);
11116	if (net != &init_net)
11117		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11118}
11119
11120static struct pernet_operations __net_initdata netdev_net_ops = {
11121	.init = netdev_init,
11122	.exit = netdev_exit,
11123};
11124
11125static void __net_exit default_device_exit(struct net *net)
11126{
11127	struct net_device *dev, *aux;
11128	/*
11129	 * Push all migratable network devices back to the
11130	 * initial network namespace
11131	 */
11132	rtnl_lock();
11133	for_each_netdev_safe(net, dev, aux) {
11134		int err;
11135		char fb_name[IFNAMSIZ];
11136
11137		/* Ignore unmoveable devices (i.e. loopback) */
11138		if (dev->features & NETIF_F_NETNS_LOCAL)
11139			continue;
11140
11141		/* Leave virtual devices for the generic cleanup */
11142		if (dev->rtnl_link_ops)
11143			continue;
11144
11145		/* Push remaining network devices to init_net */
11146		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11147		if (__dev_get_by_name(&init_net, fb_name))
11148			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11149		err = dev_change_net_namespace(dev, &init_net, fb_name);
11150		if (err) {
11151			pr_emerg("%s: failed to move %s to init_net: %d\n",
11152				 __func__, dev->name, err);
11153			BUG();
11154		}
11155	}
11156	rtnl_unlock();
11157}
11158
11159static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
11160{
11161	/* Return with the rtnl_lock held when there are no network
11162	 * devices unregistering in any network namespace in net_list.
11163	 */
11164	struct net *net;
11165	bool unregistering;
11166	DEFINE_WAIT_FUNC(wait, woken_wake_function);
11167
11168	add_wait_queue(&netdev_unregistering_wq, &wait);
11169	for (;;) {
11170		unregistering = false;
11171		rtnl_lock();
11172		list_for_each_entry(net, net_list, exit_list) {
11173			if (net->dev_unreg_count > 0) {
11174				unregistering = true;
11175				break;
11176			}
11177		}
11178		if (!unregistering)
11179			break;
11180		__rtnl_unlock();
11181
11182		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
11183	}
11184	remove_wait_queue(&netdev_unregistering_wq, &wait);
11185}
11186
11187static void __net_exit default_device_exit_batch(struct list_head *net_list)
11188{
11189	/* At exit all network devices most be removed from a network
11190	 * namespace.  Do this in the reverse order of registration.
11191	 * Do this across as many network namespaces as possible to
11192	 * improve batching efficiency.
11193	 */
11194	struct net_device *dev;
11195	struct net *net;
11196	LIST_HEAD(dev_kill_list);
11197
11198	/* To prevent network device cleanup code from dereferencing
11199	 * loopback devices or network devices that have been freed
11200	 * wait here for all pending unregistrations to complete,
11201	 * before unregistring the loopback device and allowing the
11202	 * network namespace be freed.
11203	 *
11204	 * The netdev todo list containing all network devices
11205	 * unregistrations that happen in default_device_exit_batch
11206	 * will run in the rtnl_unlock() at the end of
11207	 * default_device_exit_batch.
11208	 */
11209	rtnl_lock_unregistering(net_list);
11210	list_for_each_entry(net, net_list, exit_list) {
11211		for_each_netdev_reverse(net, dev) {
11212			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11213				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11214			else
11215				unregister_netdevice_queue(dev, &dev_kill_list);
11216		}
11217	}
11218	unregister_netdevice_many(&dev_kill_list);
11219	rtnl_unlock();
11220}
11221
11222static struct pernet_operations __net_initdata default_device_ops = {
11223	.exit = default_device_exit,
11224	.exit_batch = default_device_exit_batch,
11225};
11226
11227/*
11228 *	Initialize the DEV module. At boot time this walks the device list and
11229 *	unhooks any devices that fail to initialise (normally hardware not
11230 *	present) and leaves us with a valid list of present and active devices.
11231 *
11232 */
11233
11234/*
11235 *       This is called single threaded during boot, so no need
11236 *       to take the rtnl semaphore.
11237 */
11238static int __init net_dev_init(void)
11239{
11240	int i, rc = -ENOMEM;
11241
11242	BUG_ON(!dev_boot_phase);
11243
11244	if (dev_proc_init())
11245		goto out;
11246
11247	if (netdev_kobject_init())
11248		goto out;
11249
11250	INIT_LIST_HEAD(&ptype_all);
11251	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11252		INIT_LIST_HEAD(&ptype_base[i]);
11253
11254	INIT_LIST_HEAD(&offload_base);
11255
11256	if (register_pernet_subsys(&netdev_net_ops))
11257		goto out;
11258
11259	/*
11260	 *	Initialise the packet receive queues.
11261	 */
11262
11263	for_each_possible_cpu(i) {
11264		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11265		struct softnet_data *sd = &per_cpu(softnet_data, i);
11266
11267		INIT_WORK(flush, flush_backlog);
11268
11269		skb_queue_head_init(&sd->input_pkt_queue);
11270		skb_queue_head_init(&sd->process_queue);
11271#ifdef CONFIG_XFRM_OFFLOAD
11272		skb_queue_head_init(&sd->xfrm_backlog);
11273#endif
11274		INIT_LIST_HEAD(&sd->poll_list);
11275		sd->output_queue_tailp = &sd->output_queue;
11276#ifdef CONFIG_RPS
11277		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11278		sd->cpu = i;
11279#endif
11280
11281		init_gro_hash(&sd->backlog);
11282		sd->backlog.poll = process_backlog;
11283		sd->backlog.weight = weight_p;
11284	}
11285
11286	dev_boot_phase = 0;
11287
11288	/* The loopback device is special if any other network devices
11289	 * is present in a network namespace the loopback device must
11290	 * be present. Since we now dynamically allocate and free the
11291	 * loopback device ensure this invariant is maintained by
11292	 * keeping the loopback device as the first device on the
11293	 * list of network devices.  Ensuring the loopback devices
11294	 * is the first device that appears and the last network device
11295	 * that disappears.
11296	 */
11297	if (register_pernet_device(&loopback_net_ops))
11298		goto out;
11299
11300	if (register_pernet_device(&default_device_ops))
11301		goto out;
11302
11303	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11304	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11305
11306	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11307				       NULL, dev_cpu_dead);
11308	WARN_ON(rc < 0);
11309	rc = 0;
11310out:
11311	return rc;
11312}
11313
11314subsys_initcall(net_dev_init);