net/core/dev.c at v5.10-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v5.10-rc3 286 kB view raw
    1// SPDX-License-Identifier: GPL-2.0-or-later
    2/*
    3 *      NET3    Protocol independent device support routines.
    4 *
    5 *	Derived from the non IP parts of dev.c 1.0.19
    6 *              Authors:	Ross Biro
    7 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
    8 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
    9 *
   10 *	Additional Authors:
   11 *		Florian la Roche <rzsfl@rz.uni-sb.de>
   12 *		Alan Cox <gw4pts@gw4pts.ampr.org>
   13 *		David Hinds <dahinds@users.sourceforge.net>
   14 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
   15 *		Adam Sulmicki <adam@cfar.umd.edu>
   16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
   17 *
   18 *	Changes:
   19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
   20 *                                      to 2 if register_netdev gets called
   21 *                                      before net_dev_init & also removed a
   22 *                                      few lines of code in the process.
   23 *		Alan Cox	:	device private ioctl copies fields back.
   24 *		Alan Cox	:	Transmit queue code does relevant
   25 *					stunts to keep the queue safe.
   26 *		Alan Cox	:	Fixed double lock.
   27 *		Alan Cox	:	Fixed promisc NULL pointer trap
   28 *		????????	:	Support the full private ioctl range
   29 *		Alan Cox	:	Moved ioctl permission check into
   30 *					drivers
   31 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
   32 *		Alan Cox	:	100 backlog just doesn't cut it when
   33 *					you start doing multicast video 8)
   34 *		Alan Cox	:	Rewrote net_bh and list manager.
   35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
   36 *		Alan Cox	:	Took out transmit every packet pass
   37 *					Saved a few bytes in the ioctl handler
   38 *		Alan Cox	:	Network driver sets packet type before
   39 *					calling netif_rx. Saves a function
   40 *					call a packet.
   41 *		Alan Cox	:	Hashed net_bh()
   42 *		Richard Kooijman:	Timestamp fixes.
   43 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
   44 *		Alan Cox	:	Device lock protection.
   45 *              Alan Cox        :       Fixed nasty side effect of device close
   46 *					changes.
   47 *		Rudi Cilibrasi	:	Pass the right thing to
   48 *					set_mac_address()
   49 *		Dave Miller	:	32bit quantity for the device lock to
   50 *					make it work out on a Sparc.
   51 *		Bjorn Ekwall	:	Added KERNELD hack.
   52 *		Alan Cox	:	Cleaned up the backlog initialise.
   53 *		Craig Metz	:	SIOCGIFCONF fix if space for under
   54 *					1 device.
   55 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
   56 *					is no device open function.
   57 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
   58 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
   59 *		Cyrus Durgin	:	Cleaned for KMOD
   60 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
   61 *					A network device unload needs to purge
   62 *					the backlog queue.
   63 *	Paul Rusty Russell	:	SIOCSIFNAME
   64 *              Pekka Riikonen  :	Netdev boot-time settings code
   65 *              Andrew Morton   :       Make unregister_netdevice wait
   66 *                                      indefinitely on dev->refcnt
   67 *              J Hadi Salim    :       - Backlog queue sampling
   68 *				        - netif_rx() feedback
   69 */
   70
   71#include <linux/uaccess.h>
   72#include <linux/bitops.h>
   73#include <linux/capability.h>
   74#include <linux/cpu.h>
   75#include <linux/types.h>
   76#include <linux/kernel.h>
   77#include <linux/hash.h>
   78#include <linux/slab.h>
   79#include <linux/sched.h>
   80#include <linux/sched/mm.h>
   81#include <linux/mutex.h>
   82#include <linux/rwsem.h>
   83#include <linux/string.h>
   84#include <linux/mm.h>
   85#include <linux/socket.h>
   86#include <linux/sockios.h>
   87#include <linux/errno.h>
   88#include <linux/interrupt.h>
   89#include <linux/if_ether.h>
   90#include <linux/netdevice.h>
   91#include <linux/etherdevice.h>
   92#include <linux/ethtool.h>
   93#include <linux/skbuff.h>
   94#include <linux/bpf.h>
   95#include <linux/bpf_trace.h>
   96#include <net/net_namespace.h>
   97#include <net/sock.h>
   98#include <net/busy_poll.h>
   99#include <linux/rtnetlink.h>
  100#include <linux/stat.h>
  101#include <net/dsa.h>
  102#include <net/dst.h>
  103#include <net/dst_metadata.h>
  104#include <net/pkt_sched.h>
  105#include <net/pkt_cls.h>
  106#include <net/checksum.h>
  107#include <net/xfrm.h>
  108#include <linux/highmem.h>
  109#include <linux/init.h>
  110#include <linux/module.h>
  111#include <linux/netpoll.h>
  112#include <linux/rcupdate.h>
  113#include <linux/delay.h>
  114#include <net/iw_handler.h>
  115#include <asm/current.h>
  116#include <linux/audit.h>
  117#include <linux/dmaengine.h>
  118#include <linux/err.h>
  119#include <linux/ctype.h>
  120#include <linux/if_arp.h>
  121#include <linux/if_vlan.h>
  122#include <linux/ip.h>
  123#include <net/ip.h>
  124#include <net/mpls.h>
  125#include <linux/ipv6.h>
  126#include <linux/in.h>
  127#include <linux/jhash.h>
  128#include <linux/random.h>
  129#include <trace/events/napi.h>
  130#include <trace/events/net.h>
  131#include <trace/events/skb.h>
  132#include <linux/inetdevice.h>
  133#include <linux/cpu_rmap.h>
  134#include <linux/static_key.h>
  135#include <linux/hashtable.h>
  136#include <linux/vmalloc.h>
  137#include <linux/if_macvlan.h>
  138#include <linux/errqueue.h>
  139#include <linux/hrtimer.h>
  140#include <linux/netfilter_ingress.h>
  141#include <linux/crash_dump.h>
  142#include <linux/sctp.h>
  143#include <net/udp_tunnel.h>
  144#include <linux/net_namespace.h>
  145#include <linux/indirect_call_wrapper.h>
  146#include <net/devlink.h>
  147#include <linux/pm_runtime.h>
  148#include <linux/prandom.h>
  149
  150#include "net-sysfs.h"
  151
  152#define MAX_GRO_SKBS 8
  153
  154/* This should be increased if a protocol with a bigger head is added. */
  155#define GRO_MAX_HEAD (MAX_HEADER + 128)
  156
  157static DEFINE_SPINLOCK(ptype_lock);
  158static DEFINE_SPINLOCK(offload_lock);
  159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  160struct list_head ptype_all __read_mostly;	/* Taps */
  161static struct list_head offload_base __read_mostly;
  162
  163static int netif_rx_internal(struct sk_buff *skb);
  164static int call_netdevice_notifiers_info(unsigned long val,
  165					 struct netdev_notifier_info *info);
  166static int call_netdevice_notifiers_extack(unsigned long val,
  167					   struct net_device *dev,
  168					   struct netlink_ext_ack *extack);
  169static struct napi_struct *napi_by_id(unsigned int napi_id);
  170
  171/*
  172 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  173 * semaphore.
  174 *
  175 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
  176 *
  177 * Writers must hold the rtnl semaphore while they loop through the
  178 * dev_base_head list, and hold dev_base_lock for writing when they do the
  179 * actual updates.  This allows pure readers to access the list even
  180 * while a writer is preparing to update it.
  181 *
  182 * To put it another way, dev_base_lock is held for writing only to
  183 * protect against pure readers; the rtnl semaphore provides the
  184 * protection against other writers.
  185 *
  186 * See, for example usages, register_netdevice() and
  187 * unregister_netdevice(), which must be called with the rtnl
  188 * semaphore held.
  189 */
  190DEFINE_RWLOCK(dev_base_lock);
  191EXPORT_SYMBOL(dev_base_lock);
  192
  193static DEFINE_MUTEX(ifalias_mutex);
  194
  195/* protects napi_hash addition/deletion and napi_gen_id */
  196static DEFINE_SPINLOCK(napi_hash_lock);
  197
  198static unsigned int napi_gen_id = NR_CPUS;
  199static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  200
  201static DECLARE_RWSEM(devnet_rename_sem);
  202
  203static inline void dev_base_seq_inc(struct net *net)
  204{
  205	while (++net->dev_base_seq == 0)
  206		;
  207}
  208
  209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
  210{
  211	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
  212
  213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
  214}
  215
  216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
  217{
  218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
  219}
  220
  221static inline void rps_lock(struct softnet_data *sd)
  222{
  223#ifdef CONFIG_RPS
  224	spin_lock(&sd->input_pkt_queue.lock);
  225#endif
  226}
  227
  228static inline void rps_unlock(struct softnet_data *sd)
  229{
  230#ifdef CONFIG_RPS
  231	spin_unlock(&sd->input_pkt_queue.lock);
  232#endif
  233}
  234
  235static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
  236						       const char *name)
  237{
  238	struct netdev_name_node *name_node;
  239
  240	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
  241	if (!name_node)
  242		return NULL;
  243	INIT_HLIST_NODE(&name_node->hlist);
  244	name_node->dev = dev;
  245	name_node->name = name;
  246	return name_node;
  247}
  248
  249static struct netdev_name_node *
  250netdev_name_node_head_alloc(struct net_device *dev)
  251{
  252	struct netdev_name_node *name_node;
  253
  254	name_node = netdev_name_node_alloc(dev, dev->name);
  255	if (!name_node)
  256		return NULL;
  257	INIT_LIST_HEAD(&name_node->list);
  258	return name_node;
  259}
  260
  261static void netdev_name_node_free(struct netdev_name_node *name_node)
  262{
  263	kfree(name_node);
  264}
  265
  266static void netdev_name_node_add(struct net *net,
  267				 struct netdev_name_node *name_node)
  268{
  269	hlist_add_head_rcu(&name_node->hlist,
  270			   dev_name_hash(net, name_node->name));
  271}
  272
  273static void netdev_name_node_del(struct netdev_name_node *name_node)
  274{
  275	hlist_del_rcu(&name_node->hlist);
  276}
  277
  278static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
  279							const char *name)
  280{
  281	struct hlist_head *head = dev_name_hash(net, name);
  282	struct netdev_name_node *name_node;
  283
  284	hlist_for_each_entry(name_node, head, hlist)
  285		if (!strcmp(name_node->name, name))
  286			return name_node;
  287	return NULL;
  288}
  289
  290static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
  291							    const char *name)
  292{
  293	struct hlist_head *head = dev_name_hash(net, name);
  294	struct netdev_name_node *name_node;
  295
  296	hlist_for_each_entry_rcu(name_node, head, hlist)
  297		if (!strcmp(name_node->name, name))
  298			return name_node;
  299	return NULL;
  300}
  301
  302int netdev_name_node_alt_create(struct net_device *dev, const char *name)
  303{
  304	struct netdev_name_node *name_node;
  305	struct net *net = dev_net(dev);
  306
  307	name_node = netdev_name_node_lookup(net, name);
  308	if (name_node)
  309		return -EEXIST;
  310	name_node = netdev_name_node_alloc(dev, name);
  311	if (!name_node)
  312		return -ENOMEM;
  313	netdev_name_node_add(net, name_node);
  314	/* The node that holds dev->name acts as a head of per-device list. */
  315	list_add_tail(&name_node->list, &dev->name_node->list);
  316
  317	return 0;
  318}
  319EXPORT_SYMBOL(netdev_name_node_alt_create);
  320
  321static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
  322{
  323	list_del(&name_node->list);
  324	netdev_name_node_del(name_node);
  325	kfree(name_node->name);
  326	netdev_name_node_free(name_node);
  327}
  328
  329int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
  330{
  331	struct netdev_name_node *name_node;
  332	struct net *net = dev_net(dev);
  333
  334	name_node = netdev_name_node_lookup(net, name);
  335	if (!name_node)
  336		return -ENOENT;
  337	/* lookup might have found our primary name or a name belonging
  338	 * to another device.
  339	 */
  340	if (name_node == dev->name_node || name_node->dev != dev)
  341		return -EINVAL;
  342
  343	__netdev_name_node_alt_destroy(name_node);
  344
  345	return 0;
  346}
  347EXPORT_SYMBOL(netdev_name_node_alt_destroy);
  348
  349static void netdev_name_node_alt_flush(struct net_device *dev)
  350{
  351	struct netdev_name_node *name_node, *tmp;
  352
  353	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
  354		__netdev_name_node_alt_destroy(name_node);
  355}
  356
  357/* Device list insertion */
  358static void list_netdevice(struct net_device *dev)
  359{
  360	struct net *net = dev_net(dev);
  361
  362	ASSERT_RTNL();
  363
  364	write_lock_bh(&dev_base_lock);
  365	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
  366	netdev_name_node_add(net, dev->name_node);
  367	hlist_add_head_rcu(&dev->index_hlist,
  368			   dev_index_hash(net, dev->ifindex));
  369	write_unlock_bh(&dev_base_lock);
  370
  371	dev_base_seq_inc(net);
  372}
  373
  374/* Device list removal
  375 * caller must respect a RCU grace period before freeing/reusing dev
  376 */
  377static void unlist_netdevice(struct net_device *dev)
  378{
  379	ASSERT_RTNL();
  380
  381	/* Unlink dev from the device chain */
  382	write_lock_bh(&dev_base_lock);
  383	list_del_rcu(&dev->dev_list);
  384	netdev_name_node_del(dev->name_node);
  385	hlist_del_rcu(&dev->index_hlist);
  386	write_unlock_bh(&dev_base_lock);
  387
  388	dev_base_seq_inc(dev_net(dev));
  389}
  390
  391/*
  392 *	Our notifier list
  393 */
  394
  395static RAW_NOTIFIER_HEAD(netdev_chain);
  396
  397/*
  398 *	Device drivers call our routines to queue packets here. We empty the
  399 *	queue in the local softnet handler.
  400 */
  401
  402DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
  403EXPORT_PER_CPU_SYMBOL(softnet_data);
  404
  405#ifdef CONFIG_LOCKDEP
  406/*
  407 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
  408 * according to dev->type
  409 */
  410static const unsigned short netdev_lock_type[] = {
  411	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
  412	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
  413	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
  414	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
  415	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
  416	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
  417	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
  418	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
  419	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
  420	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
  421	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
  422	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
  423	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
  424	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
  425	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
  426
  427static const char *const netdev_lock_name[] = {
  428	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
  429	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
  430	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
  431	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
  432	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
  433	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
  434	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
  435	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
  436	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
  437	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
  438	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
  439	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
  440	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
  441	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
  442	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
  443
  444static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
  445static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
  446
  447static inline unsigned short netdev_lock_pos(unsigned short dev_type)
  448{
  449	int i;
  450
  451	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
  452		if (netdev_lock_type[i] == dev_type)
  453			return i;
  454	/* the last key is used by default */
  455	return ARRAY_SIZE(netdev_lock_type) - 1;
  456}
  457
  458static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  459						 unsigned short dev_type)
  460{
  461	int i;
  462
  463	i = netdev_lock_pos(dev_type);
  464	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
  465				   netdev_lock_name[i]);
  466}
  467
  468static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  469{
  470	int i;
  471
  472	i = netdev_lock_pos(dev->type);
  473	lockdep_set_class_and_name(&dev->addr_list_lock,
  474				   &netdev_addr_lock_key[i],
  475				   netdev_lock_name[i]);
  476}
  477#else
  478static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
  479						 unsigned short dev_type)
  480{
  481}
  482
  483static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  484{
  485}
  486#endif
  487
  488/*******************************************************************************
  489 *
  490 *		Protocol management and registration routines
  491 *
  492 *******************************************************************************/
  493
  494
  495/*
  496 *	Add a protocol ID to the list. Now that the input handler is
  497 *	smarter we can dispense with all the messy stuff that used to be
  498 *	here.
  499 *
  500 *	BEWARE!!! Protocol handlers, mangling input packets,
  501 *	MUST BE last in hash buckets and checking protocol handlers
  502 *	MUST start from promiscuous ptype_all chain in net_bh.
  503 *	It is true now, do not change it.
  504 *	Explanation follows: if protocol handler, mangling packet, will
  505 *	be the first on list, it is not able to sense, that packet
  506 *	is cloned and should be copied-on-write, so that it will
  507 *	change it and subsequent readers will get broken packet.
  508 *							--ANK (980803)
  509 */
  510
  511static inline struct list_head *ptype_head(const struct packet_type *pt)
  512{
  513	if (pt->type == htons(ETH_P_ALL))
  514		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
  515	else
  516		return pt->dev ? &pt->dev->ptype_specific :
  517				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
  518}
  519
  520/**
  521 *	dev_add_pack - add packet handler
  522 *	@pt: packet type declaration
  523 *
  524 *	Add a protocol handler to the networking stack. The passed &packet_type
  525 *	is linked into kernel lists and may not be freed until it has been
  526 *	removed from the kernel lists.
  527 *
  528 *	This call does not sleep therefore it can not
  529 *	guarantee all CPU's that are in middle of receiving packets
  530 *	will see the new packet type (until the next received packet).
  531 */
  532
  533void dev_add_pack(struct packet_type *pt)
  534{
  535	struct list_head *head = ptype_head(pt);
  536
  537	spin_lock(&ptype_lock);
  538	list_add_rcu(&pt->list, head);
  539	spin_unlock(&ptype_lock);
  540}
  541EXPORT_SYMBOL(dev_add_pack);
  542
  543/**
  544 *	__dev_remove_pack	 - remove packet handler
  545 *	@pt: packet type declaration
  546 *
  547 *	Remove a protocol handler that was previously added to the kernel
  548 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  549 *	from the kernel lists and can be freed or reused once this function
  550 *	returns.
  551 *
  552 *      The packet type might still be in use by receivers
  553 *	and must not be freed until after all the CPU's have gone
  554 *	through a quiescent state.
  555 */
  556void __dev_remove_pack(struct packet_type *pt)
  557{
  558	struct list_head *head = ptype_head(pt);
  559	struct packet_type *pt1;
  560
  561	spin_lock(&ptype_lock);
  562
  563	list_for_each_entry(pt1, head, list) {
  564		if (pt == pt1) {
  565			list_del_rcu(&pt->list);
  566			goto out;
  567		}
  568	}
  569
  570	pr_warn("dev_remove_pack: %p not found\n", pt);
  571out:
  572	spin_unlock(&ptype_lock);
  573}
  574EXPORT_SYMBOL(__dev_remove_pack);
  575
  576/**
  577 *	dev_remove_pack	 - remove packet handler
  578 *	@pt: packet type declaration
  579 *
  580 *	Remove a protocol handler that was previously added to the kernel
  581 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
  582 *	from the kernel lists and can be freed or reused once this function
  583 *	returns.
  584 *
  585 *	This call sleeps to guarantee that no CPU is looking at the packet
  586 *	type after return.
  587 */
  588void dev_remove_pack(struct packet_type *pt)
  589{
  590	__dev_remove_pack(pt);
  591
  592	synchronize_net();
  593}
  594EXPORT_SYMBOL(dev_remove_pack);
  595
  596
  597/**
  598 *	dev_add_offload - register offload handlers
  599 *	@po: protocol offload declaration
  600 *
  601 *	Add protocol offload handlers to the networking stack. The passed
  602 *	&proto_offload is linked into kernel lists and may not be freed until
  603 *	it has been removed from the kernel lists.
  604 *
  605 *	This call does not sleep therefore it can not
  606 *	guarantee all CPU's that are in middle of receiving packets
  607 *	will see the new offload handlers (until the next received packet).
  608 */
  609void dev_add_offload(struct packet_offload *po)
  610{
  611	struct packet_offload *elem;
  612
  613	spin_lock(&offload_lock);
  614	list_for_each_entry(elem, &offload_base, list) {
  615		if (po->priority < elem->priority)
  616			break;
  617	}
  618	list_add_rcu(&po->list, elem->list.prev);
  619	spin_unlock(&offload_lock);
  620}
  621EXPORT_SYMBOL(dev_add_offload);
  622
  623/**
  624 *	__dev_remove_offload	 - remove offload handler
  625 *	@po: packet offload declaration
  626 *
  627 *	Remove a protocol offload handler that was previously added to the
  628 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
  629 *	is removed from the kernel lists and can be freed or reused once this
  630 *	function returns.
  631 *
  632 *      The packet type might still be in use by receivers
  633 *	and must not be freed until after all the CPU's have gone
  634 *	through a quiescent state.
  635 */
  636static void __dev_remove_offload(struct packet_offload *po)
  637{
  638	struct list_head *head = &offload_base;
  639	struct packet_offload *po1;
  640
  641	spin_lock(&offload_lock);
  642
  643	list_for_each_entry(po1, head, list) {
  644		if (po == po1) {
  645			list_del_rcu(&po->list);
  646			goto out;
  647		}
  648	}
  649
  650	pr_warn("dev_remove_offload: %p not found\n", po);
  651out:
  652	spin_unlock(&offload_lock);
  653}
  654
  655/**
  656 *	dev_remove_offload	 - remove packet offload handler
  657 *	@po: packet offload declaration
  658 *
  659 *	Remove a packet offload handler that was previously added to the kernel
  660 *	offload handlers by dev_add_offload(). The passed &offload_type is
  661 *	removed from the kernel lists and can be freed or reused once this
  662 *	function returns.
  663 *
  664 *	This call sleeps to guarantee that no CPU is looking at the packet
  665 *	type after return.
  666 */
  667void dev_remove_offload(struct packet_offload *po)
  668{
  669	__dev_remove_offload(po);
  670
  671	synchronize_net();
  672}
  673EXPORT_SYMBOL(dev_remove_offload);
  674
  675/******************************************************************************
  676 *
  677 *		      Device Boot-time Settings Routines
  678 *
  679 ******************************************************************************/
  680
  681/* Boot time configuration table */
  682static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
  683
  684/**
  685 *	netdev_boot_setup_add	- add new setup entry
  686 *	@name: name of the device
  687 *	@map: configured settings for the device
  688 *
  689 *	Adds new setup entry to the dev_boot_setup list.  The function
  690 *	returns 0 on error and 1 on success.  This is a generic routine to
  691 *	all netdevices.
  692 */
  693static int netdev_boot_setup_add(char *name, struct ifmap *map)
  694{
  695	struct netdev_boot_setup *s;
  696	int i;
  697
  698	s = dev_boot_setup;
  699	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  700		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
  701			memset(s[i].name, 0, sizeof(s[i].name));
  702			strlcpy(s[i].name, name, IFNAMSIZ);
  703			memcpy(&s[i].map, map, sizeof(s[i].map));
  704			break;
  705		}
  706	}
  707
  708	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
  709}
  710
  711/**
  712 * netdev_boot_setup_check	- check boot time settings
  713 * @dev: the netdevice
  714 *
  715 * Check boot time settings for the device.
  716 * The found settings are set for the device to be used
  717 * later in the device probing.
  718 * Returns 0 if no settings found, 1 if they are.
  719 */
  720int netdev_boot_setup_check(struct net_device *dev)
  721{
  722	struct netdev_boot_setup *s = dev_boot_setup;
  723	int i;
  724
  725	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
  726		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
  727		    !strcmp(dev->name, s[i].name)) {
  728			dev->irq = s[i].map.irq;
  729			dev->base_addr = s[i].map.base_addr;
  730			dev->mem_start = s[i].map.mem_start;
  731			dev->mem_end = s[i].map.mem_end;
  732			return 1;
  733		}
  734	}
  735	return 0;
  736}
  737EXPORT_SYMBOL(netdev_boot_setup_check);
  738
  739
  740/**
  741 * netdev_boot_base	- get address from boot time settings
  742 * @prefix: prefix for network device
  743 * @unit: id for network device
  744 *
  745 * Check boot time settings for the base address of device.
  746 * The found settings are set for the device to be used
  747 * later in the device probing.
  748 * Returns 0 if no settings found.
  749 */
  750unsigned long netdev_boot_base(const char *prefix, int unit)
  751{
  752	const struct netdev_boot_setup *s = dev_boot_setup;
  753	char name[IFNAMSIZ];
  754	int i;
  755
  756	sprintf(name, "%s%d", prefix, unit);
  757
  758	/*
  759	 * If device already registered then return base of 1
  760	 * to indicate not to probe for this interface
  761	 */
  762	if (__dev_get_by_name(&init_net, name))
  763		return 1;
  764
  765	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
  766		if (!strcmp(name, s[i].name))
  767			return s[i].map.base_addr;
  768	return 0;
  769}
  770
  771/*
  772 * Saves at boot time configured settings for any netdevice.
  773 */
  774int __init netdev_boot_setup(char *str)
  775{
  776	int ints[5];
  777	struct ifmap map;
  778
  779	str = get_options(str, ARRAY_SIZE(ints), ints);
  780	if (!str || !*str)
  781		return 0;
  782
  783	/* Save settings */
  784	memset(&map, 0, sizeof(map));
  785	if (ints[0] > 0)
  786		map.irq = ints[1];
  787	if (ints[0] > 1)
  788		map.base_addr = ints[2];
  789	if (ints[0] > 2)
  790		map.mem_start = ints[3];
  791	if (ints[0] > 3)
  792		map.mem_end = ints[4];
  793
  794	/* Add new entry to the list */
  795	return netdev_boot_setup_add(str, &map);
  796}
  797
  798__setup("netdev=", netdev_boot_setup);
  799
  800/*******************************************************************************
  801 *
  802 *			    Device Interface Subroutines
  803 *
  804 *******************************************************************************/
  805
  806/**
  807 *	dev_get_iflink	- get 'iflink' value of a interface
  808 *	@dev: targeted interface
  809 *
  810 *	Indicates the ifindex the interface is linked to.
  811 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
  812 */
  813
  814int dev_get_iflink(const struct net_device *dev)
  815{
  816	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
  817		return dev->netdev_ops->ndo_get_iflink(dev);
  818
  819	return dev->ifindex;
  820}
  821EXPORT_SYMBOL(dev_get_iflink);
  822
  823/**
  824 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
  825 *	@dev: targeted interface
  826 *	@skb: The packet.
  827 *
  828 *	For better visibility of tunnel traffic OVS needs to retrieve
  829 *	egress tunnel information for a packet. Following API allows
  830 *	user to get this info.
  831 */
  832int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
  833{
  834	struct ip_tunnel_info *info;
  835
  836	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
  837		return -EINVAL;
  838
  839	info = skb_tunnel_info_unclone(skb);
  840	if (!info)
  841		return -ENOMEM;
  842	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
  843		return -EINVAL;
  844
  845	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
  846}
  847EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  848
  849/**
  850 *	__dev_get_by_name	- find a device by its name
  851 *	@net: the applicable net namespace
  852 *	@name: name to find
  853 *
  854 *	Find an interface by name. Must be called under RTNL semaphore
  855 *	or @dev_base_lock. If the name is found a pointer to the device
  856 *	is returned. If the name is not found then %NULL is returned. The
  857 *	reference counters are not incremented so the caller must be
  858 *	careful with locks.
  859 */
  860
  861struct net_device *__dev_get_by_name(struct net *net, const char *name)
  862{
  863	struct netdev_name_node *node_name;
  864
  865	node_name = netdev_name_node_lookup(net, name);
  866	return node_name ? node_name->dev : NULL;
  867}
  868EXPORT_SYMBOL(__dev_get_by_name);
  869
  870/**
  871 * dev_get_by_name_rcu	- find a device by its name
  872 * @net: the applicable net namespace
  873 * @name: name to find
  874 *
  875 * Find an interface by name.
  876 * If the name is found a pointer to the device is returned.
  877 * If the name is not found then %NULL is returned.
  878 * The reference counters are not incremented so the caller must be
  879 * careful with locks. The caller must hold RCU lock.
  880 */
  881
  882struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
  883{
  884	struct netdev_name_node *node_name;
  885
  886	node_name = netdev_name_node_lookup_rcu(net, name);
  887	return node_name ? node_name->dev : NULL;
  888}
  889EXPORT_SYMBOL(dev_get_by_name_rcu);
  890
  891/**
  892 *	dev_get_by_name		- find a device by its name
  893 *	@net: the applicable net namespace
  894 *	@name: name to find
  895 *
  896 *	Find an interface by name. This can be called from any
  897 *	context and does its own locking. The returned handle has
  898 *	the usage count incremented and the caller must use dev_put() to
  899 *	release it when it is no longer needed. %NULL is returned if no
  900 *	matching device is found.
  901 */
  902
  903struct net_device *dev_get_by_name(struct net *net, const char *name)
  904{
  905	struct net_device *dev;
  906
  907	rcu_read_lock();
  908	dev = dev_get_by_name_rcu(net, name);
  909	if (dev)
  910		dev_hold(dev);
  911	rcu_read_unlock();
  912	return dev;
  913}
  914EXPORT_SYMBOL(dev_get_by_name);
  915
  916/**
  917 *	__dev_get_by_index - find a device by its ifindex
  918 *	@net: the applicable net namespace
  919 *	@ifindex: index of device
  920 *
  921 *	Search for an interface by index. Returns %NULL if the device
  922 *	is not found or a pointer to the device. The device has not
  923 *	had its reference counter increased so the caller must be careful
  924 *	about locking. The caller must hold either the RTNL semaphore
  925 *	or @dev_base_lock.
  926 */
  927
  928struct net_device *__dev_get_by_index(struct net *net, int ifindex)
  929{
  930	struct net_device *dev;
  931	struct hlist_head *head = dev_index_hash(net, ifindex);
  932
  933	hlist_for_each_entry(dev, head, index_hlist)
  934		if (dev->ifindex == ifindex)
  935			return dev;
  936
  937	return NULL;
  938}
  939EXPORT_SYMBOL(__dev_get_by_index);
  940
  941/**
  942 *	dev_get_by_index_rcu - find a device by its ifindex
  943 *	@net: the applicable net namespace
  944 *	@ifindex: index of device
  945 *
  946 *	Search for an interface by index. Returns %NULL if the device
  947 *	is not found or a pointer to the device. The device has not
  948 *	had its reference counter increased so the caller must be careful
  949 *	about locking. The caller must hold RCU lock.
  950 */
  951
  952struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
  953{
  954	struct net_device *dev;
  955	struct hlist_head *head = dev_index_hash(net, ifindex);
  956
  957	hlist_for_each_entry_rcu(dev, head, index_hlist)
  958		if (dev->ifindex == ifindex)
  959			return dev;
  960
  961	return NULL;
  962}
  963EXPORT_SYMBOL(dev_get_by_index_rcu);
  964
  965
  966/**
  967 *	dev_get_by_index - find a device by its ifindex
  968 *	@net: the applicable net namespace
  969 *	@ifindex: index of device
  970 *
  971 *	Search for an interface by index. Returns NULL if the device
  972 *	is not found or a pointer to the device. The device returned has
  973 *	had a reference added and the pointer is safe until the user calls
  974 *	dev_put to indicate they have finished with it.
  975 */
  976
  977struct net_device *dev_get_by_index(struct net *net, int ifindex)
  978{
  979	struct net_device *dev;
  980
  981	rcu_read_lock();
  982	dev = dev_get_by_index_rcu(net, ifindex);
  983	if (dev)
  984		dev_hold(dev);
  985	rcu_read_unlock();
  986	return dev;
  987}
  988EXPORT_SYMBOL(dev_get_by_index);
  989
  990/**
  991 *	dev_get_by_napi_id - find a device by napi_id
  992 *	@napi_id: ID of the NAPI struct
  993 *
  994 *	Search for an interface by NAPI ID. Returns %NULL if the device
  995 *	is not found or a pointer to the device. The device has not had
  996 *	its reference counter increased so the caller must be careful
  997 *	about locking. The caller must hold RCU lock.
  998 */
  999
 1000struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 1001{
 1002	struct napi_struct *napi;
 1003
 1004	WARN_ON_ONCE(!rcu_read_lock_held());
 1005
 1006	if (napi_id < MIN_NAPI_ID)
 1007		return NULL;
 1008
 1009	napi = napi_by_id(napi_id);
 1010
 1011	return napi ? napi->dev : NULL;
 1012}
 1013EXPORT_SYMBOL(dev_get_by_napi_id);
 1014
 1015/**
 1016 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 1017 *	@net: network namespace
 1018 *	@name: a pointer to the buffer where the name will be stored.
 1019 *	@ifindex: the ifindex of the interface to get the name from.
 1020 */
 1021int netdev_get_name(struct net *net, char *name, int ifindex)
 1022{
 1023	struct net_device *dev;
 1024	int ret;
 1025
 1026	down_read(&devnet_rename_sem);
 1027	rcu_read_lock();
 1028
 1029	dev = dev_get_by_index_rcu(net, ifindex);
 1030	if (!dev) {
 1031		ret = -ENODEV;
 1032		goto out;
 1033	}
 1034
 1035	strcpy(name, dev->name);
 1036
 1037	ret = 0;
 1038out:
 1039	rcu_read_unlock();
 1040	up_read(&devnet_rename_sem);
 1041	return ret;
 1042}
 1043
 1044/**
 1045 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 1046 *	@net: the applicable net namespace
 1047 *	@type: media type of device
 1048 *	@ha: hardware address
 1049 *
 1050 *	Search for an interface by MAC address. Returns NULL if the device
 1051 *	is not found or a pointer to the device.
 1052 *	The caller must hold RCU or RTNL.
 1053 *	The returned device has not had its ref count increased
 1054 *	and the caller must therefore be careful about locking
 1055 *
 1056 */
 1057
 1058struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 1059				       const char *ha)
 1060{
 1061	struct net_device *dev;
 1062
 1063	for_each_netdev_rcu(net, dev)
 1064		if (dev->type == type &&
 1065		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 1066			return dev;
 1067
 1068	return NULL;
 1069}
 1070EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 1071
 1072struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1073{
 1074	struct net_device *dev;
 1075
 1076	ASSERT_RTNL();
 1077	for_each_netdev(net, dev)
 1078		if (dev->type == type)
 1079			return dev;
 1080
 1081	return NULL;
 1082}
 1083EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 1084
 1085struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 1086{
 1087	struct net_device *dev, *ret = NULL;
 1088
 1089	rcu_read_lock();
 1090	for_each_netdev_rcu(net, dev)
 1091		if (dev->type == type) {
 1092			dev_hold(dev);
 1093			ret = dev;
 1094			break;
 1095		}
 1096	rcu_read_unlock();
 1097	return ret;
 1098}
 1099EXPORT_SYMBOL(dev_getfirstbyhwtype);
 1100
 1101/**
 1102 *	__dev_get_by_flags - find any device with given flags
 1103 *	@net: the applicable net namespace
 1104 *	@if_flags: IFF_* values
 1105 *	@mask: bitmask of bits in if_flags to check
 1106 *
 1107 *	Search for any interface with the given flags. Returns NULL if a device
 1108 *	is not found or a pointer to the device. Must be called inside
 1109 *	rtnl_lock(), and result refcount is unchanged.
 1110 */
 1111
 1112struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 1113				      unsigned short mask)
 1114{
 1115	struct net_device *dev, *ret;
 1116
 1117	ASSERT_RTNL();
 1118
 1119	ret = NULL;
 1120	for_each_netdev(net, dev) {
 1121		if (((dev->flags ^ if_flags) & mask) == 0) {
 1122			ret = dev;
 1123			break;
 1124		}
 1125	}
 1126	return ret;
 1127}
 1128EXPORT_SYMBOL(__dev_get_by_flags);
 1129
 1130/**
 1131 *	dev_valid_name - check if name is okay for network device
 1132 *	@name: name string
 1133 *
 1134 *	Network device names need to be valid file names to
 1135 *	allow sysfs to work.  We also disallow any kind of
 1136 *	whitespace.
 1137 */
 1138bool dev_valid_name(const char *name)
 1139{
 1140	if (*name == '\0')
 1141		return false;
 1142	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 1143		return false;
 1144	if (!strcmp(name, ".") || !strcmp(name, ".."))
 1145		return false;
 1146
 1147	while (*name) {
 1148		if (*name == '/' || *name == ':' || isspace(*name))
 1149			return false;
 1150		name++;
 1151	}
 1152	return true;
 1153}
 1154EXPORT_SYMBOL(dev_valid_name);
 1155
 1156/**
 1157 *	__dev_alloc_name - allocate a name for a device
 1158 *	@net: network namespace to allocate the device name in
 1159 *	@name: name format string
 1160 *	@buf:  scratch buffer and result name string
 1161 *
 1162 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1163 *	id. It scans list of devices to build up a free map, then chooses
 1164 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1165 *	while allocating the name and adding the device in order to avoid
 1166 *	duplicates.
 1167 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1168 *	Returns the number of the unit assigned or a negative errno code.
 1169 */
 1170
 1171static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 1172{
 1173	int i = 0;
 1174	const char *p;
 1175	const int max_netdevices = 8*PAGE_SIZE;
 1176	unsigned long *inuse;
 1177	struct net_device *d;
 1178
 1179	if (!dev_valid_name(name))
 1180		return -EINVAL;
 1181
 1182	p = strchr(name, '%');
 1183	if (p) {
 1184		/*
 1185		 * Verify the string as this thing may have come from
 1186		 * the user.  There must be either one "%d" and no other "%"
 1187		 * characters.
 1188		 */
 1189		if (p[1] != 'd' || strchr(p + 2, '%'))
 1190			return -EINVAL;
 1191
 1192		/* Use one page as a bit array of possible slots */
 1193		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 1194		if (!inuse)
 1195			return -ENOMEM;
 1196
 1197		for_each_netdev(net, d) {
 1198			if (!sscanf(d->name, name, &i))
 1199				continue;
 1200			if (i < 0 || i >= max_netdevices)
 1201				continue;
 1202
 1203			/*  avoid cases where sscanf is not exact inverse of printf */
 1204			snprintf(buf, IFNAMSIZ, name, i);
 1205			if (!strncmp(buf, d->name, IFNAMSIZ))
 1206				set_bit(i, inuse);
 1207		}
 1208
 1209		i = find_first_zero_bit(inuse, max_netdevices);
 1210		free_page((unsigned long) inuse);
 1211	}
 1212
 1213	snprintf(buf, IFNAMSIZ, name, i);
 1214	if (!__dev_get_by_name(net, buf))
 1215		return i;
 1216
 1217	/* It is possible to run out of possible slots
 1218	 * when the name is long and there isn't enough space left
 1219	 * for the digits, or if all bits are used.
 1220	 */
 1221	return -ENFILE;
 1222}
 1223
 1224static int dev_alloc_name_ns(struct net *net,
 1225			     struct net_device *dev,
 1226			     const char *name)
 1227{
 1228	char buf[IFNAMSIZ];
 1229	int ret;
 1230
 1231	BUG_ON(!net);
 1232	ret = __dev_alloc_name(net, name, buf);
 1233	if (ret >= 0)
 1234		strlcpy(dev->name, buf, IFNAMSIZ);
 1235	return ret;
 1236}
 1237
 1238/**
 1239 *	dev_alloc_name - allocate a name for a device
 1240 *	@dev: device
 1241 *	@name: name format string
 1242 *
 1243 *	Passed a format string - eg "lt%d" it will try and find a suitable
 1244 *	id. It scans list of devices to build up a free map, then chooses
 1245 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 1246 *	while allocating the name and adding the device in order to avoid
 1247 *	duplicates.
 1248 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 1249 *	Returns the number of the unit assigned or a negative errno code.
 1250 */
 1251
 1252int dev_alloc_name(struct net_device *dev, const char *name)
 1253{
 1254	return dev_alloc_name_ns(dev_net(dev), dev, name);
 1255}
 1256EXPORT_SYMBOL(dev_alloc_name);
 1257
 1258static int dev_get_valid_name(struct net *net, struct net_device *dev,
 1259			      const char *name)
 1260{
 1261	BUG_ON(!net);
 1262
 1263	if (!dev_valid_name(name))
 1264		return -EINVAL;
 1265
 1266	if (strchr(name, '%'))
 1267		return dev_alloc_name_ns(net, dev, name);
 1268	else if (__dev_get_by_name(net, name))
 1269		return -EEXIST;
 1270	else if (dev->name != name)
 1271		strlcpy(dev->name, name, IFNAMSIZ);
 1272
 1273	return 0;
 1274}
 1275
 1276/**
 1277 *	dev_change_name - change name of a device
 1278 *	@dev: device
 1279 *	@newname: name (or format string) must be at least IFNAMSIZ
 1280 *
 1281 *	Change name of a device, can pass format strings "eth%d".
 1282 *	for wildcarding.
 1283 */
 1284int dev_change_name(struct net_device *dev, const char *newname)
 1285{
 1286	unsigned char old_assign_type;
 1287	char oldname[IFNAMSIZ];
 1288	int err = 0;
 1289	int ret;
 1290	struct net *net;
 1291
 1292	ASSERT_RTNL();
 1293	BUG_ON(!dev_net(dev));
 1294
 1295	net = dev_net(dev);
 1296
 1297	/* Some auto-enslaved devices e.g. failover slaves are
 1298	 * special, as userspace might rename the device after
 1299	 * the interface had been brought up and running since
 1300	 * the point kernel initiated auto-enslavement. Allow
 1301	 * live name change even when these slave devices are
 1302	 * up and running.
 1303	 *
 1304	 * Typically, users of these auto-enslaving devices
 1305	 * don't actually care about slave name change, as
 1306	 * they are supposed to operate on master interface
 1307	 * directly.
 1308	 */
 1309	if (dev->flags & IFF_UP &&
 1310	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
 1311		return -EBUSY;
 1312
 1313	down_write(&devnet_rename_sem);
 1314
 1315	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
 1316		up_write(&devnet_rename_sem);
 1317		return 0;
 1318	}
 1319
 1320	memcpy(oldname, dev->name, IFNAMSIZ);
 1321
 1322	err = dev_get_valid_name(net, dev, newname);
 1323	if (err < 0) {
 1324		up_write(&devnet_rename_sem);
 1325		return err;
 1326	}
 1327
 1328	if (oldname[0] && !strchr(oldname, '%'))
 1329		netdev_info(dev, "renamed from %s\n", oldname);
 1330
 1331	old_assign_type = dev->name_assign_type;
 1332	dev->name_assign_type = NET_NAME_RENAMED;
 1333
 1334rollback:
 1335	ret = device_rename(&dev->dev, dev->name);
 1336	if (ret) {
 1337		memcpy(dev->name, oldname, IFNAMSIZ);
 1338		dev->name_assign_type = old_assign_type;
 1339		up_write(&devnet_rename_sem);
 1340		return ret;
 1341	}
 1342
 1343	up_write(&devnet_rename_sem);
 1344
 1345	netdev_adjacent_rename_links(dev, oldname);
 1346
 1347	write_lock_bh(&dev_base_lock);
 1348	netdev_name_node_del(dev->name_node);
 1349	write_unlock_bh(&dev_base_lock);
 1350
 1351	synchronize_rcu();
 1352
 1353	write_lock_bh(&dev_base_lock);
 1354	netdev_name_node_add(net, dev->name_node);
 1355	write_unlock_bh(&dev_base_lock);
 1356
 1357	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 1358	ret = notifier_to_errno(ret);
 1359
 1360	if (ret) {
 1361		/* err >= 0 after dev_alloc_name() or stores the first errno */
 1362		if (err >= 0) {
 1363			err = ret;
 1364			down_write(&devnet_rename_sem);
 1365			memcpy(dev->name, oldname, IFNAMSIZ);
 1366			memcpy(oldname, newname, IFNAMSIZ);
 1367			dev->name_assign_type = old_assign_type;
 1368			old_assign_type = NET_NAME_RENAMED;
 1369			goto rollback;
 1370		} else {
 1371			pr_err("%s: name change rollback failed: %d\n",
 1372			       dev->name, ret);
 1373		}
 1374	}
 1375
 1376	return err;
 1377}
 1378
 1379/**
 1380 *	dev_set_alias - change ifalias of a device
 1381 *	@dev: device
 1382 *	@alias: name up to IFALIASZ
 1383 *	@len: limit of bytes to copy from info
 1384 *
 1385 *	Set ifalias for a device,
 1386 */
 1387int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 1388{
 1389	struct dev_ifalias *new_alias = NULL;
 1390
 1391	if (len >= IFALIASZ)
 1392		return -EINVAL;
 1393
 1394	if (len) {
 1395		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 1396		if (!new_alias)
 1397			return -ENOMEM;
 1398
 1399		memcpy(new_alias->ifalias, alias, len);
 1400		new_alias->ifalias[len] = 0;
 1401	}
 1402
 1403	mutex_lock(&ifalias_mutex);
 1404	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
 1405					mutex_is_locked(&ifalias_mutex));
 1406	mutex_unlock(&ifalias_mutex);
 1407
 1408	if (new_alias)
 1409		kfree_rcu(new_alias, rcuhead);
 1410
 1411	return len;
 1412}
 1413EXPORT_SYMBOL(dev_set_alias);
 1414
 1415/**
 1416 *	dev_get_alias - get ifalias of a device
 1417 *	@dev: device
 1418 *	@name: buffer to store name of ifalias
 1419 *	@len: size of buffer
 1420 *
 1421 *	get ifalias for a device.  Caller must make sure dev cannot go
 1422 *	away,  e.g. rcu read lock or own a reference count to device.
 1423 */
 1424int dev_get_alias(const struct net_device *dev, char *name, size_t len)
 1425{
 1426	const struct dev_ifalias *alias;
 1427	int ret = 0;
 1428
 1429	rcu_read_lock();
 1430	alias = rcu_dereference(dev->ifalias);
 1431	if (alias)
 1432		ret = snprintf(name, len, "%s", alias->ifalias);
 1433	rcu_read_unlock();
 1434
 1435	return ret;
 1436}
 1437
 1438/**
 1439 *	netdev_features_change - device changes features
 1440 *	@dev: device to cause notification
 1441 *
 1442 *	Called to indicate a device has changed features.
 1443 */
 1444void netdev_features_change(struct net_device *dev)
 1445{
 1446	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 1447}
 1448EXPORT_SYMBOL(netdev_features_change);
 1449
 1450/**
 1451 *	netdev_state_change - device changes state
 1452 *	@dev: device to cause notification
 1453 *
 1454 *	Called to indicate a device has changed state. This function calls
 1455 *	the notifier chains for netdev_chain and sends a NEWLINK message
 1456 *	to the routing socket.
 1457 */
 1458void netdev_state_change(struct net_device *dev)
 1459{
 1460	if (dev->flags & IFF_UP) {
 1461		struct netdev_notifier_change_info change_info = {
 1462			.info.dev = dev,
 1463		};
 1464
 1465		call_netdevice_notifiers_info(NETDEV_CHANGE,
 1466					      &change_info.info);
 1467		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 1468	}
 1469}
 1470EXPORT_SYMBOL(netdev_state_change);
 1471
 1472/**
 1473 * netdev_notify_peers - notify network peers about existence of @dev
 1474 * @dev: network device
 1475 *
 1476 * Generate traffic such that interested network peers are aware of
 1477 * @dev, such as by generating a gratuitous ARP. This may be used when
 1478 * a device wants to inform the rest of the network about some sort of
 1479 * reconfiguration such as a failover event or virtual machine
 1480 * migration.
 1481 */
 1482void netdev_notify_peers(struct net_device *dev)
 1483{
 1484	rtnl_lock();
 1485	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
 1486	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
 1487	rtnl_unlock();
 1488}
 1489EXPORT_SYMBOL(netdev_notify_peers);
 1490
 1491static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1492{
 1493	const struct net_device_ops *ops = dev->netdev_ops;
 1494	int ret;
 1495
 1496	ASSERT_RTNL();
 1497
 1498	if (!netif_device_present(dev)) {
 1499		/* may be detached because parent is runtime-suspended */
 1500		if (dev->dev.parent)
 1501			pm_runtime_resume(dev->dev.parent);
 1502		if (!netif_device_present(dev))
 1503			return -ENODEV;
 1504	}
 1505
 1506	/* Block netpoll from trying to do any rx path servicing.
 1507	 * If we don't do this there is a chance ndo_poll_controller
 1508	 * or ndo_poll may be running while we open the device
 1509	 */
 1510	netpoll_poll_disable(dev);
 1511
 1512	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
 1513	ret = notifier_to_errno(ret);
 1514	if (ret)
 1515		return ret;
 1516
 1517	set_bit(__LINK_STATE_START, &dev->state);
 1518
 1519	if (ops->ndo_validate_addr)
 1520		ret = ops->ndo_validate_addr(dev);
 1521
 1522	if (!ret && ops->ndo_open)
 1523		ret = ops->ndo_open(dev);
 1524
 1525	netpoll_poll_enable(dev);
 1526
 1527	if (ret)
 1528		clear_bit(__LINK_STATE_START, &dev->state);
 1529	else {
 1530		dev->flags |= IFF_UP;
 1531		dev_set_rx_mode(dev);
 1532		dev_activate(dev);
 1533		add_device_randomness(dev->dev_addr, dev->addr_len);
 1534	}
 1535
 1536	return ret;
 1537}
 1538
 1539/**
 1540 *	dev_open	- prepare an interface for use.
 1541 *	@dev: device to open
 1542 *	@extack: netlink extended ack
 1543 *
 1544 *	Takes a device from down to up state. The device's private open
 1545 *	function is invoked and then the multicast lists are loaded. Finally
 1546 *	the device is moved into the up state and a %NETDEV_UP message is
 1547 *	sent to the netdev notifier chain.
 1548 *
 1549 *	Calling this function on an active interface is a nop. On a failure
 1550 *	a negative errno code is returned.
 1551 */
 1552int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 1553{
 1554	int ret;
 1555
 1556	if (dev->flags & IFF_UP)
 1557		return 0;
 1558
 1559	ret = __dev_open(dev, extack);
 1560	if (ret < 0)
 1561		return ret;
 1562
 1563	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1564	call_netdevice_notifiers(NETDEV_UP, dev);
 1565
 1566	return ret;
 1567}
 1568EXPORT_SYMBOL(dev_open);
 1569
 1570static void __dev_close_many(struct list_head *head)
 1571{
 1572	struct net_device *dev;
 1573
 1574	ASSERT_RTNL();
 1575	might_sleep();
 1576
 1577	list_for_each_entry(dev, head, close_list) {
 1578		/* Temporarily disable netpoll until the interface is down */
 1579		netpoll_poll_disable(dev);
 1580
 1581		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
 1582
 1583		clear_bit(__LINK_STATE_START, &dev->state);
 1584
 1585		/* Synchronize to scheduled poll. We cannot touch poll list, it
 1586		 * can be even on different cpu. So just clear netif_running().
 1587		 *
 1588		 * dev->stop() will invoke napi_disable() on all of it's
 1589		 * napi_struct instances on this device.
 1590		 */
 1591		smp_mb__after_atomic(); /* Commit netif_running(). */
 1592	}
 1593
 1594	dev_deactivate_many(head);
 1595
 1596	list_for_each_entry(dev, head, close_list) {
 1597		const struct net_device_ops *ops = dev->netdev_ops;
 1598
 1599		/*
 1600		 *	Call the device specific close. This cannot fail.
 1601		 *	Only if device is UP
 1602		 *
 1603		 *	We allow it to be called even after a DETACH hot-plug
 1604		 *	event.
 1605		 */
 1606		if (ops->ndo_stop)
 1607			ops->ndo_stop(dev);
 1608
 1609		dev->flags &= ~IFF_UP;
 1610		netpoll_poll_enable(dev);
 1611	}
 1612}
 1613
 1614static void __dev_close(struct net_device *dev)
 1615{
 1616	LIST_HEAD(single);
 1617
 1618	list_add(&dev->close_list, &single);
 1619	__dev_close_many(&single);
 1620	list_del(&single);
 1621}
 1622
 1623void dev_close_many(struct list_head *head, bool unlink)
 1624{
 1625	struct net_device *dev, *tmp;
 1626
 1627	/* Remove the devices that don't need to be closed */
 1628	list_for_each_entry_safe(dev, tmp, head, close_list)
 1629		if (!(dev->flags & IFF_UP))
 1630			list_del_init(&dev->close_list);
 1631
 1632	__dev_close_many(head);
 1633
 1634	list_for_each_entry_safe(dev, tmp, head, close_list) {
 1635		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
 1636		call_netdevice_notifiers(NETDEV_DOWN, dev);
 1637		if (unlink)
 1638			list_del_init(&dev->close_list);
 1639	}
 1640}
 1641EXPORT_SYMBOL(dev_close_many);
 1642
 1643/**
 1644 *	dev_close - shutdown an interface.
 1645 *	@dev: device to shutdown
 1646 *
 1647 *	This function moves an active device into down state. A
 1648 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 1649 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 1650 *	chain.
 1651 */
 1652void dev_close(struct net_device *dev)
 1653{
 1654	if (dev->flags & IFF_UP) {
 1655		LIST_HEAD(single);
 1656
 1657		list_add(&dev->close_list, &single);
 1658		dev_close_many(&single, true);
 1659		list_del(&single);
 1660	}
 1661}
 1662EXPORT_SYMBOL(dev_close);
 1663
 1664
 1665/**
 1666 *	dev_disable_lro - disable Large Receive Offload on a device
 1667 *	@dev: device
 1668 *
 1669 *	Disable Large Receive Offload (LRO) on a net device.  Must be
 1670 *	called under RTNL.  This is needed if received packets may be
 1671 *	forwarded to another interface.
 1672 */
 1673void dev_disable_lro(struct net_device *dev)
 1674{
 1675	struct net_device *lower_dev;
 1676	struct list_head *iter;
 1677
 1678	dev->wanted_features &= ~NETIF_F_LRO;
 1679	netdev_update_features(dev);
 1680
 1681	if (unlikely(dev->features & NETIF_F_LRO))
 1682		netdev_WARN(dev, "failed to disable LRO!\n");
 1683
 1684	netdev_for_each_lower_dev(dev, lower_dev, iter)
 1685		dev_disable_lro(lower_dev);
 1686}
 1687EXPORT_SYMBOL(dev_disable_lro);
 1688
 1689/**
 1690 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 1691 *	@dev: device
 1692 *
 1693 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 1694 *	called under RTNL.  This is needed if Generic XDP is installed on
 1695 *	the device.
 1696 */
 1697static void dev_disable_gro_hw(struct net_device *dev)
 1698{
 1699	dev->wanted_features &= ~NETIF_F_GRO_HW;
 1700	netdev_update_features(dev);
 1701
 1702	if (unlikely(dev->features & NETIF_F_GRO_HW))
 1703		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 1704}
 1705
 1706const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 1707{
 1708#define N(val) 						\
 1709	case NETDEV_##val:				\
 1710		return "NETDEV_" __stringify(val);
 1711	switch (cmd) {
 1712	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
 1713	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
 1714	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
 1715	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
 1716	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 1717	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 1718	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 1719	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 1720	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 1721	N(PRE_CHANGEADDR)
 1722	}
 1723#undef N
 1724	return "UNKNOWN_NETDEV_EVENT";
 1725}
 1726EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
 1727
 1728static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 1729				   struct net_device *dev)
 1730{
 1731	struct netdev_notifier_info info = {
 1732		.dev = dev,
 1733	};
 1734
 1735	return nb->notifier_call(nb, val, &info);
 1736}
 1737
 1738static int call_netdevice_register_notifiers(struct notifier_block *nb,
 1739					     struct net_device *dev)
 1740{
 1741	int err;
 1742
 1743	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
 1744	err = notifier_to_errno(err);
 1745	if (err)
 1746		return err;
 1747
 1748	if (!(dev->flags & IFF_UP))
 1749		return 0;
 1750
 1751	call_netdevice_notifier(nb, NETDEV_UP, dev);
 1752	return 0;
 1753}
 1754
 1755static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
 1756						struct net_device *dev)
 1757{
 1758	if (dev->flags & IFF_UP) {
 1759		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
 1760					dev);
 1761		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
 1762	}
 1763	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 1764}
 1765
 1766static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
 1767						 struct net *net)
 1768{
 1769	struct net_device *dev;
 1770	int err;
 1771
 1772	for_each_netdev(net, dev) {
 1773		err = call_netdevice_register_notifiers(nb, dev);
 1774		if (err)
 1775			goto rollback;
 1776	}
 1777	return 0;
 1778
 1779rollback:
 1780	for_each_netdev_continue_reverse(net, dev)
 1781		call_netdevice_unregister_notifiers(nb, dev);
 1782	return err;
 1783}
 1784
 1785static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
 1786						    struct net *net)
 1787{
 1788	struct net_device *dev;
 1789
 1790	for_each_netdev(net, dev)
 1791		call_netdevice_unregister_notifiers(nb, dev);
 1792}
 1793
 1794static int dev_boot_phase = 1;
 1795
 1796/**
 1797 * register_netdevice_notifier - register a network notifier block
 1798 * @nb: notifier
 1799 *
 1800 * Register a notifier to be called when network device events occur.
 1801 * The notifier passed is linked into the kernel structures and must
 1802 * not be reused until it has been unregistered. A negative errno code
 1803 * is returned on a failure.
 1804 *
 1805 * When registered all registration and up events are replayed
 1806 * to the new notifier to allow device to have a race free
 1807 * view of the network device list.
 1808 */
 1809
 1810int register_netdevice_notifier(struct notifier_block *nb)
 1811{
 1812	struct net *net;
 1813	int err;
 1814
 1815	/* Close race with setup_net() and cleanup_net() */
 1816	down_write(&pernet_ops_rwsem);
 1817	rtnl_lock();
 1818	err = raw_notifier_chain_register(&netdev_chain, nb);
 1819	if (err)
 1820		goto unlock;
 1821	if (dev_boot_phase)
 1822		goto unlock;
 1823	for_each_net(net) {
 1824		err = call_netdevice_register_net_notifiers(nb, net);
 1825		if (err)
 1826			goto rollback;
 1827	}
 1828
 1829unlock:
 1830	rtnl_unlock();
 1831	up_write(&pernet_ops_rwsem);
 1832	return err;
 1833
 1834rollback:
 1835	for_each_net_continue_reverse(net)
 1836		call_netdevice_unregister_net_notifiers(nb, net);
 1837
 1838	raw_notifier_chain_unregister(&netdev_chain, nb);
 1839	goto unlock;
 1840}
 1841EXPORT_SYMBOL(register_netdevice_notifier);
 1842
 1843/**
 1844 * unregister_netdevice_notifier - unregister a network notifier block
 1845 * @nb: notifier
 1846 *
 1847 * Unregister a notifier previously registered by
 1848 * register_netdevice_notifier(). The notifier is unlinked into the
 1849 * kernel structures and may then be reused. A negative errno code
 1850 * is returned on a failure.
 1851 *
 1852 * After unregistering unregister and down device events are synthesized
 1853 * for all devices on the device list to the removed notifier to remove
 1854 * the need for special case cleanup code.
 1855 */
 1856
 1857int unregister_netdevice_notifier(struct notifier_block *nb)
 1858{
 1859	struct net *net;
 1860	int err;
 1861
 1862	/* Close race with setup_net() and cleanup_net() */
 1863	down_write(&pernet_ops_rwsem);
 1864	rtnl_lock();
 1865	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 1866	if (err)
 1867		goto unlock;
 1868
 1869	for_each_net(net)
 1870		call_netdevice_unregister_net_notifiers(nb, net);
 1871
 1872unlock:
 1873	rtnl_unlock();
 1874	up_write(&pernet_ops_rwsem);
 1875	return err;
 1876}
 1877EXPORT_SYMBOL(unregister_netdevice_notifier);
 1878
 1879static int __register_netdevice_notifier_net(struct net *net,
 1880					     struct notifier_block *nb,
 1881					     bool ignore_call_fail)
 1882{
 1883	int err;
 1884
 1885	err = raw_notifier_chain_register(&net->netdev_chain, nb);
 1886	if (err)
 1887		return err;
 1888	if (dev_boot_phase)
 1889		return 0;
 1890
 1891	err = call_netdevice_register_net_notifiers(nb, net);
 1892	if (err && !ignore_call_fail)
 1893		goto chain_unregister;
 1894
 1895	return 0;
 1896
 1897chain_unregister:
 1898	raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1899	return err;
 1900}
 1901
 1902static int __unregister_netdevice_notifier_net(struct net *net,
 1903					       struct notifier_block *nb)
 1904{
 1905	int err;
 1906
 1907	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
 1908	if (err)
 1909		return err;
 1910
 1911	call_netdevice_unregister_net_notifiers(nb, net);
 1912	return 0;
 1913}
 1914
 1915/**
 1916 * register_netdevice_notifier_net - register a per-netns network notifier block
 1917 * @net: network namespace
 1918 * @nb: notifier
 1919 *
 1920 * Register a notifier to be called when network device events occur.
 1921 * The notifier passed is linked into the kernel structures and must
 1922 * not be reused until it has been unregistered. A negative errno code
 1923 * is returned on a failure.
 1924 *
 1925 * When registered all registration and up events are replayed
 1926 * to the new notifier to allow device to have a race free
 1927 * view of the network device list.
 1928 */
 1929
 1930int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
 1931{
 1932	int err;
 1933
 1934	rtnl_lock();
 1935	err = __register_netdevice_notifier_net(net, nb, false);
 1936	rtnl_unlock();
 1937	return err;
 1938}
 1939EXPORT_SYMBOL(register_netdevice_notifier_net);
 1940
 1941/**
 1942 * unregister_netdevice_notifier_net - unregister a per-netns
 1943 *                                     network notifier block
 1944 * @net: network namespace
 1945 * @nb: notifier
 1946 *
 1947 * Unregister a notifier previously registered by
 1948 * register_netdevice_notifier(). The notifier is unlinked into the
 1949 * kernel structures and may then be reused. A negative errno code
 1950 * is returned on a failure.
 1951 *
 1952 * After unregistering unregister and down device events are synthesized
 1953 * for all devices on the device list to the removed notifier to remove
 1954 * the need for special case cleanup code.
 1955 */
 1956
 1957int unregister_netdevice_notifier_net(struct net *net,
 1958				      struct notifier_block *nb)
 1959{
 1960	int err;
 1961
 1962	rtnl_lock();
 1963	err = __unregister_netdevice_notifier_net(net, nb);
 1964	rtnl_unlock();
 1965	return err;
 1966}
 1967EXPORT_SYMBOL(unregister_netdevice_notifier_net);
 1968
 1969int register_netdevice_notifier_dev_net(struct net_device *dev,
 1970					struct notifier_block *nb,
 1971					struct netdev_net_notifier *nn)
 1972{
 1973	int err;
 1974
 1975	rtnl_lock();
 1976	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
 1977	if (!err) {
 1978		nn->nb = nb;
 1979		list_add(&nn->list, &dev->net_notifier_list);
 1980	}
 1981	rtnl_unlock();
 1982	return err;
 1983}
 1984EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
 1985
 1986int unregister_netdevice_notifier_dev_net(struct net_device *dev,
 1987					  struct notifier_block *nb,
 1988					  struct netdev_net_notifier *nn)
 1989{
 1990	int err;
 1991
 1992	rtnl_lock();
 1993	list_del(&nn->list);
 1994	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
 1995	rtnl_unlock();
 1996	return err;
 1997}
 1998EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
 1999
 2000static void move_netdevice_notifiers_dev_net(struct net_device *dev,
 2001					     struct net *net)
 2002{
 2003	struct netdev_net_notifier *nn;
 2004
 2005	list_for_each_entry(nn, &dev->net_notifier_list, list) {
 2006		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
 2007		__register_netdevice_notifier_net(net, nn->nb, true);
 2008	}
 2009}
 2010
 2011/**
 2012 *	call_netdevice_notifiers_info - call all network notifier blocks
 2013 *	@val: value passed unmodified to notifier function
 2014 *	@info: notifier information data
 2015 *
 2016 *	Call all network notifier blocks.  Parameters and return value
 2017 *	are as for raw_notifier_call_chain().
 2018 */
 2019
 2020static int call_netdevice_notifiers_info(unsigned long val,
 2021					 struct netdev_notifier_info *info)
 2022{
 2023	struct net *net = dev_net(info->dev);
 2024	int ret;
 2025
 2026	ASSERT_RTNL();
 2027
 2028	/* Run per-netns notifier block chain first, then run the global one.
 2029	 * Hopefully, one day, the global one is going to be removed after
 2030	 * all notifier block registrators get converted to be per-netns.
 2031	 */
 2032	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
 2033	if (ret & NOTIFY_STOP_MASK)
 2034		return ret;
 2035	return raw_notifier_call_chain(&netdev_chain, val, info);
 2036}
 2037
 2038static int call_netdevice_notifiers_extack(unsigned long val,
 2039					   struct net_device *dev,
 2040					   struct netlink_ext_ack *extack)
 2041{
 2042	struct netdev_notifier_info info = {
 2043		.dev = dev,
 2044		.extack = extack,
 2045	};
 2046
 2047	return call_netdevice_notifiers_info(val, &info);
 2048}
 2049
 2050/**
 2051 *	call_netdevice_notifiers - call all network notifier blocks
 2052 *      @val: value passed unmodified to notifier function
 2053 *      @dev: net_device pointer passed unmodified to notifier function
 2054 *
 2055 *	Call all network notifier blocks.  Parameters and return value
 2056 *	are as for raw_notifier_call_chain().
 2057 */
 2058
 2059int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 2060{
 2061	return call_netdevice_notifiers_extack(val, dev, NULL);
 2062}
 2063EXPORT_SYMBOL(call_netdevice_notifiers);
 2064
 2065/**
 2066 *	call_netdevice_notifiers_mtu - call all network notifier blocks
 2067 *	@val: value passed unmodified to notifier function
 2068 *	@dev: net_device pointer passed unmodified to notifier function
 2069 *	@arg: additional u32 argument passed to the notifier function
 2070 *
 2071 *	Call all network notifier blocks.  Parameters and return value
 2072 *	are as for raw_notifier_call_chain().
 2073 */
 2074static int call_netdevice_notifiers_mtu(unsigned long val,
 2075					struct net_device *dev, u32 arg)
 2076{
 2077	struct netdev_notifier_info_ext info = {
 2078		.info.dev = dev,
 2079		.ext.mtu = arg,
 2080	};
 2081
 2082	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
 2083
 2084	return call_netdevice_notifiers_info(val, &info.info);
 2085}
 2086
 2087#ifdef CONFIG_NET_INGRESS
 2088static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 2089
 2090void net_inc_ingress_queue(void)
 2091{
 2092	static_branch_inc(&ingress_needed_key);
 2093}
 2094EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 2095
 2096void net_dec_ingress_queue(void)
 2097{
 2098	static_branch_dec(&ingress_needed_key);
 2099}
 2100EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 2101#endif
 2102
 2103#ifdef CONFIG_NET_EGRESS
 2104static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 2105
 2106void net_inc_egress_queue(void)
 2107{
 2108	static_branch_inc(&egress_needed_key);
 2109}
 2110EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 2111
 2112void net_dec_egress_queue(void)
 2113{
 2114	static_branch_dec(&egress_needed_key);
 2115}
 2116EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 2117#endif
 2118
 2119static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 2120#ifdef CONFIG_JUMP_LABEL
 2121static atomic_t netstamp_needed_deferred;
 2122static atomic_t netstamp_wanted;
 2123static void netstamp_clear(struct work_struct *work)
 2124{
 2125	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
 2126	int wanted;
 2127
 2128	wanted = atomic_add_return(deferred, &netstamp_wanted);
 2129	if (wanted > 0)
 2130		static_branch_enable(&netstamp_needed_key);
 2131	else
 2132		static_branch_disable(&netstamp_needed_key);
 2133}
 2134static DECLARE_WORK(netstamp_work, netstamp_clear);
 2135#endif
 2136
 2137void net_enable_timestamp(void)
 2138{
 2139#ifdef CONFIG_JUMP_LABEL
 2140	int wanted;
 2141
 2142	while (1) {
 2143		wanted = atomic_read(&netstamp_wanted);
 2144		if (wanted <= 0)
 2145			break;
 2146		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
 2147			return;
 2148	}
 2149	atomic_inc(&netstamp_needed_deferred);
 2150	schedule_work(&netstamp_work);
 2151#else
 2152	static_branch_inc(&netstamp_needed_key);
 2153#endif
 2154}
 2155EXPORT_SYMBOL(net_enable_timestamp);
 2156
 2157void net_disable_timestamp(void)
 2158{
 2159#ifdef CONFIG_JUMP_LABEL
 2160	int wanted;
 2161
 2162	while (1) {
 2163		wanted = atomic_read(&netstamp_wanted);
 2164		if (wanted <= 1)
 2165			break;
 2166		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
 2167			return;
 2168	}
 2169	atomic_dec(&netstamp_needed_deferred);
 2170	schedule_work(&netstamp_work);
 2171#else
 2172	static_branch_dec(&netstamp_needed_key);
 2173#endif
 2174}
 2175EXPORT_SYMBOL(net_disable_timestamp);
 2176
 2177static inline void net_timestamp_set(struct sk_buff *skb)
 2178{
 2179	skb->tstamp = 0;
 2180	if (static_branch_unlikely(&netstamp_needed_key))
 2181		__net_timestamp(skb);
 2182}
 2183
 2184#define net_timestamp_check(COND, SKB)				\
 2185	if (static_branch_unlikely(&netstamp_needed_key)) {	\
 2186		if ((COND) && !(SKB)->tstamp)			\
 2187			__net_timestamp(SKB);			\
 2188	}							\
 2189
 2190bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 2191{
 2192	unsigned int len;
 2193
 2194	if (!(dev->flags & IFF_UP))
 2195		return false;
 2196
 2197	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
 2198	if (skb->len <= len)
 2199		return true;
 2200
 2201	/* if TSO is enabled, we don't care about the length as the packet
 2202	 * could be forwarded without being segmented before
 2203	 */
 2204	if (skb_is_gso(skb))
 2205		return true;
 2206
 2207	return false;
 2208}
 2209EXPORT_SYMBOL_GPL(is_skb_forwardable);
 2210
 2211int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2212{
 2213	int ret = ____dev_forward_skb(dev, skb);
 2214
 2215	if (likely(!ret)) {
 2216		skb->protocol = eth_type_trans(skb, dev);
 2217		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 2218	}
 2219
 2220	return ret;
 2221}
 2222EXPORT_SYMBOL_GPL(__dev_forward_skb);
 2223
 2224/**
 2225 * dev_forward_skb - loopback an skb to another netif
 2226 *
 2227 * @dev: destination network device
 2228 * @skb: buffer to forward
 2229 *
 2230 * return values:
 2231 *	NET_RX_SUCCESS	(no congestion)
 2232 *	NET_RX_DROP     (packet was dropped, but freed)
 2233 *
 2234 * dev_forward_skb can be used for injecting an skb from the
 2235 * start_xmit function of one device into the receive queue
 2236 * of another device.
 2237 *
 2238 * The receiving device may be in another namespace, so
 2239 * we have to clear all information in the skb that could
 2240 * impact namespace isolation.
 2241 */
 2242int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 2243{
 2244	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
 2245}
 2246EXPORT_SYMBOL_GPL(dev_forward_skb);
 2247
 2248static inline int deliver_skb(struct sk_buff *skb,
 2249			      struct packet_type *pt_prev,
 2250			      struct net_device *orig_dev)
 2251{
 2252	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 2253		return -ENOMEM;
 2254	refcount_inc(&skb->users);
 2255	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 2256}
 2257
 2258static inline void deliver_ptype_list_skb(struct sk_buff *skb,
 2259					  struct packet_type **pt,
 2260					  struct net_device *orig_dev,
 2261					  __be16 type,
 2262					  struct list_head *ptype_list)
 2263{
 2264	struct packet_type *ptype, *pt_prev = *pt;
 2265
 2266	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2267		if (ptype->type != type)
 2268			continue;
 2269		if (pt_prev)
 2270			deliver_skb(skb, pt_prev, orig_dev);
 2271		pt_prev = ptype;
 2272	}
 2273	*pt = pt_prev;
 2274}
 2275
 2276static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 2277{
 2278	if (!ptype->af_packet_priv || !skb->sk)
 2279		return false;
 2280
 2281	if (ptype->id_match)
 2282		return ptype->id_match(ptype, skb->sk);
 2283	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
 2284		return true;
 2285
 2286	return false;
 2287}
 2288
 2289/**
 2290 * dev_nit_active - return true if any network interface taps are in use
 2291 *
 2292 * @dev: network device to check for the presence of taps
 2293 */
 2294bool dev_nit_active(struct net_device *dev)
 2295{
 2296	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
 2297}
 2298EXPORT_SYMBOL_GPL(dev_nit_active);
 2299
 2300/*
 2301 *	Support routine. Sends outgoing frames to any network
 2302 *	taps currently in use.
 2303 */
 2304
 2305void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 2306{
 2307	struct packet_type *ptype;
 2308	struct sk_buff *skb2 = NULL;
 2309	struct packet_type *pt_prev = NULL;
 2310	struct list_head *ptype_list = &ptype_all;
 2311
 2312	rcu_read_lock();
 2313again:
 2314	list_for_each_entry_rcu(ptype, ptype_list, list) {
 2315		if (ptype->ignore_outgoing)
 2316			continue;
 2317
 2318		/* Never send packets back to the socket
 2319		 * they originated from - MvS (miquels@drinkel.ow.org)
 2320		 */
 2321		if (skb_loop_sk(ptype, skb))
 2322			continue;
 2323
 2324		if (pt_prev) {
 2325			deliver_skb(skb2, pt_prev, skb->dev);
 2326			pt_prev = ptype;
 2327			continue;
 2328		}
 2329
 2330		/* need to clone skb, done only once */
 2331		skb2 = skb_clone(skb, GFP_ATOMIC);
 2332		if (!skb2)
 2333			goto out_unlock;
 2334
 2335		net_timestamp_set(skb2);
 2336
 2337		/* skb->nh should be correctly
 2338		 * set by sender, so that the second statement is
 2339		 * just protection against buggy protocols.
 2340		 */
 2341		skb_reset_mac_header(skb2);
 2342
 2343		if (skb_network_header(skb2) < skb2->data ||
 2344		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
 2345			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
 2346					     ntohs(skb2->protocol),
 2347					     dev->name);
 2348			skb_reset_network_header(skb2);
 2349		}
 2350
 2351		skb2->transport_header = skb2->network_header;
 2352		skb2->pkt_type = PACKET_OUTGOING;
 2353		pt_prev = ptype;
 2354	}
 2355
 2356	if (ptype_list == &ptype_all) {
 2357		ptype_list = &dev->ptype_all;
 2358		goto again;
 2359	}
 2360out_unlock:
 2361	if (pt_prev) {
 2362		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
 2363			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 2364		else
 2365			kfree_skb(skb2);
 2366	}
 2367	rcu_read_unlock();
 2368}
 2369EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 2370
 2371/**
 2372 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 2373 * @dev: Network device
 2374 * @txq: number of queues available
 2375 *
 2376 * If real_num_tx_queues is changed the tc mappings may no longer be
 2377 * valid. To resolve this verify the tc mapping remains valid and if
 2378 * not NULL the mapping. With no priorities mapping to this
 2379 * offset/count pair it will no longer be used. In the worst case TC0
 2380 * is invalid nothing can be done so disable priority mappings. If is
 2381 * expected that drivers will fix this mapping if they can before
 2382 * calling netif_set_real_num_tx_queues.
 2383 */
 2384static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 2385{
 2386	int i;
 2387	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2388
 2389	/* If TC0 is invalidated disable TC mapping */
 2390	if (tc->offset + tc->count > txq) {
 2391		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
 2392		dev->num_tc = 0;
 2393		return;
 2394	}
 2395
 2396	/* Invalidated prio to tc mappings set to TC0 */
 2397	for (i = 1; i < TC_BITMASK + 1; i++) {
 2398		int q = netdev_get_prio_tc_map(dev, i);
 2399
 2400		tc = &dev->tc_to_txq[q];
 2401		if (tc->offset + tc->count > txq) {
 2402			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
 2403				i, q);
 2404			netdev_set_prio_tc_map(dev, i, 0);
 2405		}
 2406	}
 2407}
 2408
 2409int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
 2410{
 2411	if (dev->num_tc) {
 2412		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
 2413		int i;
 2414
 2415		/* walk through the TCs and see if it falls into any of them */
 2416		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
 2417			if ((txq - tc->offset) < tc->count)
 2418				return i;
 2419		}
 2420
 2421		/* didn't find it, just return -1 to indicate no match */
 2422		return -1;
 2423	}
 2424
 2425	return 0;
 2426}
 2427EXPORT_SYMBOL(netdev_txq_to_tc);
 2428
 2429#ifdef CONFIG_XPS
 2430struct static_key xps_needed __read_mostly;
 2431EXPORT_SYMBOL(xps_needed);
 2432struct static_key xps_rxqs_needed __read_mostly;
 2433EXPORT_SYMBOL(xps_rxqs_needed);
 2434static DEFINE_MUTEX(xps_map_mutex);
 2435#define xmap_dereference(P)		\
 2436	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
 2437
 2438static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 2439			     int tci, u16 index)
 2440{
 2441	struct xps_map *map = NULL;
 2442	int pos;
 2443
 2444	if (dev_maps)
 2445		map = xmap_dereference(dev_maps->attr_map[tci]);
 2446	if (!map)
 2447		return false;
 2448
 2449	for (pos = map->len; pos--;) {
 2450		if (map->queues[pos] != index)
 2451			continue;
 2452
 2453		if (map->len > 1) {
 2454			map->queues[pos] = map->queues[--map->len];
 2455			break;
 2456		}
 2457
 2458		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 2459		kfree_rcu(map, rcu);
 2460		return false;
 2461	}
 2462
 2463	return true;
 2464}
 2465
 2466static bool remove_xps_queue_cpu(struct net_device *dev,
 2467				 struct xps_dev_maps *dev_maps,
 2468				 int cpu, u16 offset, u16 count)
 2469{
 2470	int num_tc = dev->num_tc ? : 1;
 2471	bool active = false;
 2472	int tci;
 2473
 2474	for (tci = cpu * num_tc; num_tc--; tci++) {
 2475		int i, j;
 2476
 2477		for (i = count, j = offset; i--; j++) {
 2478			if (!remove_xps_queue(dev_maps, tci, j))
 2479				break;
 2480		}
 2481
 2482		active |= i < 0;
 2483	}
 2484
 2485	return active;
 2486}
 2487
 2488static void reset_xps_maps(struct net_device *dev,
 2489			   struct xps_dev_maps *dev_maps,
 2490			   bool is_rxqs_map)
 2491{
 2492	if (is_rxqs_map) {
 2493		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 2494		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 2495	} else {
 2496		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 2497	}
 2498	static_key_slow_dec_cpuslocked(&xps_needed);
 2499	kfree_rcu(dev_maps, rcu);
 2500}
 2501
 2502static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 2503			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 2504			   u16 offset, u16 count, bool is_rxqs_map)
 2505{
 2506	bool active = false;
 2507	int i, j;
 2508
 2509	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 2510	     j < nr_ids;)
 2511		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 2512					       count);
 2513	if (!active)
 2514		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2515
 2516	if (!is_rxqs_map) {
 2517		for (i = offset + (count - 1); count--; i--) {
 2518			netdev_queue_numa_node_write(
 2519				netdev_get_tx_queue(dev, i),
 2520				NUMA_NO_NODE);
 2521		}
 2522	}
 2523}
 2524
 2525static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 2526				   u16 count)
 2527{
 2528	const unsigned long *possible_mask = NULL;
 2529	struct xps_dev_maps *dev_maps;
 2530	unsigned int nr_ids;
 2531
 2532	if (!static_key_false(&xps_needed))
 2533		return;
 2534
 2535	cpus_read_lock();
 2536	mutex_lock(&xps_map_mutex);
 2537
 2538	if (static_key_false(&xps_rxqs_needed)) {
 2539		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2540		if (dev_maps) {
 2541			nr_ids = dev->num_rx_queues;
 2542			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 2543				       offset, count, true);
 2544		}
 2545	}
 2546
 2547	dev_maps = xmap_dereference(dev->xps_cpus_map);
 2548	if (!dev_maps)
 2549		goto out_no_maps;
 2550
 2551	if (num_possible_cpus() > 1)
 2552		possible_mask = cpumask_bits(cpu_possible_mask);
 2553	nr_ids = nr_cpu_ids;
 2554	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 2555		       false);
 2556
 2557out_no_maps:
 2558	mutex_unlock(&xps_map_mutex);
 2559	cpus_read_unlock();
 2560}
 2561
 2562static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
 2563{
 2564	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
 2565}
 2566
 2567static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 2568				      u16 index, bool is_rxqs_map)
 2569{
 2570	struct xps_map *new_map;
 2571	int alloc_len = XPS_MIN_MAP_ALLOC;
 2572	int i, pos;
 2573
 2574	for (pos = 0; map && pos < map->len; pos++) {
 2575		if (map->queues[pos] != index)
 2576			continue;
 2577		return map;
 2578	}
 2579
 2580	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
 2581	if (map) {
 2582		if (pos < map->alloc_len)
 2583			return map;
 2584
 2585		alloc_len = map->alloc_len * 2;
 2586	}
 2587
 2588	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
 2589	 *  map
 2590	 */
 2591	if (is_rxqs_map)
 2592		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
 2593	else
 2594		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
 2595				       cpu_to_node(attr_index));
 2596	if (!new_map)
 2597		return NULL;
 2598
 2599	for (i = 0; i < pos; i++)
 2600		new_map->queues[i] = map->queues[i];
 2601	new_map->alloc_len = alloc_len;
 2602	new_map->len = pos;
 2603
 2604	return new_map;
 2605}
 2606
 2607/* Must be called under cpus_read_lock */
 2608int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 2609			  u16 index, bool is_rxqs_map)
 2610{
 2611	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 2612	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 2613	int i, j, tci, numa_node_id = -2;
 2614	int maps_sz, num_tc = 1, tc = 0;
 2615	struct xps_map *map, *new_map;
 2616	bool active = false;
 2617	unsigned int nr_ids;
 2618
 2619	if (dev->num_tc) {
 2620		/* Do not allow XPS on subordinate device directly */
 2621		num_tc = dev->num_tc;
 2622		if (num_tc < 0)
 2623			return -EINVAL;
 2624
 2625		/* If queue belongs to subordinate dev use its map */
 2626		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 2627
 2628		tc = netdev_txq_to_tc(dev, index);
 2629		if (tc < 0)
 2630			return -EINVAL;
 2631	}
 2632
 2633	mutex_lock(&xps_map_mutex);
 2634	if (is_rxqs_map) {
 2635		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 2636		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 2637		nr_ids = dev->num_rx_queues;
 2638	} else {
 2639		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 2640		if (num_possible_cpus() > 1) {
 2641			online_mask = cpumask_bits(cpu_online_mask);
 2642			possible_mask = cpumask_bits(cpu_possible_mask);
 2643		}
 2644		dev_maps = xmap_dereference(dev->xps_cpus_map);
 2645		nr_ids = nr_cpu_ids;
 2646	}
 2647
 2648	if (maps_sz < L1_CACHE_BYTES)
 2649		maps_sz = L1_CACHE_BYTES;
 2650
 2651	/* allocate memory for queue storage */
 2652	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 2653	     j < nr_ids;) {
 2654		if (!new_dev_maps)
 2655			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 2656		if (!new_dev_maps) {
 2657			mutex_unlock(&xps_map_mutex);
 2658			return -ENOMEM;
 2659		}
 2660
 2661		tci = j * num_tc + tc;
 2662		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 2663				 NULL;
 2664
 2665		map = expand_xps_map(map, j, index, is_rxqs_map);
 2666		if (!map)
 2667			goto error;
 2668
 2669		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2670	}
 2671
 2672	if (!new_dev_maps)
 2673		goto out_no_new_maps;
 2674
 2675	if (!dev_maps) {
 2676		/* Increment static keys at most once per type */
 2677		static_key_slow_inc_cpuslocked(&xps_needed);
 2678		if (is_rxqs_map)
 2679			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 2680	}
 2681
 2682	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2683	     j < nr_ids;) {
 2684		/* copy maps belonging to foreign traffic classes */
 2685		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 2686			/* fill in the new device map from the old device map */
 2687			map = xmap_dereference(dev_maps->attr_map[tci]);
 2688			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2689		}
 2690
 2691		/* We need to explicitly update tci as prevous loop
 2692		 * could break out early if dev_maps is NULL.
 2693		 */
 2694		tci = j * num_tc + tc;
 2695
 2696		if (netif_attr_test_mask(j, mask, nr_ids) &&
 2697		    netif_attr_test_online(j, online_mask, nr_ids)) {
 2698			/* add tx-queue to CPU/rx-queue maps */
 2699			int pos = 0;
 2700
 2701			map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2702			while ((pos < map->len) && (map->queues[pos] != index))
 2703				pos++;
 2704
 2705			if (pos == map->len)
 2706				map->queues[map->len++] = index;
 2707#ifdef CONFIG_NUMA
 2708			if (!is_rxqs_map) {
 2709				if (numa_node_id == -2)
 2710					numa_node_id = cpu_to_node(j);
 2711				else if (numa_node_id != cpu_to_node(j))
 2712					numa_node_id = -1;
 2713			}
 2714#endif
 2715		} else if (dev_maps) {
 2716			/* fill in the new device map from the old device map */
 2717			map = xmap_dereference(dev_maps->attr_map[tci]);
 2718			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2719		}
 2720
 2721		/* copy maps belonging to foreign traffic classes */
 2722		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 2723			/* fill in the new device map from the old device map */
 2724			map = xmap_dereference(dev_maps->attr_map[tci]);
 2725			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 2726		}
 2727	}
 2728
 2729	if (is_rxqs_map)
 2730		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 2731	else
 2732		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 2733
 2734	/* Cleanup old maps */
 2735	if (!dev_maps)
 2736		goto out_no_old_maps;
 2737
 2738	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2739	     j < nr_ids;) {
 2740		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2741			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2742			map = xmap_dereference(dev_maps->attr_map[tci]);
 2743			if (map && map != new_map)
 2744				kfree_rcu(map, rcu);
 2745		}
 2746	}
 2747
 2748	kfree_rcu(dev_maps, rcu);
 2749
 2750out_no_old_maps:
 2751	dev_maps = new_dev_maps;
 2752	active = true;
 2753
 2754out_no_new_maps:
 2755	if (!is_rxqs_map) {
 2756		/* update Tx queue numa node */
 2757		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 2758					     (numa_node_id >= 0) ?
 2759					     numa_node_id : NUMA_NO_NODE);
 2760	}
 2761
 2762	if (!dev_maps)
 2763		goto out_no_maps;
 2764
 2765	/* removes tx-queue from unused CPUs/rx-queues */
 2766	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2767	     j < nr_ids;) {
 2768		for (i = tc, tci = j * num_tc; i--; tci++)
 2769			active |= remove_xps_queue(dev_maps, tci, index);
 2770		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 2771		    !netif_attr_test_online(j, online_mask, nr_ids))
 2772			active |= remove_xps_queue(dev_maps, tci, index);
 2773		for (i = num_tc - tc, tci++; --i; tci++)
 2774			active |= remove_xps_queue(dev_maps, tci, index);
 2775	}
 2776
 2777	/* free map if not active */
 2778	if (!active)
 2779		reset_xps_maps(dev, dev_maps, is_rxqs_map);
 2780
 2781out_no_maps:
 2782	mutex_unlock(&xps_map_mutex);
 2783
 2784	return 0;
 2785error:
 2786	/* remove any maps that we added */
 2787	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 2788	     j < nr_ids;) {
 2789		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 2790			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 2791			map = dev_maps ?
 2792			      xmap_dereference(dev_maps->attr_map[tci]) :
 2793			      NULL;
 2794			if (new_map && new_map != map)
 2795				kfree(new_map);
 2796		}
 2797	}
 2798
 2799	mutex_unlock(&xps_map_mutex);
 2800
 2801	kfree(new_dev_maps);
 2802	return -ENOMEM;
 2803}
 2804EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
 2805
 2806int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 2807			u16 index)
 2808{
 2809	int ret;
 2810
 2811	cpus_read_lock();
 2812	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 2813	cpus_read_unlock();
 2814
 2815	return ret;
 2816}
 2817EXPORT_SYMBOL(netif_set_xps_queue);
 2818
 2819#endif
 2820static void netdev_unbind_all_sb_channels(struct net_device *dev)
 2821{
 2822	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2823
 2824	/* Unbind any subordinate channels */
 2825	while (txq-- != &dev->_tx[0]) {
 2826		if (txq->sb_dev)
 2827			netdev_unbind_sb_channel(dev, txq->sb_dev);
 2828	}
 2829}
 2830
 2831void netdev_reset_tc(struct net_device *dev)
 2832{
 2833#ifdef CONFIG_XPS
 2834	netif_reset_xps_queues_gt(dev, 0);
 2835#endif
 2836	netdev_unbind_all_sb_channels(dev);
 2837
 2838	/* Reset TC configuration of device */
 2839	dev->num_tc = 0;
 2840	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
 2841	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
 2842}
 2843EXPORT_SYMBOL(netdev_reset_tc);
 2844
 2845int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
 2846{
 2847	if (tc >= dev->num_tc)
 2848		return -EINVAL;
 2849
 2850#ifdef CONFIG_XPS
 2851	netif_reset_xps_queues(dev, offset, count);
 2852#endif
 2853	dev->tc_to_txq[tc].count = count;
 2854	dev->tc_to_txq[tc].offset = offset;
 2855	return 0;
 2856}
 2857EXPORT_SYMBOL(netdev_set_tc_queue);
 2858
 2859int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 2860{
 2861	if (num_tc > TC_MAX_QUEUE)
 2862		return -EINVAL;
 2863
 2864#ifdef CONFIG_XPS
 2865	netif_reset_xps_queues_gt(dev, 0);
 2866#endif
 2867	netdev_unbind_all_sb_channels(dev);
 2868
 2869	dev->num_tc = num_tc;
 2870	return 0;
 2871}
 2872EXPORT_SYMBOL(netdev_set_num_tc);
 2873
 2874void netdev_unbind_sb_channel(struct net_device *dev,
 2875			      struct net_device *sb_dev)
 2876{
 2877	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
 2878
 2879#ifdef CONFIG_XPS
 2880	netif_reset_xps_queues_gt(sb_dev, 0);
 2881#endif
 2882	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
 2883	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
 2884
 2885	while (txq-- != &dev->_tx[0]) {
 2886		if (txq->sb_dev == sb_dev)
 2887			txq->sb_dev = NULL;
 2888	}
 2889}
 2890EXPORT_SYMBOL(netdev_unbind_sb_channel);
 2891
 2892int netdev_bind_sb_channel_queue(struct net_device *dev,
 2893				 struct net_device *sb_dev,
 2894				 u8 tc, u16 count, u16 offset)
 2895{
 2896	/* Make certain the sb_dev and dev are already configured */
 2897	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
 2898		return -EINVAL;
 2899
 2900	/* We cannot hand out queues we don't have */
 2901	if ((offset + count) > dev->real_num_tx_queues)
 2902		return -EINVAL;
 2903
 2904	/* Record the mapping */
 2905	sb_dev->tc_to_txq[tc].count = count;
 2906	sb_dev->tc_to_txq[tc].offset = offset;
 2907
 2908	/* Provide a way for Tx queue to find the tc_to_txq map or
 2909	 * XPS map for itself.
 2910	 */
 2911	while (count--)
 2912		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
 2913
 2914	return 0;
 2915}
 2916EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
 2917
 2918int netdev_set_sb_channel(struct net_device *dev, u16 channel)
 2919{
 2920	/* Do not use a multiqueue device to represent a subordinate channel */
 2921	if (netif_is_multiqueue(dev))
 2922		return -ENODEV;
 2923
 2924	/* We allow channels 1 - 32767 to be used for subordinate channels.
 2925	 * Channel 0 is meant to be "native" mode and used only to represent
 2926	 * the main root device. We allow writing 0 to reset the device back
 2927	 * to normal mode after being used as a subordinate channel.
 2928	 */
 2929	if (channel > S16_MAX)
 2930		return -EINVAL;
 2931
 2932	dev->num_tc = -channel;
 2933
 2934	return 0;
 2935}
 2936EXPORT_SYMBOL(netdev_set_sb_channel);
 2937
 2938/*
 2939 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 2940 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 2941 */
 2942int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 2943{
 2944	bool disabling;
 2945	int rc;
 2946
 2947	disabling = txq < dev->real_num_tx_queues;
 2948
 2949	if (txq < 1 || txq > dev->num_tx_queues)
 2950		return -EINVAL;
 2951
 2952	if (dev->reg_state == NETREG_REGISTERED ||
 2953	    dev->reg_state == NETREG_UNREGISTERING) {
 2954		ASSERT_RTNL();
 2955
 2956		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
 2957						  txq);
 2958		if (rc)
 2959			return rc;
 2960
 2961		if (dev->num_tc)
 2962			netif_setup_tc(dev, txq);
 2963
 2964		dev->real_num_tx_queues = txq;
 2965
 2966		if (disabling) {
 2967			synchronize_net();
 2968			qdisc_reset_all_tx_gt(dev, txq);
 2969#ifdef CONFIG_XPS
 2970			netif_reset_xps_queues_gt(dev, txq);
 2971#endif
 2972		}
 2973	} else {
 2974		dev->real_num_tx_queues = txq;
 2975	}
 2976
 2977	return 0;
 2978}
 2979EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 2980
 2981#ifdef CONFIG_SYSFS
 2982/**
 2983 *	netif_set_real_num_rx_queues - set actual number of RX queues used
 2984 *	@dev: Network device
 2985 *	@rxq: Actual number of RX queues
 2986 *
 2987 *	This must be called either with the rtnl_lock held or before
 2988 *	registration of the net device.  Returns 0 on success, or a
 2989 *	negative error code.  If called before registration, it always
 2990 *	succeeds.
 2991 */
 2992int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 2993{
 2994	int rc;
 2995
 2996	if (rxq < 1 || rxq > dev->num_rx_queues)
 2997		return -EINVAL;
 2998
 2999	if (dev->reg_state == NETREG_REGISTERED) {
 3000		ASSERT_RTNL();
 3001
 3002		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
 3003						  rxq);
 3004		if (rc)
 3005			return rc;
 3006	}
 3007
 3008	dev->real_num_rx_queues = rxq;
 3009	return 0;
 3010}
 3011EXPORT_SYMBOL(netif_set_real_num_rx_queues);
 3012#endif
 3013
 3014/**
 3015 * netif_get_num_default_rss_queues - default number of RSS queues
 3016 *
 3017 * This routine should set an upper limit on the number of RSS queues
 3018 * used by default by multiqueue devices.
 3019 */
 3020int netif_get_num_default_rss_queues(void)
 3021{
 3022	return is_kdump_kernel() ?
 3023		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
 3024}
 3025EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 3026
 3027static void __netif_reschedule(struct Qdisc *q)
 3028{
 3029	struct softnet_data *sd;
 3030	unsigned long flags;
 3031
 3032	local_irq_save(flags);
 3033	sd = this_cpu_ptr(&softnet_data);
 3034	q->next_sched = NULL;
 3035	*sd->output_queue_tailp = q;
 3036	sd->output_queue_tailp = &q->next_sched;
 3037	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3038	local_irq_restore(flags);
 3039}
 3040
 3041void __netif_schedule(struct Qdisc *q)
 3042{
 3043	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
 3044		__netif_reschedule(q);
 3045}
 3046EXPORT_SYMBOL(__netif_schedule);
 3047
 3048struct dev_kfree_skb_cb {
 3049	enum skb_free_reason reason;
 3050};
 3051
 3052static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 3053{
 3054	return (struct dev_kfree_skb_cb *)skb->cb;
 3055}
 3056
 3057void netif_schedule_queue(struct netdev_queue *txq)
 3058{
 3059	rcu_read_lock();
 3060	if (!netif_xmit_stopped(txq)) {
 3061		struct Qdisc *q = rcu_dereference(txq->qdisc);
 3062
 3063		__netif_schedule(q);
 3064	}
 3065	rcu_read_unlock();
 3066}
 3067EXPORT_SYMBOL(netif_schedule_queue);
 3068
 3069void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 3070{
 3071	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
 3072		struct Qdisc *q;
 3073
 3074		rcu_read_lock();
 3075		q = rcu_dereference(dev_queue->qdisc);
 3076		__netif_schedule(q);
 3077		rcu_read_unlock();
 3078	}
 3079}
 3080EXPORT_SYMBOL(netif_tx_wake_queue);
 3081
 3082void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 3083{
 3084	unsigned long flags;
 3085
 3086	if (unlikely(!skb))
 3087		return;
 3088
 3089	if (likely(refcount_read(&skb->users) == 1)) {
 3090		smp_rmb();
 3091		refcount_set(&skb->users, 0);
 3092	} else if (likely(!refcount_dec_and_test(&skb->users))) {
 3093		return;
 3094	}
 3095	get_kfree_skb_cb(skb)->reason = reason;
 3096	local_irq_save(flags);
 3097	skb->next = __this_cpu_read(softnet_data.completion_queue);
 3098	__this_cpu_write(softnet_data.completion_queue, skb);
 3099	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 3100	local_irq_restore(flags);
 3101}
 3102EXPORT_SYMBOL(__dev_kfree_skb_irq);
 3103
 3104void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
 3105{
 3106	if (in_irq() || irqs_disabled())
 3107		__dev_kfree_skb_irq(skb, reason);
 3108	else
 3109		dev_kfree_skb(skb);
 3110}
 3111EXPORT_SYMBOL(__dev_kfree_skb_any);
 3112
 3113
 3114/**
 3115 * netif_device_detach - mark device as removed
 3116 * @dev: network device
 3117 *
 3118 * Mark device as removed from system and therefore no longer available.
 3119 */
 3120void netif_device_detach(struct net_device *dev)
 3121{
 3122	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3123	    netif_running(dev)) {
 3124		netif_tx_stop_all_queues(dev);
 3125	}
 3126}
 3127EXPORT_SYMBOL(netif_device_detach);
 3128
 3129/**
 3130 * netif_device_attach - mark device as attached
 3131 * @dev: network device
 3132 *
 3133 * Mark device as attached from system and restart if needed.
 3134 */
 3135void netif_device_attach(struct net_device *dev)
 3136{
 3137	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
 3138	    netif_running(dev)) {
 3139		netif_tx_wake_all_queues(dev);
 3140		__netdev_watchdog_up(dev);
 3141	}
 3142}
 3143EXPORT_SYMBOL(netif_device_attach);
 3144
 3145/*
 3146 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 3147 * to be used as a distribution range.
 3148 */
 3149static u16 skb_tx_hash(const struct net_device *dev,
 3150		       const struct net_device *sb_dev,
 3151		       struct sk_buff *skb)
 3152{
 3153	u32 hash;
 3154	u16 qoffset = 0;
 3155	u16 qcount = dev->real_num_tx_queues;
 3156
 3157	if (dev->num_tc) {
 3158		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
 3159
 3160		qoffset = sb_dev->tc_to_txq[tc].offset;
 3161		qcount = sb_dev->tc_to_txq[tc].count;
 3162	}
 3163
 3164	if (skb_rx_queue_recorded(skb)) {
 3165		hash = skb_get_rx_queue(skb);
 3166		if (hash >= qoffset)
 3167			hash -= qoffset;
 3168		while (unlikely(hash >= qcount))
 3169			hash -= qcount;
 3170		return hash + qoffset;
 3171	}
 3172
 3173	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 3174}
 3175
 3176static void skb_warn_bad_offload(const struct sk_buff *skb)
 3177{
 3178	static const netdev_features_t null_features;
 3179	struct net_device *dev = skb->dev;
 3180	const char *name = "";
 3181
 3182	if (!net_ratelimit())
 3183		return;
 3184
 3185	if (dev) {
 3186		if (dev->dev.parent)
 3187			name = dev_driver_string(dev->dev.parent);
 3188		else
 3189			name = netdev_name(dev);
 3190	}
 3191	skb_dump(KERN_WARNING, skb, false);
 3192	WARN(1, "%s: caps=(%pNF, %pNF)\n",
 3193	     name, dev ? &dev->features : &null_features,
 3194	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
 3195}
 3196
 3197/*
 3198 * Invalidate hardware checksum when packet is to be mangled, and
 3199 * complete checksum manually on outgoing path.
 3200 */
 3201int skb_checksum_help(struct sk_buff *skb)
 3202{
 3203	__wsum csum;
 3204	int ret = 0, offset;
 3205
 3206	if (skb->ip_summed == CHECKSUM_COMPLETE)
 3207		goto out_set_summed;
 3208
 3209	if (unlikely(skb_shinfo(skb)->gso_size)) {
 3210		skb_warn_bad_offload(skb);
 3211		return -EINVAL;
 3212	}
 3213
 3214	/* Before computing a checksum, we should make sure no frag could
 3215	 * be modified by an external entity : checksum could be wrong.
 3216	 */
 3217	if (skb_has_shared_frag(skb)) {
 3218		ret = __skb_linearize(skb);
 3219		if (ret)
 3220			goto out;
 3221	}
 3222
 3223	offset = skb_checksum_start_offset(skb);
 3224	BUG_ON(offset >= skb_headlen(skb));
 3225	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 3226
 3227	offset += skb->csum_offset;
 3228	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
 3229
 3230	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 3231	if (ret)
 3232		goto out;
 3233
 3234	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 3235out_set_summed:
 3236	skb->ip_summed = CHECKSUM_NONE;
 3237out:
 3238	return ret;
 3239}
 3240EXPORT_SYMBOL(skb_checksum_help);
 3241
 3242int skb_crc32c_csum_help(struct sk_buff *skb)
 3243{
 3244	__le32 crc32c_csum;
 3245	int ret = 0, offset, start;
 3246
 3247	if (skb->ip_summed != CHECKSUM_PARTIAL)
 3248		goto out;
 3249
 3250	if (unlikely(skb_is_gso(skb)))
 3251		goto out;
 3252
 3253	/* Before computing a checksum, we should make sure no frag could
 3254	 * be modified by an external entity : checksum could be wrong.
 3255	 */
 3256	if (unlikely(skb_has_shared_frag(skb))) {
 3257		ret = __skb_linearize(skb);
 3258		if (ret)
 3259			goto out;
 3260	}
 3261	start = skb_checksum_start_offset(skb);
 3262	offset = start + offsetof(struct sctphdr, checksum);
 3263	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
 3264		ret = -EINVAL;
 3265		goto out;
 3266	}
 3267
 3268	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
 3269	if (ret)
 3270		goto out;
 3271
 3272	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
 3273						  skb->len - start, ~(__u32)0,
 3274						  crc32c_csum_stub));
 3275	*(__le32 *)(skb->data + offset) = crc32c_csum;
 3276	skb->ip_summed = CHECKSUM_NONE;
 3277	skb->csum_not_inet = 0;
 3278out:
 3279	return ret;
 3280}
 3281
 3282__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 3283{
 3284	__be16 type = skb->protocol;
 3285
 3286	/* Tunnel gso handlers can set protocol to ethernet. */
 3287	if (type == htons(ETH_P_TEB)) {
 3288		struct ethhdr *eth;
 3289
 3290		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
 3291			return 0;
 3292
 3293		eth = (struct ethhdr *)skb->data;
 3294		type = eth->h_proto;
 3295	}
 3296
 3297	return __vlan_get_protocol(skb, type, depth);
 3298}
 3299
 3300/**
 3301 *	skb_mac_gso_segment - mac layer segmentation handler.
 3302 *	@skb: buffer to segment
 3303 *	@features: features for the output path (see dev->features)
 3304 */
 3305struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 3306				    netdev_features_t features)
 3307{
 3308	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
 3309	struct packet_offload *ptype;
 3310	int vlan_depth = skb->mac_len;
 3311	__be16 type = skb_network_protocol(skb, &vlan_depth);
 3312
 3313	if (unlikely(!type))
 3314		return ERR_PTR(-EINVAL);
 3315
 3316	__skb_pull(skb, vlan_depth);
 3317
 3318	rcu_read_lock();
 3319	list_for_each_entry_rcu(ptype, &offload_base, list) {
 3320		if (ptype->type == type && ptype->callbacks.gso_segment) {
 3321			segs = ptype->callbacks.gso_segment(skb, features);
 3322			break;
 3323		}
 3324	}
 3325	rcu_read_unlock();
 3326
 3327	__skb_push(skb, skb->data - skb_mac_header(skb));
 3328
 3329	return segs;
 3330}
 3331EXPORT_SYMBOL(skb_mac_gso_segment);
 3332
 3333
 3334/* openvswitch calls this on rx path, so we need a different check.
 3335 */
 3336static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 3337{
 3338	if (tx_path)
 3339		return skb->ip_summed != CHECKSUM_PARTIAL &&
 3340		       skb->ip_summed != CHECKSUM_UNNECESSARY;
 3341
 3342	return skb->ip_summed == CHECKSUM_NONE;
 3343}
 3344
 3345/**
 3346 *	__skb_gso_segment - Perform segmentation on skb.
 3347 *	@skb: buffer to segment
 3348 *	@features: features for the output path (see dev->features)
 3349 *	@tx_path: whether it is called in TX path
 3350 *
 3351 *	This function segments the given skb and returns a list of segments.
 3352 *
 3353 *	It may return NULL if the skb requires no segmentation.  This is
 3354 *	only possible when GSO is used for verifying header integrity.
 3355 *
 3356 *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 3357 */
 3358struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 3359				  netdev_features_t features, bool tx_path)
 3360{
 3361	struct sk_buff *segs;
 3362
 3363	if (unlikely(skb_needs_check(skb, tx_path))) {
 3364		int err;
 3365
 3366		/* We're going to init ->check field in TCP or UDP header */
 3367		err = skb_cow_head(skb, 0);
 3368		if (err < 0)
 3369			return ERR_PTR(err);
 3370	}
 3371
 3372	/* Only report GSO partial support if it will enable us to
 3373	 * support segmentation on this frame without needing additional
 3374	 * work.
 3375	 */
 3376	if (features & NETIF_F_GSO_PARTIAL) {
 3377		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
 3378		struct net_device *dev = skb->dev;
 3379
 3380		partial_features |= dev->features & dev->gso_partial_features;
 3381		if (!skb_gso_ok(skb, features | partial_features))
 3382			features &= ~NETIF_F_GSO_PARTIAL;
 3383	}
 3384
 3385	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 3386		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 3387
 3388	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
 3389	SKB_GSO_CB(skb)->encap_level = 0;
 3390
 3391	skb_reset_mac_header(skb);
 3392	skb_reset_mac_len(skb);
 3393
 3394	segs = skb_mac_gso_segment(skb, features);
 3395
 3396	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
 3397		skb_warn_bad_offload(skb);
 3398
 3399	return segs;
 3400}
 3401EXPORT_SYMBOL(__skb_gso_segment);
 3402
 3403/* Take action when hardware reception checksum errors are detected. */
 3404#ifdef CONFIG_BUG
 3405void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
 3406{
 3407	if (net_ratelimit()) {
 3408		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
 3409		skb_dump(KERN_ERR, skb, true);
 3410		dump_stack();
 3411	}
 3412}
 3413EXPORT_SYMBOL(netdev_rx_csum_fault);
 3414#endif
 3415
 3416/* XXX: check that highmem exists at all on the given machine. */
 3417static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 3418{
 3419#ifdef CONFIG_HIGHMEM
 3420	int i;
 3421
 3422	if (!(dev->features & NETIF_F_HIGHDMA)) {
 3423		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 3424			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 3425
 3426			if (PageHighMem(skb_frag_page(frag)))
 3427				return 1;
 3428		}
 3429	}
 3430#endif
 3431	return 0;
 3432}
 3433
 3434/* If MPLS offload request, verify we are testing hardware MPLS features
 3435 * instead of standard features for the netdev.
 3436 */
 3437#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
 3438static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3439					   netdev_features_t features,
 3440					   __be16 type)
 3441{
 3442	if (eth_p_mpls(type))
 3443		features &= skb->dev->mpls_features;
 3444
 3445	return features;
 3446}
 3447#else
 3448static netdev_features_t net_mpls_features(struct sk_buff *skb,
 3449					   netdev_features_t features,
 3450					   __be16 type)
 3451{
 3452	return features;
 3453}
 3454#endif
 3455
 3456static netdev_features_t harmonize_features(struct sk_buff *skb,
 3457	netdev_features_t features)
 3458{
 3459	__be16 type;
 3460
 3461	type = skb_network_protocol(skb, NULL);
 3462	features = net_mpls_features(skb, features, type);
 3463
 3464	if (skb->ip_summed != CHECKSUM_NONE &&
 3465	    !can_checksum_protocol(features, type)) {
 3466		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 3467	}
 3468	if (illegal_highdma(skb->dev, skb))
 3469		features &= ~NETIF_F_SG;
 3470
 3471	return features;
 3472}
 3473
 3474netdev_features_t passthru_features_check(struct sk_buff *skb,
 3475					  struct net_device *dev,
 3476					  netdev_features_t features)
 3477{
 3478	return features;
 3479}
 3480EXPORT_SYMBOL(passthru_features_check);
 3481
 3482static netdev_features_t dflt_features_check(struct sk_buff *skb,
 3483					     struct net_device *dev,
 3484					     netdev_features_t features)
 3485{
 3486	return vlan_features_check(skb, features);
 3487}
 3488
 3489static netdev_features_t gso_features_check(const struct sk_buff *skb,
 3490					    struct net_device *dev,
 3491					    netdev_features_t features)
 3492{
 3493	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 3494
 3495	if (gso_segs > dev->gso_max_segs)
 3496		return features & ~NETIF_F_GSO_MASK;
 3497
 3498	/* Support for GSO partial features requires software
 3499	 * intervention before we can actually process the packets
 3500	 * so we need to strip support for any partial features now
 3501	 * and we can pull them back in after we have partially
 3502	 * segmented the frame.
 3503	 */
 3504	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
 3505		features &= ~dev->gso_partial_features;
 3506
 3507	/* Make sure to clear the IPv4 ID mangling feature if the
 3508	 * IPv4 header has the potential to be fragmented.
 3509	 */
 3510	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 3511		struct iphdr *iph = skb->encapsulation ?
 3512				    inner_ip_hdr(skb) : ip_hdr(skb);
 3513
 3514		if (!(iph->frag_off & htons(IP_DF)))
 3515			features &= ~NETIF_F_TSO_MANGLEID;
 3516	}
 3517
 3518	return features;
 3519}
 3520
 3521netdev_features_t netif_skb_features(struct sk_buff *skb)
 3522{
 3523	struct net_device *dev = skb->dev;
 3524	netdev_features_t features = dev->features;
 3525
 3526	if (skb_is_gso(skb))
 3527		features = gso_features_check(skb, dev, features);
 3528
 3529	/* If encapsulation offload request, verify we are testing
 3530	 * hardware encapsulation features instead of standard
 3531	 * features for the netdev
 3532	 */
 3533	if (skb->encapsulation)
 3534		features &= dev->hw_enc_features;
 3535
 3536	if (skb_vlan_tagged(skb))
 3537		features = netdev_intersect_features(features,
 3538						     dev->vlan_features |
 3539						     NETIF_F_HW_VLAN_CTAG_TX |
 3540						     NETIF_F_HW_VLAN_STAG_TX);
 3541
 3542	if (dev->netdev_ops->ndo_features_check)
 3543		features &= dev->netdev_ops->ndo_features_check(skb, dev,
 3544								features);
 3545	else
 3546		features &= dflt_features_check(skb, dev, features);
 3547
 3548	return harmonize_features(skb, features);
 3549}
 3550EXPORT_SYMBOL(netif_skb_features);
 3551
 3552static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 3553		    struct netdev_queue *txq, bool more)
 3554{
 3555	unsigned int len;
 3556	int rc;
 3557
 3558	if (dev_nit_active(dev))
 3559		dev_queue_xmit_nit(skb, dev);
 3560
 3561	len = skb->len;
 3562	PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
 3563	trace_net_dev_start_xmit(skb, dev);
 3564	rc = netdev_start_xmit(skb, dev, txq, more);
 3565	trace_net_dev_xmit(skb, rc, dev, len);
 3566
 3567	return rc;
 3568}
 3569
 3570struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
 3571				    struct netdev_queue *txq, int *ret)
 3572{
 3573	struct sk_buff *skb = first;
 3574	int rc = NETDEV_TX_OK;
 3575
 3576	while (skb) {
 3577		struct sk_buff *next = skb->next;
 3578
 3579		skb_mark_not_on_list(skb);
 3580		rc = xmit_one(skb, dev, txq, next != NULL);
 3581		if (unlikely(!dev_xmit_complete(rc))) {
 3582			skb->next = next;
 3583			goto out;
 3584		}
 3585
 3586		skb = next;
 3587		if (netif_tx_queue_stopped(txq) && skb) {
 3588			rc = NETDEV_TX_BUSY;
 3589			break;
 3590		}
 3591	}
 3592
 3593out:
 3594	*ret = rc;
 3595	return skb;
 3596}
 3597
 3598static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 3599					  netdev_features_t features)
 3600{
 3601	if (skb_vlan_tag_present(skb) &&
 3602	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 3603		skb = __vlan_hwaccel_push_inside(skb);
 3604	return skb;
 3605}
 3606
 3607int skb_csum_hwoffload_help(struct sk_buff *skb,
 3608			    const netdev_features_t features)
 3609{
 3610	if (unlikely(skb->csum_not_inet))
 3611		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
 3612			skb_crc32c_csum_help(skb);
 3613
 3614	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
 3615}
 3616EXPORT_SYMBOL(skb_csum_hwoffload_help);
 3617
 3618static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
 3619{
 3620	netdev_features_t features;
 3621
 3622	features = netif_skb_features(skb);
 3623	skb = validate_xmit_vlan(skb, features);
 3624	if (unlikely(!skb))
 3625		goto out_null;
 3626
 3627	skb = sk_validate_xmit_skb(skb, dev);
 3628	if (unlikely(!skb))
 3629		goto out_null;
 3630
 3631	if (netif_needs_gso(skb, features)) {
 3632		struct sk_buff *segs;
 3633
 3634		segs = skb_gso_segment(skb, features);
 3635		if (IS_ERR(segs)) {
 3636			goto out_kfree_skb;
 3637		} else if (segs) {
 3638			consume_skb(skb);
 3639			skb = segs;
 3640		}
 3641	} else {
 3642		if (skb_needs_linearize(skb, features) &&
 3643		    __skb_linearize(skb))
 3644			goto out_kfree_skb;
 3645
 3646		/* If packet is not checksummed and device does not
 3647		 * support checksumming for this protocol, complete
 3648		 * checksumming here.
 3649		 */
 3650		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 3651			if (skb->encapsulation)
 3652				skb_set_inner_transport_header(skb,
 3653							       skb_checksum_start_offset(skb));
 3654			else
 3655				skb_set_transport_header(skb,
 3656							 skb_checksum_start_offset(skb));
 3657			if (skb_csum_hwoffload_help(skb, features))
 3658				goto out_kfree_skb;
 3659		}
 3660	}
 3661
 3662	skb = validate_xmit_xfrm(skb, features, again);
 3663
 3664	return skb;
 3665
 3666out_kfree_skb:
 3667	kfree_skb(skb);
 3668out_null:
 3669	atomic_long_inc(&dev->tx_dropped);
 3670	return NULL;
 3671}
 3672
 3673struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
 3674{
 3675	struct sk_buff *next, *head = NULL, *tail;
 3676
 3677	for (; skb != NULL; skb = next) {
 3678		next = skb->next;
 3679		skb_mark_not_on_list(skb);
 3680
 3681		/* in case skb wont be segmented, point to itself */
 3682		skb->prev = skb;
 3683
 3684		skb = validate_xmit_skb(skb, dev, again);
 3685		if (!skb)
 3686			continue;
 3687
 3688		if (!head)
 3689			head = skb;
 3690		else
 3691			tail->next = skb;
 3692		/* If skb was segmented, skb->prev points to
 3693		 * the last segment. If not, it still contains skb.
 3694		 */
 3695		tail = skb->prev;
 3696	}
 3697	return head;
 3698}
 3699EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
 3700
 3701static void qdisc_pkt_len_init(struct sk_buff *skb)
 3702{
 3703	const struct skb_shared_info *shinfo = skb_shinfo(skb);
 3704
 3705	qdisc_skb_cb(skb)->pkt_len = skb->len;
 3706
 3707	/* To get more precise estimation of bytes sent on wire,
 3708	 * we add to pkt_len the headers size of all segments
 3709	 */
 3710	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
 3711		unsigned int hdr_len;
 3712		u16 gso_segs = shinfo->gso_segs;
 3713
 3714		/* mac layer + network layer */
 3715		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
 3716
 3717		/* + transport layer */
 3718		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
 3719			const struct tcphdr *th;
 3720			struct tcphdr _tcphdr;
 3721
 3722			th = skb_header_pointer(skb, skb_transport_offset(skb),
 3723						sizeof(_tcphdr), &_tcphdr);
 3724			if (likely(th))
 3725				hdr_len += __tcp_hdrlen(th);
 3726		} else {
 3727			struct udphdr _udphdr;
 3728
 3729			if (skb_header_pointer(skb, skb_transport_offset(skb),
 3730					       sizeof(_udphdr), &_udphdr))
 3731				hdr_len += sizeof(struct udphdr);
 3732		}
 3733
 3734		if (shinfo->gso_type & SKB_GSO_DODGY)
 3735			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
 3736						shinfo->gso_size);
 3737
 3738		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
 3739	}
 3740}
 3741
 3742static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 3743				 struct net_device *dev,
 3744				 struct netdev_queue *txq)
 3745{
 3746	spinlock_t *root_lock = qdisc_lock(q);
 3747	struct sk_buff *to_free = NULL;
 3748	bool contended;
 3749	int rc;
 3750
 3751	qdisc_calculate_pkt_len(skb, q);
 3752
 3753	if (q->flags & TCQ_F_NOLOCK) {
 3754		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3755		qdisc_run(q);
 3756
 3757		if (unlikely(to_free))
 3758			kfree_skb_list(to_free);
 3759		return rc;
 3760	}
 3761
 3762	/*
 3763	 * Heuristic to force contended enqueues to serialize on a
 3764	 * separate lock before trying to get qdisc main lock.
 3765	 * This permits qdisc->running owner to get the lock more
 3766	 * often and dequeue packets faster.
 3767	 */
 3768	contended = qdisc_is_running(q);
 3769	if (unlikely(contended))
 3770		spin_lock(&q->busylock);
 3771
 3772	spin_lock(root_lock);
 3773	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 3774		__qdisc_drop(skb, &to_free);
 3775		rc = NET_XMIT_DROP;
 3776	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
 3777		   qdisc_run_begin(q)) {
 3778		/*
 3779		 * This is a work-conserving queue; there are no old skbs
 3780		 * waiting to be sent out; and the qdisc is not running -
 3781		 * xmit the skb directly.
 3782		 */
 3783
 3784		qdisc_bstats_update(q, skb);
 3785
 3786		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
 3787			if (unlikely(contended)) {
 3788				spin_unlock(&q->busylock);
 3789				contended = false;
 3790			}
 3791			__qdisc_run(q);
 3792		}
 3793
 3794		qdisc_run_end(q);
 3795		rc = NET_XMIT_SUCCESS;
 3796	} else {
 3797		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
 3798		if (qdisc_run_begin(q)) {
 3799			if (unlikely(contended)) {
 3800				spin_unlock(&q->busylock);
 3801				contended = false;
 3802			}
 3803			__qdisc_run(q);
 3804			qdisc_run_end(q);
 3805		}
 3806	}
 3807	spin_unlock(root_lock);
 3808	if (unlikely(to_free))
 3809		kfree_skb_list(to_free);
 3810	if (unlikely(contended))
 3811		spin_unlock(&q->busylock);
 3812	return rc;
 3813}
 3814
 3815#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 3816static void skb_update_prio(struct sk_buff *skb)
 3817{
 3818	const struct netprio_map *map;
 3819	const struct sock *sk;
 3820	unsigned int prioidx;
 3821
 3822	if (skb->priority)
 3823		return;
 3824	map = rcu_dereference_bh(skb->dev->priomap);
 3825	if (!map)
 3826		return;
 3827	sk = skb_to_full_sk(skb);
 3828	if (!sk)
 3829		return;
 3830
 3831	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
 3832
 3833	if (prioidx < map->priomap_len)
 3834		skb->priority = map->priomap[prioidx];
 3835}
 3836#else
 3837#define skb_update_prio(skb)
 3838#endif
 3839
 3840/**
 3841 *	dev_loopback_xmit - loop back @skb
 3842 *	@net: network namespace this loopback is happening in
 3843 *	@sk:  sk needed to be a netfilter okfn
 3844 *	@skb: buffer to transmit
 3845 */
 3846int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 3847{
 3848	skb_reset_mac_header(skb);
 3849	__skb_pull(skb, skb_network_offset(skb));
 3850	skb->pkt_type = PACKET_LOOPBACK;
 3851	skb->ip_summed = CHECKSUM_UNNECESSARY;
 3852	WARN_ON(!skb_dst(skb));
 3853	skb_dst_force(skb);
 3854	netif_rx_ni(skb);
 3855	return 0;
 3856}
 3857EXPORT_SYMBOL(dev_loopback_xmit);
 3858
 3859#ifdef CONFIG_NET_EGRESS
 3860static struct sk_buff *
 3861sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 3862{
 3863	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 3864	struct tcf_result cl_res;
 3865
 3866	if (!miniq)
 3867		return skb;
 3868
 3869	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
 3870	mini_qdisc_bstats_cpu_update(miniq, skb);
 3871
 3872	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
 3873	case TC_ACT_OK:
 3874	case TC_ACT_RECLASSIFY:
 3875		skb->tc_index = TC_H_MIN(cl_res.classid);
 3876		break;
 3877	case TC_ACT_SHOT:
 3878		mini_qdisc_qstats_cpu_drop(miniq);
 3879		*ret = NET_XMIT_DROP;
 3880		kfree_skb(skb);
 3881		return NULL;
 3882	case TC_ACT_STOLEN:
 3883	case TC_ACT_QUEUED:
 3884	case TC_ACT_TRAP:
 3885		*ret = NET_XMIT_SUCCESS;
 3886		consume_skb(skb);
 3887		return NULL;
 3888	case TC_ACT_REDIRECT:
 3889		/* No need to push/pop skb's mac_header here on egress! */
 3890		skb_do_redirect(skb);
 3891		*ret = NET_XMIT_SUCCESS;
 3892		return NULL;
 3893	default:
 3894		break;
 3895	}
 3896
 3897	return skb;
 3898}
 3899#endif /* CONFIG_NET_EGRESS */
 3900
 3901#ifdef CONFIG_XPS
 3902static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 3903			       struct xps_dev_maps *dev_maps, unsigned int tci)
 3904{
 3905	struct xps_map *map;
 3906	int queue_index = -1;
 3907
 3908	if (dev->num_tc) {
 3909		tci *= dev->num_tc;
 3910		tci += netdev_get_prio_tc_map(dev, skb->priority);
 3911	}
 3912
 3913	map = rcu_dereference(dev_maps->attr_map[tci]);
 3914	if (map) {
 3915		if (map->len == 1)
 3916			queue_index = map->queues[0];
 3917		else
 3918			queue_index = map->queues[reciprocal_scale(
 3919						skb_get_hash(skb), map->len)];
 3920		if (unlikely(queue_index >= dev->real_num_tx_queues))
 3921			queue_index = -1;
 3922	}
 3923	return queue_index;
 3924}
 3925#endif
 3926
 3927static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 3928			 struct sk_buff *skb)
 3929{
 3930#ifdef CONFIG_XPS
 3931	struct xps_dev_maps *dev_maps;
 3932	struct sock *sk = skb->sk;
 3933	int queue_index = -1;
 3934
 3935	if (!static_key_false(&xps_needed))
 3936		return -1;
 3937
 3938	rcu_read_lock();
 3939	if (!static_key_false(&xps_rxqs_needed))
 3940		goto get_cpus_map;
 3941
 3942	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 3943	if (dev_maps) {
 3944		int tci = sk_rx_queue_get(sk);
 3945
 3946		if (tci >= 0 && tci < dev->num_rx_queues)
 3947			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3948							  tci);
 3949	}
 3950
 3951get_cpus_map:
 3952	if (queue_index < 0) {
 3953		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 3954		if (dev_maps) {
 3955			unsigned int tci = skb->sender_cpu - 1;
 3956
 3957			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 3958							  tci);
 3959		}
 3960	}
 3961	rcu_read_unlock();
 3962
 3963	return queue_index;
 3964#else
 3965	return -1;
 3966#endif
 3967}
 3968
 3969u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 3970		     struct net_device *sb_dev)
 3971{
 3972	return 0;
 3973}
 3974EXPORT_SYMBOL(dev_pick_tx_zero);
 3975
 3976u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 3977		       struct net_device *sb_dev)
 3978{
 3979	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
 3980}
 3981EXPORT_SYMBOL(dev_pick_tx_cpu_id);
 3982
 3983u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
 3984		     struct net_device *sb_dev)
 3985{
 3986	struct sock *sk = skb->sk;
 3987	int queue_index = sk_tx_queue_get(sk);
 3988
 3989	sb_dev = sb_dev ? : dev;
 3990
 3991	if (queue_index < 0 || skb->ooo_okay ||
 3992	    queue_index >= dev->real_num_tx_queues) {
 3993		int new_index = get_xps_queue(dev, sb_dev, skb);
 3994
 3995		if (new_index < 0)
 3996			new_index = skb_tx_hash(dev, sb_dev, skb);
 3997
 3998		if (queue_index != new_index && sk &&
 3999		    sk_fullsock(sk) &&
 4000		    rcu_access_pointer(sk->sk_dst_cache))
 4001			sk_tx_queue_set(sk, new_index);
 4002
 4003		queue_index = new_index;
 4004	}
 4005
 4006	return queue_index;
 4007}
 4008EXPORT_SYMBOL(netdev_pick_tx);
 4009
 4010struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 4011					 struct sk_buff *skb,
 4012					 struct net_device *sb_dev)
 4013{
 4014	int queue_index = 0;
 4015
 4016#ifdef CONFIG_XPS
 4017	u32 sender_cpu = skb->sender_cpu - 1;
 4018
 4019	if (sender_cpu >= (u32)NR_CPUS)
 4020		skb->sender_cpu = raw_smp_processor_id() + 1;
 4021#endif
 4022
 4023	if (dev->real_num_tx_queues != 1) {
 4024		const struct net_device_ops *ops = dev->netdev_ops;
 4025
 4026		if (ops->ndo_select_queue)
 4027			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
 4028		else
 4029			queue_index = netdev_pick_tx(dev, skb, sb_dev);
 4030
 4031		queue_index = netdev_cap_txqueue(dev, queue_index);
 4032	}
 4033
 4034	skb_set_queue_mapping(skb, queue_index);
 4035	return netdev_get_tx_queue(dev, queue_index);
 4036}
 4037
 4038/**
 4039 *	__dev_queue_xmit - transmit a buffer
 4040 *	@skb: buffer to transmit
 4041 *	@sb_dev: suboordinate device used for L2 forwarding offload
 4042 *
 4043 *	Queue a buffer for transmission to a network device. The caller must
 4044 *	have set the device and priority and built the buffer before calling
 4045 *	this function. The function can be called from an interrupt.
 4046 *
 4047 *	A negative errno code is returned on a failure. A success does not
 4048 *	guarantee the frame will be transmitted as it may be dropped due
 4049 *	to congestion or traffic shaping.
 4050 *
 4051 * -----------------------------------------------------------------------------------
 4052 *      I notice this method can also return errors from the queue disciplines,
 4053 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 4054 *      be positive.
 4055 *
 4056 *      Regardless of the return value, the skb is consumed, so it is currently
 4057 *      difficult to retry a send to this method.  (You can bump the ref count
 4058 *      before sending to hold a reference for retry if you are careful.)
 4059 *
 4060 *      When calling this method, interrupts MUST be enabled.  This is because
 4061 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 4062 *          --BLG
 4063 */
 4064static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 4065{
 4066	struct net_device *dev = skb->dev;
 4067	struct netdev_queue *txq;
 4068	struct Qdisc *q;
 4069	int rc = -ENOMEM;
 4070	bool again = false;
 4071
 4072	skb_reset_mac_header(skb);
 4073
 4074	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
 4075		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
 4076
 4077	/* Disable soft irqs for various locks below. Also
 4078	 * stops preemption for RCU.
 4079	 */
 4080	rcu_read_lock_bh();
 4081
 4082	skb_update_prio(skb);
 4083
 4084	qdisc_pkt_len_init(skb);
 4085#ifdef CONFIG_NET_CLS_ACT
 4086	skb->tc_at_ingress = 0;
 4087# ifdef CONFIG_NET_EGRESS
 4088	if (static_branch_unlikely(&egress_needed_key)) {
 4089		skb = sch_handle_egress(skb, &rc, dev);
 4090		if (!skb)
 4091			goto out;
 4092	}
 4093# endif
 4094#endif
 4095	/* If device/qdisc don't need skb->dst, release it right now while
 4096	 * its hot in this cpu cache.
 4097	 */
 4098	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
 4099		skb_dst_drop(skb);
 4100	else
 4101		skb_dst_force(skb);
 4102
 4103	txq = netdev_core_pick_tx(dev, skb, sb_dev);
 4104	q = rcu_dereference_bh(txq->qdisc);
 4105
 4106	trace_net_dev_queue(skb);
 4107	if (q->enqueue) {
 4108		rc = __dev_xmit_skb(skb, q, dev, txq);
 4109		goto out;
 4110	}
 4111
 4112	/* The device has no queue. Common case for software devices:
 4113	 * loopback, all the sorts of tunnels...
 4114
 4115	 * Really, it is unlikely that netif_tx_lock protection is necessary
 4116	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
 4117	 * counters.)
 4118	 * However, it is possible, that they rely on protection
 4119	 * made by us here.
 4120
 4121	 * Check this and shot the lock. It is not prone from deadlocks.
 4122	 *Either shot noqueue qdisc, it is even simpler 8)
 4123	 */
 4124	if (dev->flags & IFF_UP) {
 4125		int cpu = smp_processor_id(); /* ok because BHs are off */
 4126
 4127		if (txq->xmit_lock_owner != cpu) {
 4128			if (dev_xmit_recursion())
 4129				goto recursion_alert;
 4130
 4131			skb = validate_xmit_skb(skb, dev, &again);
 4132			if (!skb)
 4133				goto out;
 4134
 4135			PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
 4136			HARD_TX_LOCK(dev, txq, cpu);
 4137
 4138			if (!netif_xmit_stopped(txq)) {
 4139				dev_xmit_recursion_inc();
 4140				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
 4141				dev_xmit_recursion_dec();
 4142				if (dev_xmit_complete(rc)) {
 4143					HARD_TX_UNLOCK(dev, txq);
 4144					goto out;
 4145				}
 4146			}
 4147			HARD_TX_UNLOCK(dev, txq);
 4148			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
 4149					     dev->name);
 4150		} else {
 4151			/* Recursion is detected! It is possible,
 4152			 * unfortunately
 4153			 */
 4154recursion_alert:
 4155			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
 4156					     dev->name);
 4157		}
 4158	}
 4159
 4160	rc = -ENETDOWN;
 4161	rcu_read_unlock_bh();
 4162
 4163	atomic_long_inc(&dev->tx_dropped);
 4164	kfree_skb_list(skb);
 4165	return rc;
 4166out:
 4167	rcu_read_unlock_bh();
 4168	return rc;
 4169}
 4170
 4171int dev_queue_xmit(struct sk_buff *skb)
 4172{
 4173	return __dev_queue_xmit(skb, NULL);
 4174}
 4175EXPORT_SYMBOL(dev_queue_xmit);
 4176
 4177int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 4178{
 4179	return __dev_queue_xmit(skb, sb_dev);
 4180}
 4181EXPORT_SYMBOL(dev_queue_xmit_accel);
 4182
 4183int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 4184{
 4185	struct net_device *dev = skb->dev;
 4186	struct sk_buff *orig_skb = skb;
 4187	struct netdev_queue *txq;
 4188	int ret = NETDEV_TX_BUSY;
 4189	bool again = false;
 4190
 4191	if (unlikely(!netif_running(dev) ||
 4192		     !netif_carrier_ok(dev)))
 4193		goto drop;
 4194
 4195	skb = validate_xmit_skb_list(skb, dev, &again);
 4196	if (skb != orig_skb)
 4197		goto drop;
 4198
 4199	skb_set_queue_mapping(skb, queue_id);
 4200	txq = skb_get_tx_queue(dev, skb);
 4201	PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
 4202
 4203	local_bh_disable();
 4204
 4205	dev_xmit_recursion_inc();
 4206	HARD_TX_LOCK(dev, txq, smp_processor_id());
 4207	if (!netif_xmit_frozen_or_drv_stopped(txq))
 4208		ret = netdev_start_xmit(skb, dev, txq, false);
 4209	HARD_TX_UNLOCK(dev, txq);
 4210	dev_xmit_recursion_dec();
 4211
 4212	local_bh_enable();
 4213
 4214	if (!dev_xmit_complete(ret))
 4215		kfree_skb(skb);
 4216
 4217	return ret;
 4218drop:
 4219	atomic_long_inc(&dev->tx_dropped);
 4220	kfree_skb_list(skb);
 4221	return NET_XMIT_DROP;
 4222}
 4223EXPORT_SYMBOL(dev_direct_xmit);
 4224
 4225/*************************************************************************
 4226 *			Receiver routines
 4227 *************************************************************************/
 4228
 4229int netdev_max_backlog __read_mostly = 1000;
 4230EXPORT_SYMBOL(netdev_max_backlog);
 4231
 4232int netdev_tstamp_prequeue __read_mostly = 1;
 4233int netdev_budget __read_mostly = 300;
 4234/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 4235unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
 4236int weight_p __read_mostly = 64;           /* old backlog weight */
 4237int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 4238int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
 4239int dev_rx_weight __read_mostly = 64;
 4240int dev_tx_weight __read_mostly = 64;
 4241/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
 4242int gro_normal_batch __read_mostly = 8;
 4243
 4244/* Called with irq disabled */
 4245static inline void ____napi_schedule(struct softnet_data *sd,
 4246				     struct napi_struct *napi)
 4247{
 4248	list_add_tail(&napi->poll_list, &sd->poll_list);
 4249	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4250}
 4251
 4252#ifdef CONFIG_RPS
 4253
 4254/* One global table that all flow-based protocols share. */
 4255struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 4256EXPORT_SYMBOL(rps_sock_flow_table);
 4257u32 rps_cpu_mask __read_mostly;
 4258EXPORT_SYMBOL(rps_cpu_mask);
 4259
 4260struct static_key_false rps_needed __read_mostly;
 4261EXPORT_SYMBOL(rps_needed);
 4262struct static_key_false rfs_needed __read_mostly;
 4263EXPORT_SYMBOL(rfs_needed);
 4264
 4265static struct rps_dev_flow *
 4266set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4267	    struct rps_dev_flow *rflow, u16 next_cpu)
 4268{
 4269	if (next_cpu < nr_cpu_ids) {
 4270#ifdef CONFIG_RFS_ACCEL
 4271		struct netdev_rx_queue *rxqueue;
 4272		struct rps_dev_flow_table *flow_table;
 4273		struct rps_dev_flow *old_rflow;
 4274		u32 flow_id;
 4275		u16 rxq_index;
 4276		int rc;
 4277
 4278		/* Should we steer this flow to a different hardware queue? */
 4279		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
 4280		    !(dev->features & NETIF_F_NTUPLE))
 4281			goto out;
 4282		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
 4283		if (rxq_index == skb_get_rx_queue(skb))
 4284			goto out;
 4285
 4286		rxqueue = dev->_rx + rxq_index;
 4287		flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4288		if (!flow_table)
 4289			goto out;
 4290		flow_id = skb_get_hash(skb) & flow_table->mask;
 4291		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
 4292							rxq_index, flow_id);
 4293		if (rc < 0)
 4294			goto out;
 4295		old_rflow = rflow;
 4296		rflow = &flow_table->flows[flow_id];
 4297		rflow->filter = rc;
 4298		if (old_rflow->filter == rflow->filter)
 4299			old_rflow->filter = RPS_NO_FILTER;
 4300	out:
 4301#endif
 4302		rflow->last_qtail =
 4303			per_cpu(softnet_data, next_cpu).input_queue_head;
 4304	}
 4305
 4306	rflow->cpu = next_cpu;
 4307	return rflow;
 4308}
 4309
 4310/*
 4311 * get_rps_cpu is called from netif_receive_skb and returns the target
 4312 * CPU from the RPS map of the receiving queue for a given skb.
 4313 * rcu_read_lock must be held on entry.
 4314 */
 4315static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 4316		       struct rps_dev_flow **rflowp)
 4317{
 4318	const struct rps_sock_flow_table *sock_flow_table;
 4319	struct netdev_rx_queue *rxqueue = dev->_rx;
 4320	struct rps_dev_flow_table *flow_table;
 4321	struct rps_map *map;
 4322	int cpu = -1;
 4323	u32 tcpu;
 4324	u32 hash;
 4325
 4326	if (skb_rx_queue_recorded(skb)) {
 4327		u16 index = skb_get_rx_queue(skb);
 4328
 4329		if (unlikely(index >= dev->real_num_rx_queues)) {
 4330			WARN_ONCE(dev->real_num_rx_queues > 1,
 4331				  "%s received packet on queue %u, but number "
 4332				  "of RX queues is %u\n",
 4333				  dev->name, index, dev->real_num_rx_queues);
 4334			goto done;
 4335		}
 4336		rxqueue += index;
 4337	}
 4338
 4339	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 4340
 4341	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4342	map = rcu_dereference(rxqueue->rps_map);
 4343	if (!flow_table && !map)
 4344		goto done;
 4345
 4346	skb_reset_network_header(skb);
 4347	hash = skb_get_hash(skb);
 4348	if (!hash)
 4349		goto done;
 4350
 4351	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 4352	if (flow_table && sock_flow_table) {
 4353		struct rps_dev_flow *rflow;
 4354		u32 next_cpu;
 4355		u32 ident;
 4356
 4357		/* First check into global flow table if there is a match */
 4358		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
 4359		if ((ident ^ hash) & ~rps_cpu_mask)
 4360			goto try_rps;
 4361
 4362		next_cpu = ident & rps_cpu_mask;
 4363
 4364		/* OK, now we know there is a match,
 4365		 * we can look at the local (per receive queue) flow table
 4366		 */
 4367		rflow = &flow_table->flows[hash & flow_table->mask];
 4368		tcpu = rflow->cpu;
 4369
 4370		/*
 4371		 * If the desired CPU (where last recvmsg was done) is
 4372		 * different from current CPU (one in the rx-queue flow
 4373		 * table entry), switch if one of the following holds:
 4374		 *   - Current CPU is unset (>= nr_cpu_ids).
 4375		 *   - Current CPU is offline.
 4376		 *   - The current CPU's queue tail has advanced beyond the
 4377		 *     last packet that was enqueued using this table entry.
 4378		 *     This guarantees that all previous packets for the flow
 4379		 *     have been dequeued, thus preserving in order delivery.
 4380		 */
 4381		if (unlikely(tcpu != next_cpu) &&
 4382		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 4383		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 4384		      rflow->last_qtail)) >= 0)) {
 4385			tcpu = next_cpu;
 4386			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 4387		}
 4388
 4389		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 4390			*rflowp = rflow;
 4391			cpu = tcpu;
 4392			goto done;
 4393		}
 4394	}
 4395
 4396try_rps:
 4397
 4398	if (map) {
 4399		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 4400		if (cpu_online(tcpu)) {
 4401			cpu = tcpu;
 4402			goto done;
 4403		}
 4404	}
 4405
 4406done:
 4407	return cpu;
 4408}
 4409
 4410#ifdef CONFIG_RFS_ACCEL
 4411
 4412/**
 4413 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 4414 * @dev: Device on which the filter was set
 4415 * @rxq_index: RX queue index
 4416 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 4417 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 4418 *
 4419 * Drivers that implement ndo_rx_flow_steer() should periodically call
 4420 * this function for each installed filter and remove the filters for
 4421 * which it returns %true.
 4422 */
 4423bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 4424			 u32 flow_id, u16 filter_id)
 4425{
 4426	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
 4427	struct rps_dev_flow_table *flow_table;
 4428	struct rps_dev_flow *rflow;
 4429	bool expire = true;
 4430	unsigned int cpu;
 4431
 4432	rcu_read_lock();
 4433	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 4434	if (flow_table && flow_id <= flow_table->mask) {
 4435		rflow = &flow_table->flows[flow_id];
 4436		cpu = READ_ONCE(rflow->cpu);
 4437		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 4438		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 4439			   rflow->last_qtail) <
 4440		     (int)(10 * flow_table->mask)))
 4441			expire = false;
 4442	}
 4443	rcu_read_unlock();
 4444	return expire;
 4445}
 4446EXPORT_SYMBOL(rps_may_expire_flow);
 4447
 4448#endif /* CONFIG_RFS_ACCEL */
 4449
 4450/* Called from hardirq (IPI) context */
 4451static void rps_trigger_softirq(void *data)
 4452{
 4453	struct softnet_data *sd = data;
 4454
 4455	____napi_schedule(sd, &sd->backlog);
 4456	sd->received_rps++;
 4457}
 4458
 4459#endif /* CONFIG_RPS */
 4460
 4461/*
 4462 * Check if this softnet_data structure is another cpu one
 4463 * If yes, queue it to our IPI list and return 1
 4464 * If no, return 0
 4465 */
 4466static int rps_ipi_queued(struct softnet_data *sd)
 4467{
 4468#ifdef CONFIG_RPS
 4469	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 4470
 4471	if (sd != mysd) {
 4472		sd->rps_ipi_next = mysd->rps_ipi_list;
 4473		mysd->rps_ipi_list = sd;
 4474
 4475		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 4476		return 1;
 4477	}
 4478#endif /* CONFIG_RPS */
 4479	return 0;
 4480}
 4481
 4482#ifdef CONFIG_NET_FLOW_LIMIT
 4483int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 4484#endif
 4485
 4486static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 4487{
 4488#ifdef CONFIG_NET_FLOW_LIMIT
 4489	struct sd_flow_limit *fl;
 4490	struct softnet_data *sd;
 4491	unsigned int old_flow, new_flow;
 4492
 4493	if (qlen < (netdev_max_backlog >> 1))
 4494		return false;
 4495
 4496	sd = this_cpu_ptr(&softnet_data);
 4497
 4498	rcu_read_lock();
 4499	fl = rcu_dereference(sd->flow_limit);
 4500	if (fl) {
 4501		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
 4502		old_flow = fl->history[fl->history_head];
 4503		fl->history[fl->history_head] = new_flow;
 4504
 4505		fl->history_head++;
 4506		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
 4507
 4508		if (likely(fl->buckets[old_flow]))
 4509			fl->buckets[old_flow]--;
 4510
 4511		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
 4512			fl->count++;
 4513			rcu_read_unlock();
 4514			return true;
 4515		}
 4516	}
 4517	rcu_read_unlock();
 4518#endif
 4519	return false;
 4520}
 4521
 4522/*
 4523 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 4524 * queue (may be a remote CPU queue).
 4525 */
 4526static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 4527			      unsigned int *qtail)
 4528{
 4529	struct softnet_data *sd;
 4530	unsigned long flags;
 4531	unsigned int qlen;
 4532
 4533	sd = &per_cpu(softnet_data, cpu);
 4534
 4535	local_irq_save(flags);
 4536
 4537	rps_lock(sd);
 4538	if (!netif_running(skb->dev))
 4539		goto drop;
 4540	qlen = skb_queue_len(&sd->input_pkt_queue);
 4541	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 4542		if (qlen) {
 4543enqueue:
 4544			__skb_queue_tail(&sd->input_pkt_queue, skb);
 4545			input_queue_tail_incr_save(sd, qtail);
 4546			rps_unlock(sd);
 4547			local_irq_restore(flags);
 4548			return NET_RX_SUCCESS;
 4549		}
 4550
 4551		/* Schedule NAPI for backlog device
 4552		 * We can use non atomic operation since we own the queue lock
 4553		 */
 4554		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
 4555			if (!rps_ipi_queued(sd))
 4556				____napi_schedule(sd, &sd->backlog);
 4557		}
 4558		goto enqueue;
 4559	}
 4560
 4561drop:
 4562	sd->dropped++;
 4563	rps_unlock(sd);
 4564
 4565	local_irq_restore(flags);
 4566
 4567	atomic_long_inc(&skb->dev->rx_dropped);
 4568	kfree_skb(skb);
 4569	return NET_RX_DROP;
 4570}
 4571
 4572static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 4573{
 4574	struct net_device *dev = skb->dev;
 4575	struct netdev_rx_queue *rxqueue;
 4576
 4577	rxqueue = dev->_rx;
 4578
 4579	if (skb_rx_queue_recorded(skb)) {
 4580		u16 index = skb_get_rx_queue(skb);
 4581
 4582		if (unlikely(index >= dev->real_num_rx_queues)) {
 4583			WARN_ONCE(dev->real_num_rx_queues > 1,
 4584				  "%s received packet on queue %u, but number "
 4585				  "of RX queues is %u\n",
 4586				  dev->name, index, dev->real_num_rx_queues);
 4587
 4588			return rxqueue; /* Return first rxqueue */
 4589		}
 4590		rxqueue += index;
 4591	}
 4592	return rxqueue;
 4593}
 4594
 4595static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 4596				     struct xdp_buff *xdp,
 4597				     struct bpf_prog *xdp_prog)
 4598{
 4599	struct netdev_rx_queue *rxqueue;
 4600	void *orig_data, *orig_data_end;
 4601	u32 metalen, act = XDP_DROP;
 4602	__be16 orig_eth_type;
 4603	struct ethhdr *eth;
 4604	bool orig_bcast;
 4605	int hlen, off;
 4606	u32 mac_len;
 4607
 4608	/* Reinjected packets coming from act_mirred or similar should
 4609	 * not get XDP generic processing.
 4610	 */
 4611	if (skb_is_redirected(skb))
 4612		return XDP_PASS;
 4613
 4614	/* XDP packets must be linear and must have sufficient headroom
 4615	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 4616	 * native XDP provides, thus we need to do it here as well.
 4617	 */
 4618	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 4619	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 4620		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 4621		int troom = skb->tail + skb->data_len - skb->end;
 4622
 4623		/* In case we have to go down the path and also linearize,
 4624		 * then lets do the pskb_expand_head() work just once here.
 4625		 */
 4626		if (pskb_expand_head(skb,
 4627				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
 4628				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
 4629			goto do_drop;
 4630		if (skb_linearize(skb))
 4631			goto do_drop;
 4632	}
 4633
 4634	/* The XDP program wants to see the packet starting at the MAC
 4635	 * header.
 4636	 */
 4637	mac_len = skb->data - skb_mac_header(skb);
 4638	hlen = skb_headlen(skb) + mac_len;
 4639	xdp->data = skb->data - mac_len;
 4640	xdp->data_meta = xdp->data;
 4641	xdp->data_end = xdp->data + hlen;
 4642	xdp->data_hard_start = skb->data - skb_headroom(skb);
 4643
 4644	/* SKB "head" area always have tailroom for skb_shared_info */
 4645	xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
 4646	xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 4647
 4648	orig_data_end = xdp->data_end;
 4649	orig_data = xdp->data;
 4650	eth = (struct ethhdr *)xdp->data;
 4651	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
 4652	orig_eth_type = eth->h_proto;
 4653
 4654	rxqueue = netif_get_rxqueue(skb);
 4655	xdp->rxq = &rxqueue->xdp_rxq;
 4656
 4657	act = bpf_prog_run_xdp(xdp_prog, xdp);
 4658
 4659	/* check if bpf_xdp_adjust_head was used */
 4660	off = xdp->data - orig_data;
 4661	if (off) {
 4662		if (off > 0)
 4663			__skb_pull(skb, off);
 4664		else if (off < 0)
 4665			__skb_push(skb, -off);
 4666
 4667		skb->mac_header += off;
 4668		skb_reset_network_header(skb);
 4669	}
 4670
 4671	/* check if bpf_xdp_adjust_tail was used */
 4672	off = xdp->data_end - orig_data_end;
 4673	if (off != 0) {
 4674		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 4675		skb->len += off; /* positive on grow, negative on shrink */
 4676	}
 4677
 4678	/* check if XDP changed eth hdr such SKB needs update */
 4679	eth = (struct ethhdr *)xdp->data;
 4680	if ((orig_eth_type != eth->h_proto) ||
 4681	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
 4682		__skb_push(skb, ETH_HLEN);
 4683		skb->protocol = eth_type_trans(skb, skb->dev);
 4684	}
 4685
 4686	switch (act) {
 4687	case XDP_REDIRECT:
 4688	case XDP_TX:
 4689		__skb_push(skb, mac_len);
 4690		break;
 4691	case XDP_PASS:
 4692		metalen = xdp->data - xdp->data_meta;
 4693		if (metalen)
 4694			skb_metadata_set(skb, metalen);
 4695		break;
 4696	default:
 4697		bpf_warn_invalid_xdp_action(act);
 4698		fallthrough;
 4699	case XDP_ABORTED:
 4700		trace_xdp_exception(skb->dev, xdp_prog, act);
 4701		fallthrough;
 4702	case XDP_DROP:
 4703	do_drop:
 4704		kfree_skb(skb);
 4705		break;
 4706	}
 4707
 4708	return act;
 4709}
 4710
 4711/* When doing generic XDP we have to bypass the qdisc layer and the
 4712 * network taps in order to match in-driver-XDP behavior.
 4713 */
 4714void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 4715{
 4716	struct net_device *dev = skb->dev;
 4717	struct netdev_queue *txq;
 4718	bool free_skb = true;
 4719	int cpu, rc;
 4720
 4721	txq = netdev_core_pick_tx(dev, skb, NULL);
 4722	cpu = smp_processor_id();
 4723	HARD_TX_LOCK(dev, txq, cpu);
 4724	if (!netif_xmit_stopped(txq)) {
 4725		rc = netdev_start_xmit(skb, dev, txq, 0);
 4726		if (dev_xmit_complete(rc))
 4727			free_skb = false;
 4728	}
 4729	HARD_TX_UNLOCK(dev, txq);
 4730	if (free_skb) {
 4731		trace_xdp_exception(dev, xdp_prog, XDP_TX);
 4732		kfree_skb(skb);
 4733	}
 4734}
 4735
 4736static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 4737
 4738int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 4739{
 4740	if (xdp_prog) {
 4741		struct xdp_buff xdp;
 4742		u32 act;
 4743		int err;
 4744
 4745		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 4746		if (act != XDP_PASS) {
 4747			switch (act) {
 4748			case XDP_REDIRECT:
 4749				err = xdp_do_generic_redirect(skb->dev, skb,
 4750							      &xdp, xdp_prog);
 4751				if (err)
 4752					goto out_redir;
 4753				break;
 4754			case XDP_TX:
 4755				generic_xdp_tx(skb, xdp_prog);
 4756				break;
 4757			}
 4758			return XDP_DROP;
 4759		}
 4760	}
 4761	return XDP_PASS;
 4762out_redir:
 4763	kfree_skb(skb);
 4764	return XDP_DROP;
 4765}
 4766EXPORT_SYMBOL_GPL(do_xdp_generic);
 4767
 4768static int netif_rx_internal(struct sk_buff *skb)
 4769{
 4770	int ret;
 4771
 4772	net_timestamp_check(netdev_tstamp_prequeue, skb);
 4773
 4774	trace_netif_rx(skb);
 4775
 4776#ifdef CONFIG_RPS
 4777	if (static_branch_unlikely(&rps_needed)) {
 4778		struct rps_dev_flow voidflow, *rflow = &voidflow;
 4779		int cpu;
 4780
 4781		preempt_disable();
 4782		rcu_read_lock();
 4783
 4784		cpu = get_rps_cpu(skb->dev, skb, &rflow);
 4785		if (cpu < 0)
 4786			cpu = smp_processor_id();
 4787
 4788		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 4789
 4790		rcu_read_unlock();
 4791		preempt_enable();
 4792	} else
 4793#endif
 4794	{
 4795		unsigned int qtail;
 4796
 4797		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
 4798		put_cpu();
 4799	}
 4800	return ret;
 4801}
 4802
 4803/**
 4804 *	netif_rx	-	post buffer to the network code
 4805 *	@skb: buffer to post
 4806 *
 4807 *	This function receives a packet from a device driver and queues it for
 4808 *	the upper (protocol) levels to process.  It always succeeds. The buffer
 4809 *	may be dropped during processing for congestion control or by the
 4810 *	protocol layers.
 4811 *
 4812 *	return values:
 4813 *	NET_RX_SUCCESS	(no congestion)
 4814 *	NET_RX_DROP     (packet was dropped)
 4815 *
 4816 */
 4817
 4818int netif_rx(struct sk_buff *skb)
 4819{
 4820	int ret;
 4821
 4822	trace_netif_rx_entry(skb);
 4823
 4824	ret = netif_rx_internal(skb);
 4825	trace_netif_rx_exit(ret);
 4826
 4827	return ret;
 4828}
 4829EXPORT_SYMBOL(netif_rx);
 4830
 4831int netif_rx_ni(struct sk_buff *skb)
 4832{
 4833	int err;
 4834
 4835	trace_netif_rx_ni_entry(skb);
 4836
 4837	preempt_disable();
 4838	err = netif_rx_internal(skb);
 4839	if (local_softirq_pending())
 4840		do_softirq();
 4841	preempt_enable();
 4842	trace_netif_rx_ni_exit(err);
 4843
 4844	return err;
 4845}
 4846EXPORT_SYMBOL(netif_rx_ni);
 4847
 4848int netif_rx_any_context(struct sk_buff *skb)
 4849{
 4850	/*
 4851	 * If invoked from contexts which do not invoke bottom half
 4852	 * processing either at return from interrupt or when softrqs are
 4853	 * reenabled, use netif_rx_ni() which invokes bottomhalf processing
 4854	 * directly.
 4855	 */
 4856	if (in_interrupt())
 4857		return netif_rx(skb);
 4858	else
 4859		return netif_rx_ni(skb);
 4860}
 4861EXPORT_SYMBOL(netif_rx_any_context);
 4862
 4863static __latent_entropy void net_tx_action(struct softirq_action *h)
 4864{
 4865	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 4866
 4867	if (sd->completion_queue) {
 4868		struct sk_buff *clist;
 4869
 4870		local_irq_disable();
 4871		clist = sd->completion_queue;
 4872		sd->completion_queue = NULL;
 4873		local_irq_enable();
 4874
 4875		while (clist) {
 4876			struct sk_buff *skb = clist;
 4877
 4878			clist = clist->next;
 4879
 4880			WARN_ON(refcount_read(&skb->users));
 4881			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
 4882				trace_consume_skb(skb);
 4883			else
 4884				trace_kfree_skb(skb, net_tx_action);
 4885
 4886			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 4887				__kfree_skb(skb);
 4888			else
 4889				__kfree_skb_defer(skb);
 4890		}
 4891
 4892		__kfree_skb_flush();
 4893	}
 4894
 4895	if (sd->output_queue) {
 4896		struct Qdisc *head;
 4897
 4898		local_irq_disable();
 4899		head = sd->output_queue;
 4900		sd->output_queue = NULL;
 4901		sd->output_queue_tailp = &sd->output_queue;
 4902		local_irq_enable();
 4903
 4904		while (head) {
 4905			struct Qdisc *q = head;
 4906			spinlock_t *root_lock = NULL;
 4907
 4908			head = head->next_sched;
 4909
 4910			if (!(q->flags & TCQ_F_NOLOCK)) {
 4911				root_lock = qdisc_lock(q);
 4912				spin_lock(root_lock);
 4913			}
 4914			/* We need to make sure head->next_sched is read
 4915			 * before clearing __QDISC_STATE_SCHED
 4916			 */
 4917			smp_mb__before_atomic();
 4918			clear_bit(__QDISC_STATE_SCHED, &q->state);
 4919			qdisc_run(q);
 4920			if (root_lock)
 4921				spin_unlock(root_lock);
 4922		}
 4923	}
 4924
 4925	xfrm_dev_backlog(sd);
 4926}
 4927
 4928#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
 4929/* This hook is defined here for ATM LANE */
 4930int (*br_fdb_test_addr_hook)(struct net_device *dev,
 4931			     unsigned char *addr) __read_mostly;
 4932EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 4933#endif
 4934
 4935static inline struct sk_buff *
 4936sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 4937		   struct net_device *orig_dev, bool *another)
 4938{
 4939#ifdef CONFIG_NET_CLS_ACT
 4940	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
 4941	struct tcf_result cl_res;
 4942
 4943	/* If there's at least one ingress present somewhere (so
 4944	 * we get here via enabled static key), remaining devices
 4945	 * that are not configured with an ingress qdisc will bail
 4946	 * out here.
 4947	 */
 4948	if (!miniq)
 4949		return skb;
 4950
 4951	if (*pt_prev) {
 4952		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 4953		*pt_prev = NULL;
 4954	}
 4955
 4956	qdisc_skb_cb(skb)->pkt_len = skb->len;
 4957	skb->tc_at_ingress = 1;
 4958	mini_qdisc_bstats_cpu_update(miniq, skb);
 4959
 4960	switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
 4961				     &cl_res, false)) {
 4962	case TC_ACT_OK:
 4963	case TC_ACT_RECLASSIFY:
 4964		skb->tc_index = TC_H_MIN(cl_res.classid);
 4965		break;
 4966	case TC_ACT_SHOT:
 4967		mini_qdisc_qstats_cpu_drop(miniq);
 4968		kfree_skb(skb);
 4969		return NULL;
 4970	case TC_ACT_STOLEN:
 4971	case TC_ACT_QUEUED:
 4972	case TC_ACT_TRAP:
 4973		consume_skb(skb);
 4974		return NULL;
 4975	case TC_ACT_REDIRECT:
 4976		/* skb_mac_header check was done by cls/act_bpf, so
 4977		 * we can safely push the L2 header back before
 4978		 * redirecting to another netdev
 4979		 */
 4980		__skb_push(skb, skb->mac_len);
 4981		if (skb_do_redirect(skb) == -EAGAIN) {
 4982			__skb_pull(skb, skb->mac_len);
 4983			*another = true;
 4984			break;
 4985		}
 4986		return NULL;
 4987	case TC_ACT_CONSUMED:
 4988		return NULL;
 4989	default:
 4990		break;
 4991	}
 4992#endif /* CONFIG_NET_CLS_ACT */
 4993	return skb;
 4994}
 4995
 4996/**
 4997 *	netdev_is_rx_handler_busy - check if receive handler is registered
 4998 *	@dev: device to check
 4999 *
 5000 *	Check if a receive handler is already registered for a given device.
 5001 *	Return true if there one.
 5002 *
 5003 *	The caller must hold the rtnl_mutex.
 5004 */
 5005bool netdev_is_rx_handler_busy(struct net_device *dev)
 5006{
 5007	ASSERT_RTNL();
 5008	return dev && rtnl_dereference(dev->rx_handler);
 5009}
 5010EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
 5011
 5012/**
 5013 *	netdev_rx_handler_register - register receive handler
 5014 *	@dev: device to register a handler for
 5015 *	@rx_handler: receive handler to register
 5016 *	@rx_handler_data: data pointer that is used by rx handler
 5017 *
 5018 *	Register a receive handler for a device. This handler will then be
 5019 *	called from __netif_receive_skb. A negative errno code is returned
 5020 *	on a failure.
 5021 *
 5022 *	The caller must hold the rtnl_mutex.
 5023 *
 5024 *	For a general description of rx_handler, see enum rx_handler_result.
 5025 */
 5026int netdev_rx_handler_register(struct net_device *dev,
 5027			       rx_handler_func_t *rx_handler,
 5028			       void *rx_handler_data)
 5029{
 5030	if (netdev_is_rx_handler_busy(dev))
 5031		return -EBUSY;
 5032
 5033	if (dev->priv_flags & IFF_NO_RX_HANDLER)
 5034		return -EINVAL;
 5035
 5036	/* Note: rx_handler_data must be set before rx_handler */
 5037	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 5038	rcu_assign_pointer(dev->rx_handler, rx_handler);
 5039
 5040	return 0;
 5041}
 5042EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
 5043
 5044/**
 5045 *	netdev_rx_handler_unregister - unregister receive handler
 5046 *	@dev: device to unregister a handler from
 5047 *
 5048 *	Unregister a receive handler from a device.
 5049 *
 5050 *	The caller must hold the rtnl_mutex.
 5051 */
 5052void netdev_rx_handler_unregister(struct net_device *dev)
 5053{
 5054
 5055	ASSERT_RTNL();
 5056	RCU_INIT_POINTER(dev->rx_handler, NULL);
 5057	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
 5058	 * section has a guarantee to see a non NULL rx_handler_data
 5059	 * as well.
 5060	 */
 5061	synchronize_net();
 5062	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
 5063}
 5064EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 5065
 5066/*
 5067 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 5068 * the special handling of PFMEMALLOC skbs.
 5069 */
 5070static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 5071{
 5072	switch (skb->protocol) {
 5073	case htons(ETH_P_ARP):
 5074	case htons(ETH_P_IP):
 5075	case htons(ETH_P_IPV6):
 5076	case htons(ETH_P_8021Q):
 5077	case htons(ETH_P_8021AD):
 5078		return true;
 5079	default:
 5080		return false;
 5081	}
 5082}
 5083
 5084static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 5085			     int *ret, struct net_device *orig_dev)
 5086{
 5087	if (nf_hook_ingress_active(skb)) {
 5088		int ingress_retval;
 5089
 5090		if (*pt_prev) {
 5091			*ret = deliver_skb(skb, *pt_prev, orig_dev);
 5092			*pt_prev = NULL;
 5093		}
 5094
 5095		rcu_read_lock();
 5096		ingress_retval = nf_hook_ingress(skb);
 5097		rcu_read_unlock();
 5098		return ingress_retval;
 5099	}
 5100	return 0;
 5101}
 5102
 5103static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 5104				    struct packet_type **ppt_prev)
 5105{
 5106	struct packet_type *ptype, *pt_prev;
 5107	rx_handler_func_t *rx_handler;
 5108	struct sk_buff *skb = *pskb;
 5109	struct net_device *orig_dev;
 5110	bool deliver_exact = false;
 5111	int ret = NET_RX_DROP;
 5112	__be16 type;
 5113
 5114	net_timestamp_check(!netdev_tstamp_prequeue, skb);
 5115
 5116	trace_netif_receive_skb(skb);
 5117
 5118	orig_dev = skb->dev;
 5119
 5120	skb_reset_network_header(skb);
 5121	if (!skb_transport_header_was_set(skb))
 5122		skb_reset_transport_header(skb);
 5123	skb_reset_mac_len(skb);
 5124
 5125	pt_prev = NULL;
 5126
 5127another_round:
 5128	skb->skb_iif = skb->dev->ifindex;
 5129
 5130	__this_cpu_inc(softnet_data.processed);
 5131
 5132	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 5133		int ret2;
 5134
 5135		preempt_disable();
 5136		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 5137		preempt_enable();
 5138
 5139		if (ret2 != XDP_PASS) {
 5140			ret = NET_RX_DROP;
 5141			goto out;
 5142		}
 5143		skb_reset_mac_len(skb);
 5144	}
 5145
 5146	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5147	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5148		skb = skb_vlan_untag(skb);
 5149		if (unlikely(!skb))
 5150			goto out;
 5151	}
 5152
 5153	if (skb_skip_tc_classify(skb))
 5154		goto skip_classify;
 5155
 5156	if (pfmemalloc)
 5157		goto skip_taps;
 5158
 5159	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 5160		if (pt_prev)
 5161			ret = deliver_skb(skb, pt_prev, orig_dev);
 5162		pt_prev = ptype;
 5163	}
 5164
 5165	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
 5166		if (pt_prev)
 5167			ret = deliver_skb(skb, pt_prev, orig_dev);
 5168		pt_prev = ptype;
 5169	}
 5170
 5171skip_taps:
 5172#ifdef CONFIG_NET_INGRESS
 5173	if (static_branch_unlikely(&ingress_needed_key)) {
 5174		bool another = false;
 5175
 5176		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
 5177					 &another);
 5178		if (another)
 5179			goto another_round;
 5180		if (!skb)
 5181			goto out;
 5182
 5183		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
 5184			goto out;
 5185	}
 5186#endif
 5187	skb_reset_redirect(skb);
 5188skip_classify:
 5189	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 5190		goto drop;
 5191
 5192	if (skb_vlan_tag_present(skb)) {
 5193		if (pt_prev) {
 5194			ret = deliver_skb(skb, pt_prev, orig_dev);
 5195			pt_prev = NULL;
 5196		}
 5197		if (vlan_do_receive(&skb))
 5198			goto another_round;
 5199		else if (unlikely(!skb))
 5200			goto out;
 5201	}
 5202
 5203	rx_handler = rcu_dereference(skb->dev->rx_handler);
 5204	if (rx_handler) {
 5205		if (pt_prev) {
 5206			ret = deliver_skb(skb, pt_prev, orig_dev);
 5207			pt_prev = NULL;
 5208		}
 5209		switch (rx_handler(&skb)) {
 5210		case RX_HANDLER_CONSUMED:
 5211			ret = NET_RX_SUCCESS;
 5212			goto out;
 5213		case RX_HANDLER_ANOTHER:
 5214			goto another_round;
 5215		case RX_HANDLER_EXACT:
 5216			deliver_exact = true;
 5217		case RX_HANDLER_PASS:
 5218			break;
 5219		default:
 5220			BUG();
 5221		}
 5222	}
 5223
 5224	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
 5225check_vlan_id:
 5226		if (skb_vlan_tag_get_id(skb)) {
 5227			/* Vlan id is non 0 and vlan_do_receive() above couldn't
 5228			 * find vlan device.
 5229			 */
 5230			skb->pkt_type = PACKET_OTHERHOST;
 5231		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
 5232			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 5233			/* Outer header is 802.1P with vlan 0, inner header is
 5234			 * 802.1Q or 802.1AD and vlan_do_receive() above could
 5235			 * not find vlan dev for vlan id 0.
 5236			 */
 5237			__vlan_hwaccel_clear_tag(skb);
 5238			skb = skb_vlan_untag(skb);
 5239			if (unlikely(!skb))
 5240				goto out;
 5241			if (vlan_do_receive(&skb))
 5242				/* After stripping off 802.1P header with vlan 0
 5243				 * vlan dev is found for inner header.
 5244				 */
 5245				goto another_round;
 5246			else if (unlikely(!skb))
 5247				goto out;
 5248			else
 5249				/* We have stripped outer 802.1P vlan 0 header.
 5250				 * But could not find vlan dev.
 5251				 * check again for vlan id to set OTHERHOST.
 5252				 */
 5253				goto check_vlan_id;
 5254		}
 5255		/* Note: we might in the future use prio bits
 5256		 * and set skb->priority like in vlan_do_receive()
 5257		 * For the time being, just ignore Priority Code Point
 5258		 */
 5259		__vlan_hwaccel_clear_tag(skb);
 5260	}
 5261
 5262	type = skb->protocol;
 5263
 5264	/* deliver only exact match when indicated */
 5265	if (likely(!deliver_exact)) {
 5266		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5267				       &ptype_base[ntohs(type) &
 5268						   PTYPE_HASH_MASK]);
 5269	}
 5270
 5271	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5272			       &orig_dev->ptype_specific);
 5273
 5274	if (unlikely(skb->dev != orig_dev)) {
 5275		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 5276				       &skb->dev->ptype_specific);
 5277	}
 5278
 5279	if (pt_prev) {
 5280		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 5281			goto drop;
 5282		*ppt_prev = pt_prev;
 5283	} else {
 5284drop:
 5285		if (!deliver_exact)
 5286			atomic_long_inc(&skb->dev->rx_dropped);
 5287		else
 5288			atomic_long_inc(&skb->dev->rx_nohandler);
 5289		kfree_skb(skb);
 5290		/* Jamal, now you will not able to escape explaining
 5291		 * me how you were going to use this. :-)
 5292		 */
 5293		ret = NET_RX_DROP;
 5294	}
 5295
 5296out:
 5297	/* The invariant here is that if *ppt_prev is not NULL
 5298	 * then skb should also be non-NULL.
 5299	 *
 5300	 * Apparently *ppt_prev assignment above holds this invariant due to
 5301	 * skb dereferencing near it.
 5302	 */
 5303	*pskb = skb;
 5304	return ret;
 5305}
 5306
 5307static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 5308{
 5309	struct net_device *orig_dev = skb->dev;
 5310	struct packet_type *pt_prev = NULL;
 5311	int ret;
 5312
 5313	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5314	if (pt_prev)
 5315		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 5316					 skb->dev, pt_prev, orig_dev);
 5317	return ret;
 5318}
 5319
 5320/**
 5321 *	netif_receive_skb_core - special purpose version of netif_receive_skb
 5322 *	@skb: buffer to process
 5323 *
 5324 *	More direct receive version of netif_receive_skb().  It should
 5325 *	only be used by callers that have a need to skip RPS and Generic XDP.
 5326 *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 5327 *
 5328 *	This function may only be called from softirq context and interrupts
 5329 *	should be enabled.
 5330 *
 5331 *	Return values (usually ignored):
 5332 *	NET_RX_SUCCESS: no congestion
 5333 *	NET_RX_DROP: packet was dropped
 5334 */
 5335int netif_receive_skb_core(struct sk_buff *skb)
 5336{
 5337	int ret;
 5338
 5339	rcu_read_lock();
 5340	ret = __netif_receive_skb_one_core(skb, false);
 5341	rcu_read_unlock();
 5342
 5343	return ret;
 5344}
 5345EXPORT_SYMBOL(netif_receive_skb_core);
 5346
 5347static inline void __netif_receive_skb_list_ptype(struct list_head *head,
 5348						  struct packet_type *pt_prev,
 5349						  struct net_device *orig_dev)
 5350{
 5351	struct sk_buff *skb, *next;
 5352
 5353	if (!pt_prev)
 5354		return;
 5355	if (list_empty(head))
 5356		return;
 5357	if (pt_prev->list_func != NULL)
 5358		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
 5359				   ip_list_rcv, head, pt_prev, orig_dev);
 5360	else
 5361		list_for_each_entry_safe(skb, next, head, list) {
 5362			skb_list_del_init(skb);
 5363			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 5364		}
 5365}
 5366
 5367static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 5368{
 5369	/* Fast-path assumptions:
 5370	 * - There is no RX handler.
 5371	 * - Only one packet_type matches.
 5372	 * If either of these fails, we will end up doing some per-packet
 5373	 * processing in-line, then handling the 'last ptype' for the whole
 5374	 * sublist.  This can't cause out-of-order delivery to any single ptype,
 5375	 * because the 'last ptype' must be constant across the sublist, and all
 5376	 * other ptypes are handled per-packet.
 5377	 */
 5378	/* Current (common) ptype of sublist */
 5379	struct packet_type *pt_curr = NULL;
 5380	/* Current (common) orig_dev of sublist */
 5381	struct net_device *od_curr = NULL;
 5382	struct list_head sublist;
 5383	struct sk_buff *skb, *next;
 5384
 5385	INIT_LIST_HEAD(&sublist);
 5386	list_for_each_entry_safe(skb, next, head, list) {
 5387		struct net_device *orig_dev = skb->dev;
 5388		struct packet_type *pt_prev = NULL;
 5389
 5390		skb_list_del_init(skb);
 5391		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 5392		if (!pt_prev)
 5393			continue;
 5394		if (pt_curr != pt_prev || od_curr != orig_dev) {
 5395			/* dispatch old sublist */
 5396			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5397			/* start new sublist */
 5398			INIT_LIST_HEAD(&sublist);
 5399			pt_curr = pt_prev;
 5400			od_curr = orig_dev;
 5401		}
 5402		list_add_tail(&skb->list, &sublist);
 5403	}
 5404
 5405	/* dispatch final sublist */
 5406	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 5407}
 5408
 5409static int __netif_receive_skb(struct sk_buff *skb)
 5410{
 5411	int ret;
 5412
 5413	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
 5414		unsigned int noreclaim_flag;
 5415
 5416		/*
 5417		 * PFMEMALLOC skbs are special, they should
 5418		 * - be delivered to SOCK_MEMALLOC sockets only
 5419		 * - stay away from userspace
 5420		 * - have bounded memory usage
 5421		 *
 5422		 * Use PF_MEMALLOC as this saves us from propagating the allocation
 5423		 * context down to all allocation sites.
 5424		 */
 5425		noreclaim_flag = memalloc_noreclaim_save();
 5426		ret = __netif_receive_skb_one_core(skb, true);
 5427		memalloc_noreclaim_restore(noreclaim_flag);
 5428	} else
 5429		ret = __netif_receive_skb_one_core(skb, false);
 5430
 5431	return ret;
 5432}
 5433
 5434static void __netif_receive_skb_list(struct list_head *head)
 5435{
 5436	unsigned long noreclaim_flag = 0;
 5437	struct sk_buff *skb, *next;
 5438	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
 5439
 5440	list_for_each_entry_safe(skb, next, head, list) {
 5441		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
 5442			struct list_head sublist;
 5443
 5444			/* Handle the previous sublist */
 5445			list_cut_before(&sublist, head, &skb->list);
 5446			if (!list_empty(&sublist))
 5447				__netif_receive_skb_list_core(&sublist, pfmemalloc);
 5448			pfmemalloc = !pfmemalloc;
 5449			/* See comments in __netif_receive_skb */
 5450			if (pfmemalloc)
 5451				noreclaim_flag = memalloc_noreclaim_save();
 5452			else
 5453				memalloc_noreclaim_restore(noreclaim_flag);
 5454		}
 5455	}
 5456	/* Handle the remaining sublist */
 5457	if (!list_empty(head))
 5458		__netif_receive_skb_list_core(head, pfmemalloc);
 5459	/* Restore pflags */
 5460	if (pfmemalloc)
 5461		memalloc_noreclaim_restore(noreclaim_flag);
 5462}
 5463
 5464static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 5465{
 5466	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
 5467	struct bpf_prog *new = xdp->prog;
 5468	int ret = 0;
 5469
 5470	if (new) {
 5471		u32 i;
 5472
 5473		mutex_lock(&new->aux->used_maps_mutex);
 5474
 5475		/* generic XDP does not work with DEVMAPs that can
 5476		 * have a bpf_prog installed on an entry
 5477		 */
 5478		for (i = 0; i < new->aux->used_map_cnt; i++) {
 5479			if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
 5480			    cpu_map_prog_allowed(new->aux->used_maps[i])) {
 5481				mutex_unlock(&new->aux->used_maps_mutex);
 5482				return -EINVAL;
 5483			}
 5484		}
 5485
 5486		mutex_unlock(&new->aux->used_maps_mutex);
 5487	}
 5488
 5489	switch (xdp->command) {
 5490	case XDP_SETUP_PROG:
 5491		rcu_assign_pointer(dev->xdp_prog, new);
 5492		if (old)
 5493			bpf_prog_put(old);
 5494
 5495		if (old && !new) {
 5496			static_branch_dec(&generic_xdp_needed_key);
 5497		} else if (new && !old) {
 5498			static_branch_inc(&generic_xdp_needed_key);
 5499			dev_disable_lro(dev);
 5500			dev_disable_gro_hw(dev);
 5501		}
 5502		break;
 5503
 5504	default:
 5505		ret = -EINVAL;
 5506		break;
 5507	}
 5508
 5509	return ret;
 5510}
 5511
 5512static int netif_receive_skb_internal(struct sk_buff *skb)
 5513{
 5514	int ret;
 5515
 5516	net_timestamp_check(netdev_tstamp_prequeue, skb);
 5517
 5518	if (skb_defer_rx_timestamp(skb))
 5519		return NET_RX_SUCCESS;
 5520
 5521	rcu_read_lock();
 5522#ifdef CONFIG_RPS
 5523	if (static_branch_unlikely(&rps_needed)) {
 5524		struct rps_dev_flow voidflow, *rflow = &voidflow;
 5525		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5526
 5527		if (cpu >= 0) {
 5528			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5529			rcu_read_unlock();
 5530			return ret;
 5531		}
 5532	}
 5533#endif
 5534	ret = __netif_receive_skb(skb);
 5535	rcu_read_unlock();
 5536	return ret;
 5537}
 5538
 5539static void netif_receive_skb_list_internal(struct list_head *head)
 5540{
 5541	struct sk_buff *skb, *next;
 5542	struct list_head sublist;
 5543
 5544	INIT_LIST_HEAD(&sublist);
 5545	list_for_each_entry_safe(skb, next, head, list) {
 5546		net_timestamp_check(netdev_tstamp_prequeue, skb);
 5547		skb_list_del_init(skb);
 5548		if (!skb_defer_rx_timestamp(skb))
 5549			list_add_tail(&skb->list, &sublist);
 5550	}
 5551	list_splice_init(&sublist, head);
 5552
 5553	rcu_read_lock();
 5554#ifdef CONFIG_RPS
 5555	if (static_branch_unlikely(&rps_needed)) {
 5556		list_for_each_entry_safe(skb, next, head, list) {
 5557			struct rps_dev_flow voidflow, *rflow = &voidflow;
 5558			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
 5559
 5560			if (cpu >= 0) {
 5561				/* Will be handled, remove from list */
 5562				skb_list_del_init(skb);
 5563				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 5564			}
 5565		}
 5566	}
 5567#endif
 5568	__netif_receive_skb_list(head);
 5569	rcu_read_unlock();
 5570}
 5571
 5572/**
 5573 *	netif_receive_skb - process receive buffer from network
 5574 *	@skb: buffer to process
 5575 *
 5576 *	netif_receive_skb() is the main receive data processing function.
 5577 *	It always succeeds. The buffer may be dropped during processing
 5578 *	for congestion control or by the protocol layers.
 5579 *
 5580 *	This function may only be called from softirq context and interrupts
 5581 *	should be enabled.
 5582 *
 5583 *	Return values (usually ignored):
 5584 *	NET_RX_SUCCESS: no congestion
 5585 *	NET_RX_DROP: packet was dropped
 5586 */
 5587int netif_receive_skb(struct sk_buff *skb)
 5588{
 5589	int ret;
 5590
 5591	trace_netif_receive_skb_entry(skb);
 5592
 5593	ret = netif_receive_skb_internal(skb);
 5594	trace_netif_receive_skb_exit(ret);
 5595
 5596	return ret;
 5597}
 5598EXPORT_SYMBOL(netif_receive_skb);
 5599
 5600/**
 5601 *	netif_receive_skb_list - process many receive buffers from network
 5602 *	@head: list of skbs to process.
 5603 *
 5604 *	Since return value of netif_receive_skb() is normally ignored, and
 5605 *	wouldn't be meaningful for a list, this function returns void.
 5606 *
 5607 *	This function may only be called from softirq context and interrupts
 5608 *	should be enabled.
 5609 */
 5610void netif_receive_skb_list(struct list_head *head)
 5611{
 5612	struct sk_buff *skb;
 5613
 5614	if (list_empty(head))
 5615		return;
 5616	if (trace_netif_receive_skb_list_entry_enabled()) {
 5617		list_for_each_entry(skb, head, list)
 5618			trace_netif_receive_skb_list_entry(skb);
 5619	}
 5620	netif_receive_skb_list_internal(head);
 5621	trace_netif_receive_skb_list_exit(0);
 5622}
 5623EXPORT_SYMBOL(netif_receive_skb_list);
 5624
 5625static DEFINE_PER_CPU(struct work_struct, flush_works);
 5626
 5627/* Network device is going away, flush any packets still pending */
 5628static void flush_backlog(struct work_struct *work)
 5629{
 5630	struct sk_buff *skb, *tmp;
 5631	struct softnet_data *sd;
 5632
 5633	local_bh_disable();
 5634	sd = this_cpu_ptr(&softnet_data);
 5635
 5636	local_irq_disable();
 5637	rps_lock(sd);
 5638	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 5639		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5640			__skb_unlink(skb, &sd->input_pkt_queue);
 5641			dev_kfree_skb_irq(skb);
 5642			input_queue_head_incr(sd);
 5643		}
 5644	}
 5645	rps_unlock(sd);
 5646	local_irq_enable();
 5647
 5648	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 5649		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 5650			__skb_unlink(skb, &sd->process_queue);
 5651			kfree_skb(skb);
 5652			input_queue_head_incr(sd);
 5653		}
 5654	}
 5655	local_bh_enable();
 5656}
 5657
 5658static bool flush_required(int cpu)
 5659{
 5660#if IS_ENABLED(CONFIG_RPS)
 5661	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 5662	bool do_flush;
 5663
 5664	local_irq_disable();
 5665	rps_lock(sd);
 5666
 5667	/* as insertion into process_queue happens with the rps lock held,
 5668	 * process_queue access may race only with dequeue
 5669	 */
 5670	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 5671		   !skb_queue_empty_lockless(&sd->process_queue);
 5672	rps_unlock(sd);
 5673	local_irq_enable();
 5674
 5675	return do_flush;
 5676#endif
 5677	/* without RPS we can't safely check input_pkt_queue: during a
 5678	 * concurrent remote skb_queue_splice() we can detect as empty both
 5679	 * input_pkt_queue and process_queue even if the latter could end-up
 5680	 * containing a lot of packets.
 5681	 */
 5682	return true;
 5683}
 5684
 5685static void flush_all_backlogs(void)
 5686{
 5687	static cpumask_t flush_cpus;
 5688	unsigned int cpu;
 5689
 5690	/* since we are under rtnl lock protection we can use static data
 5691	 * for the cpumask and avoid allocating on stack the possibly
 5692	 * large mask
 5693	 */
 5694	ASSERT_RTNL();
 5695
 5696	get_online_cpus();
 5697
 5698	cpumask_clear(&flush_cpus);
 5699	for_each_online_cpu(cpu) {
 5700		if (flush_required(cpu)) {
 5701			queue_work_on(cpu, system_highpri_wq,
 5702				      per_cpu_ptr(&flush_works, cpu));
 5703			cpumask_set_cpu(cpu, &flush_cpus);
 5704		}
 5705	}
 5706
 5707	/* we can have in flight packet[s] on the cpus we are not flushing,
 5708	 * synchronize_net() in rollback_registered_many() will take care of
 5709	 * them
 5710	 */
 5711	for_each_cpu(cpu, &flush_cpus)
 5712		flush_work(per_cpu_ptr(&flush_works, cpu));
 5713
 5714	put_online_cpus();
 5715}
 5716
 5717/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 5718static void gro_normal_list(struct napi_struct *napi)
 5719{
 5720	if (!napi->rx_count)
 5721		return;
 5722	netif_receive_skb_list_internal(&napi->rx_list);
 5723	INIT_LIST_HEAD(&napi->rx_list);
 5724	napi->rx_count = 0;
 5725}
 5726
 5727/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
 5728 * pass the whole batch up to the stack.
 5729 */
 5730static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
 5731{
 5732	list_add_tail(&skb->list, &napi->rx_list);
 5733	if (++napi->rx_count >= gro_normal_batch)
 5734		gro_normal_list(napi);
 5735}
 5736
 5737INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
 5738INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
 5739static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 5740{
 5741	struct packet_offload *ptype;
 5742	__be16 type = skb->protocol;
 5743	struct list_head *head = &offload_base;
 5744	int err = -ENOENT;
 5745
 5746	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 5747
 5748	if (NAPI_GRO_CB(skb)->count == 1) {
 5749		skb_shinfo(skb)->gso_size = 0;
 5750		goto out;
 5751	}
 5752
 5753	rcu_read_lock();
 5754	list_for_each_entry_rcu(ptype, head, list) {
 5755		if (ptype->type != type || !ptype->callbacks.gro_complete)
 5756			continue;
 5757
 5758		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
 5759					 ipv6_gro_complete, inet_gro_complete,
 5760					 skb, 0);
 5761		break;
 5762	}
 5763	rcu_read_unlock();
 5764
 5765	if (err) {
 5766		WARN_ON(&ptype->list == head);
 5767		kfree_skb(skb);
 5768		return NET_RX_SUCCESS;
 5769	}
 5770
 5771out:
 5772	gro_normal_one(napi, skb);
 5773	return NET_RX_SUCCESS;
 5774}
 5775
 5776static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 5777				   bool flush_old)
 5778{
 5779	struct list_head *head = &napi->gro_hash[index].list;
 5780	struct sk_buff *skb, *p;
 5781
 5782	list_for_each_entry_safe_reverse(skb, p, head, list) {
 5783		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 5784			return;
 5785		skb_list_del_init(skb);
 5786		napi_gro_complete(napi, skb);
 5787		napi->gro_hash[index].count--;
 5788	}
 5789
 5790	if (!napi->gro_hash[index].count)
 5791		__clear_bit(index, &napi->gro_bitmask);
 5792}
 5793
 5794/* napi->gro_hash[].list contains packets ordered by age.
 5795 * youngest packets at the head of it.
 5796 * Complete skbs in reverse order to reduce latencies.
 5797 */
 5798void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 5799{
 5800	unsigned long bitmask = napi->gro_bitmask;
 5801	unsigned int i, base = ~0U;
 5802
 5803	while ((i = ffs(bitmask)) != 0) {
 5804		bitmask >>= i;
 5805		base += i;
 5806		__napi_gro_flush_chain(napi, base, flush_old);
 5807	}
 5808}
 5809EXPORT_SYMBOL(napi_gro_flush);
 5810
 5811static struct list_head *gro_list_prepare(struct napi_struct *napi,
 5812					  struct sk_buff *skb)
 5813{
 5814	unsigned int maclen = skb->dev->hard_header_len;
 5815	u32 hash = skb_get_hash_raw(skb);
 5816	struct list_head *head;
 5817	struct sk_buff *p;
 5818
 5819	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
 5820	list_for_each_entry(p, head, list) {
 5821		unsigned long diffs;
 5822
 5823		NAPI_GRO_CB(p)->flush = 0;
 5824
 5825		if (hash != skb_get_hash_raw(p)) {
 5826			NAPI_GRO_CB(p)->same_flow = 0;
 5827			continue;
 5828		}
 5829
 5830		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 5831		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
 5832		if (skb_vlan_tag_present(p))
 5833			diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
 5834		diffs |= skb_metadata_dst_cmp(p, skb);
 5835		diffs |= skb_metadata_differs(p, skb);
 5836		if (maclen == ETH_HLEN)
 5837			diffs |= compare_ether_header(skb_mac_header(p),
 5838						      skb_mac_header(skb));
 5839		else if (!diffs)
 5840			diffs = memcmp(skb_mac_header(p),
 5841				       skb_mac_header(skb),
 5842				       maclen);
 5843		NAPI_GRO_CB(p)->same_flow = !diffs;
 5844	}
 5845
 5846	return head;
 5847}
 5848
 5849static void skb_gro_reset_offset(struct sk_buff *skb)
 5850{
 5851	const struct skb_shared_info *pinfo = skb_shinfo(skb);
 5852	const skb_frag_t *frag0 = &pinfo->frags[0];
 5853
 5854	NAPI_GRO_CB(skb)->data_offset = 0;
 5855	NAPI_GRO_CB(skb)->frag0 = NULL;
 5856	NAPI_GRO_CB(skb)->frag0_len = 0;
 5857
 5858	if (!skb_headlen(skb) && pinfo->nr_frags &&
 5859	    !PageHighMem(skb_frag_page(frag0))) {
 5860		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
 5861		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
 5862						    skb_frag_size(frag0),
 5863						    skb->end - skb->tail);
 5864	}
 5865}
 5866
 5867static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
 5868{
 5869	struct skb_shared_info *pinfo = skb_shinfo(skb);
 5870
 5871	BUG_ON(skb->end - skb->tail < grow);
 5872
 5873	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
 5874
 5875	skb->data_len -= grow;
 5876	skb->tail += grow;
 5877
 5878	skb_frag_off_add(&pinfo->frags[0], grow);
 5879	skb_frag_size_sub(&pinfo->frags[0], grow);
 5880
 5881	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
 5882		skb_frag_unref(skb, 0);
 5883		memmove(pinfo->frags, pinfo->frags + 1,
 5884			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
 5885	}
 5886}
 5887
 5888static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
 5889{
 5890	struct sk_buff *oldest;
 5891
 5892	oldest = list_last_entry(head, struct sk_buff, list);
 5893
 5894	/* We are called with head length >= MAX_GRO_SKBS, so this is
 5895	 * impossible.
 5896	 */
 5897	if (WARN_ON_ONCE(!oldest))
 5898		return;
 5899
 5900	/* Do not adjust napi->gro_hash[].count, caller is adding a new
 5901	 * SKB to the chain.
 5902	 */
 5903	skb_list_del_init(oldest);
 5904	napi_gro_complete(napi, oldest);
 5905}
 5906
 5907INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
 5908							   struct sk_buff *));
 5909INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
 5910							   struct sk_buff *));
 5911static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 5912{
 5913	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 5914	struct list_head *head = &offload_base;
 5915	struct packet_offload *ptype;
 5916	__be16 type = skb->protocol;
 5917	struct list_head *gro_head;
 5918	struct sk_buff *pp = NULL;
 5919	enum gro_result ret;
 5920	int same_flow;
 5921	int grow;
 5922
 5923	if (netif_elide_gro(skb->dev))
 5924		goto normal;
 5925
 5926	gro_head = gro_list_prepare(napi, skb);
 5927
 5928	rcu_read_lock();
 5929	list_for_each_entry_rcu(ptype, head, list) {
 5930		if (ptype->type != type || !ptype->callbacks.gro_receive)
 5931			continue;
 5932
 5933		skb_set_network_header(skb, skb_gro_offset(skb));
 5934		skb_reset_mac_len(skb);
 5935		NAPI_GRO_CB(skb)->same_flow = 0;
 5936		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
 5937		NAPI_GRO_CB(skb)->free = 0;
 5938		NAPI_GRO_CB(skb)->encap_mark = 0;
 5939		NAPI_GRO_CB(skb)->recursion_counter = 0;
 5940		NAPI_GRO_CB(skb)->is_fou = 0;
 5941		NAPI_GRO_CB(skb)->is_atomic = 1;
 5942		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 5943
 5944		/* Setup for GRO checksum validation */
 5945		switch (skb->ip_summed) {
 5946		case CHECKSUM_COMPLETE:
 5947			NAPI_GRO_CB(skb)->csum = skb->csum;
 5948			NAPI_GRO_CB(skb)->csum_valid = 1;
 5949			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5950			break;
 5951		case CHECKSUM_UNNECESSARY:
 5952			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
 5953			NAPI_GRO_CB(skb)->csum_valid = 0;
 5954			break;
 5955		default:
 5956			NAPI_GRO_CB(skb)->csum_cnt = 0;
 5957			NAPI_GRO_CB(skb)->csum_valid = 0;
 5958		}
 5959
 5960		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
 5961					ipv6_gro_receive, inet_gro_receive,
 5962					gro_head, skb);
 5963		break;
 5964	}
 5965	rcu_read_unlock();
 5966
 5967	if (&ptype->list == head)
 5968		goto normal;
 5969
 5970	if (PTR_ERR(pp) == -EINPROGRESS) {
 5971		ret = GRO_CONSUMED;
 5972		goto ok;
 5973	}
 5974
 5975	same_flow = NAPI_GRO_CB(skb)->same_flow;
 5976	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 5977
 5978	if (pp) {
 5979		skb_list_del_init(pp);
 5980		napi_gro_complete(napi, pp);
 5981		napi->gro_hash[hash].count--;
 5982	}
 5983
 5984	if (same_flow)
 5985		goto ok;
 5986
 5987	if (NAPI_GRO_CB(skb)->flush)
 5988		goto normal;
 5989
 5990	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 5991		gro_flush_oldest(napi, gro_head);
 5992	} else {
 5993		napi->gro_hash[hash].count++;
 5994	}
 5995	NAPI_GRO_CB(skb)->count = 1;
 5996	NAPI_GRO_CB(skb)->age = jiffies;
 5997	NAPI_GRO_CB(skb)->last = skb;
 5998	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 5999	list_add(&skb->list, gro_head);
 6000	ret = GRO_HELD;
 6001
 6002pull:
 6003	grow = skb_gro_offset(skb) - skb_headlen(skb);
 6004	if (grow > 0)
 6005		gro_pull_from_frag0(skb, grow);
 6006ok:
 6007	if (napi->gro_hash[hash].count) {
 6008		if (!test_bit(hash, &napi->gro_bitmask))
 6009			__set_bit(hash, &napi->gro_bitmask);
 6010	} else if (test_bit(hash, &napi->gro_bitmask)) {
 6011		__clear_bit(hash, &napi->gro_bitmask);
 6012	}
 6013
 6014	return ret;
 6015
 6016normal:
 6017	ret = GRO_NORMAL;
 6018	goto pull;
 6019}
 6020
 6021struct packet_offload *gro_find_receive_by_type(__be16 type)
 6022{
 6023	struct list_head *offload_head = &offload_base;
 6024	struct packet_offload *ptype;
 6025
 6026	list_for_each_entry_rcu(ptype, offload_head, list) {
 6027		if (ptype->type != type || !ptype->callbacks.gro_receive)
 6028			continue;
 6029		return ptype;
 6030	}
 6031	return NULL;
 6032}
 6033EXPORT_SYMBOL(gro_find_receive_by_type);
 6034
 6035struct packet_offload *gro_find_complete_by_type(__be16 type)
 6036{
 6037	struct list_head *offload_head = &offload_base;
 6038	struct packet_offload *ptype;
 6039
 6040	list_for_each_entry_rcu(ptype, offload_head, list) {
 6041		if (ptype->type != type || !ptype->callbacks.gro_complete)
 6042			continue;
 6043		return ptype;
 6044	}
 6045	return NULL;
 6046}
 6047EXPORT_SYMBOL(gro_find_complete_by_type);
 6048
 6049static void napi_skb_free_stolen_head(struct sk_buff *skb)
 6050{
 6051	skb_dst_drop(skb);
 6052	skb_ext_put(skb);
 6053	kmem_cache_free(skbuff_head_cache, skb);
 6054}
 6055
 6056static gro_result_t napi_skb_finish(struct napi_struct *napi,
 6057				    struct sk_buff *skb,
 6058				    gro_result_t ret)
 6059{
 6060	switch (ret) {
 6061	case GRO_NORMAL:
 6062		gro_normal_one(napi, skb);
 6063		break;
 6064
 6065	case GRO_DROP:
 6066		kfree_skb(skb);
 6067		break;
 6068
 6069	case GRO_MERGED_FREE:
 6070		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6071			napi_skb_free_stolen_head(skb);
 6072		else
 6073			__kfree_skb(skb);
 6074		break;
 6075
 6076	case GRO_HELD:
 6077	case GRO_MERGED:
 6078	case GRO_CONSUMED:
 6079		break;
 6080	}
 6081
 6082	return ret;
 6083}
 6084
 6085gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 6086{
 6087	gro_result_t ret;
 6088
 6089	skb_mark_napi_id(skb, napi);
 6090	trace_napi_gro_receive_entry(skb);
 6091
 6092	skb_gro_reset_offset(skb);
 6093
 6094	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
 6095	trace_napi_gro_receive_exit(ret);
 6096
 6097	return ret;
 6098}
 6099EXPORT_SYMBOL(napi_gro_receive);
 6100
 6101static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 6102{
 6103	if (unlikely(skb->pfmemalloc)) {
 6104		consume_skb(skb);
 6105		return;
 6106	}
 6107	__skb_pull(skb, skb_headlen(skb));
 6108	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
 6109	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
 6110	__vlan_hwaccel_clear_tag(skb);
 6111	skb->dev = napi->dev;
 6112	skb->skb_iif = 0;
 6113
 6114	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
 6115	skb->pkt_type = PACKET_HOST;
 6116
 6117	skb->encapsulation = 0;
 6118	skb_shinfo(skb)->gso_type = 0;
 6119	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 6120	skb_ext_reset(skb);
 6121
 6122	napi->skb = skb;
 6123}
 6124
 6125struct sk_buff *napi_get_frags(struct napi_struct *napi)
 6126{
 6127	struct sk_buff *skb = napi->skb;
 6128
 6129	if (!skb) {
 6130		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 6131		if (skb) {
 6132			napi->skb = skb;
 6133			skb_mark_napi_id(skb, napi);
 6134		}
 6135	}
 6136	return skb;
 6137}
 6138EXPORT_SYMBOL(napi_get_frags);
 6139
 6140static gro_result_t napi_frags_finish(struct napi_struct *napi,
 6141				      struct sk_buff *skb,
 6142				      gro_result_t ret)
 6143{
 6144	switch (ret) {
 6145	case GRO_NORMAL:
 6146	case GRO_HELD:
 6147		__skb_push(skb, ETH_HLEN);
 6148		skb->protocol = eth_type_trans(skb, skb->dev);
 6149		if (ret == GRO_NORMAL)
 6150			gro_normal_one(napi, skb);
 6151		break;
 6152
 6153	case GRO_DROP:
 6154		napi_reuse_skb(napi, skb);
 6155		break;
 6156
 6157	case GRO_MERGED_FREE:
 6158		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 6159			napi_skb_free_stolen_head(skb);
 6160		else
 6161			napi_reuse_skb(napi, skb);
 6162		break;
 6163
 6164	case GRO_MERGED:
 6165	case GRO_CONSUMED:
 6166		break;
 6167	}
 6168
 6169	return ret;
 6170}
 6171
 6172/* Upper GRO stack assumes network header starts at gro_offset=0
 6173 * Drivers could call both napi_gro_frags() and napi_gro_receive()
 6174 * We copy ethernet header into skb->data to have a common layout.
 6175 */
 6176static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 6177{
 6178	struct sk_buff *skb = napi->skb;
 6179	const struct ethhdr *eth;
 6180	unsigned int hlen = sizeof(*eth);
 6181
 6182	napi->skb = NULL;
 6183
 6184	skb_reset_mac_header(skb);
 6185	skb_gro_reset_offset(skb);
 6186
 6187	if (unlikely(skb_gro_header_hard(skb, hlen))) {
 6188		eth = skb_gro_header_slow(skb, hlen, 0);
 6189		if (unlikely(!eth)) {
 6190			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
 6191					     __func__, napi->dev->name);
 6192			napi_reuse_skb(napi, skb);
 6193			return NULL;
 6194		}
 6195	} else {
 6196		eth = (const struct ethhdr *)skb->data;
 6197		gro_pull_from_frag0(skb, hlen);
 6198		NAPI_GRO_CB(skb)->frag0 += hlen;
 6199		NAPI_GRO_CB(skb)->frag0_len -= hlen;
 6200	}
 6201	__skb_pull(skb, hlen);
 6202
 6203	/*
 6204	 * This works because the only protocols we care about don't require
 6205	 * special handling.
 6206	 * We'll fix it up properly in napi_frags_finish()
 6207	 */
 6208	skb->protocol = eth->h_proto;
 6209
 6210	return skb;
 6211}
 6212
 6213gro_result_t napi_gro_frags(struct napi_struct *napi)
 6214{
 6215	gro_result_t ret;
 6216	struct sk_buff *skb = napi_frags_skb(napi);
 6217
 6218	if (!skb)
 6219		return GRO_DROP;
 6220
 6221	trace_napi_gro_frags_entry(skb);
 6222
 6223	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 6224	trace_napi_gro_frags_exit(ret);
 6225
 6226	return ret;
 6227}
 6228EXPORT_SYMBOL(napi_gro_frags);
 6229
 6230/* Compute the checksum from gro_offset and return the folded value
 6231 * after adding in any pseudo checksum.
 6232 */
 6233__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
 6234{
 6235	__wsum wsum;
 6236	__sum16 sum;
 6237
 6238	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
 6239
 6240	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
 6241	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
 6242	/* See comments in __skb_checksum_complete(). */
 6243	if (likely(!sum)) {
 6244		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
 6245		    !skb->csum_complete_sw)
 6246			netdev_rx_csum_fault(skb->dev, skb);
 6247	}
 6248
 6249	NAPI_GRO_CB(skb)->csum = wsum;
 6250	NAPI_GRO_CB(skb)->csum_valid = 1;
 6251
 6252	return sum;
 6253}
 6254EXPORT_SYMBOL(__skb_gro_checksum_complete);
 6255
 6256static void net_rps_send_ipi(struct softnet_data *remsd)
 6257{
 6258#ifdef CONFIG_RPS
 6259	while (remsd) {
 6260		struct softnet_data *next = remsd->rps_ipi_next;
 6261
 6262		if (cpu_online(remsd->cpu))
 6263			smp_call_function_single_async(remsd->cpu, &remsd->csd);
 6264		remsd = next;
 6265	}
 6266#endif
 6267}
 6268
 6269/*
 6270 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 6271 * Note: called with local irq disabled, but exits with local irq enabled.
 6272 */
 6273static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 6274{
 6275#ifdef CONFIG_RPS
 6276	struct softnet_data *remsd = sd->rps_ipi_list;
 6277
 6278	if (remsd) {
 6279		sd->rps_ipi_list = NULL;
 6280
 6281		local_irq_enable();
 6282
 6283		/* Send pending IPI's to kick RPS processing on remote cpus. */
 6284		net_rps_send_ipi(remsd);
 6285	} else
 6286#endif
 6287		local_irq_enable();
 6288}
 6289
 6290static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 6291{
 6292#ifdef CONFIG_RPS
 6293	return sd->rps_ipi_list != NULL;
 6294#else
 6295	return false;
 6296#endif
 6297}
 6298
 6299static int process_backlog(struct napi_struct *napi, int quota)
 6300{
 6301	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
 6302	bool again = true;
 6303	int work = 0;
 6304
 6305	/* Check if we have pending ipi, its better to send them now,
 6306	 * not waiting net_rx_action() end.
 6307	 */
 6308	if (sd_has_rps_ipi_waiting(sd)) {
 6309		local_irq_disable();
 6310		net_rps_action_and_irq_enable(sd);
 6311	}
 6312
 6313	napi->weight = dev_rx_weight;
 6314	while (again) {
 6315		struct sk_buff *skb;
 6316
 6317		while ((skb = __skb_dequeue(&sd->process_queue))) {
 6318			rcu_read_lock();
 6319			__netif_receive_skb(skb);
 6320			rcu_read_unlock();
 6321			input_queue_head_incr(sd);
 6322			if (++work >= quota)
 6323				return work;
 6324
 6325		}
 6326
 6327		local_irq_disable();
 6328		rps_lock(sd);
 6329		if (skb_queue_empty(&sd->input_pkt_queue)) {
 6330			/*
 6331			 * Inline a custom version of __napi_complete().
 6332			 * only current cpu owns and manipulates this napi,
 6333			 * and NAPI_STATE_SCHED is the only possible flag set
 6334			 * on backlog.
 6335			 * We can use a plain write instead of clear_bit(),
 6336			 * and we dont need an smp_mb() memory barrier.
 6337			 */
 6338			napi->state = 0;
 6339			again = false;
 6340		} else {
 6341			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 6342						   &sd->process_queue);
 6343		}
 6344		rps_unlock(sd);
 6345		local_irq_enable();
 6346	}
 6347
 6348	return work;
 6349}
 6350
 6351/**
 6352 * __napi_schedule - schedule for receive
 6353 * @n: entry to schedule
 6354 *
 6355 * The entry's receive function will be scheduled to run.
 6356 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 6357 */
 6358void __napi_schedule(struct napi_struct *n)
 6359{
 6360	unsigned long flags;
 6361
 6362	local_irq_save(flags);
 6363	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6364	local_irq_restore(flags);
 6365}
 6366EXPORT_SYMBOL(__napi_schedule);
 6367
 6368/**
 6369 *	napi_schedule_prep - check if napi can be scheduled
 6370 *	@n: napi context
 6371 *
 6372 * Test if NAPI routine is already running, and if not mark
 6373 * it as running.  This is used as a condition variable to
 6374 * insure only one NAPI poll instance runs.  We also make
 6375 * sure there is no pending NAPI disable.
 6376 */
 6377bool napi_schedule_prep(struct napi_struct *n)
 6378{
 6379	unsigned long val, new;
 6380
 6381	do {
 6382		val = READ_ONCE(n->state);
 6383		if (unlikely(val & NAPIF_STATE_DISABLE))
 6384			return false;
 6385		new = val | NAPIF_STATE_SCHED;
 6386
 6387		/* Sets STATE_MISSED bit if STATE_SCHED was already set
 6388		 * This was suggested by Alexander Duyck, as compiler
 6389		 * emits better code than :
 6390		 * if (val & NAPIF_STATE_SCHED)
 6391		 *     new |= NAPIF_STATE_MISSED;
 6392		 */
 6393		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
 6394						   NAPIF_STATE_MISSED;
 6395	} while (cmpxchg(&n->state, val, new) != val);
 6396
 6397	return !(val & NAPIF_STATE_SCHED);
 6398}
 6399EXPORT_SYMBOL(napi_schedule_prep);
 6400
 6401/**
 6402 * __napi_schedule_irqoff - schedule for receive
 6403 * @n: entry to schedule
 6404 *
 6405 * Variant of __napi_schedule() assuming hard irqs are masked
 6406 */
 6407void __napi_schedule_irqoff(struct napi_struct *n)
 6408{
 6409	____napi_schedule(this_cpu_ptr(&softnet_data), n);
 6410}
 6411EXPORT_SYMBOL(__napi_schedule_irqoff);
 6412
 6413bool napi_complete_done(struct napi_struct *n, int work_done)
 6414{
 6415	unsigned long flags, val, new, timeout = 0;
 6416	bool ret = true;
 6417
 6418	/*
 6419	 * 1) Don't let napi dequeue from the cpu poll list
 6420	 *    just in case its running on a different cpu.
 6421	 * 2) If we are busy polling, do nothing here, we have
 6422	 *    the guarantee we will be called later.
 6423	 */
 6424	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 6425				 NAPIF_STATE_IN_BUSY_POLL)))
 6426		return false;
 6427
 6428	if (work_done) {
 6429		if (n->gro_bitmask)
 6430			timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6431		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
 6432	}
 6433	if (n->defer_hard_irqs_count > 0) {
 6434		n->defer_hard_irqs_count--;
 6435		timeout = READ_ONCE(n->dev->gro_flush_timeout);
 6436		if (timeout)
 6437			ret = false;
 6438	}
 6439	if (n->gro_bitmask) {
 6440		/* When the NAPI instance uses a timeout and keeps postponing
 6441		 * it, we need to bound somehow the time packets are kept in
 6442		 * the GRO layer
 6443		 */
 6444		napi_gro_flush(n, !!timeout);
 6445	}
 6446
 6447	gro_normal_list(n);
 6448
 6449	if (unlikely(!list_empty(&n->poll_list))) {
 6450		/* If n->poll_list is not empty, we need to mask irqs */
 6451		local_irq_save(flags);
 6452		list_del_init(&n->poll_list);
 6453		local_irq_restore(flags);
 6454	}
 6455
 6456	do {
 6457		val = READ_ONCE(n->state);
 6458
 6459		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 6460
 6461		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
 6462
 6463		/* If STATE_MISSED was set, leave STATE_SCHED set,
 6464		 * because we will call napi->poll() one more time.
 6465		 * This C code was suggested by Alexander Duyck to help gcc.
 6466		 */
 6467		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
 6468						    NAPIF_STATE_SCHED;
 6469	} while (cmpxchg(&n->state, val, new) != val);
 6470
 6471	if (unlikely(val & NAPIF_STATE_MISSED)) {
 6472		__napi_schedule(n);
 6473		return false;
 6474	}
 6475
 6476	if (timeout)
 6477		hrtimer_start(&n->timer, ns_to_ktime(timeout),
 6478			      HRTIMER_MODE_REL_PINNED);
 6479	return ret;
 6480}
 6481EXPORT_SYMBOL(napi_complete_done);
 6482
 6483/* must be called under rcu_read_lock(), as we dont take a reference */
 6484static struct napi_struct *napi_by_id(unsigned int napi_id)
 6485{
 6486	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 6487	struct napi_struct *napi;
 6488
 6489	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
 6490		if (napi->napi_id == napi_id)
 6491			return napi;
 6492
 6493	return NULL;
 6494}
 6495
 6496#if defined(CONFIG_NET_RX_BUSY_POLL)
 6497
 6498#define BUSY_POLL_BUDGET 8
 6499
 6500static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 6501{
 6502	int rc;
 6503
 6504	/* Busy polling means there is a high chance device driver hard irq
 6505	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
 6506	 * set in napi_schedule_prep().
 6507	 * Since we are about to call napi->poll() once more, we can safely
 6508	 * clear NAPI_STATE_MISSED.
 6509	 *
 6510	 * Note: x86 could use a single "lock and ..." instruction
 6511	 * to perform these two clear_bit()
 6512	 */
 6513	clear_bit(NAPI_STATE_MISSED, &napi->state);
 6514	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
 6515
 6516	local_bh_disable();
 6517
 6518	/* All we really want here is to re-enable device interrupts.
 6519	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
 6520	 */
 6521	rc = napi->poll(napi, BUSY_POLL_BUDGET);
 6522	/* We can't gro_normal_list() here, because napi->poll() might have
 6523	 * rearmed the napi (napi_complete_done()) in which case it could
 6524	 * already be running on another CPU.
 6525	 */
 6526	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
 6527	netpoll_poll_unlock(have_poll_lock);
 6528	if (rc == BUSY_POLL_BUDGET) {
 6529		/* As the whole budget was spent, we still own the napi so can
 6530		 * safely handle the rx_list.
 6531		 */
 6532		gro_normal_list(napi);
 6533		__napi_schedule(napi);
 6534	}
 6535	local_bh_enable();
 6536}
 6537
 6538void napi_busy_loop(unsigned int napi_id,
 6539		    bool (*loop_end)(void *, unsigned long),
 6540		    void *loop_end_arg)
 6541{
 6542	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
 6543	int (*napi_poll)(struct napi_struct *napi, int budget);
 6544	void *have_poll_lock = NULL;
 6545	struct napi_struct *napi;
 6546
 6547restart:
 6548	napi_poll = NULL;
 6549
 6550	rcu_read_lock();
 6551
 6552	napi = napi_by_id(napi_id);
 6553	if (!napi)
 6554		goto out;
 6555
 6556	preempt_disable();
 6557	for (;;) {
 6558		int work = 0;
 6559
 6560		local_bh_disable();
 6561		if (!napi_poll) {
 6562			unsigned long val = READ_ONCE(napi->state);
 6563
 6564			/* If multiple threads are competing for this napi,
 6565			 * we avoid dirtying napi->state as much as we can.
 6566			 */
 6567			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
 6568				   NAPIF_STATE_IN_BUSY_POLL))
 6569				goto count;
 6570			if (cmpxchg(&napi->state, val,
 6571				    val | NAPIF_STATE_IN_BUSY_POLL |
 6572					  NAPIF_STATE_SCHED) != val)
 6573				goto count;
 6574			have_poll_lock = netpoll_poll_lock(napi);
 6575			napi_poll = napi->poll;
 6576		}
 6577		work = napi_poll(napi, BUSY_POLL_BUDGET);
 6578		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 6579		gro_normal_list(napi);
 6580count:
 6581		if (work > 0)
 6582			__NET_ADD_STATS(dev_net(napi->dev),
 6583					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 6584		local_bh_enable();
 6585
 6586		if (!loop_end || loop_end(loop_end_arg, start_time))
 6587			break;
 6588
 6589		if (unlikely(need_resched())) {
 6590			if (napi_poll)
 6591				busy_poll_stop(napi, have_poll_lock);
 6592			preempt_enable();
 6593			rcu_read_unlock();
 6594			cond_resched();
 6595			if (loop_end(loop_end_arg, start_time))
 6596				return;
 6597			goto restart;
 6598		}
 6599		cpu_relax();
 6600	}
 6601	if (napi_poll)
 6602		busy_poll_stop(napi, have_poll_lock);
 6603	preempt_enable();
 6604out:
 6605	rcu_read_unlock();
 6606}
 6607EXPORT_SYMBOL(napi_busy_loop);
 6608
 6609#endif /* CONFIG_NET_RX_BUSY_POLL */
 6610
 6611static void napi_hash_add(struct napi_struct *napi)
 6612{
 6613	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
 6614		return;
 6615
 6616	spin_lock(&napi_hash_lock);
 6617
 6618	/* 0..NR_CPUS range is reserved for sender_cpu use */
 6619	do {
 6620		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
 6621			napi_gen_id = MIN_NAPI_ID;
 6622	} while (napi_by_id(napi_gen_id));
 6623	napi->napi_id = napi_gen_id;
 6624
 6625	hlist_add_head_rcu(&napi->napi_hash_node,
 6626			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 6627
 6628	spin_unlock(&napi_hash_lock);
 6629}
 6630
 6631/* Warning : caller is responsible to make sure rcu grace period
 6632 * is respected before freeing memory containing @napi
 6633 */
 6634static void napi_hash_del(struct napi_struct *napi)
 6635{
 6636	spin_lock(&napi_hash_lock);
 6637
 6638	hlist_del_init_rcu(&napi->napi_hash_node);
 6639
 6640	spin_unlock(&napi_hash_lock);
 6641}
 6642
 6643static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 6644{
 6645	struct napi_struct *napi;
 6646
 6647	napi = container_of(timer, struct napi_struct, timer);
 6648
 6649	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 6650	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
 6651	 */
 6652	if (!napi_disable_pending(napi) &&
 6653	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
 6654		__napi_schedule_irqoff(napi);
 6655
 6656	return HRTIMER_NORESTART;
 6657}
 6658
 6659static void init_gro_hash(struct napi_struct *napi)
 6660{
 6661	int i;
 6662
 6663	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6664		INIT_LIST_HEAD(&napi->gro_hash[i].list);
 6665		napi->gro_hash[i].count = 0;
 6666	}
 6667	napi->gro_bitmask = 0;
 6668}
 6669
 6670void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 6671		    int (*poll)(struct napi_struct *, int), int weight)
 6672{
 6673	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 6674		return;
 6675
 6676	INIT_LIST_HEAD(&napi->poll_list);
 6677	INIT_HLIST_NODE(&napi->napi_hash_node);
 6678	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 6679	napi->timer.function = napi_watchdog;
 6680	init_gro_hash(napi);
 6681	napi->skb = NULL;
 6682	INIT_LIST_HEAD(&napi->rx_list);
 6683	napi->rx_count = 0;
 6684	napi->poll = poll;
 6685	if (weight > NAPI_POLL_WEIGHT)
 6686		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
 6687				weight);
 6688	napi->weight = weight;
 6689	napi->dev = dev;
 6690#ifdef CONFIG_NETPOLL
 6691	napi->poll_owner = -1;
 6692#endif
 6693	set_bit(NAPI_STATE_SCHED, &napi->state);
 6694	set_bit(NAPI_STATE_NPSVC, &napi->state);
 6695	list_add_rcu(&napi->dev_list, &dev->napi_list);
 6696	napi_hash_add(napi);
 6697}
 6698EXPORT_SYMBOL(netif_napi_add);
 6699
 6700void napi_disable(struct napi_struct *n)
 6701{
 6702	might_sleep();
 6703	set_bit(NAPI_STATE_DISABLE, &n->state);
 6704
 6705	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
 6706		msleep(1);
 6707	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
 6708		msleep(1);
 6709
 6710	hrtimer_cancel(&n->timer);
 6711
 6712	clear_bit(NAPI_STATE_DISABLE, &n->state);
 6713}
 6714EXPORT_SYMBOL(napi_disable);
 6715
 6716static void flush_gro_hash(struct napi_struct *napi)
 6717{
 6718	int i;
 6719
 6720	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
 6721		struct sk_buff *skb, *n;
 6722
 6723		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
 6724			kfree_skb(skb);
 6725		napi->gro_hash[i].count = 0;
 6726	}
 6727}
 6728
 6729/* Must be called in process context */
 6730void __netif_napi_del(struct napi_struct *napi)
 6731{
 6732	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
 6733		return;
 6734
 6735	napi_hash_del(napi);
 6736	list_del_rcu(&napi->dev_list);
 6737	napi_free_frags(napi);
 6738
 6739	flush_gro_hash(napi);
 6740	napi->gro_bitmask = 0;
 6741}
 6742EXPORT_SYMBOL(__netif_napi_del);
 6743
 6744static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 6745{
 6746	void *have;
 6747	int work, weight;
 6748
 6749	list_del_init(&n->poll_list);
 6750
 6751	have = netpoll_poll_lock(n);
 6752
 6753	weight = n->weight;
 6754
 6755	/* This NAPI_STATE_SCHED test is for avoiding a race
 6756	 * with netpoll's poll_napi().  Only the entity which
 6757	 * obtains the lock and sees NAPI_STATE_SCHED set will
 6758	 * actually make the ->poll() call.  Therefore we avoid
 6759	 * accidentally calling ->poll() when NAPI is not scheduled.
 6760	 */
 6761	work = 0;
 6762	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 6763		work = n->poll(n, weight);
 6764		trace_napi_poll(n, work, weight);
 6765	}
 6766
 6767	if (unlikely(work > weight))
 6768		pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
 6769			    n->poll, work, weight);
 6770
 6771	if (likely(work < weight))
 6772		goto out_unlock;
 6773
 6774	/* Drivers must not modify the NAPI state if they
 6775	 * consume the entire weight.  In such cases this code
 6776	 * still "owns" the NAPI instance and therefore can
 6777	 * move the instance around on the list at-will.
 6778	 */
 6779	if (unlikely(napi_disable_pending(n))) {
 6780		napi_complete(n);
 6781		goto out_unlock;
 6782	}
 6783
 6784	if (n->gro_bitmask) {
 6785		/* flush too old packets
 6786		 * If HZ < 1000, flush all packets.
 6787		 */
 6788		napi_gro_flush(n, HZ >= 1000);
 6789	}
 6790
 6791	gro_normal_list(n);
 6792
 6793	/* Some drivers may have called napi_schedule
 6794	 * prior to exhausting their budget.
 6795	 */
 6796	if (unlikely(!list_empty(&n->poll_list))) {
 6797		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
 6798			     n->dev ? n->dev->name : "backlog");
 6799		goto out_unlock;
 6800	}
 6801
 6802	list_add_tail(&n->poll_list, repoll);
 6803
 6804out_unlock:
 6805	netpoll_poll_unlock(have);
 6806
 6807	return work;
 6808}
 6809
 6810static __latent_entropy void net_rx_action(struct softirq_action *h)
 6811{
 6812	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 6813	unsigned long time_limit = jiffies +
 6814		usecs_to_jiffies(netdev_budget_usecs);
 6815	int budget = netdev_budget;
 6816	LIST_HEAD(list);
 6817	LIST_HEAD(repoll);
 6818
 6819	local_irq_disable();
 6820	list_splice_init(&sd->poll_list, &list);
 6821	local_irq_enable();
 6822
 6823	for (;;) {
 6824		struct napi_struct *n;
 6825
 6826		if (list_empty(&list)) {
 6827			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 6828				goto out;
 6829			break;
 6830		}
 6831
 6832		n = list_first_entry(&list, struct napi_struct, poll_list);
 6833		budget -= napi_poll(n, &repoll);
 6834
 6835		/* If softirq window is exhausted then punt.
 6836		 * Allow this to run for 2 jiffies since which will allow
 6837		 * an average latency of 1.5/HZ.
 6838		 */
 6839		if (unlikely(budget <= 0 ||
 6840			     time_after_eq(jiffies, time_limit))) {
 6841			sd->time_squeeze++;
 6842			break;
 6843		}
 6844	}
 6845
 6846	local_irq_disable();
 6847
 6848	list_splice_tail_init(&sd->poll_list, &list);
 6849	list_splice_tail(&repoll, &list);
 6850	list_splice(&list, &sd->poll_list);
 6851	if (!list_empty(&sd->poll_list))
 6852		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 6853
 6854	net_rps_action_and_irq_enable(sd);
 6855out:
 6856	__kfree_skb_flush();
 6857}
 6858
 6859struct netdev_adjacent {
 6860	struct net_device *dev;
 6861
 6862	/* upper master flag, there can only be one master device per list */
 6863	bool master;
 6864
 6865	/* lookup ignore flag */
 6866	bool ignore;
 6867
 6868	/* counter for the number of times this device was added to us */
 6869	u16 ref_nr;
 6870
 6871	/* private field for the users */
 6872	void *private;
 6873
 6874	struct list_head list;
 6875	struct rcu_head rcu;
 6876};
 6877
 6878static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 6879						 struct list_head *adj_list)
 6880{
 6881	struct netdev_adjacent *adj;
 6882
 6883	list_for_each_entry(adj, adj_list, list) {
 6884		if (adj->dev == adj_dev)
 6885			return adj;
 6886	}
 6887	return NULL;
 6888}
 6889
 6890static int ____netdev_has_upper_dev(struct net_device *upper_dev,
 6891				    struct netdev_nested_priv *priv)
 6892{
 6893	struct net_device *dev = (struct net_device *)priv->data;
 6894
 6895	return upper_dev == dev;
 6896}
 6897
 6898/**
 6899 * netdev_has_upper_dev - Check if device is linked to an upper device
 6900 * @dev: device
 6901 * @upper_dev: upper device to check
 6902 *
 6903 * Find out if a device is linked to specified upper device and return true
 6904 * in case it is. Note that this checks only immediate upper device,
 6905 * not through a complete stack of devices. The caller must hold the RTNL lock.
 6906 */
 6907bool netdev_has_upper_dev(struct net_device *dev,
 6908			  struct net_device *upper_dev)
 6909{
 6910	struct netdev_nested_priv priv = {
 6911		.data = (void *)upper_dev,
 6912	};
 6913
 6914	ASSERT_RTNL();
 6915
 6916	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6917					     &priv);
 6918}
 6919EXPORT_SYMBOL(netdev_has_upper_dev);
 6920
 6921/**
 6922 * netdev_has_upper_dev_all - Check if device is linked to an upper device
 6923 * @dev: device
 6924 * @upper_dev: upper device to check
 6925 *
 6926 * Find out if a device is linked to specified upper device and return true
 6927 * in case it is. Note that this checks the entire upper device chain.
 6928 * The caller must hold rcu lock.
 6929 */
 6930
 6931bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 6932				  struct net_device *upper_dev)
 6933{
 6934	struct netdev_nested_priv priv = {
 6935		.data = (void *)upper_dev,
 6936	};
 6937
 6938	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
 6939					       &priv);
 6940}
 6941EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
 6942
 6943/**
 6944 * netdev_has_any_upper_dev - Check if device is linked to some device
 6945 * @dev: device
 6946 *
 6947 * Find out if a device is linked to an upper device and return true in case
 6948 * it is. The caller must hold the RTNL lock.
 6949 */
 6950bool netdev_has_any_upper_dev(struct net_device *dev)
 6951{
 6952	ASSERT_RTNL();
 6953
 6954	return !list_empty(&dev->adj_list.upper);
 6955}
 6956EXPORT_SYMBOL(netdev_has_any_upper_dev);
 6957
 6958/**
 6959 * netdev_master_upper_dev_get - Get master upper device
 6960 * @dev: device
 6961 *
 6962 * Find a master upper device and return pointer to it or NULL in case
 6963 * it's not there. The caller must hold the RTNL lock.
 6964 */
 6965struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 6966{
 6967	struct netdev_adjacent *upper;
 6968
 6969	ASSERT_RTNL();
 6970
 6971	if (list_empty(&dev->adj_list.upper))
 6972		return NULL;
 6973
 6974	upper = list_first_entry(&dev->adj_list.upper,
 6975				 struct netdev_adjacent, list);
 6976	if (likely(upper->master))
 6977		return upper->dev;
 6978	return NULL;
 6979}
 6980EXPORT_SYMBOL(netdev_master_upper_dev_get);
 6981
 6982static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
 6983{
 6984	struct netdev_adjacent *upper;
 6985
 6986	ASSERT_RTNL();
 6987
 6988	if (list_empty(&dev->adj_list.upper))
 6989		return NULL;
 6990
 6991	upper = list_first_entry(&dev->adj_list.upper,
 6992				 struct netdev_adjacent, list);
 6993	if (likely(upper->master) && !upper->ignore)
 6994		return upper->dev;
 6995	return NULL;
 6996}
 6997
 6998/**
 6999 * netdev_has_any_lower_dev - Check if device is linked to some device
 7000 * @dev: device
 7001 *
 7002 * Find out if a device is linked to a lower device and return true in case
 7003 * it is. The caller must hold the RTNL lock.
 7004 */
 7005static bool netdev_has_any_lower_dev(struct net_device *dev)
 7006{
 7007	ASSERT_RTNL();
 7008
 7009	return !list_empty(&dev->adj_list.lower);
 7010}
 7011
 7012void *netdev_adjacent_get_private(struct list_head *adj_list)
 7013{
 7014	struct netdev_adjacent *adj;
 7015
 7016	adj = list_entry(adj_list, struct netdev_adjacent, list);
 7017
 7018	return adj->private;
 7019}
 7020EXPORT_SYMBOL(netdev_adjacent_get_private);
 7021
 7022/**
 7023 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 7024 * @dev: device
 7025 * @iter: list_head ** of the current position
 7026 *
 7027 * Gets the next device from the dev's upper list, starting from iter
 7028 * position. The caller must hold RCU read lock.
 7029 */
 7030struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 7031						 struct list_head **iter)
 7032{
 7033	struct netdev_adjacent *upper;
 7034
 7035	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7036
 7037	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7038
 7039	if (&upper->list == &dev->adj_list.upper)
 7040		return NULL;
 7041
 7042	*iter = &upper->list;
 7043
 7044	return upper->dev;
 7045}
 7046EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 7047
 7048static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
 7049						  struct list_head **iter,
 7050						  bool *ignore)
 7051{
 7052	struct netdev_adjacent *upper;
 7053
 7054	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
 7055
 7056	if (&upper->list == &dev->adj_list.upper)
 7057		return NULL;
 7058
 7059	*iter = &upper->list;
 7060	*ignore = upper->ignore;
 7061
 7062	return upper->dev;
 7063}
 7064
 7065static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 7066						    struct list_head **iter)
 7067{
 7068	struct netdev_adjacent *upper;
 7069
 7070	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
 7071
 7072	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7073
 7074	if (&upper->list == &dev->adj_list.upper)
 7075		return NULL;
 7076
 7077	*iter = &upper->list;
 7078
 7079	return upper->dev;
 7080}
 7081
 7082static int __netdev_walk_all_upper_dev(struct net_device *dev,
 7083				       int (*fn)(struct net_device *dev,
 7084					 struct netdev_nested_priv *priv),
 7085				       struct netdev_nested_priv *priv)
 7086{
 7087	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7088	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7089	int ret, cur = 0;
 7090	bool ignore;
 7091
 7092	now = dev;
 7093	iter = &dev->adj_list.upper;
 7094
 7095	while (1) {
 7096		if (now != dev) {
 7097			ret = fn(now, priv);
 7098			if (ret)
 7099				return ret;
 7100		}
 7101
 7102		next = NULL;
 7103		while (1) {
 7104			udev = __netdev_next_upper_dev(now, &iter, &ignore);
 7105			if (!udev)
 7106				break;
 7107			if (ignore)
 7108				continue;
 7109
 7110			next = udev;
 7111			niter = &udev->adj_list.upper;
 7112			dev_stack[cur] = now;
 7113			iter_stack[cur++] = iter;
 7114			break;
 7115		}
 7116
 7117		if (!next) {
 7118			if (!cur)
 7119				return 0;
 7120			next = dev_stack[--cur];
 7121			niter = iter_stack[cur];
 7122		}
 7123
 7124		now = next;
 7125		iter = niter;
 7126	}
 7127
 7128	return 0;
 7129}
 7130
 7131int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 7132				  int (*fn)(struct net_device *dev,
 7133					    struct netdev_nested_priv *priv),
 7134				  struct netdev_nested_priv *priv)
 7135{
 7136	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7137	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7138	int ret, cur = 0;
 7139
 7140	now = dev;
 7141	iter = &dev->adj_list.upper;
 7142
 7143	while (1) {
 7144		if (now != dev) {
 7145			ret = fn(now, priv);
 7146			if (ret)
 7147				return ret;
 7148		}
 7149
 7150		next = NULL;
 7151		while (1) {
 7152			udev = netdev_next_upper_dev_rcu(now, &iter);
 7153			if (!udev)
 7154				break;
 7155
 7156			next = udev;
 7157			niter = &udev->adj_list.upper;
 7158			dev_stack[cur] = now;
 7159			iter_stack[cur++] = iter;
 7160			break;
 7161		}
 7162
 7163		if (!next) {
 7164			if (!cur)
 7165				return 0;
 7166			next = dev_stack[--cur];
 7167			niter = iter_stack[cur];
 7168		}
 7169
 7170		now = next;
 7171		iter = niter;
 7172	}
 7173
 7174	return 0;
 7175}
 7176EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
 7177
 7178static bool __netdev_has_upper_dev(struct net_device *dev,
 7179				   struct net_device *upper_dev)
 7180{
 7181	struct netdev_nested_priv priv = {
 7182		.flags = 0,
 7183		.data = (void *)upper_dev,
 7184	};
 7185
 7186	ASSERT_RTNL();
 7187
 7188	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
 7189					   &priv);
 7190}
 7191
 7192/**
 7193 * netdev_lower_get_next_private - Get the next ->private from the
 7194 *				   lower neighbour list
 7195 * @dev: device
 7196 * @iter: list_head ** of the current position
 7197 *
 7198 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7199 * list, starting from iter position. The caller must hold either hold the
 7200 * RTNL lock or its own locking that guarantees that the neighbour lower
 7201 * list will remain unchanged.
 7202 */
 7203void *netdev_lower_get_next_private(struct net_device *dev,
 7204				    struct list_head **iter)
 7205{
 7206	struct netdev_adjacent *lower;
 7207
 7208	lower = list_entry(*iter, struct netdev_adjacent, list);
 7209
 7210	if (&lower->list == &dev->adj_list.lower)
 7211		return NULL;
 7212
 7213	*iter = lower->list.next;
 7214
 7215	return lower->private;
 7216}
 7217EXPORT_SYMBOL(netdev_lower_get_next_private);
 7218
 7219/**
 7220 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 7221 *				       lower neighbour list, RCU
 7222 *				       variant
 7223 * @dev: device
 7224 * @iter: list_head ** of the current position
 7225 *
 7226 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 7227 * list, starting from iter position. The caller must hold RCU read lock.
 7228 */
 7229void *netdev_lower_get_next_private_rcu(struct net_device *dev,
 7230					struct list_head **iter)
 7231{
 7232	struct netdev_adjacent *lower;
 7233
 7234	WARN_ON_ONCE(!rcu_read_lock_held());
 7235
 7236	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7237
 7238	if (&lower->list == &dev->adj_list.lower)
 7239		return NULL;
 7240
 7241	*iter = &lower->list;
 7242
 7243	return lower->private;
 7244}
 7245EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
 7246
 7247/**
 7248 * netdev_lower_get_next - Get the next device from the lower neighbour
 7249 *                         list
 7250 * @dev: device
 7251 * @iter: list_head ** of the current position
 7252 *
 7253 * Gets the next netdev_adjacent from the dev's lower neighbour
 7254 * list, starting from iter position. The caller must hold RTNL lock or
 7255 * its own locking that guarantees that the neighbour lower
 7256 * list will remain unchanged.
 7257 */
 7258void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 7259{
 7260	struct netdev_adjacent *lower;
 7261
 7262	lower = list_entry(*iter, struct netdev_adjacent, list);
 7263
 7264	if (&lower->list == &dev->adj_list.lower)
 7265		return NULL;
 7266
 7267	*iter = lower->list.next;
 7268
 7269	return lower->dev;
 7270}
 7271EXPORT_SYMBOL(netdev_lower_get_next);
 7272
 7273static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 7274						struct list_head **iter)
 7275{
 7276	struct netdev_adjacent *lower;
 7277
 7278	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7279
 7280	if (&lower->list == &dev->adj_list.lower)
 7281		return NULL;
 7282
 7283	*iter = &lower->list;
 7284
 7285	return lower->dev;
 7286}
 7287
 7288static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
 7289						  struct list_head **iter,
 7290						  bool *ignore)
 7291{
 7292	struct netdev_adjacent *lower;
 7293
 7294	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
 7295
 7296	if (&lower->list == &dev->adj_list.lower)
 7297		return NULL;
 7298
 7299	*iter = &lower->list;
 7300	*ignore = lower->ignore;
 7301
 7302	return lower->dev;
 7303}
 7304
 7305int netdev_walk_all_lower_dev(struct net_device *dev,
 7306			      int (*fn)(struct net_device *dev,
 7307					struct netdev_nested_priv *priv),
 7308			      struct netdev_nested_priv *priv)
 7309{
 7310	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7311	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7312	int ret, cur = 0;
 7313
 7314	now = dev;
 7315	iter = &dev->adj_list.lower;
 7316
 7317	while (1) {
 7318		if (now != dev) {
 7319			ret = fn(now, priv);
 7320			if (ret)
 7321				return ret;
 7322		}
 7323
 7324		next = NULL;
 7325		while (1) {
 7326			ldev = netdev_next_lower_dev(now, &iter);
 7327			if (!ldev)
 7328				break;
 7329
 7330			next = ldev;
 7331			niter = &ldev->adj_list.lower;
 7332			dev_stack[cur] = now;
 7333			iter_stack[cur++] = iter;
 7334			break;
 7335		}
 7336
 7337		if (!next) {
 7338			if (!cur)
 7339				return 0;
 7340			next = dev_stack[--cur];
 7341			niter = iter_stack[cur];
 7342		}
 7343
 7344		now = next;
 7345		iter = niter;
 7346	}
 7347
 7348	return 0;
 7349}
 7350EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 7351
 7352static int __netdev_walk_all_lower_dev(struct net_device *dev,
 7353				       int (*fn)(struct net_device *dev,
 7354					 struct netdev_nested_priv *priv),
 7355				       struct netdev_nested_priv *priv)
 7356{
 7357	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7358	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7359	int ret, cur = 0;
 7360	bool ignore;
 7361
 7362	now = dev;
 7363	iter = &dev->adj_list.lower;
 7364
 7365	while (1) {
 7366		if (now != dev) {
 7367			ret = fn(now, priv);
 7368			if (ret)
 7369				return ret;
 7370		}
 7371
 7372		next = NULL;
 7373		while (1) {
 7374			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
 7375			if (!ldev)
 7376				break;
 7377			if (ignore)
 7378				continue;
 7379
 7380			next = ldev;
 7381			niter = &ldev->adj_list.lower;
 7382			dev_stack[cur] = now;
 7383			iter_stack[cur++] = iter;
 7384			break;
 7385		}
 7386
 7387		if (!next) {
 7388			if (!cur)
 7389				return 0;
 7390			next = dev_stack[--cur];
 7391			niter = iter_stack[cur];
 7392		}
 7393
 7394		now = next;
 7395		iter = niter;
 7396	}
 7397
 7398	return 0;
 7399}
 7400
 7401struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 7402					     struct list_head **iter)
 7403{
 7404	struct netdev_adjacent *lower;
 7405
 7406	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
 7407	if (&lower->list == &dev->adj_list.lower)
 7408		return NULL;
 7409
 7410	*iter = &lower->list;
 7411
 7412	return lower->dev;
 7413}
 7414EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
 7415
 7416static u8 __netdev_upper_depth(struct net_device *dev)
 7417{
 7418	struct net_device *udev;
 7419	struct list_head *iter;
 7420	u8 max_depth = 0;
 7421	bool ignore;
 7422
 7423	for (iter = &dev->adj_list.upper,
 7424	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
 7425	     udev;
 7426	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
 7427		if (ignore)
 7428			continue;
 7429		if (max_depth < udev->upper_level)
 7430			max_depth = udev->upper_level;
 7431	}
 7432
 7433	return max_depth;
 7434}
 7435
 7436static u8 __netdev_lower_depth(struct net_device *dev)
 7437{
 7438	struct net_device *ldev;
 7439	struct list_head *iter;
 7440	u8 max_depth = 0;
 7441	bool ignore;
 7442
 7443	for (iter = &dev->adj_list.lower,
 7444	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
 7445	     ldev;
 7446	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
 7447		if (ignore)
 7448			continue;
 7449		if (max_depth < ldev->lower_level)
 7450			max_depth = ldev->lower_level;
 7451	}
 7452
 7453	return max_depth;
 7454}
 7455
 7456static int __netdev_update_upper_level(struct net_device *dev,
 7457				       struct netdev_nested_priv *__unused)
 7458{
 7459	dev->upper_level = __netdev_upper_depth(dev) + 1;
 7460	return 0;
 7461}
 7462
 7463static int __netdev_update_lower_level(struct net_device *dev,
 7464				       struct netdev_nested_priv *priv)
 7465{
 7466	dev->lower_level = __netdev_lower_depth(dev) + 1;
 7467
 7468#ifdef CONFIG_LOCKDEP
 7469	if (!priv)
 7470		return 0;
 7471
 7472	if (priv->flags & NESTED_SYNC_IMM)
 7473		dev->nested_level = dev->lower_level - 1;
 7474	if (priv->flags & NESTED_SYNC_TODO)
 7475		net_unlink_todo(dev);
 7476#endif
 7477	return 0;
 7478}
 7479
 7480int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
 7481				  int (*fn)(struct net_device *dev,
 7482					    struct netdev_nested_priv *priv),
 7483				  struct netdev_nested_priv *priv)
 7484{
 7485	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
 7486	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
 7487	int ret, cur = 0;
 7488
 7489	now = dev;
 7490	iter = &dev->adj_list.lower;
 7491
 7492	while (1) {
 7493		if (now != dev) {
 7494			ret = fn(now, priv);
 7495			if (ret)
 7496				return ret;
 7497		}
 7498
 7499		next = NULL;
 7500		while (1) {
 7501			ldev = netdev_next_lower_dev_rcu(now, &iter);
 7502			if (!ldev)
 7503				break;
 7504
 7505			next = ldev;
 7506			niter = &ldev->adj_list.lower;
 7507			dev_stack[cur] = now;
 7508			iter_stack[cur++] = iter;
 7509			break;
 7510		}
 7511
 7512		if (!next) {
 7513			if (!cur)
 7514				return 0;
 7515			next = dev_stack[--cur];
 7516			niter = iter_stack[cur];
 7517		}
 7518
 7519		now = next;
 7520		iter = niter;
 7521	}
 7522
 7523	return 0;
 7524}
 7525EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
 7526
 7527/**
 7528 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 7529 *				       lower neighbour list, RCU
 7530 *				       variant
 7531 * @dev: device
 7532 *
 7533 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 7534 * list. The caller must hold RCU read lock.
 7535 */
 7536void *netdev_lower_get_first_private_rcu(struct net_device *dev)
 7537{
 7538	struct netdev_adjacent *lower;
 7539
 7540	lower = list_first_or_null_rcu(&dev->adj_list.lower,
 7541			struct netdev_adjacent, list);
 7542	if (lower)
 7543		return lower->private;
 7544	return NULL;
 7545}
 7546EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
 7547
 7548/**
 7549 * netdev_master_upper_dev_get_rcu - Get master upper device
 7550 * @dev: device
 7551 *
 7552 * Find a master upper device and return pointer to it or NULL in case
 7553 * it's not there. The caller must hold the RCU read lock.
 7554 */
 7555struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
 7556{
 7557	struct netdev_adjacent *upper;
 7558
 7559	upper = list_first_or_null_rcu(&dev->adj_list.upper,
 7560				       struct netdev_adjacent, list);
 7561	if (upper && likely(upper->master))
 7562		return upper->dev;
 7563	return NULL;
 7564}
 7565EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
 7566
 7567static int netdev_adjacent_sysfs_add(struct net_device *dev,
 7568			      struct net_device *adj_dev,
 7569			      struct list_head *dev_list)
 7570{
 7571	char linkname[IFNAMSIZ+7];
 7572
 7573	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7574		"upper_%s" : "lower_%s", adj_dev->name);
 7575	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
 7576				 linkname);
 7577}
 7578static void netdev_adjacent_sysfs_del(struct net_device *dev,
 7579			       char *name,
 7580			       struct list_head *dev_list)
 7581{
 7582	char linkname[IFNAMSIZ+7];
 7583
 7584	sprintf(linkname, dev_list == &dev->adj_list.upper ?
 7585		"upper_%s" : "lower_%s", name);
 7586	sysfs_remove_link(&(dev->dev.kobj), linkname);
 7587}
 7588
 7589static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
 7590						 struct net_device *adj_dev,
 7591						 struct list_head *dev_list)
 7592{
 7593	return (dev_list == &dev->adj_list.upper ||
 7594		dev_list == &dev->adj_list.lower) &&
 7595		net_eq(dev_net(dev), dev_net(adj_dev));
 7596}
 7597
 7598static int __netdev_adjacent_dev_insert(struct net_device *dev,
 7599					struct net_device *adj_dev,
 7600					struct list_head *dev_list,
 7601					void *private, bool master)
 7602{
 7603	struct netdev_adjacent *adj;
 7604	int ret;
 7605
 7606	adj = __netdev_find_adj(adj_dev, dev_list);
 7607
 7608	if (adj) {
 7609		adj->ref_nr += 1;
 7610		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
 7611			 dev->name, adj_dev->name, adj->ref_nr);
 7612
 7613		return 0;
 7614	}
 7615
 7616	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
 7617	if (!adj)
 7618		return -ENOMEM;
 7619
 7620	adj->dev = adj_dev;
 7621	adj->master = master;
 7622	adj->ref_nr = 1;
 7623	adj->private = private;
 7624	adj->ignore = false;
 7625	dev_hold(adj_dev);
 7626
 7627	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
 7628		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
 7629
 7630	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 7631		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 7632		if (ret)
 7633			goto free_adj;
 7634	}
 7635
 7636	/* Ensure that master link is always the first item in list. */
 7637	if (master) {
 7638		ret = sysfs_create_link(&(dev->dev.kobj),
 7639					&(adj_dev->dev.kobj), "master");
 7640		if (ret)
 7641			goto remove_symlinks;
 7642
 7643		list_add_rcu(&adj->list, dev_list);
 7644	} else {
 7645		list_add_tail_rcu(&adj->list, dev_list);
 7646	}
 7647
 7648	return 0;
 7649
 7650remove_symlinks:
 7651	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7652		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7653free_adj:
 7654	kfree(adj);
 7655	dev_put(adj_dev);
 7656
 7657	return ret;
 7658}
 7659
 7660static void __netdev_adjacent_dev_remove(struct net_device *dev,
 7661					 struct net_device *adj_dev,
 7662					 u16 ref_nr,
 7663					 struct list_head *dev_list)
 7664{
 7665	struct netdev_adjacent *adj;
 7666
 7667	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
 7668		 dev->name, adj_dev->name, ref_nr);
 7669
 7670	adj = __netdev_find_adj(adj_dev, dev_list);
 7671
 7672	if (!adj) {
 7673		pr_err("Adjacency does not exist for device %s from %s\n",
 7674		       dev->name, adj_dev->name);
 7675		WARN_ON(1);
 7676		return;
 7677	}
 7678
 7679	if (adj->ref_nr > ref_nr) {
 7680		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
 7681			 dev->name, adj_dev->name, ref_nr,
 7682			 adj->ref_nr - ref_nr);
 7683		adj->ref_nr -= ref_nr;
 7684		return;
 7685	}
 7686
 7687	if (adj->master)
 7688		sysfs_remove_link(&(dev->dev.kobj), "master");
 7689
 7690	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 7691		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 7692
 7693	list_del_rcu(&adj->list);
 7694	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
 7695		 adj_dev->name, dev->name, adj_dev->name);
 7696	dev_put(adj_dev);
 7697	kfree_rcu(adj, rcu);
 7698}
 7699
 7700static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 7701					    struct net_device *upper_dev,
 7702					    struct list_head *up_list,
 7703					    struct list_head *down_list,
 7704					    void *private, bool master)
 7705{
 7706	int ret;
 7707
 7708	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
 7709					   private, master);
 7710	if (ret)
 7711		return ret;
 7712
 7713	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
 7714					   private, false);
 7715	if (ret) {
 7716		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
 7717		return ret;
 7718	}
 7719
 7720	return 0;
 7721}
 7722
 7723static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 7724					       struct net_device *upper_dev,
 7725					       u16 ref_nr,
 7726					       struct list_head *up_list,
 7727					       struct list_head *down_list)
 7728{
 7729	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
 7730	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 7731}
 7732
 7733static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 7734						struct net_device *upper_dev,
 7735						void *private, bool master)
 7736{
 7737	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
 7738						&dev->adj_list.upper,
 7739						&upper_dev->adj_list.lower,
 7740						private, master);
 7741}
 7742
 7743static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 7744						   struct net_device *upper_dev)
 7745{
 7746	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 7747					   &dev->adj_list.upper,
 7748					   &upper_dev->adj_list.lower);
 7749}
 7750
 7751static int __netdev_upper_dev_link(struct net_device *dev,
 7752				   struct net_device *upper_dev, bool master,
 7753				   void *upper_priv, void *upper_info,
 7754				   struct netdev_nested_priv *priv,
 7755				   struct netlink_ext_ack *extack)
 7756{
 7757	struct netdev_notifier_changeupper_info changeupper_info = {
 7758		.info = {
 7759			.dev = dev,
 7760			.extack = extack,
 7761		},
 7762		.upper_dev = upper_dev,
 7763		.master = master,
 7764		.linking = true,
 7765		.upper_info = upper_info,
 7766	};
 7767	struct net_device *master_dev;
 7768	int ret = 0;
 7769
 7770	ASSERT_RTNL();
 7771
 7772	if (dev == upper_dev)
 7773		return -EBUSY;
 7774
 7775	/* To prevent loops, check if dev is not upper device to upper_dev. */
 7776	if (__netdev_has_upper_dev(upper_dev, dev))
 7777		return -EBUSY;
 7778
 7779	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
 7780		return -EMLINK;
 7781
 7782	if (!master) {
 7783		if (__netdev_has_upper_dev(dev, upper_dev))
 7784			return -EEXIST;
 7785	} else {
 7786		master_dev = __netdev_master_upper_dev_get(dev);
 7787		if (master_dev)
 7788			return master_dev == upper_dev ? -EEXIST : -EBUSY;
 7789	}
 7790
 7791	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7792					    &changeupper_info.info);
 7793	ret = notifier_to_errno(ret);
 7794	if (ret)
 7795		return ret;
 7796
 7797	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 7798						   master);
 7799	if (ret)
 7800		return ret;
 7801
 7802	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7803					    &changeupper_info.info);
 7804	ret = notifier_to_errno(ret);
 7805	if (ret)
 7806		goto rollback;
 7807
 7808	__netdev_update_upper_level(dev, NULL);
 7809	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7810
 7811	__netdev_update_lower_level(upper_dev, priv);
 7812	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7813				    priv);
 7814
 7815	return 0;
 7816
 7817rollback:
 7818	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7819
 7820	return ret;
 7821}
 7822
 7823/**
 7824 * netdev_upper_dev_link - Add a link to the upper device
 7825 * @dev: device
 7826 * @upper_dev: new upper device
 7827 * @extack: netlink extended ack
 7828 *
 7829 * Adds a link to device which is upper to this one. The caller must hold
 7830 * the RTNL lock. On a failure a negative errno code is returned.
 7831 * On success the reference counts are adjusted and the function
 7832 * returns zero.
 7833 */
 7834int netdev_upper_dev_link(struct net_device *dev,
 7835			  struct net_device *upper_dev,
 7836			  struct netlink_ext_ack *extack)
 7837{
 7838	struct netdev_nested_priv priv = {
 7839		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7840		.data = NULL,
 7841	};
 7842
 7843	return __netdev_upper_dev_link(dev, upper_dev, false,
 7844				       NULL, NULL, &priv, extack);
 7845}
 7846EXPORT_SYMBOL(netdev_upper_dev_link);
 7847
 7848/**
 7849 * netdev_master_upper_dev_link - Add a master link to the upper device
 7850 * @dev: device
 7851 * @upper_dev: new upper device
 7852 * @upper_priv: upper device private
 7853 * @upper_info: upper info to be passed down via notifier
 7854 * @extack: netlink extended ack
 7855 *
 7856 * Adds a link to device which is upper to this one. In this case, only
 7857 * one master upper device can be linked, although other non-master devices
 7858 * might be linked as well. The caller must hold the RTNL lock.
 7859 * On a failure a negative errno code is returned. On success the reference
 7860 * counts are adjusted and the function returns zero.
 7861 */
 7862int netdev_master_upper_dev_link(struct net_device *dev,
 7863				 struct net_device *upper_dev,
 7864				 void *upper_priv, void *upper_info,
 7865				 struct netlink_ext_ack *extack)
 7866{
 7867	struct netdev_nested_priv priv = {
 7868		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7869		.data = NULL,
 7870	};
 7871
 7872	return __netdev_upper_dev_link(dev, upper_dev, true,
 7873				       upper_priv, upper_info, &priv, extack);
 7874}
 7875EXPORT_SYMBOL(netdev_master_upper_dev_link);
 7876
 7877static void __netdev_upper_dev_unlink(struct net_device *dev,
 7878				      struct net_device *upper_dev,
 7879				      struct netdev_nested_priv *priv)
 7880{
 7881	struct netdev_notifier_changeupper_info changeupper_info = {
 7882		.info = {
 7883			.dev = dev,
 7884		},
 7885		.upper_dev = upper_dev,
 7886		.linking = false,
 7887	};
 7888
 7889	ASSERT_RTNL();
 7890
 7891	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
 7892
 7893	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
 7894				      &changeupper_info.info);
 7895
 7896	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 7897
 7898	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
 7899				      &changeupper_info.info);
 7900
 7901	__netdev_update_upper_level(dev, NULL);
 7902	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
 7903
 7904	__netdev_update_lower_level(upper_dev, priv);
 7905	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
 7906				    priv);
 7907}
 7908
 7909/**
 7910 * netdev_upper_dev_unlink - Removes a link to upper device
 7911 * @dev: device
 7912 * @upper_dev: new upper device
 7913 *
 7914 * Removes a link to device which is upper to this one. The caller must hold
 7915 * the RTNL lock.
 7916 */
 7917void netdev_upper_dev_unlink(struct net_device *dev,
 7918			     struct net_device *upper_dev)
 7919{
 7920	struct netdev_nested_priv priv = {
 7921		.flags = NESTED_SYNC_TODO,
 7922		.data = NULL,
 7923	};
 7924
 7925	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
 7926}
 7927EXPORT_SYMBOL(netdev_upper_dev_unlink);
 7928
 7929static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
 7930				      struct net_device *lower_dev,
 7931				      bool val)
 7932{
 7933	struct netdev_adjacent *adj;
 7934
 7935	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
 7936	if (adj)
 7937		adj->ignore = val;
 7938
 7939	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
 7940	if (adj)
 7941		adj->ignore = val;
 7942}
 7943
 7944static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
 7945					struct net_device *lower_dev)
 7946{
 7947	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
 7948}
 7949
 7950static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
 7951				       struct net_device *lower_dev)
 7952{
 7953	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
 7954}
 7955
 7956int netdev_adjacent_change_prepare(struct net_device *old_dev,
 7957				   struct net_device *new_dev,
 7958				   struct net_device *dev,
 7959				   struct netlink_ext_ack *extack)
 7960{
 7961	struct netdev_nested_priv priv = {
 7962		.flags = 0,
 7963		.data = NULL,
 7964	};
 7965	int err;
 7966
 7967	if (!new_dev)
 7968		return 0;
 7969
 7970	if (old_dev && new_dev != old_dev)
 7971		netdev_adjacent_dev_disable(dev, old_dev);
 7972	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
 7973				      extack);
 7974	if (err) {
 7975		if (old_dev && new_dev != old_dev)
 7976			netdev_adjacent_dev_enable(dev, old_dev);
 7977		return err;
 7978	}
 7979
 7980	return 0;
 7981}
 7982EXPORT_SYMBOL(netdev_adjacent_change_prepare);
 7983
 7984void netdev_adjacent_change_commit(struct net_device *old_dev,
 7985				   struct net_device *new_dev,
 7986				   struct net_device *dev)
 7987{
 7988	struct netdev_nested_priv priv = {
 7989		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
 7990		.data = NULL,
 7991	};
 7992
 7993	if (!new_dev || !old_dev)
 7994		return;
 7995
 7996	if (new_dev == old_dev)
 7997		return;
 7998
 7999	netdev_adjacent_dev_enable(dev, old_dev);
 8000	__netdev_upper_dev_unlink(old_dev, dev, &priv);
 8001}
 8002EXPORT_SYMBOL(netdev_adjacent_change_commit);
 8003
 8004void netdev_adjacent_change_abort(struct net_device *old_dev,
 8005				  struct net_device *new_dev,
 8006				  struct net_device *dev)
 8007{
 8008	struct netdev_nested_priv priv = {
 8009		.flags = 0,
 8010		.data = NULL,
 8011	};
 8012
 8013	if (!new_dev)
 8014		return;
 8015
 8016	if (old_dev && new_dev != old_dev)
 8017		netdev_adjacent_dev_enable(dev, old_dev);
 8018
 8019	__netdev_upper_dev_unlink(new_dev, dev, &priv);
 8020}
 8021EXPORT_SYMBOL(netdev_adjacent_change_abort);
 8022
 8023/**
 8024 * netdev_bonding_info_change - Dispatch event about slave change
 8025 * @dev: device
 8026 * @bonding_info: info to dispatch
 8027 *
 8028 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 8029 * The caller must hold the RTNL lock.
 8030 */
 8031void netdev_bonding_info_change(struct net_device *dev,
 8032				struct netdev_bonding_info *bonding_info)
 8033{
 8034	struct netdev_notifier_bonding_info info = {
 8035		.info.dev = dev,
 8036	};
 8037
 8038	memcpy(&info.bonding_info, bonding_info,
 8039	       sizeof(struct netdev_bonding_info));
 8040	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
 8041				      &info.info);
 8042}
 8043EXPORT_SYMBOL(netdev_bonding_info_change);
 8044
 8045/**
 8046 * netdev_get_xmit_slave - Get the xmit slave of master device
 8047 * @dev: device
 8048 * @skb: The packet
 8049 * @all_slaves: assume all the slaves are active
 8050 *
 8051 * The reference counters are not incremented so the caller must be
 8052 * careful with locks. The caller must hold RCU lock.
 8053 * %NULL is returned if no slave is found.
 8054 */
 8055
 8056struct net_device *netdev_get_xmit_slave(struct net_device *dev,
 8057					 struct sk_buff *skb,
 8058					 bool all_slaves)
 8059{
 8060	const struct net_device_ops *ops = dev->netdev_ops;
 8061
 8062	if (!ops->ndo_get_xmit_slave)
 8063		return NULL;
 8064	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
 8065}
 8066EXPORT_SYMBOL(netdev_get_xmit_slave);
 8067
 8068static void netdev_adjacent_add_links(struct net_device *dev)
 8069{
 8070	struct netdev_adjacent *iter;
 8071
 8072	struct net *net = dev_net(dev);
 8073
 8074	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8075		if (!net_eq(net, dev_net(iter->dev)))
 8076			continue;
 8077		netdev_adjacent_sysfs_add(iter->dev, dev,
 8078					  &iter->dev->adj_list.lower);
 8079		netdev_adjacent_sysfs_add(dev, iter->dev,
 8080					  &dev->adj_list.upper);
 8081	}
 8082
 8083	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8084		if (!net_eq(net, dev_net(iter->dev)))
 8085			continue;
 8086		netdev_adjacent_sysfs_add(iter->dev, dev,
 8087					  &iter->dev->adj_list.upper);
 8088		netdev_adjacent_sysfs_add(dev, iter->dev,
 8089					  &dev->adj_list.lower);
 8090	}
 8091}
 8092
 8093static void netdev_adjacent_del_links(struct net_device *dev)
 8094{
 8095	struct netdev_adjacent *iter;
 8096
 8097	struct net *net = dev_net(dev);
 8098
 8099	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8100		if (!net_eq(net, dev_net(iter->dev)))
 8101			continue;
 8102		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8103					  &iter->dev->adj_list.lower);
 8104		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8105					  &dev->adj_list.upper);
 8106	}
 8107
 8108	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8109		if (!net_eq(net, dev_net(iter->dev)))
 8110			continue;
 8111		netdev_adjacent_sysfs_del(iter->dev, dev->name,
 8112					  &iter->dev->adj_list.upper);
 8113		netdev_adjacent_sysfs_del(dev, iter->dev->name,
 8114					  &dev->adj_list.lower);
 8115	}
 8116}
 8117
 8118void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 8119{
 8120	struct netdev_adjacent *iter;
 8121
 8122	struct net *net = dev_net(dev);
 8123
 8124	list_for_each_entry(iter, &dev->adj_list.upper, list) {
 8125		if (!net_eq(net, dev_net(iter->dev)))
 8126			continue;
 8127		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8128					  &iter->dev->adj_list.lower);
 8129		netdev_adjacent_sysfs_add(iter->dev, dev,
 8130					  &iter->dev->adj_list.lower);
 8131	}
 8132
 8133	list_for_each_entry(iter, &dev->adj_list.lower, list) {
 8134		if (!net_eq(net, dev_net(iter->dev)))
 8135			continue;
 8136		netdev_adjacent_sysfs_del(iter->dev, oldname,
 8137					  &iter->dev->adj_list.upper);
 8138		netdev_adjacent_sysfs_add(iter->dev, dev,
 8139					  &iter->dev->adj_list.upper);
 8140	}
 8141}
 8142
 8143void *netdev_lower_dev_get_private(struct net_device *dev,
 8144				   struct net_device *lower_dev)
 8145{
 8146	struct netdev_adjacent *lower;
 8147
 8148	if (!lower_dev)
 8149		return NULL;
 8150	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
 8151	if (!lower)
 8152		return NULL;
 8153
 8154	return lower->private;
 8155}
 8156EXPORT_SYMBOL(netdev_lower_dev_get_private);
 8157
 8158
 8159/**
 8160 * netdev_lower_change - Dispatch event about lower device state change
 8161 * @lower_dev: device
 8162 * @lower_state_info: state to dispatch
 8163 *
 8164 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 8165 * The caller must hold the RTNL lock.
 8166 */
 8167void netdev_lower_state_changed(struct net_device *lower_dev,
 8168				void *lower_state_info)
 8169{
 8170	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
 8171		.info.dev = lower_dev,
 8172	};
 8173
 8174	ASSERT_RTNL();
 8175	changelowerstate_info.lower_state_info = lower_state_info;
 8176	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
 8177				      &changelowerstate_info.info);
 8178}
 8179EXPORT_SYMBOL(netdev_lower_state_changed);
 8180
 8181static void dev_change_rx_flags(struct net_device *dev, int flags)
 8182{
 8183	const struct net_device_ops *ops = dev->netdev_ops;
 8184
 8185	if (ops->ndo_change_rx_flags)
 8186		ops->ndo_change_rx_flags(dev, flags);
 8187}
 8188
 8189static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 8190{
 8191	unsigned int old_flags = dev->flags;
 8192	kuid_t uid;
 8193	kgid_t gid;
 8194
 8195	ASSERT_RTNL();
 8196
 8197	dev->flags |= IFF_PROMISC;
 8198	dev->promiscuity += inc;
 8199	if (dev->promiscuity == 0) {
 8200		/*
 8201		 * Avoid overflow.
 8202		 * If inc causes overflow, untouch promisc and return error.
 8203		 */
 8204		if (inc < 0)
 8205			dev->flags &= ~IFF_PROMISC;
 8206		else {
 8207			dev->promiscuity -= inc;
 8208			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
 8209				dev->name);
 8210			return -EOVERFLOW;
 8211		}
 8212	}
 8213	if (dev->flags != old_flags) {
 8214		pr_info("device %s %s promiscuous mode\n",
 8215			dev->name,
 8216			dev->flags & IFF_PROMISC ? "entered" : "left");
 8217		if (audit_enabled) {
 8218			current_uid_gid(&uid, &gid);
 8219			audit_log(audit_context(), GFP_ATOMIC,
 8220				  AUDIT_ANOM_PROMISCUOUS,
 8221				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
 8222				  dev->name, (dev->flags & IFF_PROMISC),
 8223				  (old_flags & IFF_PROMISC),
 8224				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
 8225				  from_kuid(&init_user_ns, uid),
 8226				  from_kgid(&init_user_ns, gid),
 8227				  audit_get_sessionid(current));
 8228		}
 8229
 8230		dev_change_rx_flags(dev, IFF_PROMISC);
 8231	}
 8232	if (notify)
 8233		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
 8234	return 0;
 8235}
 8236
 8237/**
 8238 *	dev_set_promiscuity	- update promiscuity count on a device
 8239 *	@dev: device
 8240 *	@inc: modifier
 8241 *
 8242 *	Add or remove promiscuity from a device. While the count in the device
 8243 *	remains above zero the interface remains promiscuous. Once it hits zero
 8244 *	the device reverts back to normal filtering operation. A negative inc
 8245 *	value is used to drop promiscuity on the device.
 8246 *	Return 0 if successful or a negative errno code on error.
 8247 */
 8248int dev_set_promiscuity(struct net_device *dev, int inc)
 8249{
 8250	unsigned int old_flags = dev->flags;
 8251	int err;
 8252
 8253	err = __dev_set_promiscuity(dev, inc, true);
 8254	if (err < 0)
 8255		return err;
 8256	if (dev->flags != old_flags)
 8257		dev_set_rx_mode(dev);
 8258	return err;
 8259}
 8260EXPORT_SYMBOL(dev_set_promiscuity);
 8261
 8262static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 8263{
 8264	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
 8265
 8266	ASSERT_RTNL();
 8267
 8268	dev->flags |= IFF_ALLMULTI;
 8269	dev->allmulti += inc;
 8270	if (dev->allmulti == 0) {
 8271		/*
 8272		 * Avoid overflow.
 8273		 * If inc causes overflow, untouch allmulti and return error.
 8274		 */
 8275		if (inc < 0)
 8276			dev->flags &= ~IFF_ALLMULTI;
 8277		else {
 8278			dev->allmulti -= inc;
 8279			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
 8280				dev->name);
 8281			return -EOVERFLOW;
 8282		}
 8283	}
 8284	if (dev->flags ^ old_flags) {
 8285		dev_change_rx_flags(dev, IFF_ALLMULTI);
 8286		dev_set_rx_mode(dev);
 8287		if (notify)
 8288			__dev_notify_flags(dev, old_flags,
 8289					   dev->gflags ^ old_gflags);
 8290	}
 8291	return 0;
 8292}
 8293
 8294/**
 8295 *	dev_set_allmulti	- update allmulti count on a device
 8296 *	@dev: device
 8297 *	@inc: modifier
 8298 *
 8299 *	Add or remove reception of all multicast frames to a device. While the
 8300 *	count in the device remains above zero the interface remains listening
 8301 *	to all interfaces. Once it hits zero the device reverts back to normal
 8302 *	filtering operation. A negative @inc value is used to drop the counter
 8303 *	when releasing a resource needing all multicasts.
 8304 *	Return 0 if successful or a negative errno code on error.
 8305 */
 8306
 8307int dev_set_allmulti(struct net_device *dev, int inc)
 8308{
 8309	return __dev_set_allmulti(dev, inc, true);
 8310}
 8311EXPORT_SYMBOL(dev_set_allmulti);
 8312
 8313/*
 8314 *	Upload unicast and multicast address lists to device and
 8315 *	configure RX filtering. When the device doesn't support unicast
 8316 *	filtering it is put in promiscuous mode while unicast addresses
 8317 *	are present.
 8318 */
 8319void __dev_set_rx_mode(struct net_device *dev)
 8320{
 8321	const struct net_device_ops *ops = dev->netdev_ops;
 8322
 8323	/* dev_open will call this function so the list will stay sane. */
 8324	if (!(dev->flags&IFF_UP))
 8325		return;
 8326
 8327	if (!netif_device_present(dev))
 8328		return;
 8329
 8330	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
 8331		/* Unicast addresses changes may only happen under the rtnl,
 8332		 * therefore calling __dev_set_promiscuity here is safe.
 8333		 */
 8334		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
 8335			__dev_set_promiscuity(dev, 1, false);
 8336			dev->uc_promisc = true;
 8337		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
 8338			__dev_set_promiscuity(dev, -1, false);
 8339			dev->uc_promisc = false;
 8340		}
 8341	}
 8342
 8343	if (ops->ndo_set_rx_mode)
 8344		ops->ndo_set_rx_mode(dev);
 8345}
 8346
 8347void dev_set_rx_mode(struct net_device *dev)
 8348{
 8349	netif_addr_lock_bh(dev);
 8350	__dev_set_rx_mode(dev);
 8351	netif_addr_unlock_bh(dev);
 8352}
 8353
 8354/**
 8355 *	dev_get_flags - get flags reported to userspace
 8356 *	@dev: device
 8357 *
 8358 *	Get the combination of flag bits exported through APIs to userspace.
 8359 */
 8360unsigned int dev_get_flags(const struct net_device *dev)
 8361{
 8362	unsigned int flags;
 8363
 8364	flags = (dev->flags & ~(IFF_PROMISC |
 8365				IFF_ALLMULTI |
 8366				IFF_RUNNING |
 8367				IFF_LOWER_UP |
 8368				IFF_DORMANT)) |
 8369		(dev->gflags & (IFF_PROMISC |
 8370				IFF_ALLMULTI));
 8371
 8372	if (netif_running(dev)) {
 8373		if (netif_oper_up(dev))
 8374			flags |= IFF_RUNNING;
 8375		if (netif_carrier_ok(dev))
 8376			flags |= IFF_LOWER_UP;
 8377		if (netif_dormant(dev))
 8378			flags |= IFF_DORMANT;
 8379	}
 8380
 8381	return flags;
 8382}
 8383EXPORT_SYMBOL(dev_get_flags);
 8384
 8385int __dev_change_flags(struct net_device *dev, unsigned int flags,
 8386		       struct netlink_ext_ack *extack)
 8387{
 8388	unsigned int old_flags = dev->flags;
 8389	int ret;
 8390
 8391	ASSERT_RTNL();
 8392
 8393	/*
 8394	 *	Set the flags on our device.
 8395	 */
 8396
 8397	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
 8398			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
 8399			       IFF_AUTOMEDIA)) |
 8400		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
 8401				    IFF_ALLMULTI));
 8402
 8403	/*
 8404	 *	Load in the correct multicast list now the flags have changed.
 8405	 */
 8406
 8407	if ((old_flags ^ flags) & IFF_MULTICAST)
 8408		dev_change_rx_flags(dev, IFF_MULTICAST);
 8409
 8410	dev_set_rx_mode(dev);
 8411
 8412	/*
 8413	 *	Have we downed the interface. We handle IFF_UP ourselves
 8414	 *	according to user attempts to set it, rather than blindly
 8415	 *	setting it.
 8416	 */
 8417
 8418	ret = 0;
 8419	if ((old_flags ^ flags) & IFF_UP) {
 8420		if (old_flags & IFF_UP)
 8421			__dev_close(dev);
 8422		else
 8423			ret = __dev_open(dev, extack);
 8424	}
 8425
 8426	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 8427		int inc = (flags & IFF_PROMISC) ? 1 : -1;
 8428		unsigned int old_flags = dev->flags;
 8429
 8430		dev->gflags ^= IFF_PROMISC;
 8431
 8432		if (__dev_set_promiscuity(dev, inc, false) >= 0)
 8433			if (dev->flags != old_flags)
 8434				dev_set_rx_mode(dev);
 8435	}
 8436
 8437	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
 8438	 * is important. Some (broken) drivers set IFF_PROMISC, when
 8439	 * IFF_ALLMULTI is requested not asking us and not reporting.
 8440	 */
 8441	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
 8442		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
 8443
 8444		dev->gflags ^= IFF_ALLMULTI;
 8445		__dev_set_allmulti(dev, inc, false);
 8446	}
 8447
 8448	return ret;
 8449}
 8450
 8451void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
 8452			unsigned int gchanges)
 8453{
 8454	unsigned int changes = dev->flags ^ old_flags;
 8455
 8456	if (gchanges)
 8457		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
 8458
 8459	if (changes & IFF_UP) {
 8460		if (dev->flags & IFF_UP)
 8461			call_netdevice_notifiers(NETDEV_UP, dev);
 8462		else
 8463			call_netdevice_notifiers(NETDEV_DOWN, dev);
 8464	}
 8465
 8466	if (dev->flags & IFF_UP &&
 8467	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
 8468		struct netdev_notifier_change_info change_info = {
 8469			.info = {
 8470				.dev = dev,
 8471			},
 8472			.flags_changed = changes,
 8473		};
 8474
 8475		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
 8476	}
 8477}
 8478
 8479/**
 8480 *	dev_change_flags - change device settings
 8481 *	@dev: device
 8482 *	@flags: device state flags
 8483 *	@extack: netlink extended ack
 8484 *
 8485 *	Change settings on device based state flags. The flags are
 8486 *	in the userspace exported format.
 8487 */
 8488int dev_change_flags(struct net_device *dev, unsigned int flags,
 8489		     struct netlink_ext_ack *extack)
 8490{
 8491	int ret;
 8492	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
 8493
 8494	ret = __dev_change_flags(dev, flags, extack);
 8495	if (ret < 0)
 8496		return ret;
 8497
 8498	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
 8499	__dev_notify_flags(dev, old_flags, changes);
 8500	return ret;
 8501}
 8502EXPORT_SYMBOL(dev_change_flags);
 8503
 8504int __dev_set_mtu(struct net_device *dev, int new_mtu)
 8505{
 8506	const struct net_device_ops *ops = dev->netdev_ops;
 8507
 8508	if (ops->ndo_change_mtu)
 8509		return ops->ndo_change_mtu(dev, new_mtu);
 8510
 8511	/* Pairs with all the lockless reads of dev->mtu in the stack */
 8512	WRITE_ONCE(dev->mtu, new_mtu);
 8513	return 0;
 8514}
 8515EXPORT_SYMBOL(__dev_set_mtu);
 8516
 8517int dev_validate_mtu(struct net_device *dev, int new_mtu,
 8518		     struct netlink_ext_ack *extack)
 8519{
 8520	/* MTU must be positive, and in range */
 8521	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
 8522		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
 8523		return -EINVAL;
 8524	}
 8525
 8526	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
 8527		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
 8528		return -EINVAL;
 8529	}
 8530	return 0;
 8531}
 8532
 8533/**
 8534 *	dev_set_mtu_ext - Change maximum transfer unit
 8535 *	@dev: device
 8536 *	@new_mtu: new transfer unit
 8537 *	@extack: netlink extended ack
 8538 *
 8539 *	Change the maximum transfer size of the network device.
 8540 */
 8541int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
 8542		    struct netlink_ext_ack *extack)
 8543{
 8544	int err, orig_mtu;
 8545
 8546	if (new_mtu == dev->mtu)
 8547		return 0;
 8548
 8549	err = dev_validate_mtu(dev, new_mtu, extack);
 8550	if (err)
 8551		return err;
 8552
 8553	if (!netif_device_present(dev))
 8554		return -ENODEV;
 8555
 8556	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
 8557	err = notifier_to_errno(err);
 8558	if (err)
 8559		return err;
 8560
 8561	orig_mtu = dev->mtu;
 8562	err = __dev_set_mtu(dev, new_mtu);
 8563
 8564	if (!err) {
 8565		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8566						   orig_mtu);
 8567		err = notifier_to_errno(err);
 8568		if (err) {
 8569			/* setting mtu back and notifying everyone again,
 8570			 * so that they have a chance to revert changes.
 8571			 */
 8572			__dev_set_mtu(dev, orig_mtu);
 8573			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 8574						     new_mtu);
 8575		}
 8576	}
 8577	return err;
 8578}
 8579
 8580int dev_set_mtu(struct net_device *dev, int new_mtu)
 8581{
 8582	struct netlink_ext_ack extack;
 8583	int err;
 8584
 8585	memset(&extack, 0, sizeof(extack));
 8586	err = dev_set_mtu_ext(dev, new_mtu, &extack);
 8587	if (err && extack._msg)
 8588		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
 8589	return err;
 8590}
 8591EXPORT_SYMBOL(dev_set_mtu);
 8592
 8593/**
 8594 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
 8595 *	@dev: device
 8596 *	@new_len: new tx queue length
 8597 */
 8598int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
 8599{
 8600	unsigned int orig_len = dev->tx_queue_len;
 8601	int res;
 8602
 8603	if (new_len != (unsigned int)new_len)
 8604		return -ERANGE;
 8605
 8606	if (new_len != orig_len) {
 8607		dev->tx_queue_len = new_len;
 8608		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
 8609		res = notifier_to_errno(res);
 8610		if (res)
 8611			goto err_rollback;
 8612		res = dev_qdisc_change_tx_queue_len(dev);
 8613		if (res)
 8614			goto err_rollback;
 8615	}
 8616
 8617	return 0;
 8618
 8619err_rollback:
 8620	netdev_err(dev, "refused to change device tx_queue_len\n");
 8621	dev->tx_queue_len = orig_len;
 8622	return res;
 8623}
 8624
 8625/**
 8626 *	dev_set_group - Change group this device belongs to
 8627 *	@dev: device
 8628 *	@new_group: group this device should belong to
 8629 */
 8630void dev_set_group(struct net_device *dev, int new_group)
 8631{
 8632	dev->group = new_group;
 8633}
 8634EXPORT_SYMBOL(dev_set_group);
 8635
 8636/**
 8637 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 8638 *	@dev: device
 8639 *	@addr: new address
 8640 *	@extack: netlink extended ack
 8641 */
 8642int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 8643			      struct netlink_ext_ack *extack)
 8644{
 8645	struct netdev_notifier_pre_changeaddr_info info = {
 8646		.info.dev = dev,
 8647		.info.extack = extack,
 8648		.dev_addr = addr,
 8649	};
 8650	int rc;
 8651
 8652	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 8653	return notifier_to_errno(rc);
 8654}
 8655EXPORT_SYMBOL(dev_pre_changeaddr_notify);
 8656
 8657/**
 8658 *	dev_set_mac_address - Change Media Access Control Address
 8659 *	@dev: device
 8660 *	@sa: new address
 8661 *	@extack: netlink extended ack
 8662 *
 8663 *	Change the hardware (MAC) address of the device
 8664 */
 8665int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 8666			struct netlink_ext_ack *extack)
 8667{
 8668	const struct net_device_ops *ops = dev->netdev_ops;
 8669	int err;
 8670
 8671	if (!ops->ndo_set_mac_address)
 8672		return -EOPNOTSUPP;
 8673	if (sa->sa_family != dev->type)
 8674		return -EINVAL;
 8675	if (!netif_device_present(dev))
 8676		return -ENODEV;
 8677	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
 8678	if (err)
 8679		return err;
 8680	err = ops->ndo_set_mac_address(dev, sa);
 8681	if (err)
 8682		return err;
 8683	dev->addr_assign_type = NET_ADDR_SET;
 8684	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 8685	add_device_randomness(dev->dev_addr, dev->addr_len);
 8686	return 0;
 8687}
 8688EXPORT_SYMBOL(dev_set_mac_address);
 8689
 8690/**
 8691 *	dev_change_carrier - Change device carrier
 8692 *	@dev: device
 8693 *	@new_carrier: new value
 8694 *
 8695 *	Change device carrier
 8696 */
 8697int dev_change_carrier(struct net_device *dev, bool new_carrier)
 8698{
 8699	const struct net_device_ops *ops = dev->netdev_ops;
 8700
 8701	if (!ops->ndo_change_carrier)
 8702		return -EOPNOTSUPP;
 8703	if (!netif_device_present(dev))
 8704		return -ENODEV;
 8705	return ops->ndo_change_carrier(dev, new_carrier);
 8706}
 8707EXPORT_SYMBOL(dev_change_carrier);
 8708
 8709/**
 8710 *	dev_get_phys_port_id - Get device physical port ID
 8711 *	@dev: device
 8712 *	@ppid: port ID
 8713 *
 8714 *	Get device physical port ID
 8715 */
 8716int dev_get_phys_port_id(struct net_device *dev,
 8717			 struct netdev_phys_item_id *ppid)
 8718{
 8719	const struct net_device_ops *ops = dev->netdev_ops;
 8720
 8721	if (!ops->ndo_get_phys_port_id)
 8722		return -EOPNOTSUPP;
 8723	return ops->ndo_get_phys_port_id(dev, ppid);
 8724}
 8725EXPORT_SYMBOL(dev_get_phys_port_id);
 8726
 8727/**
 8728 *	dev_get_phys_port_name - Get device physical port name
 8729 *	@dev: device
 8730 *	@name: port name
 8731 *	@len: limit of bytes to copy to name
 8732 *
 8733 *	Get device physical port name
 8734 */
 8735int dev_get_phys_port_name(struct net_device *dev,
 8736			   char *name, size_t len)
 8737{
 8738	const struct net_device_ops *ops = dev->netdev_ops;
 8739	int err;
 8740
 8741	if (ops->ndo_get_phys_port_name) {
 8742		err = ops->ndo_get_phys_port_name(dev, name, len);
 8743		if (err != -EOPNOTSUPP)
 8744			return err;
 8745	}
 8746	return devlink_compat_phys_port_name_get(dev, name, len);
 8747}
 8748EXPORT_SYMBOL(dev_get_phys_port_name);
 8749
 8750/**
 8751 *	dev_get_port_parent_id - Get the device's port parent identifier
 8752 *	@dev: network device
 8753 *	@ppid: pointer to a storage for the port's parent identifier
 8754 *	@recurse: allow/disallow recursion to lower devices
 8755 *
 8756 *	Get the devices's port parent identifier
 8757 */
 8758int dev_get_port_parent_id(struct net_device *dev,
 8759			   struct netdev_phys_item_id *ppid,
 8760			   bool recurse)
 8761{
 8762	const struct net_device_ops *ops = dev->netdev_ops;
 8763	struct netdev_phys_item_id first = { };
 8764	struct net_device *lower_dev;
 8765	struct list_head *iter;
 8766	int err;
 8767
 8768	if (ops->ndo_get_port_parent_id) {
 8769		err = ops->ndo_get_port_parent_id(dev, ppid);
 8770		if (err != -EOPNOTSUPP)
 8771			return err;
 8772	}
 8773
 8774	err = devlink_compat_switch_id_get(dev, ppid);
 8775	if (!err || err != -EOPNOTSUPP)
 8776		return err;
 8777
 8778	if (!recurse)
 8779		return -EOPNOTSUPP;
 8780
 8781	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 8782		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
 8783		if (err)
 8784			break;
 8785		if (!first.id_len)
 8786			first = *ppid;
 8787		else if (memcmp(&first, ppid, sizeof(*ppid)))
 8788			return -EOPNOTSUPP;
 8789	}
 8790
 8791	return err;
 8792}
 8793EXPORT_SYMBOL(dev_get_port_parent_id);
 8794
 8795/**
 8796 *	netdev_port_same_parent_id - Indicate if two network devices have
 8797 *	the same port parent identifier
 8798 *	@a: first network device
 8799 *	@b: second network device
 8800 */
 8801bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 8802{
 8803	struct netdev_phys_item_id a_id = { };
 8804	struct netdev_phys_item_id b_id = { };
 8805
 8806	if (dev_get_port_parent_id(a, &a_id, true) ||
 8807	    dev_get_port_parent_id(b, &b_id, true))
 8808		return false;
 8809
 8810	return netdev_phys_item_id_same(&a_id, &b_id);
 8811}
 8812EXPORT_SYMBOL(netdev_port_same_parent_id);
 8813
 8814/**
 8815 *	dev_change_proto_down - update protocol port state information
 8816 *	@dev: device
 8817 *	@proto_down: new value
 8818 *
 8819 *	This info can be used by switch drivers to set the phys state of the
 8820 *	port.
 8821 */
 8822int dev_change_proto_down(struct net_device *dev, bool proto_down)
 8823{
 8824	const struct net_device_ops *ops = dev->netdev_ops;
 8825
 8826	if (!ops->ndo_change_proto_down)
 8827		return -EOPNOTSUPP;
 8828	if (!netif_device_present(dev))
 8829		return -ENODEV;
 8830	return ops->ndo_change_proto_down(dev, proto_down);
 8831}
 8832EXPORT_SYMBOL(dev_change_proto_down);
 8833
 8834/**
 8835 *	dev_change_proto_down_generic - generic implementation for
 8836 * 	ndo_change_proto_down that sets carrier according to
 8837 * 	proto_down.
 8838 *
 8839 *	@dev: device
 8840 *	@proto_down: new value
 8841 */
 8842int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 8843{
 8844	if (proto_down)
 8845		netif_carrier_off(dev);
 8846	else
 8847		netif_carrier_on(dev);
 8848	dev->proto_down = proto_down;
 8849	return 0;
 8850}
 8851EXPORT_SYMBOL(dev_change_proto_down_generic);
 8852
 8853/**
 8854 *	dev_change_proto_down_reason - proto down reason
 8855 *
 8856 *	@dev: device
 8857 *	@mask: proto down mask
 8858 *	@value: proto down value
 8859 */
 8860void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 8861				  u32 value)
 8862{
 8863	int b;
 8864
 8865	if (!mask) {
 8866		dev->proto_down_reason = value;
 8867	} else {
 8868		for_each_set_bit(b, &mask, 32) {
 8869			if (value & (1 << b))
 8870				dev->proto_down_reason |= BIT(b);
 8871			else
 8872				dev->proto_down_reason &= ~BIT(b);
 8873		}
 8874	}
 8875}
 8876EXPORT_SYMBOL(dev_change_proto_down_reason);
 8877
 8878struct bpf_xdp_link {
 8879	struct bpf_link link;
 8880	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
 8881	int flags;
 8882};
 8883
 8884static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
 8885{
 8886	if (flags & XDP_FLAGS_HW_MODE)
 8887		return XDP_MODE_HW;
 8888	if (flags & XDP_FLAGS_DRV_MODE)
 8889		return XDP_MODE_DRV;
 8890	if (flags & XDP_FLAGS_SKB_MODE)
 8891		return XDP_MODE_SKB;
 8892	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
 8893}
 8894
 8895static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 8896{
 8897	switch (mode) {
 8898	case XDP_MODE_SKB:
 8899		return generic_xdp_install;
 8900	case XDP_MODE_DRV:
 8901	case XDP_MODE_HW:
 8902		return dev->netdev_ops->ndo_bpf;
 8903	default:
 8904		return NULL;
 8905	};
 8906}
 8907
 8908static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
 8909					 enum bpf_xdp_mode mode)
 8910{
 8911	return dev->xdp_state[mode].link;
 8912}
 8913
 8914static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 8915				     enum bpf_xdp_mode mode)
 8916{
 8917	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
 8918
 8919	if (link)
 8920		return link->link.prog;
 8921	return dev->xdp_state[mode].prog;
 8922}
 8923
 8924u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 8925{
 8926	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
 8927
 8928	return prog ? prog->aux->id : 0;
 8929}
 8930
 8931static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
 8932			     struct bpf_xdp_link *link)
 8933{
 8934	dev->xdp_state[mode].link = link;
 8935	dev->xdp_state[mode].prog = NULL;
 8936}
 8937
 8938static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 8939			     struct bpf_prog *prog)
 8940{
 8941	dev->xdp_state[mode].link = NULL;
 8942	dev->xdp_state[mode].prog = prog;
 8943}
 8944
 8945static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 8946			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
 8947			   u32 flags, struct bpf_prog *prog)
 8948{
 8949	struct netdev_bpf xdp;
 8950	int err;
 8951
 8952	memset(&xdp, 0, sizeof(xdp));
 8953	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 8954	xdp.extack = extack;
 8955	xdp.flags = flags;
 8956	xdp.prog = prog;
 8957
 8958	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
 8959	 * "moved" into driver), so they don't increment it on their own, but
 8960	 * they do decrement refcnt when program is detached or replaced.
 8961	 * Given net_device also owns link/prog, we need to bump refcnt here
 8962	 * to prevent drivers from underflowing it.
 8963	 */
 8964	if (prog)
 8965		bpf_prog_inc(prog);
 8966	err = bpf_op(dev, &xdp);
 8967	if (err) {
 8968		if (prog)
 8969			bpf_prog_put(prog);
 8970		return err;
 8971	}
 8972
 8973	if (mode != XDP_MODE_HW)
 8974		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 8975
 8976	return 0;
 8977}
 8978
 8979static void dev_xdp_uninstall(struct net_device *dev)
 8980{
 8981	struct bpf_xdp_link *link;
 8982	struct bpf_prog *prog;
 8983	enum bpf_xdp_mode mode;
 8984	bpf_op_t bpf_op;
 8985
 8986	ASSERT_RTNL();
 8987
 8988	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
 8989		prog = dev_xdp_prog(dev, mode);
 8990		if (!prog)
 8991			continue;
 8992
 8993		bpf_op = dev_xdp_bpf_op(dev, mode);
 8994		if (!bpf_op)
 8995			continue;
 8996
 8997		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 8998
 8999		/* auto-detach link from net device */
 9000		link = dev_xdp_link(dev, mode);
 9001		if (link)
 9002			link->dev = NULL;
 9003		else
 9004			bpf_prog_put(prog);
 9005
 9006		dev_xdp_set_link(dev, mode, NULL);
 9007	}
 9008}
 9009
 9010static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
 9011			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
 9012			  struct bpf_prog *old_prog, u32 flags)
 9013{
 9014	struct bpf_prog *cur_prog;
 9015	enum bpf_xdp_mode mode;
 9016	bpf_op_t bpf_op;
 9017	int err;
 9018
 9019	ASSERT_RTNL();
 9020
 9021	/* either link or prog attachment, never both */
 9022	if (link && (new_prog || old_prog))
 9023		return -EINVAL;
 9024	/* link supports only XDP mode flags */
 9025	if (link && (flags & ~XDP_FLAGS_MODES)) {
 9026		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
 9027		return -EINVAL;
 9028	}
 9029	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 9030	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 9031		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
 9032		return -EINVAL;
 9033	}
 9034	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
 9035	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
 9036		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
 9037		return -EINVAL;
 9038	}
 9039
 9040	mode = dev_xdp_mode(dev, flags);
 9041	/* can't replace attached link */
 9042	if (dev_xdp_link(dev, mode)) {
 9043		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
 9044		return -EBUSY;
 9045	}
 9046
 9047	cur_prog = dev_xdp_prog(dev, mode);
 9048	/* can't replace attached prog with link */
 9049	if (link && cur_prog) {
 9050		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
 9051		return -EBUSY;
 9052	}
 9053	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 9054		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 9055		return -EEXIST;
 9056	}
 9057
 9058	/* put effective new program into new_prog */
 9059	if (link)
 9060		new_prog = link->link.prog;
 9061
 9062	if (new_prog) {
 9063		bool offload = mode == XDP_MODE_HW;
 9064		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
 9065					       ? XDP_MODE_DRV : XDP_MODE_SKB;
 9066
 9067		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
 9068			NL_SET_ERR_MSG(extack, "XDP program already attached");
 9069			return -EBUSY;
 9070		}
 9071		if (!offload && dev_xdp_prog(dev, other_mode)) {
 9072			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 9073			return -EEXIST;
 9074		}
 9075		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
 9076			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 9077			return -EINVAL;
 9078		}
 9079		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
 9080			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
 9081			return -EINVAL;
 9082		}
 9083		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
 9084			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
 9085			return -EINVAL;
 9086		}
 9087	}
 9088
 9089	/* don't call drivers if the effective program didn't change */
 9090	if (new_prog != cur_prog) {
 9091		bpf_op = dev_xdp_bpf_op(dev, mode);
 9092		if (!bpf_op) {
 9093			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
 9094			return -EOPNOTSUPP;
 9095		}
 9096
 9097		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
 9098		if (err)
 9099			return err;
 9100	}
 9101
 9102	if (link)
 9103		dev_xdp_set_link(dev, mode, link);
 9104	else
 9105		dev_xdp_set_prog(dev, mode, new_prog);
 9106	if (cur_prog)
 9107		bpf_prog_put(cur_prog);
 9108
 9109	return 0;
 9110}
 9111
 9112static int dev_xdp_attach_link(struct net_device *dev,
 9113			       struct netlink_ext_ack *extack,
 9114			       struct bpf_xdp_link *link)
 9115{
 9116	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
 9117}
 9118
 9119static int dev_xdp_detach_link(struct net_device *dev,
 9120			       struct netlink_ext_ack *extack,
 9121			       struct bpf_xdp_link *link)
 9122{
 9123	enum bpf_xdp_mode mode;
 9124	bpf_op_t bpf_op;
 9125
 9126	ASSERT_RTNL();
 9127
 9128	mode = dev_xdp_mode(dev, link->flags);
 9129	if (dev_xdp_link(dev, mode) != link)
 9130		return -EINVAL;
 9131
 9132	bpf_op = dev_xdp_bpf_op(dev, mode);
 9133	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 9134	dev_xdp_set_link(dev, mode, NULL);
 9135	return 0;
 9136}
 9137
 9138static void bpf_xdp_link_release(struct bpf_link *link)
 9139{
 9140	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9141
 9142	rtnl_lock();
 9143
 9144	/* if racing with net_device's tear down, xdp_link->dev might be
 9145	 * already NULL, in which case link was already auto-detached
 9146	 */
 9147	if (xdp_link->dev) {
 9148		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
 9149		xdp_link->dev = NULL;
 9150	}
 9151
 9152	rtnl_unlock();
 9153}
 9154
 9155static int bpf_xdp_link_detach(struct bpf_link *link)
 9156{
 9157	bpf_xdp_link_release(link);
 9158	return 0;
 9159}
 9160
 9161static void bpf_xdp_link_dealloc(struct bpf_link *link)
 9162{
 9163	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9164
 9165	kfree(xdp_link);
 9166}
 9167
 9168static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
 9169				     struct seq_file *seq)
 9170{
 9171	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9172	u32 ifindex = 0;
 9173
 9174	rtnl_lock();
 9175	if (xdp_link->dev)
 9176		ifindex = xdp_link->dev->ifindex;
 9177	rtnl_unlock();
 9178
 9179	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 9180}
 9181
 9182static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
 9183				       struct bpf_link_info *info)
 9184{
 9185	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9186	u32 ifindex = 0;
 9187
 9188	rtnl_lock();
 9189	if (xdp_link->dev)
 9190		ifindex = xdp_link->dev->ifindex;
 9191	rtnl_unlock();
 9192
 9193	info->xdp.ifindex = ifindex;
 9194	return 0;
 9195}
 9196
 9197static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 9198			       struct bpf_prog *old_prog)
 9199{
 9200	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
 9201	enum bpf_xdp_mode mode;
 9202	bpf_op_t bpf_op;
 9203	int err = 0;
 9204
 9205	rtnl_lock();
 9206
 9207	/* link might have been auto-released already, so fail */
 9208	if (!xdp_link->dev) {
 9209		err = -ENOLINK;
 9210		goto out_unlock;
 9211	}
 9212
 9213	if (old_prog && link->prog != old_prog) {
 9214		err = -EPERM;
 9215		goto out_unlock;
 9216	}
 9217	old_prog = link->prog;
 9218	if (old_prog == new_prog) {
 9219		/* no-op, don't disturb drivers */
 9220		bpf_prog_put(new_prog);
 9221		goto out_unlock;
 9222	}
 9223
 9224	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
 9225	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
 9226	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
 9227			      xdp_link->flags, new_prog);
 9228	if (err)
 9229		goto out_unlock;
 9230
 9231	old_prog = xchg(&link->prog, new_prog);
 9232	bpf_prog_put(old_prog);
 9233
 9234out_unlock:
 9235	rtnl_unlock();
 9236	return err;
 9237}
 9238
 9239static const struct bpf_link_ops bpf_xdp_link_lops = {
 9240	.release = bpf_xdp_link_release,
 9241	.dealloc = bpf_xdp_link_dealloc,
 9242	.detach = bpf_xdp_link_detach,
 9243	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 9244	.fill_link_info = bpf_xdp_link_fill_link_info,
 9245	.update_prog = bpf_xdp_link_update,
 9246};
 9247
 9248int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 9249{
 9250	struct net *net = current->nsproxy->net_ns;
 9251	struct bpf_link_primer link_primer;
 9252	struct bpf_xdp_link *link;
 9253	struct net_device *dev;
 9254	int err, fd;
 9255
 9256	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
 9257	if (!dev)
 9258		return -EINVAL;
 9259
 9260	link = kzalloc(sizeof(*link), GFP_USER);
 9261	if (!link) {
 9262		err = -ENOMEM;
 9263		goto out_put_dev;
 9264	}
 9265
 9266	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
 9267	link->dev = dev;
 9268	link->flags = attr->link_create.flags;
 9269
 9270	err = bpf_link_prime(&link->link, &link_primer);
 9271	if (err) {
 9272		kfree(link);
 9273		goto out_put_dev;
 9274	}
 9275
 9276	rtnl_lock();
 9277	err = dev_xdp_attach_link(dev, NULL, link);
 9278	rtnl_unlock();
 9279
 9280	if (err) {
 9281		bpf_link_cleanup(&link_primer);
 9282		goto out_put_dev;
 9283	}
 9284
 9285	fd = bpf_link_settle(&link_primer);
 9286	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
 9287	dev_put(dev);
 9288	return fd;
 9289
 9290out_put_dev:
 9291	dev_put(dev);
 9292	return err;
 9293}
 9294
 9295/**
 9296 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
 9297 *	@dev: device
 9298 *	@extack: netlink extended ack
 9299 *	@fd: new program fd or negative value to clear
 9300 *	@expected_fd: old program fd that userspace expects to replace or clear
 9301 *	@flags: xdp-related flags
 9302 *
 9303 *	Set or clear a bpf program for a device
 9304 */
 9305int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 9306		      int fd, int expected_fd, u32 flags)
 9307{
 9308	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
 9309	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 9310	int err;
 9311
 9312	ASSERT_RTNL();
 9313
 9314	if (fd >= 0) {
 9315		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
 9316						 mode != XDP_MODE_SKB);
 9317		if (IS_ERR(new_prog))
 9318			return PTR_ERR(new_prog);
 9319	}
 9320
 9321	if (expected_fd >= 0) {
 9322		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
 9323						 mode != XDP_MODE_SKB);
 9324		if (IS_ERR(old_prog)) {
 9325			err = PTR_ERR(old_prog);
 9326			old_prog = NULL;
 9327			goto err_out;
 9328		}
 9329	}
 9330
 9331	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 9332
 9333err_out:
 9334	if (err && new_prog)
 9335		bpf_prog_put(new_prog);
 9336	if (old_prog)
 9337		bpf_prog_put(old_prog);
 9338	return err;
 9339}
 9340
 9341/**
 9342 *	dev_new_index	-	allocate an ifindex
 9343 *	@net: the applicable net namespace
 9344 *
 9345 *	Returns a suitable unique value for a new device interface
 9346 *	number.  The caller must hold the rtnl semaphore or the
 9347 *	dev_base_lock to be sure it remains unique.
 9348 */
 9349static int dev_new_index(struct net *net)
 9350{
 9351	int ifindex = net->ifindex;
 9352
 9353	for (;;) {
 9354		if (++ifindex <= 0)
 9355			ifindex = 1;
 9356		if (!__dev_get_by_index(net, ifindex))
 9357			return net->ifindex = ifindex;
 9358	}
 9359}
 9360
 9361/* Delayed registration/unregisteration */
 9362static LIST_HEAD(net_todo_list);
 9363DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 9364
 9365static void net_set_todo(struct net_device *dev)
 9366{
 9367	list_add_tail(&dev->todo_list, &net_todo_list);
 9368	dev_net(dev)->dev_unreg_count++;
 9369}
 9370
 9371static void rollback_registered_many(struct list_head *head)
 9372{
 9373	struct net_device *dev, *tmp;
 9374	LIST_HEAD(close_head);
 9375
 9376	BUG_ON(dev_boot_phase);
 9377	ASSERT_RTNL();
 9378
 9379	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
 9380		/* Some devices call without registering
 9381		 * for initialization unwind. Remove those
 9382		 * devices and proceed with the remaining.
 9383		 */
 9384		if (dev->reg_state == NETREG_UNINITIALIZED) {
 9385			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
 9386				 dev->name, dev);
 9387
 9388			WARN_ON(1);
 9389			list_del(&dev->unreg_list);
 9390			continue;
 9391		}
 9392		dev->dismantle = true;
 9393		BUG_ON(dev->reg_state != NETREG_REGISTERED);
 9394	}
 9395
 9396	/* If device is running, close it first. */
 9397	list_for_each_entry(dev, head, unreg_list)
 9398		list_add_tail(&dev->close_list, &close_head);
 9399	dev_close_many(&close_head, true);
 9400
 9401	list_for_each_entry(dev, head, unreg_list) {
 9402		/* And unlink it from device chain. */
 9403		unlist_netdevice(dev);
 9404
 9405		dev->reg_state = NETREG_UNREGISTERING;
 9406	}
 9407	flush_all_backlogs();
 9408
 9409	synchronize_net();
 9410
 9411	list_for_each_entry(dev, head, unreg_list) {
 9412		struct sk_buff *skb = NULL;
 9413
 9414		/* Shutdown queueing discipline. */
 9415		dev_shutdown(dev);
 9416
 9417		dev_xdp_uninstall(dev);
 9418
 9419		/* Notify protocols, that we are about to destroy
 9420		 * this device. They should clean all the things.
 9421		 */
 9422		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 9423
 9424		if (!dev->rtnl_link_ops ||
 9425		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 9426			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 9427						     GFP_KERNEL, NULL, 0);
 9428
 9429		/*
 9430		 *	Flush the unicast and multicast chains
 9431		 */
 9432		dev_uc_flush(dev);
 9433		dev_mc_flush(dev);
 9434
 9435		netdev_name_node_alt_flush(dev);
 9436		netdev_name_node_free(dev->name_node);
 9437
 9438		if (dev->netdev_ops->ndo_uninit)
 9439			dev->netdev_ops->ndo_uninit(dev);
 9440
 9441		if (skb)
 9442			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
 9443
 9444		/* Notifier chain MUST detach us all upper devices. */
 9445		WARN_ON(netdev_has_any_upper_dev(dev));
 9446		WARN_ON(netdev_has_any_lower_dev(dev));
 9447
 9448		/* Remove entries from kobject tree */
 9449		netdev_unregister_kobject(dev);
 9450#ifdef CONFIG_XPS
 9451		/* Remove XPS queueing entries */
 9452		netif_reset_xps_queues_gt(dev, 0);
 9453#endif
 9454	}
 9455
 9456	synchronize_net();
 9457
 9458	list_for_each_entry(dev, head, unreg_list)
 9459		dev_put(dev);
 9460}
 9461
 9462static void rollback_registered(struct net_device *dev)
 9463{
 9464	LIST_HEAD(single);
 9465
 9466	list_add(&dev->unreg_list, &single);
 9467	rollback_registered_many(&single);
 9468	list_del(&single);
 9469}
 9470
 9471static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
 9472	struct net_device *upper, netdev_features_t features)
 9473{
 9474	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9475	netdev_features_t feature;
 9476	int feature_bit;
 9477
 9478	for_each_netdev_feature(upper_disables, feature_bit) {
 9479		feature = __NETIF_F_BIT(feature_bit);
 9480		if (!(upper->wanted_features & feature)
 9481		    && (features & feature)) {
 9482			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
 9483				   &feature, upper->name);
 9484			features &= ~feature;
 9485		}
 9486	}
 9487
 9488	return features;
 9489}
 9490
 9491static void netdev_sync_lower_features(struct net_device *upper,
 9492	struct net_device *lower, netdev_features_t features)
 9493{
 9494	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
 9495	netdev_features_t feature;
 9496	int feature_bit;
 9497
 9498	for_each_netdev_feature(upper_disables, feature_bit) {
 9499		feature = __NETIF_F_BIT(feature_bit);
 9500		if (!(features & feature) && (lower->features & feature)) {
 9501			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
 9502				   &feature, lower->name);
 9503			lower->wanted_features &= ~feature;
 9504			__netdev_update_features(lower);
 9505
 9506			if (unlikely(lower->features & feature))
 9507				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
 9508					    &feature, lower->name);
 9509			else
 9510				netdev_features_change(lower);
 9511		}
 9512	}
 9513}
 9514
 9515static netdev_features_t netdev_fix_features(struct net_device *dev,
 9516	netdev_features_t features)
 9517{
 9518	/* Fix illegal checksum combinations */
 9519	if ((features & NETIF_F_HW_CSUM) &&
 9520	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
 9521		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
 9522		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 9523	}
 9524
 9525	/* TSO requires that SG is present as well. */
 9526	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 9527		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 9528		features &= ~NETIF_F_ALL_TSO;
 9529	}
 9530
 9531	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
 9532					!(features & NETIF_F_IP_CSUM)) {
 9533		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
 9534		features &= ~NETIF_F_TSO;
 9535		features &= ~NETIF_F_TSO_ECN;
 9536	}
 9537
 9538	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
 9539					 !(features & NETIF_F_IPV6_CSUM)) {
 9540		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
 9541		features &= ~NETIF_F_TSO6;
 9542	}
 9543
 9544	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
 9545	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
 9546		features &= ~NETIF_F_TSO_MANGLEID;
 9547
 9548	/* TSO ECN requires that TSO is present as well. */
 9549	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 9550		features &= ~NETIF_F_TSO_ECN;
 9551
 9552	/* Software GSO depends on SG. */
 9553	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
 9554		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
 9555		features &= ~NETIF_F_GSO;
 9556	}
 9557
 9558	/* GSO partial features require GSO partial be set */
 9559	if ((features & dev->gso_partial_features) &&
 9560	    !(features & NETIF_F_GSO_PARTIAL)) {
 9561		netdev_dbg(dev,
 9562			   "Dropping partially supported GSO features since no GSO partial.\n");
 9563		features &= ~dev->gso_partial_features;
 9564	}
 9565
 9566	if (!(features & NETIF_F_RXCSUM)) {
 9567		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
 9568		 * successfully merged by hardware must also have the
 9569		 * checksum verified by hardware.  If the user does not
 9570		 * want to enable RXCSUM, logically, we should disable GRO_HW.
 9571		 */
 9572		if (features & NETIF_F_GRO_HW) {
 9573			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
 9574			features &= ~NETIF_F_GRO_HW;
 9575		}
 9576	}
 9577
 9578	/* LRO/HW-GRO features cannot be combined with RX-FCS */
 9579	if (features & NETIF_F_RXFCS) {
 9580		if (features & NETIF_F_LRO) {
 9581			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
 9582			features &= ~NETIF_F_LRO;
 9583		}
 9584
 9585		if (features & NETIF_F_GRO_HW) {
 9586			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
 9587			features &= ~NETIF_F_GRO_HW;
 9588		}
 9589	}
 9590
 9591	return features;
 9592}
 9593
 9594int __netdev_update_features(struct net_device *dev)
 9595{
 9596	struct net_device *upper, *lower;
 9597	netdev_features_t features;
 9598	struct list_head *iter;
 9599	int err = -1;
 9600
 9601	ASSERT_RTNL();
 9602
 9603	features = netdev_get_wanted_features(dev);
 9604
 9605	if (dev->netdev_ops->ndo_fix_features)
 9606		features = dev->netdev_ops->ndo_fix_features(dev, features);
 9607
 9608	/* driver might be less strict about feature dependencies */
 9609	features = netdev_fix_features(dev, features);
 9610
 9611	/* some features can't be enabled if they're off on an upper device */
 9612	netdev_for_each_upper_dev_rcu(dev, upper, iter)
 9613		features = netdev_sync_upper_features(dev, upper, features);
 9614
 9615	if (dev->features == features)
 9616		goto sync_lower;
 9617
 9618	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
 9619		&dev->features, &features);
 9620
 9621	if (dev->netdev_ops->ndo_set_features)
 9622		err = dev->netdev_ops->ndo_set_features(dev, features);
 9623	else
 9624		err = 0;
 9625
 9626	if (unlikely(err < 0)) {
 9627		netdev_err(dev,
 9628			"set_features() failed (%d); wanted %pNF, left %pNF\n",
 9629			err, &features, &dev->features);
 9630		/* return non-0 since some features might have changed and
 9631		 * it's better to fire a spurious notification than miss it
 9632		 */
 9633		return -1;
 9634	}
 9635
 9636sync_lower:
 9637	/* some features must be disabled on lower devices when disabled
 9638	 * on an upper device (think: bonding master or bridge)
 9639	 */
 9640	netdev_for_each_lower_dev(dev, lower, iter)
 9641		netdev_sync_lower_features(dev, lower, features);
 9642
 9643	if (!err) {
 9644		netdev_features_t diff = features ^ dev->features;
 9645
 9646		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9647			/* udp_tunnel_{get,drop}_rx_info both need
 9648			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
 9649			 * device, or they won't do anything.
 9650			 * Thus we need to update dev->features
 9651			 * *before* calling udp_tunnel_get_rx_info,
 9652			 * but *after* calling udp_tunnel_drop_rx_info.
 9653			 */
 9654			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 9655				dev->features = features;
 9656				udp_tunnel_get_rx_info(dev);
 9657			} else {
 9658				udp_tunnel_drop_rx_info(dev);
 9659			}
 9660		}
 9661
 9662		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9663			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 9664				dev->features = features;
 9665				err |= vlan_get_rx_ctag_filter_info(dev);
 9666			} else {
 9667				vlan_drop_rx_ctag_filter_info(dev);
 9668			}
 9669		}
 9670
 9671		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
 9672			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
 9673				dev->features = features;
 9674				err |= vlan_get_rx_stag_filter_info(dev);
 9675			} else {
 9676				vlan_drop_rx_stag_filter_info(dev);
 9677			}
 9678		}
 9679
 9680		dev->features = features;
 9681	}
 9682
 9683	return err < 0 ? 0 : 1;
 9684}
 9685
 9686/**
 9687 *	netdev_update_features - recalculate device features
 9688 *	@dev: the device to check
 9689 *
 9690 *	Recalculate dev->features set and send notifications if it
 9691 *	has changed. Should be called after driver or hardware dependent
 9692 *	conditions might have changed that influence the features.
 9693 */
 9694void netdev_update_features(struct net_device *dev)
 9695{
 9696	if (__netdev_update_features(dev))
 9697		netdev_features_change(dev);
 9698}
 9699EXPORT_SYMBOL(netdev_update_features);
 9700
 9701/**
 9702 *	netdev_change_features - recalculate device features
 9703 *	@dev: the device to check
 9704 *
 9705 *	Recalculate dev->features set and send notifications even
 9706 *	if they have not changed. Should be called instead of
 9707 *	netdev_update_features() if also dev->vlan_features might
 9708 *	have changed to allow the changes to be propagated to stacked
 9709 *	VLAN devices.
 9710 */
 9711void netdev_change_features(struct net_device *dev)
 9712{
 9713	__netdev_update_features(dev);
 9714	netdev_features_change(dev);
 9715}
 9716EXPORT_SYMBOL(netdev_change_features);
 9717
 9718/**
 9719 *	netif_stacked_transfer_operstate -	transfer operstate
 9720 *	@rootdev: the root or lower level device to transfer state from
 9721 *	@dev: the device to transfer operstate to
 9722 *
 9723 *	Transfer operational state from root to device. This is normally
 9724 *	called when a stacking relationship exists between the root
 9725 *	device and the device(a leaf device).
 9726 */
 9727void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 9728					struct net_device *dev)
 9729{
 9730	if (rootdev->operstate == IF_OPER_DORMANT)
 9731		netif_dormant_on(dev);
 9732	else
 9733		netif_dormant_off(dev);
 9734
 9735	if (rootdev->operstate == IF_OPER_TESTING)
 9736		netif_testing_on(dev);
 9737	else
 9738		netif_testing_off(dev);
 9739
 9740	if (netif_carrier_ok(rootdev))
 9741		netif_carrier_on(dev);
 9742	else
 9743		netif_carrier_off(dev);
 9744}
 9745EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 9746
 9747static int netif_alloc_rx_queues(struct net_device *dev)
 9748{
 9749	unsigned int i, count = dev->num_rx_queues;
 9750	struct netdev_rx_queue *rx;
 9751	size_t sz = count * sizeof(*rx);
 9752	int err = 0;
 9753
 9754	BUG_ON(count < 1);
 9755
 9756	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9757	if (!rx)
 9758		return -ENOMEM;
 9759
 9760	dev->_rx = rx;
 9761
 9762	for (i = 0; i < count; i++) {
 9763		rx[i].dev = dev;
 9764
 9765		/* XDP RX-queue setup */
 9766		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
 9767		if (err < 0)
 9768			goto err_rxq_info;
 9769	}
 9770	return 0;
 9771
 9772err_rxq_info:
 9773	/* Rollback successful reg's and free other resources */
 9774	while (i--)
 9775		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 9776	kvfree(dev->_rx);
 9777	dev->_rx = NULL;
 9778	return err;
 9779}
 9780
 9781static void netif_free_rx_queues(struct net_device *dev)
 9782{
 9783	unsigned int i, count = dev->num_rx_queues;
 9784
 9785	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
 9786	if (!dev->_rx)
 9787		return;
 9788
 9789	for (i = 0; i < count; i++)
 9790		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
 9791
 9792	kvfree(dev->_rx);
 9793}
 9794
 9795static void netdev_init_one_queue(struct net_device *dev,
 9796				  struct netdev_queue *queue, void *_unused)
 9797{
 9798	/* Initialize queue lock */
 9799	spin_lock_init(&queue->_xmit_lock);
 9800	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
 9801	queue->xmit_lock_owner = -1;
 9802	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
 9803	queue->dev = dev;
 9804#ifdef CONFIG_BQL
 9805	dql_init(&queue->dql, HZ);
 9806#endif
 9807}
 9808
 9809static void netif_free_tx_queues(struct net_device *dev)
 9810{
 9811	kvfree(dev->_tx);
 9812}
 9813
 9814static int netif_alloc_netdev_queues(struct net_device *dev)
 9815{
 9816	unsigned int count = dev->num_tx_queues;
 9817	struct netdev_queue *tx;
 9818	size_t sz = count * sizeof(*tx);
 9819
 9820	if (count < 1 || count > 0xffff)
 9821		return -EINVAL;
 9822
 9823	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 9824	if (!tx)
 9825		return -ENOMEM;
 9826
 9827	dev->_tx = tx;
 9828
 9829	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 9830	spin_lock_init(&dev->tx_global_lock);
 9831
 9832	return 0;
 9833}
 9834
 9835void netif_tx_stop_all_queues(struct net_device *dev)
 9836{
 9837	unsigned int i;
 9838
 9839	for (i = 0; i < dev->num_tx_queues; i++) {
 9840		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 9841
 9842		netif_tx_stop_queue(txq);
 9843	}
 9844}
 9845EXPORT_SYMBOL(netif_tx_stop_all_queues);
 9846
 9847/**
 9848 *	register_netdevice	- register a network device
 9849 *	@dev: device to register
 9850 *
 9851 *	Take a completed network device structure and add it to the kernel
 9852 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 9853 *	chain. 0 is returned on success. A negative errno code is returned
 9854 *	on a failure to set up the device, or if the name is a duplicate.
 9855 *
 9856 *	Callers must hold the rtnl semaphore. You may want
 9857 *	register_netdev() instead of this.
 9858 *
 9859 *	BUGS:
 9860 *	The locking appears insufficient to guarantee two parallel registers
 9861 *	will not get the same name.
 9862 */
 9863
 9864int register_netdevice(struct net_device *dev)
 9865{
 9866	int ret;
 9867	struct net *net = dev_net(dev);
 9868
 9869	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
 9870		     NETDEV_FEATURE_COUNT);
 9871	BUG_ON(dev_boot_phase);
 9872	ASSERT_RTNL();
 9873
 9874	might_sleep();
 9875
 9876	/* When net_device's are persistent, this will be fatal. */
 9877	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 9878	BUG_ON(!net);
 9879
 9880	ret = ethtool_check_ops(dev->ethtool_ops);
 9881	if (ret)
 9882		return ret;
 9883
 9884	spin_lock_init(&dev->addr_list_lock);
 9885	netdev_set_addr_lockdep_class(dev);
 9886
 9887	ret = dev_get_valid_name(net, dev, dev->name);
 9888	if (ret < 0)
 9889		goto out;
 9890
 9891	ret = -ENOMEM;
 9892	dev->name_node = netdev_name_node_head_alloc(dev);
 9893	if (!dev->name_node)
 9894		goto out;
 9895
 9896	/* Init, if this function is available */
 9897	if (dev->netdev_ops->ndo_init) {
 9898		ret = dev->netdev_ops->ndo_init(dev);
 9899		if (ret) {
 9900			if (ret > 0)
 9901				ret = -EIO;
 9902			goto err_free_name;
 9903		}
 9904	}
 9905
 9906	if (((dev->hw_features | dev->features) &
 9907	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 9908	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 9909	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 9910		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
 9911		ret = -EINVAL;
 9912		goto err_uninit;
 9913	}
 9914
 9915	ret = -EBUSY;
 9916	if (!dev->ifindex)
 9917		dev->ifindex = dev_new_index(net);
 9918	else if (__dev_get_by_index(net, dev->ifindex))
 9919		goto err_uninit;
 9920
 9921	/* Transfer changeable features to wanted_features and enable
 9922	 * software offloads (GSO and GRO).
 9923	 */
 9924	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
 9925	dev->features |= NETIF_F_SOFT_FEATURES;
 9926
 9927	if (dev->netdev_ops->ndo_udp_tunnel_add) {
 9928		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9929		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
 9930	}
 9931
 9932	dev->wanted_features = dev->features & dev->hw_features;
 9933
 9934	if (!(dev->flags & IFF_LOOPBACK))
 9935		dev->hw_features |= NETIF_F_NOCACHE_COPY;
 9936
 9937	/* If IPv4 TCP segmentation offload is supported we should also
 9938	 * allow the device to enable segmenting the frame with the option
 9939	 * of ignoring a static IP ID value.  This doesn't enable the
 9940	 * feature itself but allows the user to enable it later.
 9941	 */
 9942	if (dev->hw_features & NETIF_F_TSO)
 9943		dev->hw_features |= NETIF_F_TSO_MANGLEID;
 9944	if (dev->vlan_features & NETIF_F_TSO)
 9945		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
 9946	if (dev->mpls_features & NETIF_F_TSO)
 9947		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
 9948	if (dev->hw_enc_features & NETIF_F_TSO)
 9949		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 9950
 9951	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
 9952	 */
 9953	dev->vlan_features |= NETIF_F_HIGHDMA;
 9954
 9955	/* Make NETIF_F_SG inheritable to tunnel devices.
 9956	 */
 9957	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
 9958
 9959	/* Make NETIF_F_SG inheritable to MPLS.
 9960	 */
 9961	dev->mpls_features |= NETIF_F_SG;
 9962
 9963	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 9964	ret = notifier_to_errno(ret);
 9965	if (ret)
 9966		goto err_uninit;
 9967
 9968	ret = netdev_register_kobject(dev);
 9969	if (ret) {
 9970		dev->reg_state = NETREG_UNREGISTERED;
 9971		goto err_uninit;
 9972	}
 9973	dev->reg_state = NETREG_REGISTERED;
 9974
 9975	__netdev_update_features(dev);
 9976
 9977	/*
 9978	 *	Default initial state at registry is that the
 9979	 *	device is present.
 9980	 */
 9981
 9982	set_bit(__LINK_STATE_PRESENT, &dev->state);
 9983
 9984	linkwatch_init_dev(dev);
 9985
 9986	dev_init_scheduler(dev);
 9987	dev_hold(dev);
 9988	list_netdevice(dev);
 9989	add_device_randomness(dev->dev_addr, dev->addr_len);
 9990
 9991	/* If the device has permanent device address, driver should
 9992	 * set dev_addr and also addr_assign_type should be set to
 9993	 * NET_ADDR_PERM (default value).
 9994	 */
 9995	if (dev->addr_assign_type == NET_ADDR_PERM)
 9996		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 9997
 9998	/* Notify protocols, that a new device appeared. */
 9999	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10000	ret = notifier_to_errno(ret);
10001	if (ret) {
10002		rollback_registered(dev);
10003		rcu_barrier();
10004
10005		dev->reg_state = NETREG_UNREGISTERED;
10006		/* We should put the kobject that hold in
10007		 * netdev_unregister_kobject(), otherwise
10008		 * the net device cannot be freed when
10009		 * driver calls free_netdev(), because the
10010		 * kobject is being hold.
10011		 */
10012		kobject_put(&dev->dev.kobj);
10013	}
10014	/*
10015	 *	Prevent userspace races by waiting until the network
10016	 *	device is fully setup before sending notifications.
10017	 */
10018	if (!dev->rtnl_link_ops ||
10019	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10020		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10021
10022out:
10023	return ret;
10024
10025err_uninit:
10026	if (dev->netdev_ops->ndo_uninit)
10027		dev->netdev_ops->ndo_uninit(dev);
10028	if (dev->priv_destructor)
10029		dev->priv_destructor(dev);
10030err_free_name:
10031	netdev_name_node_free(dev->name_node);
10032	goto out;
10033}
10034EXPORT_SYMBOL(register_netdevice);
10035
10036/**
10037 *	init_dummy_netdev	- init a dummy network device for NAPI
10038 *	@dev: device to init
10039 *
10040 *	This takes a network device structure and initialize the minimum
10041 *	amount of fields so it can be used to schedule NAPI polls without
10042 *	registering a full blown interface. This is to be used by drivers
10043 *	that need to tie several hardware interfaces to a single NAPI
10044 *	poll scheduler due to HW limitations.
10045 */
10046int init_dummy_netdev(struct net_device *dev)
10047{
10048	/* Clear everything. Note we don't initialize spinlocks
10049	 * are they aren't supposed to be taken by any of the
10050	 * NAPI code and this dummy netdev is supposed to be
10051	 * only ever used for NAPI polls
10052	 */
10053	memset(dev, 0, sizeof(struct net_device));
10054
10055	/* make sure we BUG if trying to hit standard
10056	 * register/unregister code path
10057	 */
10058	dev->reg_state = NETREG_DUMMY;
10059
10060	/* NAPI wants this */
10061	INIT_LIST_HEAD(&dev->napi_list);
10062
10063	/* a dummy interface is started by default */
10064	set_bit(__LINK_STATE_PRESENT, &dev->state);
10065	set_bit(__LINK_STATE_START, &dev->state);
10066
10067	/* napi_busy_loop stats accounting wants this */
10068	dev_net_set(dev, &init_net);
10069
10070	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10071	 * because users of this 'device' dont need to change
10072	 * its refcount.
10073	 */
10074
10075	return 0;
10076}
10077EXPORT_SYMBOL_GPL(init_dummy_netdev);
10078
10079
10080/**
10081 *	register_netdev	- register a network device
10082 *	@dev: device to register
10083 *
10084 *	Take a completed network device structure and add it to the kernel
10085 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10086 *	chain. 0 is returned on success. A negative errno code is returned
10087 *	on a failure to set up the device, or if the name is a duplicate.
10088 *
10089 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10090 *	and expands the device name if you passed a format string to
10091 *	alloc_netdev.
10092 */
10093int register_netdev(struct net_device *dev)
10094{
10095	int err;
10096
10097	if (rtnl_lock_killable())
10098		return -EINTR;
10099	err = register_netdevice(dev);
10100	rtnl_unlock();
10101	return err;
10102}
10103EXPORT_SYMBOL(register_netdev);
10104
10105int netdev_refcnt_read(const struct net_device *dev)
10106{
10107	int i, refcnt = 0;
10108
10109	for_each_possible_cpu(i)
10110		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10111	return refcnt;
10112}
10113EXPORT_SYMBOL(netdev_refcnt_read);
10114
10115#define WAIT_REFS_MIN_MSECS 1
10116#define WAIT_REFS_MAX_MSECS 250
10117/**
10118 * netdev_wait_allrefs - wait until all references are gone.
10119 * @dev: target net_device
10120 *
10121 * This is called when unregistering network devices.
10122 *
10123 * Any protocol or device that holds a reference should register
10124 * for netdevice notification, and cleanup and put back the
10125 * reference if they receive an UNREGISTER event.
10126 * We can get stuck here if buggy protocols don't correctly
10127 * call dev_put.
10128 */
10129static void netdev_wait_allrefs(struct net_device *dev)
10130{
10131	unsigned long rebroadcast_time, warning_time;
10132	int wait = 0, refcnt;
10133
10134	linkwatch_forget_dev(dev);
10135
10136	rebroadcast_time = warning_time = jiffies;
10137	refcnt = netdev_refcnt_read(dev);
10138
10139	while (refcnt != 0) {
10140		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10141			rtnl_lock();
10142
10143			/* Rebroadcast unregister notification */
10144			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10145
10146			__rtnl_unlock();
10147			rcu_barrier();
10148			rtnl_lock();
10149
10150			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10151				     &dev->state)) {
10152				/* We must not have linkwatch events
10153				 * pending on unregister. If this
10154				 * happens, we simply run the queue
10155				 * unscheduled, resulting in a noop
10156				 * for this device.
10157				 */
10158				linkwatch_run_queue();
10159			}
10160
10161			__rtnl_unlock();
10162
10163			rebroadcast_time = jiffies;
10164		}
10165
10166		if (!wait) {
10167			rcu_barrier();
10168			wait = WAIT_REFS_MIN_MSECS;
10169		} else {
10170			msleep(wait);
10171			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10172		}
10173
10174		refcnt = netdev_refcnt_read(dev);
10175
10176		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10177			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10178				 dev->name, refcnt);
10179			warning_time = jiffies;
10180		}
10181	}
10182}
10183
10184/* The sequence is:
10185 *
10186 *	rtnl_lock();
10187 *	...
10188 *	register_netdevice(x1);
10189 *	register_netdevice(x2);
10190 *	...
10191 *	unregister_netdevice(y1);
10192 *	unregister_netdevice(y2);
10193 *      ...
10194 *	rtnl_unlock();
10195 *	free_netdev(y1);
10196 *	free_netdev(y2);
10197 *
10198 * We are invoked by rtnl_unlock().
10199 * This allows us to deal with problems:
10200 * 1) We can delete sysfs objects which invoke hotplug
10201 *    without deadlocking with linkwatch via keventd.
10202 * 2) Since we run with the RTNL semaphore not held, we can sleep
10203 *    safely in order to wait for the netdev refcnt to drop to zero.
10204 *
10205 * We must not return until all unregister events added during
10206 * the interval the lock was held have been completed.
10207 */
10208void netdev_run_todo(void)
10209{
10210	struct list_head list;
10211#ifdef CONFIG_LOCKDEP
10212	struct list_head unlink_list;
10213
10214	list_replace_init(&net_unlink_list, &unlink_list);
10215
10216	while (!list_empty(&unlink_list)) {
10217		struct net_device *dev = list_first_entry(&unlink_list,
10218							  struct net_device,
10219							  unlink_list);
10220		list_del_init(&dev->unlink_list);
10221		dev->nested_level = dev->lower_level - 1;
10222	}
10223#endif
10224
10225	/* Snapshot list, allow later requests */
10226	list_replace_init(&net_todo_list, &list);
10227
10228	__rtnl_unlock();
10229
10230
10231	/* Wait for rcu callbacks to finish before next phase */
10232	if (!list_empty(&list))
10233		rcu_barrier();
10234
10235	while (!list_empty(&list)) {
10236		struct net_device *dev
10237			= list_first_entry(&list, struct net_device, todo_list);
10238		list_del(&dev->todo_list);
10239
10240		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10241			pr_err("network todo '%s' but state %d\n",
10242			       dev->name, dev->reg_state);
10243			dump_stack();
10244			continue;
10245		}
10246
10247		dev->reg_state = NETREG_UNREGISTERED;
10248
10249		netdev_wait_allrefs(dev);
10250
10251		/* paranoia */
10252		BUG_ON(netdev_refcnt_read(dev));
10253		BUG_ON(!list_empty(&dev->ptype_all));
10254		BUG_ON(!list_empty(&dev->ptype_specific));
10255		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10256		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10257#if IS_ENABLED(CONFIG_DECNET)
10258		WARN_ON(dev->dn_ptr);
10259#endif
10260		if (dev->priv_destructor)
10261			dev->priv_destructor(dev);
10262		if (dev->needs_free_netdev)
10263			free_netdev(dev);
10264
10265		/* Report a network device has been unregistered */
10266		rtnl_lock();
10267		dev_net(dev)->dev_unreg_count--;
10268		__rtnl_unlock();
10269		wake_up(&netdev_unregistering_wq);
10270
10271		/* Free network device */
10272		kobject_put(&dev->dev.kobj);
10273	}
10274}
10275
10276/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10277 * all the same fields in the same order as net_device_stats, with only
10278 * the type differing, but rtnl_link_stats64 may have additional fields
10279 * at the end for newer counters.
10280 */
10281void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10282			     const struct net_device_stats *netdev_stats)
10283{
10284#if BITS_PER_LONG == 64
10285	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10286	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10287	/* zero out counters that only exist in rtnl_link_stats64 */
10288	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10289	       sizeof(*stats64) - sizeof(*netdev_stats));
10290#else
10291	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10292	const unsigned long *src = (const unsigned long *)netdev_stats;
10293	u64 *dst = (u64 *)stats64;
10294
10295	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10296	for (i = 0; i < n; i++)
10297		dst[i] = src[i];
10298	/* zero out counters that only exist in rtnl_link_stats64 */
10299	memset((char *)stats64 + n * sizeof(u64), 0,
10300	       sizeof(*stats64) - n * sizeof(u64));
10301#endif
10302}
10303EXPORT_SYMBOL(netdev_stats_to_stats64);
10304
10305/**
10306 *	dev_get_stats	- get network device statistics
10307 *	@dev: device to get statistics from
10308 *	@storage: place to store stats
10309 *
10310 *	Get network statistics from device. Return @storage.
10311 *	The device driver may provide its own method by setting
10312 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10313 *	otherwise the internal statistics structure is used.
10314 */
10315struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10316					struct rtnl_link_stats64 *storage)
10317{
10318	const struct net_device_ops *ops = dev->netdev_ops;
10319
10320	if (ops->ndo_get_stats64) {
10321		memset(storage, 0, sizeof(*storage));
10322		ops->ndo_get_stats64(dev, storage);
10323	} else if (ops->ndo_get_stats) {
10324		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10325	} else {
10326		netdev_stats_to_stats64(storage, &dev->stats);
10327	}
10328	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10329	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10330	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10331	return storage;
10332}
10333EXPORT_SYMBOL(dev_get_stats);
10334
10335/**
10336 *	dev_fetch_sw_netstats - get per-cpu network device statistics
10337 *	@s: place to store stats
10338 *	@netstats: per-cpu network stats to read from
10339 *
10340 *	Read per-cpu network statistics and populate the related fields in @s.
10341 */
10342void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10343			   const struct pcpu_sw_netstats __percpu *netstats)
10344{
10345	int cpu;
10346
10347	for_each_possible_cpu(cpu) {
10348		const struct pcpu_sw_netstats *stats;
10349		struct pcpu_sw_netstats tmp;
10350		unsigned int start;
10351
10352		stats = per_cpu_ptr(netstats, cpu);
10353		do {
10354			start = u64_stats_fetch_begin_irq(&stats->syncp);
10355			tmp.rx_packets = stats->rx_packets;
10356			tmp.rx_bytes   = stats->rx_bytes;
10357			tmp.tx_packets = stats->tx_packets;
10358			tmp.tx_bytes   = stats->tx_bytes;
10359		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10360
10361		s->rx_packets += tmp.rx_packets;
10362		s->rx_bytes   += tmp.rx_bytes;
10363		s->tx_packets += tmp.tx_packets;
10364		s->tx_bytes   += tmp.tx_bytes;
10365	}
10366}
10367EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10368
10369struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10370{
10371	struct netdev_queue *queue = dev_ingress_queue(dev);
10372
10373#ifdef CONFIG_NET_CLS_ACT
10374	if (queue)
10375		return queue;
10376	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10377	if (!queue)
10378		return NULL;
10379	netdev_init_one_queue(dev, queue, NULL);
10380	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10381	queue->qdisc_sleeping = &noop_qdisc;
10382	rcu_assign_pointer(dev->ingress_queue, queue);
10383#endif
10384	return queue;
10385}
10386
10387static const struct ethtool_ops default_ethtool_ops;
10388
10389void netdev_set_default_ethtool_ops(struct net_device *dev,
10390				    const struct ethtool_ops *ops)
10391{
10392	if (dev->ethtool_ops == &default_ethtool_ops)
10393		dev->ethtool_ops = ops;
10394}
10395EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10396
10397void netdev_freemem(struct net_device *dev)
10398{
10399	char *addr = (char *)dev - dev->padded;
10400
10401	kvfree(addr);
10402}
10403
10404/**
10405 * alloc_netdev_mqs - allocate network device
10406 * @sizeof_priv: size of private data to allocate space for
10407 * @name: device name format string
10408 * @name_assign_type: origin of device name
10409 * @setup: callback to initialize device
10410 * @txqs: the number of TX subqueues to allocate
10411 * @rxqs: the number of RX subqueues to allocate
10412 *
10413 * Allocates a struct net_device with private data area for driver use
10414 * and performs basic initialization.  Also allocates subqueue structs
10415 * for each queue on the device.
10416 */
10417struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10418		unsigned char name_assign_type,
10419		void (*setup)(struct net_device *),
10420		unsigned int txqs, unsigned int rxqs)
10421{
10422	struct net_device *dev;
10423	unsigned int alloc_size;
10424	struct net_device *p;
10425
10426	BUG_ON(strlen(name) >= sizeof(dev->name));
10427
10428	if (txqs < 1) {
10429		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10430		return NULL;
10431	}
10432
10433	if (rxqs < 1) {
10434		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10435		return NULL;
10436	}
10437
10438	alloc_size = sizeof(struct net_device);
10439	if (sizeof_priv) {
10440		/* ensure 32-byte alignment of private area */
10441		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10442		alloc_size += sizeof_priv;
10443	}
10444	/* ensure 32-byte alignment of whole construct */
10445	alloc_size += NETDEV_ALIGN - 1;
10446
10447	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10448	if (!p)
10449		return NULL;
10450
10451	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10452	dev->padded = (char *)dev - (char *)p;
10453
10454	dev->pcpu_refcnt = alloc_percpu(int);
10455	if (!dev->pcpu_refcnt)
10456		goto free_dev;
10457
10458	if (dev_addr_init(dev))
10459		goto free_pcpu;
10460
10461	dev_mc_init(dev);
10462	dev_uc_init(dev);
10463
10464	dev_net_set(dev, &init_net);
10465
10466	dev->gso_max_size = GSO_MAX_SIZE;
10467	dev->gso_max_segs = GSO_MAX_SEGS;
10468	dev->upper_level = 1;
10469	dev->lower_level = 1;
10470#ifdef CONFIG_LOCKDEP
10471	dev->nested_level = 0;
10472	INIT_LIST_HEAD(&dev->unlink_list);
10473#endif
10474
10475	INIT_LIST_HEAD(&dev->napi_list);
10476	INIT_LIST_HEAD(&dev->unreg_list);
10477	INIT_LIST_HEAD(&dev->close_list);
10478	INIT_LIST_HEAD(&dev->link_watch_list);
10479	INIT_LIST_HEAD(&dev->adj_list.upper);
10480	INIT_LIST_HEAD(&dev->adj_list.lower);
10481	INIT_LIST_HEAD(&dev->ptype_all);
10482	INIT_LIST_HEAD(&dev->ptype_specific);
10483	INIT_LIST_HEAD(&dev->net_notifier_list);
10484#ifdef CONFIG_NET_SCHED
10485	hash_init(dev->qdisc_hash);
10486#endif
10487	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10488	setup(dev);
10489
10490	if (!dev->tx_queue_len) {
10491		dev->priv_flags |= IFF_NO_QUEUE;
10492		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10493	}
10494
10495	dev->num_tx_queues = txqs;
10496	dev->real_num_tx_queues = txqs;
10497	if (netif_alloc_netdev_queues(dev))
10498		goto free_all;
10499
10500	dev->num_rx_queues = rxqs;
10501	dev->real_num_rx_queues = rxqs;
10502	if (netif_alloc_rx_queues(dev))
10503		goto free_all;
10504
10505	strcpy(dev->name, name);
10506	dev->name_assign_type = name_assign_type;
10507	dev->group = INIT_NETDEV_GROUP;
10508	if (!dev->ethtool_ops)
10509		dev->ethtool_ops = &default_ethtool_ops;
10510
10511	nf_hook_ingress_init(dev);
10512
10513	return dev;
10514
10515free_all:
10516	free_netdev(dev);
10517	return NULL;
10518
10519free_pcpu:
10520	free_percpu(dev->pcpu_refcnt);
10521free_dev:
10522	netdev_freemem(dev);
10523	return NULL;
10524}
10525EXPORT_SYMBOL(alloc_netdev_mqs);
10526
10527/**
10528 * free_netdev - free network device
10529 * @dev: device
10530 *
10531 * This function does the last stage of destroying an allocated device
10532 * interface. The reference to the device object is released. If this
10533 * is the last reference then it will be freed.Must be called in process
10534 * context.
10535 */
10536void free_netdev(struct net_device *dev)
10537{
10538	struct napi_struct *p, *n;
10539
10540	might_sleep();
10541	netif_free_tx_queues(dev);
10542	netif_free_rx_queues(dev);
10543
10544	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10545
10546	/* Flush device addresses */
10547	dev_addr_flush(dev);
10548
10549	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10550		netif_napi_del(p);
10551
10552	free_percpu(dev->pcpu_refcnt);
10553	dev->pcpu_refcnt = NULL;
10554	free_percpu(dev->xdp_bulkq);
10555	dev->xdp_bulkq = NULL;
10556
10557	/*  Compatibility with error handling in drivers */
10558	if (dev->reg_state == NETREG_UNINITIALIZED) {
10559		netdev_freemem(dev);
10560		return;
10561	}
10562
10563	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10564	dev->reg_state = NETREG_RELEASED;
10565
10566	/* will free via device release */
10567	put_device(&dev->dev);
10568}
10569EXPORT_SYMBOL(free_netdev);
10570
10571/**
10572 *	synchronize_net -  Synchronize with packet receive processing
10573 *
10574 *	Wait for packets currently being received to be done.
10575 *	Does not block later packets from starting.
10576 */
10577void synchronize_net(void)
10578{
10579	might_sleep();
10580	if (rtnl_is_locked())
10581		synchronize_rcu_expedited();
10582	else
10583		synchronize_rcu();
10584}
10585EXPORT_SYMBOL(synchronize_net);
10586
10587/**
10588 *	unregister_netdevice_queue - remove device from the kernel
10589 *	@dev: device
10590 *	@head: list
10591 *
10592 *	This function shuts down a device interface and removes it
10593 *	from the kernel tables.
10594 *	If head not NULL, device is queued to be unregistered later.
10595 *
10596 *	Callers must hold the rtnl semaphore.  You may want
10597 *	unregister_netdev() instead of this.
10598 */
10599
10600void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10601{
10602	ASSERT_RTNL();
10603
10604	if (head) {
10605		list_move_tail(&dev->unreg_list, head);
10606	} else {
10607		rollback_registered(dev);
10608		/* Finish processing unregister after unlock */
10609		net_set_todo(dev);
10610	}
10611}
10612EXPORT_SYMBOL(unregister_netdevice_queue);
10613
10614/**
10615 *	unregister_netdevice_many - unregister many devices
10616 *	@head: list of devices
10617 *
10618 *  Note: As most callers use a stack allocated list_head,
10619 *  we force a list_del() to make sure stack wont be corrupted later.
10620 */
10621void unregister_netdevice_many(struct list_head *head)
10622{
10623	struct net_device *dev;
10624
10625	if (!list_empty(head)) {
10626		rollback_registered_many(head);
10627		list_for_each_entry(dev, head, unreg_list)
10628			net_set_todo(dev);
10629		list_del(head);
10630	}
10631}
10632EXPORT_SYMBOL(unregister_netdevice_many);
10633
10634/**
10635 *	unregister_netdev - remove device from the kernel
10636 *	@dev: device
10637 *
10638 *	This function shuts down a device interface and removes it
10639 *	from the kernel tables.
10640 *
10641 *	This is just a wrapper for unregister_netdevice that takes
10642 *	the rtnl semaphore.  In general you want to use this and not
10643 *	unregister_netdevice.
10644 */
10645void unregister_netdev(struct net_device *dev)
10646{
10647	rtnl_lock();
10648	unregister_netdevice(dev);
10649	rtnl_unlock();
10650}
10651EXPORT_SYMBOL(unregister_netdev);
10652
10653/**
10654 *	dev_change_net_namespace - move device to different nethost namespace
10655 *	@dev: device
10656 *	@net: network namespace
10657 *	@pat: If not NULL name pattern to try if the current device name
10658 *	      is already taken in the destination network namespace.
10659 *
10660 *	This function shuts down a device interface and moves it
10661 *	to a new network namespace. On success 0 is returned, on
10662 *	a failure a netagive errno code is returned.
10663 *
10664 *	Callers must hold the rtnl semaphore.
10665 */
10666
10667int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10668{
10669	struct net *net_old = dev_net(dev);
10670	int err, new_nsid, new_ifindex;
10671
10672	ASSERT_RTNL();
10673
10674	/* Don't allow namespace local devices to be moved. */
10675	err = -EINVAL;
10676	if (dev->features & NETIF_F_NETNS_LOCAL)
10677		goto out;
10678
10679	/* Ensure the device has been registrered */
10680	if (dev->reg_state != NETREG_REGISTERED)
10681		goto out;
10682
10683	/* Get out if there is nothing todo */
10684	err = 0;
10685	if (net_eq(net_old, net))
10686		goto out;
10687
10688	/* Pick the destination device name, and ensure
10689	 * we can use it in the destination network namespace.
10690	 */
10691	err = -EEXIST;
10692	if (__dev_get_by_name(net, dev->name)) {
10693		/* We get here if we can't use the current device name */
10694		if (!pat)
10695			goto out;
10696		err = dev_get_valid_name(net, dev, pat);
10697		if (err < 0)
10698			goto out;
10699	}
10700
10701	/*
10702	 * And now a mini version of register_netdevice unregister_netdevice.
10703	 */
10704
10705	/* If device is running close it first. */
10706	dev_close(dev);
10707
10708	/* And unlink it from device chain */
10709	unlist_netdevice(dev);
10710
10711	synchronize_net();
10712
10713	/* Shutdown queueing discipline. */
10714	dev_shutdown(dev);
10715
10716	/* Notify protocols, that we are about to destroy
10717	 * this device. They should clean all the things.
10718	 *
10719	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10720	 * This is wanted because this way 8021q and macvlan know
10721	 * the device is just moving and can keep their slaves up.
10722	 */
10723	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10724	rcu_barrier();
10725
10726	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10727	/* If there is an ifindex conflict assign a new one */
10728	if (__dev_get_by_index(net, dev->ifindex))
10729		new_ifindex = dev_new_index(net);
10730	else
10731		new_ifindex = dev->ifindex;
10732
10733	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10734			    new_ifindex);
10735
10736	/*
10737	 *	Flush the unicast and multicast chains
10738	 */
10739	dev_uc_flush(dev);
10740	dev_mc_flush(dev);
10741
10742	/* Send a netdev-removed uevent to the old namespace */
10743	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10744	netdev_adjacent_del_links(dev);
10745
10746	/* Move per-net netdevice notifiers that are following the netdevice */
10747	move_netdevice_notifiers_dev_net(dev, net);
10748
10749	/* Actually switch the network namespace */
10750	dev_net_set(dev, net);
10751	dev->ifindex = new_ifindex;
10752
10753	/* Send a netdev-add uevent to the new namespace */
10754	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10755	netdev_adjacent_add_links(dev);
10756
10757	/* Fixup kobjects */
10758	err = device_rename(&dev->dev, dev->name);
10759	WARN_ON(err);
10760
10761	/* Adapt owner in case owning user namespace of target network
10762	 * namespace is different from the original one.
10763	 */
10764	err = netdev_change_owner(dev, net_old, net);
10765	WARN_ON(err);
10766
10767	/* Add the device back in the hashes */
10768	list_netdevice(dev);
10769
10770	/* Notify protocols, that a new device appeared. */
10771	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10772
10773	/*
10774	 *	Prevent userspace races by waiting until the network
10775	 *	device is fully setup before sending notifications.
10776	 */
10777	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10778
10779	synchronize_net();
10780	err = 0;
10781out:
10782	return err;
10783}
10784EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10785
10786static int dev_cpu_dead(unsigned int oldcpu)
10787{
10788	struct sk_buff **list_skb;
10789	struct sk_buff *skb;
10790	unsigned int cpu;
10791	struct softnet_data *sd, *oldsd, *remsd = NULL;
10792
10793	local_irq_disable();
10794	cpu = smp_processor_id();
10795	sd = &per_cpu(softnet_data, cpu);
10796	oldsd = &per_cpu(softnet_data, oldcpu);
10797
10798	/* Find end of our completion_queue. */
10799	list_skb = &sd->completion_queue;
10800	while (*list_skb)
10801		list_skb = &(*list_skb)->next;
10802	/* Append completion queue from offline CPU. */
10803	*list_skb = oldsd->completion_queue;
10804	oldsd->completion_queue = NULL;
10805
10806	/* Append output queue from offline CPU. */
10807	if (oldsd->output_queue) {
10808		*sd->output_queue_tailp = oldsd->output_queue;
10809		sd->output_queue_tailp = oldsd->output_queue_tailp;
10810		oldsd->output_queue = NULL;
10811		oldsd->output_queue_tailp = &oldsd->output_queue;
10812	}
10813	/* Append NAPI poll list from offline CPU, with one exception :
10814	 * process_backlog() must be called by cpu owning percpu backlog.
10815	 * We properly handle process_queue & input_pkt_queue later.
10816	 */
10817	while (!list_empty(&oldsd->poll_list)) {
10818		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10819							    struct napi_struct,
10820							    poll_list);
10821
10822		list_del_init(&napi->poll_list);
10823		if (napi->poll == process_backlog)
10824			napi->state = 0;
10825		else
10826			____napi_schedule(sd, napi);
10827	}
10828
10829	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10830	local_irq_enable();
10831
10832#ifdef CONFIG_RPS
10833	remsd = oldsd->rps_ipi_list;
10834	oldsd->rps_ipi_list = NULL;
10835#endif
10836	/* send out pending IPI's on offline CPU */
10837	net_rps_send_ipi(remsd);
10838
10839	/* Process offline CPU's input_pkt_queue */
10840	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10841		netif_rx_ni(skb);
10842		input_queue_head_incr(oldsd);
10843	}
10844	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10845		netif_rx_ni(skb);
10846		input_queue_head_incr(oldsd);
10847	}
10848
10849	return 0;
10850}
10851
10852/**
10853 *	netdev_increment_features - increment feature set by one
10854 *	@all: current feature set
10855 *	@one: new feature set
10856 *	@mask: mask feature set
10857 *
10858 *	Computes a new feature set after adding a device with feature set
10859 *	@one to the master device with current feature set @all.  Will not
10860 *	enable anything that is off in @mask. Returns the new feature set.
10861 */
10862netdev_features_t netdev_increment_features(netdev_features_t all,
10863	netdev_features_t one, netdev_features_t mask)
10864{
10865	if (mask & NETIF_F_HW_CSUM)
10866		mask |= NETIF_F_CSUM_MASK;
10867	mask |= NETIF_F_VLAN_CHALLENGED;
10868
10869	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10870	all &= one | ~NETIF_F_ALL_FOR_ALL;
10871
10872	/* If one device supports hw checksumming, set for all. */
10873	if (all & NETIF_F_HW_CSUM)
10874		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10875
10876	return all;
10877}
10878EXPORT_SYMBOL(netdev_increment_features);
10879
10880static struct hlist_head * __net_init netdev_create_hash(void)
10881{
10882	int i;
10883	struct hlist_head *hash;
10884
10885	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10886	if (hash != NULL)
10887		for (i = 0; i < NETDEV_HASHENTRIES; i++)
10888			INIT_HLIST_HEAD(&hash[i]);
10889
10890	return hash;
10891}
10892
10893/* Initialize per network namespace state */
10894static int __net_init netdev_init(struct net *net)
10895{
10896	BUILD_BUG_ON(GRO_HASH_BUCKETS >
10897		     8 * sizeof_field(struct napi_struct, gro_bitmask));
10898
10899	if (net != &init_net)
10900		INIT_LIST_HEAD(&net->dev_base_head);
10901
10902	net->dev_name_head = netdev_create_hash();
10903	if (net->dev_name_head == NULL)
10904		goto err_name;
10905
10906	net->dev_index_head = netdev_create_hash();
10907	if (net->dev_index_head == NULL)
10908		goto err_idx;
10909
10910	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10911
10912	return 0;
10913
10914err_idx:
10915	kfree(net->dev_name_head);
10916err_name:
10917	return -ENOMEM;
10918}
10919
10920/**
10921 *	netdev_drivername - network driver for the device
10922 *	@dev: network device
10923 *
10924 *	Determine network driver for device.
10925 */
10926const char *netdev_drivername(const struct net_device *dev)
10927{
10928	const struct device_driver *driver;
10929	const struct device *parent;
10930	const char *empty = "";
10931
10932	parent = dev->dev.parent;
10933	if (!parent)
10934		return empty;
10935
10936	driver = parent->driver;
10937	if (driver && driver->name)
10938		return driver->name;
10939	return empty;
10940}
10941
10942static void __netdev_printk(const char *level, const struct net_device *dev,
10943			    struct va_format *vaf)
10944{
10945	if (dev && dev->dev.parent) {
10946		dev_printk_emit(level[1] - '0',
10947				dev->dev.parent,
10948				"%s %s %s%s: %pV",
10949				dev_driver_string(dev->dev.parent),
10950				dev_name(dev->dev.parent),
10951				netdev_name(dev), netdev_reg_state(dev),
10952				vaf);
10953	} else if (dev) {
10954		printk("%s%s%s: %pV",
10955		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10956	} else {
10957		printk("%s(NULL net_device): %pV", level, vaf);
10958	}
10959}
10960
10961void netdev_printk(const char *level, const struct net_device *dev,
10962		   const char *format, ...)
10963{
10964	struct va_format vaf;
10965	va_list args;
10966
10967	va_start(args, format);
10968
10969	vaf.fmt = format;
10970	vaf.va = &args;
10971
10972	__netdev_printk(level, dev, &vaf);
10973
10974	va_end(args);
10975}
10976EXPORT_SYMBOL(netdev_printk);
10977
10978#define define_netdev_printk_level(func, level)			\
10979void func(const struct net_device *dev, const char *fmt, ...)	\
10980{								\
10981	struct va_format vaf;					\
10982	va_list args;						\
10983								\
10984	va_start(args, fmt);					\
10985								\
10986	vaf.fmt = fmt;						\
10987	vaf.va = &args;						\
10988								\
10989	__netdev_printk(level, dev, &vaf);			\
10990								\
10991	va_end(args);						\
10992}								\
10993EXPORT_SYMBOL(func);
10994
10995define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10996define_netdev_printk_level(netdev_alert, KERN_ALERT);
10997define_netdev_printk_level(netdev_crit, KERN_CRIT);
10998define_netdev_printk_level(netdev_err, KERN_ERR);
10999define_netdev_printk_level(netdev_warn, KERN_WARNING);
11000define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11001define_netdev_printk_level(netdev_info, KERN_INFO);
11002
11003static void __net_exit netdev_exit(struct net *net)
11004{
11005	kfree(net->dev_name_head);
11006	kfree(net->dev_index_head);
11007	if (net != &init_net)
11008		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11009}
11010
11011static struct pernet_operations __net_initdata netdev_net_ops = {
11012	.init = netdev_init,
11013	.exit = netdev_exit,
11014};
11015
11016static void __net_exit default_device_exit(struct net *net)
11017{
11018	struct net_device *dev, *aux;
11019	/*
11020	 * Push all migratable network devices back to the
11021	 * initial network namespace
11022	 */
11023	rtnl_lock();
11024	for_each_netdev_safe(net, dev, aux) {
11025		int err;
11026		char fb_name[IFNAMSIZ];
11027
11028		/* Ignore unmoveable devices (i.e. loopback) */
11029		if (dev->features & NETIF_F_NETNS_LOCAL)
11030			continue;
11031
11032		/* Leave virtual devices for the generic cleanup */
11033		if (dev->rtnl_link_ops)
11034			continue;
11035
11036		/* Push remaining network devices to init_net */
11037		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11038		if (__dev_get_by_name(&init_net, fb_name))
11039			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11040		err = dev_change_net_namespace(dev, &init_net, fb_name);
11041		if (err) {
11042			pr_emerg("%s: failed to move %s to init_net: %d\n",
11043				 __func__, dev->name, err);
11044			BUG();
11045		}
11046	}
11047	rtnl_unlock();
11048}
11049
11050static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
11051{
11052	/* Return with the rtnl_lock held when there are no network
11053	 * devices unregistering in any network namespace in net_list.
11054	 */
11055	struct net *net;
11056	bool unregistering;
11057	DEFINE_WAIT_FUNC(wait, woken_wake_function);
11058
11059	add_wait_queue(&netdev_unregistering_wq, &wait);
11060	for (;;) {
11061		unregistering = false;
11062		rtnl_lock();
11063		list_for_each_entry(net, net_list, exit_list) {
11064			if (net->dev_unreg_count > 0) {
11065				unregistering = true;
11066				break;
11067			}
11068		}
11069		if (!unregistering)
11070			break;
11071		__rtnl_unlock();
11072
11073		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
11074	}
11075	remove_wait_queue(&netdev_unregistering_wq, &wait);
11076}
11077
11078static void __net_exit default_device_exit_batch(struct list_head *net_list)
11079{
11080	/* At exit all network devices most be removed from a network
11081	 * namespace.  Do this in the reverse order of registration.
11082	 * Do this across as many network namespaces as possible to
11083	 * improve batching efficiency.
11084	 */
11085	struct net_device *dev;
11086	struct net *net;
11087	LIST_HEAD(dev_kill_list);
11088
11089	/* To prevent network device cleanup code from dereferencing
11090	 * loopback devices or network devices that have been freed
11091	 * wait here for all pending unregistrations to complete,
11092	 * before unregistring the loopback device and allowing the
11093	 * network namespace be freed.
11094	 *
11095	 * The netdev todo list containing all network devices
11096	 * unregistrations that happen in default_device_exit_batch
11097	 * will run in the rtnl_unlock() at the end of
11098	 * default_device_exit_batch.
11099	 */
11100	rtnl_lock_unregistering(net_list);
11101	list_for_each_entry(net, net_list, exit_list) {
11102		for_each_netdev_reverse(net, dev) {
11103			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11104				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11105			else
11106				unregister_netdevice_queue(dev, &dev_kill_list);
11107		}
11108	}
11109	unregister_netdevice_many(&dev_kill_list);
11110	rtnl_unlock();
11111}
11112
11113static struct pernet_operations __net_initdata default_device_ops = {
11114	.exit = default_device_exit,
11115	.exit_batch = default_device_exit_batch,
11116};
11117
11118/*
11119 *	Initialize the DEV module. At boot time this walks the device list and
11120 *	unhooks any devices that fail to initialise (normally hardware not
11121 *	present) and leaves us with a valid list of present and active devices.
11122 *
11123 */
11124
11125/*
11126 *       This is called single threaded during boot, so no need
11127 *       to take the rtnl semaphore.
11128 */
11129static int __init net_dev_init(void)
11130{
11131	int i, rc = -ENOMEM;
11132
11133	BUG_ON(!dev_boot_phase);
11134
11135	if (dev_proc_init())
11136		goto out;
11137
11138	if (netdev_kobject_init())
11139		goto out;
11140
11141	INIT_LIST_HEAD(&ptype_all);
11142	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11143		INIT_LIST_HEAD(&ptype_base[i]);
11144
11145	INIT_LIST_HEAD(&offload_base);
11146
11147	if (register_pernet_subsys(&netdev_net_ops))
11148		goto out;
11149
11150	/*
11151	 *	Initialise the packet receive queues.
11152	 */
11153
11154	for_each_possible_cpu(i) {
11155		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11156		struct softnet_data *sd = &per_cpu(softnet_data, i);
11157
11158		INIT_WORK(flush, flush_backlog);
11159
11160		skb_queue_head_init(&sd->input_pkt_queue);
11161		skb_queue_head_init(&sd->process_queue);
11162#ifdef CONFIG_XFRM_OFFLOAD
11163		skb_queue_head_init(&sd->xfrm_backlog);
11164#endif
11165		INIT_LIST_HEAD(&sd->poll_list);
11166		sd->output_queue_tailp = &sd->output_queue;
11167#ifdef CONFIG_RPS
11168		sd->csd.func = rps_trigger_softirq;
11169		sd->csd.info = sd;
11170		sd->cpu = i;
11171#endif
11172
11173		init_gro_hash(&sd->backlog);
11174		sd->backlog.poll = process_backlog;
11175		sd->backlog.weight = weight_p;
11176	}
11177
11178	dev_boot_phase = 0;
11179
11180	/* The loopback device is special if any other network devices
11181	 * is present in a network namespace the loopback device must
11182	 * be present. Since we now dynamically allocate and free the
11183	 * loopback device ensure this invariant is maintained by
11184	 * keeping the loopback device as the first device on the
11185	 * list of network devices.  Ensuring the loopback devices
11186	 * is the first device that appears and the last network device
11187	 * that disappears.
11188	 */
11189	if (register_pernet_device(&loopback_net_ops))
11190		goto out;
11191
11192	if (register_pernet_device(&default_device_ops))
11193		goto out;
11194
11195	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11196	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11197
11198	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11199				       NULL, dev_cpu_dead);
11200	WARN_ON(rc < 0);
11201	rc = 0;
11202out:
11203	return rc;
11204}
11205
11206subsys_initcall(net_dev_init);