net/core/dev.c at v4.10-rc7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v4.10-rc7 214 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 143
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246	ASSERT_RTNL();
 247
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 862
 863/**
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168	err = dev_get_valid_name(net, dev, newname);
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
1211			memcpy(dev->name, oldname, IFNAMSIZ);
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
1256
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698/* We are not allowed to call static_key_slow_dec() from irq context
1699 * If net_disable_timestamp() is called from irq context, defer the
1700 * static_key_slow_dec() calls.
1701 */
1702static atomic_t netstamp_needed_deferred;
1703#endif
1704
1705void net_enable_timestamp(void)
1706{
1707#ifdef HAVE_JUMP_LABEL
1708	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710	if (deferred) {
1711		while (--deferred)
1712			static_key_slow_dec(&netstamp_needed);
1713		return;
1714	}
1715#endif
1716	static_key_slow_inc(&netstamp_needed);
1717}
1718EXPORT_SYMBOL(net_enable_timestamp);
1719
1720void net_disable_timestamp(void)
1721{
1722#ifdef HAVE_JUMP_LABEL
1723	if (in_interrupt()) {
1724		atomic_inc(&netstamp_needed_deferred);
1725		return;
1726	}
1727#endif
1728	static_key_slow_dec(&netstamp_needed);
1729}
1730EXPORT_SYMBOL(net_disable_timestamp);
1731
1732static inline void net_timestamp_set(struct sk_buff *skb)
1733{
1734	skb->tstamp = 0;
1735	if (static_key_false(&netstamp_needed))
1736		__net_timestamp(skb);
1737}
1738
1739#define net_timestamp_check(COND, SKB)			\
1740	if (static_key_false(&netstamp_needed)) {		\
1741		if ((COND) && !(SKB)->tstamp)	\
1742			__net_timestamp(SKB);		\
1743	}						\
1744
1745bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746{
1747	unsigned int len;
1748
1749	if (!(dev->flags & IFF_UP))
1750		return false;
1751
1752	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753	if (skb->len <= len)
1754		return true;
1755
1756	/* if TSO is enabled, we don't care about the length as the packet
1757	 * could be forwarded without being segmented before
1758	 */
1759	if (skb_is_gso(skb))
1760		return true;
1761
1762	return false;
1763}
1764EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767{
1768	int ret = ____dev_forward_skb(dev, skb);
1769
1770	if (likely(!ret)) {
1771		skb->protocol = eth_type_trans(skb, dev);
1772		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1773	}
1774
1775	return ret;
1776}
1777EXPORT_SYMBOL_GPL(__dev_forward_skb);
1778
1779/**
1780 * dev_forward_skb - loopback an skb to another netif
1781 *
1782 * @dev: destination network device
1783 * @skb: buffer to forward
1784 *
1785 * return values:
1786 *	NET_RX_SUCCESS	(no congestion)
1787 *	NET_RX_DROP     (packet was dropped, but freed)
1788 *
1789 * dev_forward_skb can be used for injecting an skb from the
1790 * start_xmit function of one device into the receive queue
1791 * of another device.
1792 *
1793 * The receiving device may be in another namespace, so
1794 * we have to clear all information in the skb that could
1795 * impact namespace isolation.
1796 */
1797int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1798{
1799	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1800}
1801EXPORT_SYMBOL_GPL(dev_forward_skb);
1802
1803static inline int deliver_skb(struct sk_buff *skb,
1804			      struct packet_type *pt_prev,
1805			      struct net_device *orig_dev)
1806{
1807	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1808		return -ENOMEM;
1809	atomic_inc(&skb->users);
1810	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1811}
1812
1813static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1814					  struct packet_type **pt,
1815					  struct net_device *orig_dev,
1816					  __be16 type,
1817					  struct list_head *ptype_list)
1818{
1819	struct packet_type *ptype, *pt_prev = *pt;
1820
1821	list_for_each_entry_rcu(ptype, ptype_list, list) {
1822		if (ptype->type != type)
1823			continue;
1824		if (pt_prev)
1825			deliver_skb(skb, pt_prev, orig_dev);
1826		pt_prev = ptype;
1827	}
1828	*pt = pt_prev;
1829}
1830
1831static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1832{
1833	if (!ptype->af_packet_priv || !skb->sk)
1834		return false;
1835
1836	if (ptype->id_match)
1837		return ptype->id_match(ptype, skb->sk);
1838	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1839		return true;
1840
1841	return false;
1842}
1843
1844/*
1845 *	Support routine. Sends outgoing frames to any network
1846 *	taps currently in use.
1847 */
1848
1849void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1850{
1851	struct packet_type *ptype;
1852	struct sk_buff *skb2 = NULL;
1853	struct packet_type *pt_prev = NULL;
1854	struct list_head *ptype_list = &ptype_all;
1855
1856	rcu_read_lock();
1857again:
1858	list_for_each_entry_rcu(ptype, ptype_list, list) {
1859		/* Never send packets back to the socket
1860		 * they originated from - MvS (miquels@drinkel.ow.org)
1861		 */
1862		if (skb_loop_sk(ptype, skb))
1863			continue;
1864
1865		if (pt_prev) {
1866			deliver_skb(skb2, pt_prev, skb->dev);
1867			pt_prev = ptype;
1868			continue;
1869		}
1870
1871		/* need to clone skb, done only once */
1872		skb2 = skb_clone(skb, GFP_ATOMIC);
1873		if (!skb2)
1874			goto out_unlock;
1875
1876		net_timestamp_set(skb2);
1877
1878		/* skb->nh should be correctly
1879		 * set by sender, so that the second statement is
1880		 * just protection against buggy protocols.
1881		 */
1882		skb_reset_mac_header(skb2);
1883
1884		if (skb_network_header(skb2) < skb2->data ||
1885		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1886			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1887					     ntohs(skb2->protocol),
1888					     dev->name);
1889			skb_reset_network_header(skb2);
1890		}
1891
1892		skb2->transport_header = skb2->network_header;
1893		skb2->pkt_type = PACKET_OUTGOING;
1894		pt_prev = ptype;
1895	}
1896
1897	if (ptype_list == &ptype_all) {
1898		ptype_list = &dev->ptype_all;
1899		goto again;
1900	}
1901out_unlock:
1902	if (pt_prev)
1903		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1904	rcu_read_unlock();
1905}
1906EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1907
1908/**
1909 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1910 * @dev: Network device
1911 * @txq: number of queues available
1912 *
1913 * If real_num_tx_queues is changed the tc mappings may no longer be
1914 * valid. To resolve this verify the tc mapping remains valid and if
1915 * not NULL the mapping. With no priorities mapping to this
1916 * offset/count pair it will no longer be used. In the worst case TC0
1917 * is invalid nothing can be done so disable priority mappings. If is
1918 * expected that drivers will fix this mapping if they can before
1919 * calling netif_set_real_num_tx_queues.
1920 */
1921static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1922{
1923	int i;
1924	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1925
1926	/* If TC0 is invalidated disable TC mapping */
1927	if (tc->offset + tc->count > txq) {
1928		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1929		dev->num_tc = 0;
1930		return;
1931	}
1932
1933	/* Invalidated prio to tc mappings set to TC0 */
1934	for (i = 1; i < TC_BITMASK + 1; i++) {
1935		int q = netdev_get_prio_tc_map(dev, i);
1936
1937		tc = &dev->tc_to_txq[q];
1938		if (tc->offset + tc->count > txq) {
1939			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1940				i, q);
1941			netdev_set_prio_tc_map(dev, i, 0);
1942		}
1943	}
1944}
1945
1946int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1947{
1948	if (dev->num_tc) {
1949		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950		int i;
1951
1952		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1953			if ((txq - tc->offset) < tc->count)
1954				return i;
1955		}
1956
1957		return -1;
1958	}
1959
1960	return 0;
1961}
1962
1963#ifdef CONFIG_XPS
1964static DEFINE_MUTEX(xps_map_mutex);
1965#define xmap_dereference(P)		\
1966	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1967
1968static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1969			     int tci, u16 index)
1970{
1971	struct xps_map *map = NULL;
1972	int pos;
1973
1974	if (dev_maps)
1975		map = xmap_dereference(dev_maps->cpu_map[tci]);
1976	if (!map)
1977		return false;
1978
1979	for (pos = map->len; pos--;) {
1980		if (map->queues[pos] != index)
1981			continue;
1982
1983		if (map->len > 1) {
1984			map->queues[pos] = map->queues[--map->len];
1985			break;
1986		}
1987
1988		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1989		kfree_rcu(map, rcu);
1990		return false;
1991	}
1992
1993	return true;
1994}
1995
1996static bool remove_xps_queue_cpu(struct net_device *dev,
1997				 struct xps_dev_maps *dev_maps,
1998				 int cpu, u16 offset, u16 count)
1999{
2000	int num_tc = dev->num_tc ? : 1;
2001	bool active = false;
2002	int tci;
2003
2004	for (tci = cpu * num_tc; num_tc--; tci++) {
2005		int i, j;
2006
2007		for (i = count, j = offset; i--; j++) {
2008			if (!remove_xps_queue(dev_maps, cpu, j))
2009				break;
2010		}
2011
2012		active |= i < 0;
2013	}
2014
2015	return active;
2016}
2017
2018static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2019				   u16 count)
2020{
2021	struct xps_dev_maps *dev_maps;
2022	int cpu, i;
2023	bool active = false;
2024
2025	mutex_lock(&xps_map_mutex);
2026	dev_maps = xmap_dereference(dev->xps_maps);
2027
2028	if (!dev_maps)
2029		goto out_no_maps;
2030
2031	for_each_possible_cpu(cpu)
2032		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2033					       offset, count);
2034
2035	if (!active) {
2036		RCU_INIT_POINTER(dev->xps_maps, NULL);
2037		kfree_rcu(dev_maps, rcu);
2038	}
2039
2040	for (i = offset + (count - 1); count--; i--)
2041		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2042					     NUMA_NO_NODE);
2043
2044out_no_maps:
2045	mutex_unlock(&xps_map_mutex);
2046}
2047
2048static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2049{
2050	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2051}
2052
2053static struct xps_map *expand_xps_map(struct xps_map *map,
2054				      int cpu, u16 index)
2055{
2056	struct xps_map *new_map;
2057	int alloc_len = XPS_MIN_MAP_ALLOC;
2058	int i, pos;
2059
2060	for (pos = 0; map && pos < map->len; pos++) {
2061		if (map->queues[pos] != index)
2062			continue;
2063		return map;
2064	}
2065
2066	/* Need to add queue to this CPU's existing map */
2067	if (map) {
2068		if (pos < map->alloc_len)
2069			return map;
2070
2071		alloc_len = map->alloc_len * 2;
2072	}
2073
2074	/* Need to allocate new map to store queue on this CPU's map */
2075	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2076			       cpu_to_node(cpu));
2077	if (!new_map)
2078		return NULL;
2079
2080	for (i = 0; i < pos; i++)
2081		new_map->queues[i] = map->queues[i];
2082	new_map->alloc_len = alloc_len;
2083	new_map->len = pos;
2084
2085	return new_map;
2086}
2087
2088int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2089			u16 index)
2090{
2091	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2092	int i, cpu, tci, numa_node_id = -2;
2093	int maps_sz, num_tc = 1, tc = 0;
2094	struct xps_map *map, *new_map;
2095	bool active = false;
2096
2097	if (dev->num_tc) {
2098		num_tc = dev->num_tc;
2099		tc = netdev_txq_to_tc(dev, index);
2100		if (tc < 0)
2101			return -EINVAL;
2102	}
2103
2104	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2105	if (maps_sz < L1_CACHE_BYTES)
2106		maps_sz = L1_CACHE_BYTES;
2107
2108	mutex_lock(&xps_map_mutex);
2109
2110	dev_maps = xmap_dereference(dev->xps_maps);
2111
2112	/* allocate memory for queue storage */
2113	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2114		if (!new_dev_maps)
2115			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2116		if (!new_dev_maps) {
2117			mutex_unlock(&xps_map_mutex);
2118			return -ENOMEM;
2119		}
2120
2121		tci = cpu * num_tc + tc;
2122		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2123				 NULL;
2124
2125		map = expand_xps_map(map, cpu, index);
2126		if (!map)
2127			goto error;
2128
2129		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2130	}
2131
2132	if (!new_dev_maps)
2133		goto out_no_new_maps;
2134
2135	for_each_possible_cpu(cpu) {
2136		/* copy maps belonging to foreign traffic classes */
2137		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2138			/* fill in the new device map from the old device map */
2139			map = xmap_dereference(dev_maps->cpu_map[tci]);
2140			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2141		}
2142
2143		/* We need to explicitly update tci as prevous loop
2144		 * could break out early if dev_maps is NULL.
2145		 */
2146		tci = cpu * num_tc + tc;
2147
2148		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2149			/* add queue to CPU maps */
2150			int pos = 0;
2151
2152			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2153			while ((pos < map->len) && (map->queues[pos] != index))
2154				pos++;
2155
2156			if (pos == map->len)
2157				map->queues[map->len++] = index;
2158#ifdef CONFIG_NUMA
2159			if (numa_node_id == -2)
2160				numa_node_id = cpu_to_node(cpu);
2161			else if (numa_node_id != cpu_to_node(cpu))
2162				numa_node_id = -1;
2163#endif
2164		} else if (dev_maps) {
2165			/* fill in the new device map from the old device map */
2166			map = xmap_dereference(dev_maps->cpu_map[tci]);
2167			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2168		}
2169
2170		/* copy maps belonging to foreign traffic classes */
2171		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2172			/* fill in the new device map from the old device map */
2173			map = xmap_dereference(dev_maps->cpu_map[tci]);
2174			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2175		}
2176	}
2177
2178	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2179
2180	/* Cleanup old maps */
2181	if (!dev_maps)
2182		goto out_no_old_maps;
2183
2184	for_each_possible_cpu(cpu) {
2185		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2186			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2187			map = xmap_dereference(dev_maps->cpu_map[tci]);
2188			if (map && map != new_map)
2189				kfree_rcu(map, rcu);
2190		}
2191	}
2192
2193	kfree_rcu(dev_maps, rcu);
2194
2195out_no_old_maps:
2196	dev_maps = new_dev_maps;
2197	active = true;
2198
2199out_no_new_maps:
2200	/* update Tx queue numa node */
2201	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2202				     (numa_node_id >= 0) ? numa_node_id :
2203				     NUMA_NO_NODE);
2204
2205	if (!dev_maps)
2206		goto out_no_maps;
2207
2208	/* removes queue from unused CPUs */
2209	for_each_possible_cpu(cpu) {
2210		for (i = tc, tci = cpu * num_tc; i--; tci++)
2211			active |= remove_xps_queue(dev_maps, tci, index);
2212		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2213			active |= remove_xps_queue(dev_maps, tci, index);
2214		for (i = num_tc - tc, tci++; --i; tci++)
2215			active |= remove_xps_queue(dev_maps, tci, index);
2216	}
2217
2218	/* free map if not active */
2219	if (!active) {
2220		RCU_INIT_POINTER(dev->xps_maps, NULL);
2221		kfree_rcu(dev_maps, rcu);
2222	}
2223
2224out_no_maps:
2225	mutex_unlock(&xps_map_mutex);
2226
2227	return 0;
2228error:
2229	/* remove any maps that we added */
2230	for_each_possible_cpu(cpu) {
2231		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2232			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2233			map = dev_maps ?
2234			      xmap_dereference(dev_maps->cpu_map[tci]) :
2235			      NULL;
2236			if (new_map && new_map != map)
2237				kfree(new_map);
2238		}
2239	}
2240
2241	mutex_unlock(&xps_map_mutex);
2242
2243	kfree(new_dev_maps);
2244	return -ENOMEM;
2245}
2246EXPORT_SYMBOL(netif_set_xps_queue);
2247
2248#endif
2249void netdev_reset_tc(struct net_device *dev)
2250{
2251#ifdef CONFIG_XPS
2252	netif_reset_xps_queues_gt(dev, 0);
2253#endif
2254	dev->num_tc = 0;
2255	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2256	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2257}
2258EXPORT_SYMBOL(netdev_reset_tc);
2259
2260int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2261{
2262	if (tc >= dev->num_tc)
2263		return -EINVAL;
2264
2265#ifdef CONFIG_XPS
2266	netif_reset_xps_queues(dev, offset, count);
2267#endif
2268	dev->tc_to_txq[tc].count = count;
2269	dev->tc_to_txq[tc].offset = offset;
2270	return 0;
2271}
2272EXPORT_SYMBOL(netdev_set_tc_queue);
2273
2274int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2275{
2276	if (num_tc > TC_MAX_QUEUE)
2277		return -EINVAL;
2278
2279#ifdef CONFIG_XPS
2280	netif_reset_xps_queues_gt(dev, 0);
2281#endif
2282	dev->num_tc = num_tc;
2283	return 0;
2284}
2285EXPORT_SYMBOL(netdev_set_num_tc);
2286
2287/*
2288 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2289 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2290 */
2291int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2292{
2293	int rc;
2294
2295	if (txq < 1 || txq > dev->num_tx_queues)
2296		return -EINVAL;
2297
2298	if (dev->reg_state == NETREG_REGISTERED ||
2299	    dev->reg_state == NETREG_UNREGISTERING) {
2300		ASSERT_RTNL();
2301
2302		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2303						  txq);
2304		if (rc)
2305			return rc;
2306
2307		if (dev->num_tc)
2308			netif_setup_tc(dev, txq);
2309
2310		if (txq < dev->real_num_tx_queues) {
2311			qdisc_reset_all_tx_gt(dev, txq);
2312#ifdef CONFIG_XPS
2313			netif_reset_xps_queues_gt(dev, txq);
2314#endif
2315		}
2316	}
2317
2318	dev->real_num_tx_queues = txq;
2319	return 0;
2320}
2321EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2322
2323#ifdef CONFIG_SYSFS
2324/**
2325 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2326 *	@dev: Network device
2327 *	@rxq: Actual number of RX queues
2328 *
2329 *	This must be called either with the rtnl_lock held or before
2330 *	registration of the net device.  Returns 0 on success, or a
2331 *	negative error code.  If called before registration, it always
2332 *	succeeds.
2333 */
2334int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2335{
2336	int rc;
2337
2338	if (rxq < 1 || rxq > dev->num_rx_queues)
2339		return -EINVAL;
2340
2341	if (dev->reg_state == NETREG_REGISTERED) {
2342		ASSERT_RTNL();
2343
2344		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2345						  rxq);
2346		if (rc)
2347			return rc;
2348	}
2349
2350	dev->real_num_rx_queues = rxq;
2351	return 0;
2352}
2353EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2354#endif
2355
2356/**
2357 * netif_get_num_default_rss_queues - default number of RSS queues
2358 *
2359 * This routine should set an upper limit on the number of RSS queues
2360 * used by default by multiqueue devices.
2361 */
2362int netif_get_num_default_rss_queues(void)
2363{
2364	return is_kdump_kernel() ?
2365		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2366}
2367EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2368
2369static void __netif_reschedule(struct Qdisc *q)
2370{
2371	struct softnet_data *sd;
2372	unsigned long flags;
2373
2374	local_irq_save(flags);
2375	sd = this_cpu_ptr(&softnet_data);
2376	q->next_sched = NULL;
2377	*sd->output_queue_tailp = q;
2378	sd->output_queue_tailp = &q->next_sched;
2379	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380	local_irq_restore(flags);
2381}
2382
2383void __netif_schedule(struct Qdisc *q)
2384{
2385	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2386		__netif_reschedule(q);
2387}
2388EXPORT_SYMBOL(__netif_schedule);
2389
2390struct dev_kfree_skb_cb {
2391	enum skb_free_reason reason;
2392};
2393
2394static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2395{
2396	return (struct dev_kfree_skb_cb *)skb->cb;
2397}
2398
2399void netif_schedule_queue(struct netdev_queue *txq)
2400{
2401	rcu_read_lock();
2402	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2403		struct Qdisc *q = rcu_dereference(txq->qdisc);
2404
2405		__netif_schedule(q);
2406	}
2407	rcu_read_unlock();
2408}
2409EXPORT_SYMBOL(netif_schedule_queue);
2410
2411/**
2412 *	netif_wake_subqueue - allow sending packets on subqueue
2413 *	@dev: network device
2414 *	@queue_index: sub queue index
2415 *
2416 * Resume individual transmit queue of a device with multiple transmit queues.
2417 */
2418void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2419{
2420	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2421
2422	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2423		struct Qdisc *q;
2424
2425		rcu_read_lock();
2426		q = rcu_dereference(txq->qdisc);
2427		__netif_schedule(q);
2428		rcu_read_unlock();
2429	}
2430}
2431EXPORT_SYMBOL(netif_wake_subqueue);
2432
2433void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2434{
2435	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2436		struct Qdisc *q;
2437
2438		rcu_read_lock();
2439		q = rcu_dereference(dev_queue->qdisc);
2440		__netif_schedule(q);
2441		rcu_read_unlock();
2442	}
2443}
2444EXPORT_SYMBOL(netif_tx_wake_queue);
2445
2446void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2447{
2448	unsigned long flags;
2449
2450	if (likely(atomic_read(&skb->users) == 1)) {
2451		smp_rmb();
2452		atomic_set(&skb->users, 0);
2453	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2454		return;
2455	}
2456	get_kfree_skb_cb(skb)->reason = reason;
2457	local_irq_save(flags);
2458	skb->next = __this_cpu_read(softnet_data.completion_queue);
2459	__this_cpu_write(softnet_data.completion_queue, skb);
2460	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2461	local_irq_restore(flags);
2462}
2463EXPORT_SYMBOL(__dev_kfree_skb_irq);
2464
2465void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2466{
2467	if (in_irq() || irqs_disabled())
2468		__dev_kfree_skb_irq(skb, reason);
2469	else
2470		dev_kfree_skb(skb);
2471}
2472EXPORT_SYMBOL(__dev_kfree_skb_any);
2473
2474
2475/**
2476 * netif_device_detach - mark device as removed
2477 * @dev: network device
2478 *
2479 * Mark device as removed from system and therefore no longer available.
2480 */
2481void netif_device_detach(struct net_device *dev)
2482{
2483	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2484	    netif_running(dev)) {
2485		netif_tx_stop_all_queues(dev);
2486	}
2487}
2488EXPORT_SYMBOL(netif_device_detach);
2489
2490/**
2491 * netif_device_attach - mark device as attached
2492 * @dev: network device
2493 *
2494 * Mark device as attached from system and restart if needed.
2495 */
2496void netif_device_attach(struct net_device *dev)
2497{
2498	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2499	    netif_running(dev)) {
2500		netif_tx_wake_all_queues(dev);
2501		__netdev_watchdog_up(dev);
2502	}
2503}
2504EXPORT_SYMBOL(netif_device_attach);
2505
2506/*
2507 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2508 * to be used as a distribution range.
2509 */
2510u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2511		  unsigned int num_tx_queues)
2512{
2513	u32 hash;
2514	u16 qoffset = 0;
2515	u16 qcount = num_tx_queues;
2516
2517	if (skb_rx_queue_recorded(skb)) {
2518		hash = skb_get_rx_queue(skb);
2519		while (unlikely(hash >= num_tx_queues))
2520			hash -= num_tx_queues;
2521		return hash;
2522	}
2523
2524	if (dev->num_tc) {
2525		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2526		qoffset = dev->tc_to_txq[tc].offset;
2527		qcount = dev->tc_to_txq[tc].count;
2528	}
2529
2530	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2531}
2532EXPORT_SYMBOL(__skb_tx_hash);
2533
2534static void skb_warn_bad_offload(const struct sk_buff *skb)
2535{
2536	static const netdev_features_t null_features;
2537	struct net_device *dev = skb->dev;
2538	const char *name = "";
2539
2540	if (!net_ratelimit())
2541		return;
2542
2543	if (dev) {
2544		if (dev->dev.parent)
2545			name = dev_driver_string(dev->dev.parent);
2546		else
2547			name = netdev_name(dev);
2548	}
2549	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2550	     "gso_type=%d ip_summed=%d\n",
2551	     name, dev ? &dev->features : &null_features,
2552	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2553	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2554	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2555}
2556
2557/*
2558 * Invalidate hardware checksum when packet is to be mangled, and
2559 * complete checksum manually on outgoing path.
2560 */
2561int skb_checksum_help(struct sk_buff *skb)
2562{
2563	__wsum csum;
2564	int ret = 0, offset;
2565
2566	if (skb->ip_summed == CHECKSUM_COMPLETE)
2567		goto out_set_summed;
2568
2569	if (unlikely(skb_shinfo(skb)->gso_size)) {
2570		skb_warn_bad_offload(skb);
2571		return -EINVAL;
2572	}
2573
2574	/* Before computing a checksum, we should make sure no frag could
2575	 * be modified by an external entity : checksum could be wrong.
2576	 */
2577	if (skb_has_shared_frag(skb)) {
2578		ret = __skb_linearize(skb);
2579		if (ret)
2580			goto out;
2581	}
2582
2583	offset = skb_checksum_start_offset(skb);
2584	BUG_ON(offset >= skb_headlen(skb));
2585	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2586
2587	offset += skb->csum_offset;
2588	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2589
2590	if (skb_cloned(skb) &&
2591	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2592		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2593		if (ret)
2594			goto out;
2595	}
2596
2597	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2598out_set_summed:
2599	skb->ip_summed = CHECKSUM_NONE;
2600out:
2601	return ret;
2602}
2603EXPORT_SYMBOL(skb_checksum_help);
2604
2605__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2606{
2607	__be16 type = skb->protocol;
2608
2609	/* Tunnel gso handlers can set protocol to ethernet. */
2610	if (type == htons(ETH_P_TEB)) {
2611		struct ethhdr *eth;
2612
2613		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2614			return 0;
2615
2616		eth = (struct ethhdr *)skb_mac_header(skb);
2617		type = eth->h_proto;
2618	}
2619
2620	return __vlan_get_protocol(skb, type, depth);
2621}
2622
2623/**
2624 *	skb_mac_gso_segment - mac layer segmentation handler.
2625 *	@skb: buffer to segment
2626 *	@features: features for the output path (see dev->features)
2627 */
2628struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2629				    netdev_features_t features)
2630{
2631	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2632	struct packet_offload *ptype;
2633	int vlan_depth = skb->mac_len;
2634	__be16 type = skb_network_protocol(skb, &vlan_depth);
2635
2636	if (unlikely(!type))
2637		return ERR_PTR(-EINVAL);
2638
2639	__skb_pull(skb, vlan_depth);
2640
2641	rcu_read_lock();
2642	list_for_each_entry_rcu(ptype, &offload_base, list) {
2643		if (ptype->type == type && ptype->callbacks.gso_segment) {
2644			segs = ptype->callbacks.gso_segment(skb, features);
2645			break;
2646		}
2647	}
2648	rcu_read_unlock();
2649
2650	__skb_push(skb, skb->data - skb_mac_header(skb));
2651
2652	return segs;
2653}
2654EXPORT_SYMBOL(skb_mac_gso_segment);
2655
2656
2657/* openvswitch calls this on rx path, so we need a different check.
2658 */
2659static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2660{
2661	if (tx_path)
2662		return skb->ip_summed != CHECKSUM_PARTIAL;
2663	else
2664		return skb->ip_summed == CHECKSUM_NONE;
2665}
2666
2667/**
2668 *	__skb_gso_segment - Perform segmentation on skb.
2669 *	@skb: buffer to segment
2670 *	@features: features for the output path (see dev->features)
2671 *	@tx_path: whether it is called in TX path
2672 *
2673 *	This function segments the given skb and returns a list of segments.
2674 *
2675 *	It may return NULL if the skb requires no segmentation.  This is
2676 *	only possible when GSO is used for verifying header integrity.
2677 *
2678 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2679 */
2680struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2681				  netdev_features_t features, bool tx_path)
2682{
2683	if (unlikely(skb_needs_check(skb, tx_path))) {
2684		int err;
2685
2686		skb_warn_bad_offload(skb);
2687
2688		err = skb_cow_head(skb, 0);
2689		if (err < 0)
2690			return ERR_PTR(err);
2691	}
2692
2693	/* Only report GSO partial support if it will enable us to
2694	 * support segmentation on this frame without needing additional
2695	 * work.
2696	 */
2697	if (features & NETIF_F_GSO_PARTIAL) {
2698		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2699		struct net_device *dev = skb->dev;
2700
2701		partial_features |= dev->features & dev->gso_partial_features;
2702		if (!skb_gso_ok(skb, features | partial_features))
2703			features &= ~NETIF_F_GSO_PARTIAL;
2704	}
2705
2706	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2707		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2708
2709	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2710	SKB_GSO_CB(skb)->encap_level = 0;
2711
2712	skb_reset_mac_header(skb);
2713	skb_reset_mac_len(skb);
2714
2715	return skb_mac_gso_segment(skb, features);
2716}
2717EXPORT_SYMBOL(__skb_gso_segment);
2718
2719/* Take action when hardware reception checksum errors are detected. */
2720#ifdef CONFIG_BUG
2721void netdev_rx_csum_fault(struct net_device *dev)
2722{
2723	if (net_ratelimit()) {
2724		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2725		dump_stack();
2726	}
2727}
2728EXPORT_SYMBOL(netdev_rx_csum_fault);
2729#endif
2730
2731/* Actually, we should eliminate this check as soon as we know, that:
2732 * 1. IOMMU is present and allows to map all the memory.
2733 * 2. No high memory really exists on this machine.
2734 */
2735
2736static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2737{
2738#ifdef CONFIG_HIGHMEM
2739	int i;
2740	if (!(dev->features & NETIF_F_HIGHDMA)) {
2741		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2742			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2743			if (PageHighMem(skb_frag_page(frag)))
2744				return 1;
2745		}
2746	}
2747
2748	if (PCI_DMA_BUS_IS_PHYS) {
2749		struct device *pdev = dev->dev.parent;
2750
2751		if (!pdev)
2752			return 0;
2753		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2754			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2755			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2756			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2757				return 1;
2758		}
2759	}
2760#endif
2761	return 0;
2762}
2763
2764/* If MPLS offload request, verify we are testing hardware MPLS features
2765 * instead of standard features for the netdev.
2766 */
2767#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2768static netdev_features_t net_mpls_features(struct sk_buff *skb,
2769					   netdev_features_t features,
2770					   __be16 type)
2771{
2772	if (eth_p_mpls(type))
2773		features &= skb->dev->mpls_features;
2774
2775	return features;
2776}
2777#else
2778static netdev_features_t net_mpls_features(struct sk_buff *skb,
2779					   netdev_features_t features,
2780					   __be16 type)
2781{
2782	return features;
2783}
2784#endif
2785
2786static netdev_features_t harmonize_features(struct sk_buff *skb,
2787	netdev_features_t features)
2788{
2789	int tmp;
2790	__be16 type;
2791
2792	type = skb_network_protocol(skb, &tmp);
2793	features = net_mpls_features(skb, features, type);
2794
2795	if (skb->ip_summed != CHECKSUM_NONE &&
2796	    !can_checksum_protocol(features, type)) {
2797		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2798	}
2799	if (illegal_highdma(skb->dev, skb))
2800		features &= ~NETIF_F_SG;
2801
2802	return features;
2803}
2804
2805netdev_features_t passthru_features_check(struct sk_buff *skb,
2806					  struct net_device *dev,
2807					  netdev_features_t features)
2808{
2809	return features;
2810}
2811EXPORT_SYMBOL(passthru_features_check);
2812
2813static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2814					     struct net_device *dev,
2815					     netdev_features_t features)
2816{
2817	return vlan_features_check(skb, features);
2818}
2819
2820static netdev_features_t gso_features_check(const struct sk_buff *skb,
2821					    struct net_device *dev,
2822					    netdev_features_t features)
2823{
2824	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2825
2826	if (gso_segs > dev->gso_max_segs)
2827		return features & ~NETIF_F_GSO_MASK;
2828
2829	/* Support for GSO partial features requires software
2830	 * intervention before we can actually process the packets
2831	 * so we need to strip support for any partial features now
2832	 * and we can pull them back in after we have partially
2833	 * segmented the frame.
2834	 */
2835	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2836		features &= ~dev->gso_partial_features;
2837
2838	/* Make sure to clear the IPv4 ID mangling feature if the
2839	 * IPv4 header has the potential to be fragmented.
2840	 */
2841	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2842		struct iphdr *iph = skb->encapsulation ?
2843				    inner_ip_hdr(skb) : ip_hdr(skb);
2844
2845		if (!(iph->frag_off & htons(IP_DF)))
2846			features &= ~NETIF_F_TSO_MANGLEID;
2847	}
2848
2849	return features;
2850}
2851
2852netdev_features_t netif_skb_features(struct sk_buff *skb)
2853{
2854	struct net_device *dev = skb->dev;
2855	netdev_features_t features = dev->features;
2856
2857	if (skb_is_gso(skb))
2858		features = gso_features_check(skb, dev, features);
2859
2860	/* If encapsulation offload request, verify we are testing
2861	 * hardware encapsulation features instead of standard
2862	 * features for the netdev
2863	 */
2864	if (skb->encapsulation)
2865		features &= dev->hw_enc_features;
2866
2867	if (skb_vlan_tagged(skb))
2868		features = netdev_intersect_features(features,
2869						     dev->vlan_features |
2870						     NETIF_F_HW_VLAN_CTAG_TX |
2871						     NETIF_F_HW_VLAN_STAG_TX);
2872
2873	if (dev->netdev_ops->ndo_features_check)
2874		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2875								features);
2876	else
2877		features &= dflt_features_check(skb, dev, features);
2878
2879	return harmonize_features(skb, features);
2880}
2881EXPORT_SYMBOL(netif_skb_features);
2882
2883static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2884		    struct netdev_queue *txq, bool more)
2885{
2886	unsigned int len;
2887	int rc;
2888
2889	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2890		dev_queue_xmit_nit(skb, dev);
2891
2892	len = skb->len;
2893	trace_net_dev_start_xmit(skb, dev);
2894	rc = netdev_start_xmit(skb, dev, txq, more);
2895	trace_net_dev_xmit(skb, rc, dev, len);
2896
2897	return rc;
2898}
2899
2900struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2901				    struct netdev_queue *txq, int *ret)
2902{
2903	struct sk_buff *skb = first;
2904	int rc = NETDEV_TX_OK;
2905
2906	while (skb) {
2907		struct sk_buff *next = skb->next;
2908
2909		skb->next = NULL;
2910		rc = xmit_one(skb, dev, txq, next != NULL);
2911		if (unlikely(!dev_xmit_complete(rc))) {
2912			skb->next = next;
2913			goto out;
2914		}
2915
2916		skb = next;
2917		if (netif_xmit_stopped(txq) && skb) {
2918			rc = NETDEV_TX_BUSY;
2919			break;
2920		}
2921	}
2922
2923out:
2924	*ret = rc;
2925	return skb;
2926}
2927
2928static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2929					  netdev_features_t features)
2930{
2931	if (skb_vlan_tag_present(skb) &&
2932	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2933		skb = __vlan_hwaccel_push_inside(skb);
2934	return skb;
2935}
2936
2937static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2938{
2939	netdev_features_t features;
2940
2941	features = netif_skb_features(skb);
2942	skb = validate_xmit_vlan(skb, features);
2943	if (unlikely(!skb))
2944		goto out_null;
2945
2946	if (netif_needs_gso(skb, features)) {
2947		struct sk_buff *segs;
2948
2949		segs = skb_gso_segment(skb, features);
2950		if (IS_ERR(segs)) {
2951			goto out_kfree_skb;
2952		} else if (segs) {
2953			consume_skb(skb);
2954			skb = segs;
2955		}
2956	} else {
2957		if (skb_needs_linearize(skb, features) &&
2958		    __skb_linearize(skb))
2959			goto out_kfree_skb;
2960
2961		/* If packet is not checksummed and device does not
2962		 * support checksumming for this protocol, complete
2963		 * checksumming here.
2964		 */
2965		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2966			if (skb->encapsulation)
2967				skb_set_inner_transport_header(skb,
2968							       skb_checksum_start_offset(skb));
2969			else
2970				skb_set_transport_header(skb,
2971							 skb_checksum_start_offset(skb));
2972			if (!(features & NETIF_F_CSUM_MASK) &&
2973			    skb_checksum_help(skb))
2974				goto out_kfree_skb;
2975		}
2976	}
2977
2978	return skb;
2979
2980out_kfree_skb:
2981	kfree_skb(skb);
2982out_null:
2983	atomic_long_inc(&dev->tx_dropped);
2984	return NULL;
2985}
2986
2987struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2988{
2989	struct sk_buff *next, *head = NULL, *tail;
2990
2991	for (; skb != NULL; skb = next) {
2992		next = skb->next;
2993		skb->next = NULL;
2994
2995		/* in case skb wont be segmented, point to itself */
2996		skb->prev = skb;
2997
2998		skb = validate_xmit_skb(skb, dev);
2999		if (!skb)
3000			continue;
3001
3002		if (!head)
3003			head = skb;
3004		else
3005			tail->next = skb;
3006		/* If skb was segmented, skb->prev points to
3007		 * the last segment. If not, it still contains skb.
3008		 */
3009		tail = skb->prev;
3010	}
3011	return head;
3012}
3013EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3014
3015static void qdisc_pkt_len_init(struct sk_buff *skb)
3016{
3017	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3018
3019	qdisc_skb_cb(skb)->pkt_len = skb->len;
3020
3021	/* To get more precise estimation of bytes sent on wire,
3022	 * we add to pkt_len the headers size of all segments
3023	 */
3024	if (shinfo->gso_size)  {
3025		unsigned int hdr_len;
3026		u16 gso_segs = shinfo->gso_segs;
3027
3028		/* mac layer + network layer */
3029		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3030
3031		/* + transport layer */
3032		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3033			hdr_len += tcp_hdrlen(skb);
3034		else
3035			hdr_len += sizeof(struct udphdr);
3036
3037		if (shinfo->gso_type & SKB_GSO_DODGY)
3038			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3039						shinfo->gso_size);
3040
3041		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3042	}
3043}
3044
3045static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3046				 struct net_device *dev,
3047				 struct netdev_queue *txq)
3048{
3049	spinlock_t *root_lock = qdisc_lock(q);
3050	struct sk_buff *to_free = NULL;
3051	bool contended;
3052	int rc;
3053
3054	qdisc_calculate_pkt_len(skb, q);
3055	/*
3056	 * Heuristic to force contended enqueues to serialize on a
3057	 * separate lock before trying to get qdisc main lock.
3058	 * This permits qdisc->running owner to get the lock more
3059	 * often and dequeue packets faster.
3060	 */
3061	contended = qdisc_is_running(q);
3062	if (unlikely(contended))
3063		spin_lock(&q->busylock);
3064
3065	spin_lock(root_lock);
3066	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3067		__qdisc_drop(skb, &to_free);
3068		rc = NET_XMIT_DROP;
3069	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3070		   qdisc_run_begin(q)) {
3071		/*
3072		 * This is a work-conserving queue; there are no old skbs
3073		 * waiting to be sent out; and the qdisc is not running -
3074		 * xmit the skb directly.
3075		 */
3076
3077		qdisc_bstats_update(q, skb);
3078
3079		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3080			if (unlikely(contended)) {
3081				spin_unlock(&q->busylock);
3082				contended = false;
3083			}
3084			__qdisc_run(q);
3085		} else
3086			qdisc_run_end(q);
3087
3088		rc = NET_XMIT_SUCCESS;
3089	} else {
3090		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3091		if (qdisc_run_begin(q)) {
3092			if (unlikely(contended)) {
3093				spin_unlock(&q->busylock);
3094				contended = false;
3095			}
3096			__qdisc_run(q);
3097		}
3098	}
3099	spin_unlock(root_lock);
3100	if (unlikely(to_free))
3101		kfree_skb_list(to_free);
3102	if (unlikely(contended))
3103		spin_unlock(&q->busylock);
3104	return rc;
3105}
3106
3107#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3108static void skb_update_prio(struct sk_buff *skb)
3109{
3110	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3111
3112	if (!skb->priority && skb->sk && map) {
3113		unsigned int prioidx =
3114			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3115
3116		if (prioidx < map->priomap_len)
3117			skb->priority = map->priomap[prioidx];
3118	}
3119}
3120#else
3121#define skb_update_prio(skb)
3122#endif
3123
3124DEFINE_PER_CPU(int, xmit_recursion);
3125EXPORT_SYMBOL(xmit_recursion);
3126
3127/**
3128 *	dev_loopback_xmit - loop back @skb
3129 *	@net: network namespace this loopback is happening in
3130 *	@sk:  sk needed to be a netfilter okfn
3131 *	@skb: buffer to transmit
3132 */
3133int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3134{
3135	skb_reset_mac_header(skb);
3136	__skb_pull(skb, skb_network_offset(skb));
3137	skb->pkt_type = PACKET_LOOPBACK;
3138	skb->ip_summed = CHECKSUM_UNNECESSARY;
3139	WARN_ON(!skb_dst(skb));
3140	skb_dst_force(skb);
3141	netif_rx_ni(skb);
3142	return 0;
3143}
3144EXPORT_SYMBOL(dev_loopback_xmit);
3145
3146#ifdef CONFIG_NET_EGRESS
3147static struct sk_buff *
3148sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3149{
3150	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3151	struct tcf_result cl_res;
3152
3153	if (!cl)
3154		return skb;
3155
3156	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3157	 * earlier by the caller.
3158	 */
3159	qdisc_bstats_cpu_update(cl->q, skb);
3160
3161	switch (tc_classify(skb, cl, &cl_res, false)) {
3162	case TC_ACT_OK:
3163	case TC_ACT_RECLASSIFY:
3164		skb->tc_index = TC_H_MIN(cl_res.classid);
3165		break;
3166	case TC_ACT_SHOT:
3167		qdisc_qstats_cpu_drop(cl->q);
3168		*ret = NET_XMIT_DROP;
3169		kfree_skb(skb);
3170		return NULL;
3171	case TC_ACT_STOLEN:
3172	case TC_ACT_QUEUED:
3173		*ret = NET_XMIT_SUCCESS;
3174		consume_skb(skb);
3175		return NULL;
3176	case TC_ACT_REDIRECT:
3177		/* No need to push/pop skb's mac_header here on egress! */
3178		skb_do_redirect(skb);
3179		*ret = NET_XMIT_SUCCESS;
3180		return NULL;
3181	default:
3182		break;
3183	}
3184
3185	return skb;
3186}
3187#endif /* CONFIG_NET_EGRESS */
3188
3189static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3190{
3191#ifdef CONFIG_XPS
3192	struct xps_dev_maps *dev_maps;
3193	struct xps_map *map;
3194	int queue_index = -1;
3195
3196	rcu_read_lock();
3197	dev_maps = rcu_dereference(dev->xps_maps);
3198	if (dev_maps) {
3199		unsigned int tci = skb->sender_cpu - 1;
3200
3201		if (dev->num_tc) {
3202			tci *= dev->num_tc;
3203			tci += netdev_get_prio_tc_map(dev, skb->priority);
3204		}
3205
3206		map = rcu_dereference(dev_maps->cpu_map[tci]);
3207		if (map) {
3208			if (map->len == 1)
3209				queue_index = map->queues[0];
3210			else
3211				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3212									   map->len)];
3213			if (unlikely(queue_index >= dev->real_num_tx_queues))
3214				queue_index = -1;
3215		}
3216	}
3217	rcu_read_unlock();
3218
3219	return queue_index;
3220#else
3221	return -1;
3222#endif
3223}
3224
3225static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3226{
3227	struct sock *sk = skb->sk;
3228	int queue_index = sk_tx_queue_get(sk);
3229
3230	if (queue_index < 0 || skb->ooo_okay ||
3231	    queue_index >= dev->real_num_tx_queues) {
3232		int new_index = get_xps_queue(dev, skb);
3233		if (new_index < 0)
3234			new_index = skb_tx_hash(dev, skb);
3235
3236		if (queue_index != new_index && sk &&
3237		    sk_fullsock(sk) &&
3238		    rcu_access_pointer(sk->sk_dst_cache))
3239			sk_tx_queue_set(sk, new_index);
3240
3241		queue_index = new_index;
3242	}
3243
3244	return queue_index;
3245}
3246
3247struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3248				    struct sk_buff *skb,
3249				    void *accel_priv)
3250{
3251	int queue_index = 0;
3252
3253#ifdef CONFIG_XPS
3254	u32 sender_cpu = skb->sender_cpu - 1;
3255
3256	if (sender_cpu >= (u32)NR_CPUS)
3257		skb->sender_cpu = raw_smp_processor_id() + 1;
3258#endif
3259
3260	if (dev->real_num_tx_queues != 1) {
3261		const struct net_device_ops *ops = dev->netdev_ops;
3262		if (ops->ndo_select_queue)
3263			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3264							    __netdev_pick_tx);
3265		else
3266			queue_index = __netdev_pick_tx(dev, skb);
3267
3268		if (!accel_priv)
3269			queue_index = netdev_cap_txqueue(dev, queue_index);
3270	}
3271
3272	skb_set_queue_mapping(skb, queue_index);
3273	return netdev_get_tx_queue(dev, queue_index);
3274}
3275
3276/**
3277 *	__dev_queue_xmit - transmit a buffer
3278 *	@skb: buffer to transmit
3279 *	@accel_priv: private data used for L2 forwarding offload
3280 *
3281 *	Queue a buffer for transmission to a network device. The caller must
3282 *	have set the device and priority and built the buffer before calling
3283 *	this function. The function can be called from an interrupt.
3284 *
3285 *	A negative errno code is returned on a failure. A success does not
3286 *	guarantee the frame will be transmitted as it may be dropped due
3287 *	to congestion or traffic shaping.
3288 *
3289 * -----------------------------------------------------------------------------------
3290 *      I notice this method can also return errors from the queue disciplines,
3291 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3292 *      be positive.
3293 *
3294 *      Regardless of the return value, the skb is consumed, so it is currently
3295 *      difficult to retry a send to this method.  (You can bump the ref count
3296 *      before sending to hold a reference for retry if you are careful.)
3297 *
3298 *      When calling this method, interrupts MUST be enabled.  This is because
3299 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3300 *          --BLG
3301 */
3302static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3303{
3304	struct net_device *dev = skb->dev;
3305	struct netdev_queue *txq;
3306	struct Qdisc *q;
3307	int rc = -ENOMEM;
3308
3309	skb_reset_mac_header(skb);
3310
3311	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3312		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3313
3314	/* Disable soft irqs for various locks below. Also
3315	 * stops preemption for RCU.
3316	 */
3317	rcu_read_lock_bh();
3318
3319	skb_update_prio(skb);
3320
3321	qdisc_pkt_len_init(skb);
3322#ifdef CONFIG_NET_CLS_ACT
3323	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3324# ifdef CONFIG_NET_EGRESS
3325	if (static_key_false(&egress_needed)) {
3326		skb = sch_handle_egress(skb, &rc, dev);
3327		if (!skb)
3328			goto out;
3329	}
3330# endif
3331#endif
3332	/* If device/qdisc don't need skb->dst, release it right now while
3333	 * its hot in this cpu cache.
3334	 */
3335	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3336		skb_dst_drop(skb);
3337	else
3338		skb_dst_force(skb);
3339
3340	txq = netdev_pick_tx(dev, skb, accel_priv);
3341	q = rcu_dereference_bh(txq->qdisc);
3342
3343	trace_net_dev_queue(skb);
3344	if (q->enqueue) {
3345		rc = __dev_xmit_skb(skb, q, dev, txq);
3346		goto out;
3347	}
3348
3349	/* The device has no queue. Common case for software devices:
3350	   loopback, all the sorts of tunnels...
3351
3352	   Really, it is unlikely that netif_tx_lock protection is necessary
3353	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3354	   counters.)
3355	   However, it is possible, that they rely on protection
3356	   made by us here.
3357
3358	   Check this and shot the lock. It is not prone from deadlocks.
3359	   Either shot noqueue qdisc, it is even simpler 8)
3360	 */
3361	if (dev->flags & IFF_UP) {
3362		int cpu = smp_processor_id(); /* ok because BHs are off */
3363
3364		if (txq->xmit_lock_owner != cpu) {
3365			if (unlikely(__this_cpu_read(xmit_recursion) >
3366				     XMIT_RECURSION_LIMIT))
3367				goto recursion_alert;
3368
3369			skb = validate_xmit_skb(skb, dev);
3370			if (!skb)
3371				goto out;
3372
3373			HARD_TX_LOCK(dev, txq, cpu);
3374
3375			if (!netif_xmit_stopped(txq)) {
3376				__this_cpu_inc(xmit_recursion);
3377				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3378				__this_cpu_dec(xmit_recursion);
3379				if (dev_xmit_complete(rc)) {
3380					HARD_TX_UNLOCK(dev, txq);
3381					goto out;
3382				}
3383			}
3384			HARD_TX_UNLOCK(dev, txq);
3385			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3386					     dev->name);
3387		} else {
3388			/* Recursion is detected! It is possible,
3389			 * unfortunately
3390			 */
3391recursion_alert:
3392			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3393					     dev->name);
3394		}
3395	}
3396
3397	rc = -ENETDOWN;
3398	rcu_read_unlock_bh();
3399
3400	atomic_long_inc(&dev->tx_dropped);
3401	kfree_skb_list(skb);
3402	return rc;
3403out:
3404	rcu_read_unlock_bh();
3405	return rc;
3406}
3407
3408int dev_queue_xmit(struct sk_buff *skb)
3409{
3410	return __dev_queue_xmit(skb, NULL);
3411}
3412EXPORT_SYMBOL(dev_queue_xmit);
3413
3414int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3415{
3416	return __dev_queue_xmit(skb, accel_priv);
3417}
3418EXPORT_SYMBOL(dev_queue_xmit_accel);
3419
3420
3421/*=======================================================================
3422			Receiver routines
3423  =======================================================================*/
3424
3425int netdev_max_backlog __read_mostly = 1000;
3426EXPORT_SYMBOL(netdev_max_backlog);
3427
3428int netdev_tstamp_prequeue __read_mostly = 1;
3429int netdev_budget __read_mostly = 300;
3430int weight_p __read_mostly = 64;            /* old backlog weight */
3431
3432/* Called with irq disabled */
3433static inline void ____napi_schedule(struct softnet_data *sd,
3434				     struct napi_struct *napi)
3435{
3436	list_add_tail(&napi->poll_list, &sd->poll_list);
3437	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3438}
3439
3440#ifdef CONFIG_RPS
3441
3442/* One global table that all flow-based protocols share. */
3443struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3444EXPORT_SYMBOL(rps_sock_flow_table);
3445u32 rps_cpu_mask __read_mostly;
3446EXPORT_SYMBOL(rps_cpu_mask);
3447
3448struct static_key rps_needed __read_mostly;
3449EXPORT_SYMBOL(rps_needed);
3450struct static_key rfs_needed __read_mostly;
3451EXPORT_SYMBOL(rfs_needed);
3452
3453static struct rps_dev_flow *
3454set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3455	    struct rps_dev_flow *rflow, u16 next_cpu)
3456{
3457	if (next_cpu < nr_cpu_ids) {
3458#ifdef CONFIG_RFS_ACCEL
3459		struct netdev_rx_queue *rxqueue;
3460		struct rps_dev_flow_table *flow_table;
3461		struct rps_dev_flow *old_rflow;
3462		u32 flow_id;
3463		u16 rxq_index;
3464		int rc;
3465
3466		/* Should we steer this flow to a different hardware queue? */
3467		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3468		    !(dev->features & NETIF_F_NTUPLE))
3469			goto out;
3470		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3471		if (rxq_index == skb_get_rx_queue(skb))
3472			goto out;
3473
3474		rxqueue = dev->_rx + rxq_index;
3475		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3476		if (!flow_table)
3477			goto out;
3478		flow_id = skb_get_hash(skb) & flow_table->mask;
3479		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3480							rxq_index, flow_id);
3481		if (rc < 0)
3482			goto out;
3483		old_rflow = rflow;
3484		rflow = &flow_table->flows[flow_id];
3485		rflow->filter = rc;
3486		if (old_rflow->filter == rflow->filter)
3487			old_rflow->filter = RPS_NO_FILTER;
3488	out:
3489#endif
3490		rflow->last_qtail =
3491			per_cpu(softnet_data, next_cpu).input_queue_head;
3492	}
3493
3494	rflow->cpu = next_cpu;
3495	return rflow;
3496}
3497
3498/*
3499 * get_rps_cpu is called from netif_receive_skb and returns the target
3500 * CPU from the RPS map of the receiving queue for a given skb.
3501 * rcu_read_lock must be held on entry.
3502 */
3503static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3504		       struct rps_dev_flow **rflowp)
3505{
3506	const struct rps_sock_flow_table *sock_flow_table;
3507	struct netdev_rx_queue *rxqueue = dev->_rx;
3508	struct rps_dev_flow_table *flow_table;
3509	struct rps_map *map;
3510	int cpu = -1;
3511	u32 tcpu;
3512	u32 hash;
3513
3514	if (skb_rx_queue_recorded(skb)) {
3515		u16 index = skb_get_rx_queue(skb);
3516
3517		if (unlikely(index >= dev->real_num_rx_queues)) {
3518			WARN_ONCE(dev->real_num_rx_queues > 1,
3519				  "%s received packet on queue %u, but number "
3520				  "of RX queues is %u\n",
3521				  dev->name, index, dev->real_num_rx_queues);
3522			goto done;
3523		}
3524		rxqueue += index;
3525	}
3526
3527	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3528
3529	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3530	map = rcu_dereference(rxqueue->rps_map);
3531	if (!flow_table && !map)
3532		goto done;
3533
3534	skb_reset_network_header(skb);
3535	hash = skb_get_hash(skb);
3536	if (!hash)
3537		goto done;
3538
3539	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3540	if (flow_table && sock_flow_table) {
3541		struct rps_dev_flow *rflow;
3542		u32 next_cpu;
3543		u32 ident;
3544
3545		/* First check into global flow table if there is a match */
3546		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3547		if ((ident ^ hash) & ~rps_cpu_mask)
3548			goto try_rps;
3549
3550		next_cpu = ident & rps_cpu_mask;
3551
3552		/* OK, now we know there is a match,
3553		 * we can look at the local (per receive queue) flow table
3554		 */
3555		rflow = &flow_table->flows[hash & flow_table->mask];
3556		tcpu = rflow->cpu;
3557
3558		/*
3559		 * If the desired CPU (where last recvmsg was done) is
3560		 * different from current CPU (one in the rx-queue flow
3561		 * table entry), switch if one of the following holds:
3562		 *   - Current CPU is unset (>= nr_cpu_ids).
3563		 *   - Current CPU is offline.
3564		 *   - The current CPU's queue tail has advanced beyond the
3565		 *     last packet that was enqueued using this table entry.
3566		 *     This guarantees that all previous packets for the flow
3567		 *     have been dequeued, thus preserving in order delivery.
3568		 */
3569		if (unlikely(tcpu != next_cpu) &&
3570		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3571		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3572		      rflow->last_qtail)) >= 0)) {
3573			tcpu = next_cpu;
3574			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3575		}
3576
3577		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3578			*rflowp = rflow;
3579			cpu = tcpu;
3580			goto done;
3581		}
3582	}
3583
3584try_rps:
3585
3586	if (map) {
3587		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3588		if (cpu_online(tcpu)) {
3589			cpu = tcpu;
3590			goto done;
3591		}
3592	}
3593
3594done:
3595	return cpu;
3596}
3597
3598#ifdef CONFIG_RFS_ACCEL
3599
3600/**
3601 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3602 * @dev: Device on which the filter was set
3603 * @rxq_index: RX queue index
3604 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3605 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3606 *
3607 * Drivers that implement ndo_rx_flow_steer() should periodically call
3608 * this function for each installed filter and remove the filters for
3609 * which it returns %true.
3610 */
3611bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3612			 u32 flow_id, u16 filter_id)
3613{
3614	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3615	struct rps_dev_flow_table *flow_table;
3616	struct rps_dev_flow *rflow;
3617	bool expire = true;
3618	unsigned int cpu;
3619
3620	rcu_read_lock();
3621	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3622	if (flow_table && flow_id <= flow_table->mask) {
3623		rflow = &flow_table->flows[flow_id];
3624		cpu = ACCESS_ONCE(rflow->cpu);
3625		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3626		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3627			   rflow->last_qtail) <
3628		     (int)(10 * flow_table->mask)))
3629			expire = false;
3630	}
3631	rcu_read_unlock();
3632	return expire;
3633}
3634EXPORT_SYMBOL(rps_may_expire_flow);
3635
3636#endif /* CONFIG_RFS_ACCEL */
3637
3638/* Called from hardirq (IPI) context */
3639static void rps_trigger_softirq(void *data)
3640{
3641	struct softnet_data *sd = data;
3642
3643	____napi_schedule(sd, &sd->backlog);
3644	sd->received_rps++;
3645}
3646
3647#endif /* CONFIG_RPS */
3648
3649/*
3650 * Check if this softnet_data structure is another cpu one
3651 * If yes, queue it to our IPI list and return 1
3652 * If no, return 0
3653 */
3654static int rps_ipi_queued(struct softnet_data *sd)
3655{
3656#ifdef CONFIG_RPS
3657	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3658
3659	if (sd != mysd) {
3660		sd->rps_ipi_next = mysd->rps_ipi_list;
3661		mysd->rps_ipi_list = sd;
3662
3663		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3664		return 1;
3665	}
3666#endif /* CONFIG_RPS */
3667	return 0;
3668}
3669
3670#ifdef CONFIG_NET_FLOW_LIMIT
3671int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3672#endif
3673
3674static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3675{
3676#ifdef CONFIG_NET_FLOW_LIMIT
3677	struct sd_flow_limit *fl;
3678	struct softnet_data *sd;
3679	unsigned int old_flow, new_flow;
3680
3681	if (qlen < (netdev_max_backlog >> 1))
3682		return false;
3683
3684	sd = this_cpu_ptr(&softnet_data);
3685
3686	rcu_read_lock();
3687	fl = rcu_dereference(sd->flow_limit);
3688	if (fl) {
3689		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3690		old_flow = fl->history[fl->history_head];
3691		fl->history[fl->history_head] = new_flow;
3692
3693		fl->history_head++;
3694		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3695
3696		if (likely(fl->buckets[old_flow]))
3697			fl->buckets[old_flow]--;
3698
3699		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3700			fl->count++;
3701			rcu_read_unlock();
3702			return true;
3703		}
3704	}
3705	rcu_read_unlock();
3706#endif
3707	return false;
3708}
3709
3710/*
3711 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3712 * queue (may be a remote CPU queue).
3713 */
3714static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3715			      unsigned int *qtail)
3716{
3717	struct softnet_data *sd;
3718	unsigned long flags;
3719	unsigned int qlen;
3720
3721	sd = &per_cpu(softnet_data, cpu);
3722
3723	local_irq_save(flags);
3724
3725	rps_lock(sd);
3726	if (!netif_running(skb->dev))
3727		goto drop;
3728	qlen = skb_queue_len(&sd->input_pkt_queue);
3729	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3730		if (qlen) {
3731enqueue:
3732			__skb_queue_tail(&sd->input_pkt_queue, skb);
3733			input_queue_tail_incr_save(sd, qtail);
3734			rps_unlock(sd);
3735			local_irq_restore(flags);
3736			return NET_RX_SUCCESS;
3737		}
3738
3739		/* Schedule NAPI for backlog device
3740		 * We can use non atomic operation since we own the queue lock
3741		 */
3742		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3743			if (!rps_ipi_queued(sd))
3744				____napi_schedule(sd, &sd->backlog);
3745		}
3746		goto enqueue;
3747	}
3748
3749drop:
3750	sd->dropped++;
3751	rps_unlock(sd);
3752
3753	local_irq_restore(flags);
3754
3755	atomic_long_inc(&skb->dev->rx_dropped);
3756	kfree_skb(skb);
3757	return NET_RX_DROP;
3758}
3759
3760static int netif_rx_internal(struct sk_buff *skb)
3761{
3762	int ret;
3763
3764	net_timestamp_check(netdev_tstamp_prequeue, skb);
3765
3766	trace_netif_rx(skb);
3767#ifdef CONFIG_RPS
3768	if (static_key_false(&rps_needed)) {
3769		struct rps_dev_flow voidflow, *rflow = &voidflow;
3770		int cpu;
3771
3772		preempt_disable();
3773		rcu_read_lock();
3774
3775		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3776		if (cpu < 0)
3777			cpu = smp_processor_id();
3778
3779		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3780
3781		rcu_read_unlock();
3782		preempt_enable();
3783	} else
3784#endif
3785	{
3786		unsigned int qtail;
3787		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3788		put_cpu();
3789	}
3790	return ret;
3791}
3792
3793/**
3794 *	netif_rx	-	post buffer to the network code
3795 *	@skb: buffer to post
3796 *
3797 *	This function receives a packet from a device driver and queues it for
3798 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3799 *	may be dropped during processing for congestion control or by the
3800 *	protocol layers.
3801 *
3802 *	return values:
3803 *	NET_RX_SUCCESS	(no congestion)
3804 *	NET_RX_DROP     (packet was dropped)
3805 *
3806 */
3807
3808int netif_rx(struct sk_buff *skb)
3809{
3810	trace_netif_rx_entry(skb);
3811
3812	return netif_rx_internal(skb);
3813}
3814EXPORT_SYMBOL(netif_rx);
3815
3816int netif_rx_ni(struct sk_buff *skb)
3817{
3818	int err;
3819
3820	trace_netif_rx_ni_entry(skb);
3821
3822	preempt_disable();
3823	err = netif_rx_internal(skb);
3824	if (local_softirq_pending())
3825		do_softirq();
3826	preempt_enable();
3827
3828	return err;
3829}
3830EXPORT_SYMBOL(netif_rx_ni);
3831
3832static __latent_entropy void net_tx_action(struct softirq_action *h)
3833{
3834	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3835
3836	if (sd->completion_queue) {
3837		struct sk_buff *clist;
3838
3839		local_irq_disable();
3840		clist = sd->completion_queue;
3841		sd->completion_queue = NULL;
3842		local_irq_enable();
3843
3844		while (clist) {
3845			struct sk_buff *skb = clist;
3846			clist = clist->next;
3847
3848			WARN_ON(atomic_read(&skb->users));
3849			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3850				trace_consume_skb(skb);
3851			else
3852				trace_kfree_skb(skb, net_tx_action);
3853
3854			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3855				__kfree_skb(skb);
3856			else
3857				__kfree_skb_defer(skb);
3858		}
3859
3860		__kfree_skb_flush();
3861	}
3862
3863	if (sd->output_queue) {
3864		struct Qdisc *head;
3865
3866		local_irq_disable();
3867		head = sd->output_queue;
3868		sd->output_queue = NULL;
3869		sd->output_queue_tailp = &sd->output_queue;
3870		local_irq_enable();
3871
3872		while (head) {
3873			struct Qdisc *q = head;
3874			spinlock_t *root_lock;
3875
3876			head = head->next_sched;
3877
3878			root_lock = qdisc_lock(q);
3879			spin_lock(root_lock);
3880			/* We need to make sure head->next_sched is read
3881			 * before clearing __QDISC_STATE_SCHED
3882			 */
3883			smp_mb__before_atomic();
3884			clear_bit(__QDISC_STATE_SCHED, &q->state);
3885			qdisc_run(q);
3886			spin_unlock(root_lock);
3887		}
3888	}
3889}
3890
3891#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3892/* This hook is defined here for ATM LANE */
3893int (*br_fdb_test_addr_hook)(struct net_device *dev,
3894			     unsigned char *addr) __read_mostly;
3895EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3896#endif
3897
3898static inline struct sk_buff *
3899sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3900		   struct net_device *orig_dev)
3901{
3902#ifdef CONFIG_NET_CLS_ACT
3903	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3904	struct tcf_result cl_res;
3905
3906	/* If there's at least one ingress present somewhere (so
3907	 * we get here via enabled static key), remaining devices
3908	 * that are not configured with an ingress qdisc will bail
3909	 * out here.
3910	 */
3911	if (!cl)
3912		return skb;
3913	if (*pt_prev) {
3914		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3915		*pt_prev = NULL;
3916	}
3917
3918	qdisc_skb_cb(skb)->pkt_len = skb->len;
3919	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3920	qdisc_bstats_cpu_update(cl->q, skb);
3921
3922	switch (tc_classify(skb, cl, &cl_res, false)) {
3923	case TC_ACT_OK:
3924	case TC_ACT_RECLASSIFY:
3925		skb->tc_index = TC_H_MIN(cl_res.classid);
3926		break;
3927	case TC_ACT_SHOT:
3928		qdisc_qstats_cpu_drop(cl->q);
3929		kfree_skb(skb);
3930		return NULL;
3931	case TC_ACT_STOLEN:
3932	case TC_ACT_QUEUED:
3933		consume_skb(skb);
3934		return NULL;
3935	case TC_ACT_REDIRECT:
3936		/* skb_mac_header check was done by cls/act_bpf, so
3937		 * we can safely push the L2 header back before
3938		 * redirecting to another netdev
3939		 */
3940		__skb_push(skb, skb->mac_len);
3941		skb_do_redirect(skb);
3942		return NULL;
3943	default:
3944		break;
3945	}
3946#endif /* CONFIG_NET_CLS_ACT */
3947	return skb;
3948}
3949
3950/**
3951 *	netdev_is_rx_handler_busy - check if receive handler is registered
3952 *	@dev: device to check
3953 *
3954 *	Check if a receive handler is already registered for a given device.
3955 *	Return true if there one.
3956 *
3957 *	The caller must hold the rtnl_mutex.
3958 */
3959bool netdev_is_rx_handler_busy(struct net_device *dev)
3960{
3961	ASSERT_RTNL();
3962	return dev && rtnl_dereference(dev->rx_handler);
3963}
3964EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3965
3966/**
3967 *	netdev_rx_handler_register - register receive handler
3968 *	@dev: device to register a handler for
3969 *	@rx_handler: receive handler to register
3970 *	@rx_handler_data: data pointer that is used by rx handler
3971 *
3972 *	Register a receive handler for a device. This handler will then be
3973 *	called from __netif_receive_skb. A negative errno code is returned
3974 *	on a failure.
3975 *
3976 *	The caller must hold the rtnl_mutex.
3977 *
3978 *	For a general description of rx_handler, see enum rx_handler_result.
3979 */
3980int netdev_rx_handler_register(struct net_device *dev,
3981			       rx_handler_func_t *rx_handler,
3982			       void *rx_handler_data)
3983{
3984	ASSERT_RTNL();
3985
3986	if (dev->rx_handler)
3987		return -EBUSY;
3988
3989	/* Note: rx_handler_data must be set before rx_handler */
3990	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3991	rcu_assign_pointer(dev->rx_handler, rx_handler);
3992
3993	return 0;
3994}
3995EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3996
3997/**
3998 *	netdev_rx_handler_unregister - unregister receive handler
3999 *	@dev: device to unregister a handler from
4000 *
4001 *	Unregister a receive handler from a device.
4002 *
4003 *	The caller must hold the rtnl_mutex.
4004 */
4005void netdev_rx_handler_unregister(struct net_device *dev)
4006{
4007
4008	ASSERT_RTNL();
4009	RCU_INIT_POINTER(dev->rx_handler, NULL);
4010	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4011	 * section has a guarantee to see a non NULL rx_handler_data
4012	 * as well.
4013	 */
4014	synchronize_net();
4015	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4018
4019/*
4020 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4021 * the special handling of PFMEMALLOC skbs.
4022 */
4023static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4024{
4025	switch (skb->protocol) {
4026	case htons(ETH_P_ARP):
4027	case htons(ETH_P_IP):
4028	case htons(ETH_P_IPV6):
4029	case htons(ETH_P_8021Q):
4030	case htons(ETH_P_8021AD):
4031		return true;
4032	default:
4033		return false;
4034	}
4035}
4036
4037static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4038			     int *ret, struct net_device *orig_dev)
4039{
4040#ifdef CONFIG_NETFILTER_INGRESS
4041	if (nf_hook_ingress_active(skb)) {
4042		int ingress_retval;
4043
4044		if (*pt_prev) {
4045			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4046			*pt_prev = NULL;
4047		}
4048
4049		rcu_read_lock();
4050		ingress_retval = nf_hook_ingress(skb);
4051		rcu_read_unlock();
4052		return ingress_retval;
4053	}
4054#endif /* CONFIG_NETFILTER_INGRESS */
4055	return 0;
4056}
4057
4058static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4059{
4060	struct packet_type *ptype, *pt_prev;
4061	rx_handler_func_t *rx_handler;
4062	struct net_device *orig_dev;
4063	bool deliver_exact = false;
4064	int ret = NET_RX_DROP;
4065	__be16 type;
4066
4067	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4068
4069	trace_netif_receive_skb(skb);
4070
4071	orig_dev = skb->dev;
4072
4073	skb_reset_network_header(skb);
4074	if (!skb_transport_header_was_set(skb))
4075		skb_reset_transport_header(skb);
4076	skb_reset_mac_len(skb);
4077
4078	pt_prev = NULL;
4079
4080another_round:
4081	skb->skb_iif = skb->dev->ifindex;
4082
4083	__this_cpu_inc(softnet_data.processed);
4084
4085	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4086	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4087		skb = skb_vlan_untag(skb);
4088		if (unlikely(!skb))
4089			goto out;
4090	}
4091
4092#ifdef CONFIG_NET_CLS_ACT
4093	if (skb->tc_verd & TC_NCLS) {
4094		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4095		goto ncls;
4096	}
4097#endif
4098
4099	if (pfmemalloc)
4100		goto skip_taps;
4101
4102	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4103		if (pt_prev)
4104			ret = deliver_skb(skb, pt_prev, orig_dev);
4105		pt_prev = ptype;
4106	}
4107
4108	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4109		if (pt_prev)
4110			ret = deliver_skb(skb, pt_prev, orig_dev);
4111		pt_prev = ptype;
4112	}
4113
4114skip_taps:
4115#ifdef CONFIG_NET_INGRESS
4116	if (static_key_false(&ingress_needed)) {
4117		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4118		if (!skb)
4119			goto out;
4120
4121		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4122			goto out;
4123	}
4124#endif
4125#ifdef CONFIG_NET_CLS_ACT
4126	skb->tc_verd = 0;
4127ncls:
4128#endif
4129	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4130		goto drop;
4131
4132	if (skb_vlan_tag_present(skb)) {
4133		if (pt_prev) {
4134			ret = deliver_skb(skb, pt_prev, orig_dev);
4135			pt_prev = NULL;
4136		}
4137		if (vlan_do_receive(&skb))
4138			goto another_round;
4139		else if (unlikely(!skb))
4140			goto out;
4141	}
4142
4143	rx_handler = rcu_dereference(skb->dev->rx_handler);
4144	if (rx_handler) {
4145		if (pt_prev) {
4146			ret = deliver_skb(skb, pt_prev, orig_dev);
4147			pt_prev = NULL;
4148		}
4149		switch (rx_handler(&skb)) {
4150		case RX_HANDLER_CONSUMED:
4151			ret = NET_RX_SUCCESS;
4152			goto out;
4153		case RX_HANDLER_ANOTHER:
4154			goto another_round;
4155		case RX_HANDLER_EXACT:
4156			deliver_exact = true;
4157		case RX_HANDLER_PASS:
4158			break;
4159		default:
4160			BUG();
4161		}
4162	}
4163
4164	if (unlikely(skb_vlan_tag_present(skb))) {
4165		if (skb_vlan_tag_get_id(skb))
4166			skb->pkt_type = PACKET_OTHERHOST;
4167		/* Note: we might in the future use prio bits
4168		 * and set skb->priority like in vlan_do_receive()
4169		 * For the time being, just ignore Priority Code Point
4170		 */
4171		skb->vlan_tci = 0;
4172	}
4173
4174	type = skb->protocol;
4175
4176	/* deliver only exact match when indicated */
4177	if (likely(!deliver_exact)) {
4178		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4179				       &ptype_base[ntohs(type) &
4180						   PTYPE_HASH_MASK]);
4181	}
4182
4183	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4184			       &orig_dev->ptype_specific);
4185
4186	if (unlikely(skb->dev != orig_dev)) {
4187		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4188				       &skb->dev->ptype_specific);
4189	}
4190
4191	if (pt_prev) {
4192		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4193			goto drop;
4194		else
4195			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4196	} else {
4197drop:
4198		if (!deliver_exact)
4199			atomic_long_inc(&skb->dev->rx_dropped);
4200		else
4201			atomic_long_inc(&skb->dev->rx_nohandler);
4202		kfree_skb(skb);
4203		/* Jamal, now you will not able to escape explaining
4204		 * me how you were going to use this. :-)
4205		 */
4206		ret = NET_RX_DROP;
4207	}
4208
4209out:
4210	return ret;
4211}
4212
4213static int __netif_receive_skb(struct sk_buff *skb)
4214{
4215	int ret;
4216
4217	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4218		unsigned long pflags = current->flags;
4219
4220		/*
4221		 * PFMEMALLOC skbs are special, they should
4222		 * - be delivered to SOCK_MEMALLOC sockets only
4223		 * - stay away from userspace
4224		 * - have bounded memory usage
4225		 *
4226		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4227		 * context down to all allocation sites.
4228		 */
4229		current->flags |= PF_MEMALLOC;
4230		ret = __netif_receive_skb_core(skb, true);
4231		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4232	} else
4233		ret = __netif_receive_skb_core(skb, false);
4234
4235	return ret;
4236}
4237
4238static int netif_receive_skb_internal(struct sk_buff *skb)
4239{
4240	int ret;
4241
4242	net_timestamp_check(netdev_tstamp_prequeue, skb);
4243
4244	if (skb_defer_rx_timestamp(skb))
4245		return NET_RX_SUCCESS;
4246
4247	rcu_read_lock();
4248
4249#ifdef CONFIG_RPS
4250	if (static_key_false(&rps_needed)) {
4251		struct rps_dev_flow voidflow, *rflow = &voidflow;
4252		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4253
4254		if (cpu >= 0) {
4255			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4256			rcu_read_unlock();
4257			return ret;
4258		}
4259	}
4260#endif
4261	ret = __netif_receive_skb(skb);
4262	rcu_read_unlock();
4263	return ret;
4264}
4265
4266/**
4267 *	netif_receive_skb - process receive buffer from network
4268 *	@skb: buffer to process
4269 *
4270 *	netif_receive_skb() is the main receive data processing function.
4271 *	It always succeeds. The buffer may be dropped during processing
4272 *	for congestion control or by the protocol layers.
4273 *
4274 *	This function may only be called from softirq context and interrupts
4275 *	should be enabled.
4276 *
4277 *	Return values (usually ignored):
4278 *	NET_RX_SUCCESS: no congestion
4279 *	NET_RX_DROP: packet was dropped
4280 */
4281int netif_receive_skb(struct sk_buff *skb)
4282{
4283	trace_netif_receive_skb_entry(skb);
4284
4285	return netif_receive_skb_internal(skb);
4286}
4287EXPORT_SYMBOL(netif_receive_skb);
4288
4289DEFINE_PER_CPU(struct work_struct, flush_works);
4290
4291/* Network device is going away, flush any packets still pending */
4292static void flush_backlog(struct work_struct *work)
4293{
4294	struct sk_buff *skb, *tmp;
4295	struct softnet_data *sd;
4296
4297	local_bh_disable();
4298	sd = this_cpu_ptr(&softnet_data);
4299
4300	local_irq_disable();
4301	rps_lock(sd);
4302	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4303		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4304			__skb_unlink(skb, &sd->input_pkt_queue);
4305			kfree_skb(skb);
4306			input_queue_head_incr(sd);
4307		}
4308	}
4309	rps_unlock(sd);
4310	local_irq_enable();
4311
4312	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4313		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4314			__skb_unlink(skb, &sd->process_queue);
4315			kfree_skb(skb);
4316			input_queue_head_incr(sd);
4317		}
4318	}
4319	local_bh_enable();
4320}
4321
4322static void flush_all_backlogs(void)
4323{
4324	unsigned int cpu;
4325
4326	get_online_cpus();
4327
4328	for_each_online_cpu(cpu)
4329		queue_work_on(cpu, system_highpri_wq,
4330			      per_cpu_ptr(&flush_works, cpu));
4331
4332	for_each_online_cpu(cpu)
4333		flush_work(per_cpu_ptr(&flush_works, cpu));
4334
4335	put_online_cpus();
4336}
4337
4338static int napi_gro_complete(struct sk_buff *skb)
4339{
4340	struct packet_offload *ptype;
4341	__be16 type = skb->protocol;
4342	struct list_head *head = &offload_base;
4343	int err = -ENOENT;
4344
4345	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4346
4347	if (NAPI_GRO_CB(skb)->count == 1) {
4348		skb_shinfo(skb)->gso_size = 0;
4349		goto out;
4350	}
4351
4352	rcu_read_lock();
4353	list_for_each_entry_rcu(ptype, head, list) {
4354		if (ptype->type != type || !ptype->callbacks.gro_complete)
4355			continue;
4356
4357		err = ptype->callbacks.gro_complete(skb, 0);
4358		break;
4359	}
4360	rcu_read_unlock();
4361
4362	if (err) {
4363		WARN_ON(&ptype->list == head);
4364		kfree_skb(skb);
4365		return NET_RX_SUCCESS;
4366	}
4367
4368out:
4369	return netif_receive_skb_internal(skb);
4370}
4371
4372/* napi->gro_list contains packets ordered by age.
4373 * youngest packets at the head of it.
4374 * Complete skbs in reverse order to reduce latencies.
4375 */
4376void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4377{
4378	struct sk_buff *skb, *prev = NULL;
4379
4380	/* scan list and build reverse chain */
4381	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4382		skb->prev = prev;
4383		prev = skb;
4384	}
4385
4386	for (skb = prev; skb; skb = prev) {
4387		skb->next = NULL;
4388
4389		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4390			return;
4391
4392		prev = skb->prev;
4393		napi_gro_complete(skb);
4394		napi->gro_count--;
4395	}
4396
4397	napi->gro_list = NULL;
4398}
4399EXPORT_SYMBOL(napi_gro_flush);
4400
4401static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4402{
4403	struct sk_buff *p;
4404	unsigned int maclen = skb->dev->hard_header_len;
4405	u32 hash = skb_get_hash_raw(skb);
4406
4407	for (p = napi->gro_list; p; p = p->next) {
4408		unsigned long diffs;
4409
4410		NAPI_GRO_CB(p)->flush = 0;
4411
4412		if (hash != skb_get_hash_raw(p)) {
4413			NAPI_GRO_CB(p)->same_flow = 0;
4414			continue;
4415		}
4416
4417		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4418		diffs |= p->vlan_tci ^ skb->vlan_tci;
4419		diffs |= skb_metadata_dst_cmp(p, skb);
4420		if (maclen == ETH_HLEN)
4421			diffs |= compare_ether_header(skb_mac_header(p),
4422						      skb_mac_header(skb));
4423		else if (!diffs)
4424			diffs = memcmp(skb_mac_header(p),
4425				       skb_mac_header(skb),
4426				       maclen);
4427		NAPI_GRO_CB(p)->same_flow = !diffs;
4428	}
4429}
4430
4431static void skb_gro_reset_offset(struct sk_buff *skb)
4432{
4433	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4434	const skb_frag_t *frag0 = &pinfo->frags[0];
4435
4436	NAPI_GRO_CB(skb)->data_offset = 0;
4437	NAPI_GRO_CB(skb)->frag0 = NULL;
4438	NAPI_GRO_CB(skb)->frag0_len = 0;
4439
4440	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4441	    pinfo->nr_frags &&
4442	    !PageHighMem(skb_frag_page(frag0))) {
4443		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4444		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4445						    skb_frag_size(frag0),
4446						    skb->end - skb->tail);
4447	}
4448}
4449
4450static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4451{
4452	struct skb_shared_info *pinfo = skb_shinfo(skb);
4453
4454	BUG_ON(skb->end - skb->tail < grow);
4455
4456	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4457
4458	skb->data_len -= grow;
4459	skb->tail += grow;
4460
4461	pinfo->frags[0].page_offset += grow;
4462	skb_frag_size_sub(&pinfo->frags[0], grow);
4463
4464	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4465		skb_frag_unref(skb, 0);
4466		memmove(pinfo->frags, pinfo->frags + 1,
4467			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4468	}
4469}
4470
4471static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4472{
4473	struct sk_buff **pp = NULL;
4474	struct packet_offload *ptype;
4475	__be16 type = skb->protocol;
4476	struct list_head *head = &offload_base;
4477	int same_flow;
4478	enum gro_result ret;
4479	int grow;
4480
4481	if (!(skb->dev->features & NETIF_F_GRO))
4482		goto normal;
4483
4484	if (skb->csum_bad)
4485		goto normal;
4486
4487	gro_list_prepare(napi, skb);
4488
4489	rcu_read_lock();
4490	list_for_each_entry_rcu(ptype, head, list) {
4491		if (ptype->type != type || !ptype->callbacks.gro_receive)
4492			continue;
4493
4494		skb_set_network_header(skb, skb_gro_offset(skb));
4495		skb_reset_mac_len(skb);
4496		NAPI_GRO_CB(skb)->same_flow = 0;
4497		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4498		NAPI_GRO_CB(skb)->free = 0;
4499		NAPI_GRO_CB(skb)->encap_mark = 0;
4500		NAPI_GRO_CB(skb)->recursion_counter = 0;
4501		NAPI_GRO_CB(skb)->is_fou = 0;
4502		NAPI_GRO_CB(skb)->is_atomic = 1;
4503		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4504
4505		/* Setup for GRO checksum validation */
4506		switch (skb->ip_summed) {
4507		case CHECKSUM_COMPLETE:
4508			NAPI_GRO_CB(skb)->csum = skb->csum;
4509			NAPI_GRO_CB(skb)->csum_valid = 1;
4510			NAPI_GRO_CB(skb)->csum_cnt = 0;
4511			break;
4512		case CHECKSUM_UNNECESSARY:
4513			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4514			NAPI_GRO_CB(skb)->csum_valid = 0;
4515			break;
4516		default:
4517			NAPI_GRO_CB(skb)->csum_cnt = 0;
4518			NAPI_GRO_CB(skb)->csum_valid = 0;
4519		}
4520
4521		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4522		break;
4523	}
4524	rcu_read_unlock();
4525
4526	if (&ptype->list == head)
4527		goto normal;
4528
4529	same_flow = NAPI_GRO_CB(skb)->same_flow;
4530	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4531
4532	if (pp) {
4533		struct sk_buff *nskb = *pp;
4534
4535		*pp = nskb->next;
4536		nskb->next = NULL;
4537		napi_gro_complete(nskb);
4538		napi->gro_count--;
4539	}
4540
4541	if (same_flow)
4542		goto ok;
4543
4544	if (NAPI_GRO_CB(skb)->flush)
4545		goto normal;
4546
4547	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4548		struct sk_buff *nskb = napi->gro_list;
4549
4550		/* locate the end of the list to select the 'oldest' flow */
4551		while (nskb->next) {
4552			pp = &nskb->next;
4553			nskb = *pp;
4554		}
4555		*pp = NULL;
4556		nskb->next = NULL;
4557		napi_gro_complete(nskb);
4558	} else {
4559		napi->gro_count++;
4560	}
4561	NAPI_GRO_CB(skb)->count = 1;
4562	NAPI_GRO_CB(skb)->age = jiffies;
4563	NAPI_GRO_CB(skb)->last = skb;
4564	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4565	skb->next = napi->gro_list;
4566	napi->gro_list = skb;
4567	ret = GRO_HELD;
4568
4569pull:
4570	grow = skb_gro_offset(skb) - skb_headlen(skb);
4571	if (grow > 0)
4572		gro_pull_from_frag0(skb, grow);
4573ok:
4574	return ret;
4575
4576normal:
4577	ret = GRO_NORMAL;
4578	goto pull;
4579}
4580
4581struct packet_offload *gro_find_receive_by_type(__be16 type)
4582{
4583	struct list_head *offload_head = &offload_base;
4584	struct packet_offload *ptype;
4585
4586	list_for_each_entry_rcu(ptype, offload_head, list) {
4587		if (ptype->type != type || !ptype->callbacks.gro_receive)
4588			continue;
4589		return ptype;
4590	}
4591	return NULL;
4592}
4593EXPORT_SYMBOL(gro_find_receive_by_type);
4594
4595struct packet_offload *gro_find_complete_by_type(__be16 type)
4596{
4597	struct list_head *offload_head = &offload_base;
4598	struct packet_offload *ptype;
4599
4600	list_for_each_entry_rcu(ptype, offload_head, list) {
4601		if (ptype->type != type || !ptype->callbacks.gro_complete)
4602			continue;
4603		return ptype;
4604	}
4605	return NULL;
4606}
4607EXPORT_SYMBOL(gro_find_complete_by_type);
4608
4609static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4610{
4611	switch (ret) {
4612	case GRO_NORMAL:
4613		if (netif_receive_skb_internal(skb))
4614			ret = GRO_DROP;
4615		break;
4616
4617	case GRO_DROP:
4618		kfree_skb(skb);
4619		break;
4620
4621	case GRO_MERGED_FREE:
4622		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4623			skb_dst_drop(skb);
4624			kmem_cache_free(skbuff_head_cache, skb);
4625		} else {
4626			__kfree_skb(skb);
4627		}
4628		break;
4629
4630	case GRO_HELD:
4631	case GRO_MERGED:
4632		break;
4633	}
4634
4635	return ret;
4636}
4637
4638gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4639{
4640	skb_mark_napi_id(skb, napi);
4641	trace_napi_gro_receive_entry(skb);
4642
4643	skb_gro_reset_offset(skb);
4644
4645	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4646}
4647EXPORT_SYMBOL(napi_gro_receive);
4648
4649static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4650{
4651	if (unlikely(skb->pfmemalloc)) {
4652		consume_skb(skb);
4653		return;
4654	}
4655	__skb_pull(skb, skb_headlen(skb));
4656	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4657	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4658	skb->vlan_tci = 0;
4659	skb->dev = napi->dev;
4660	skb->skb_iif = 0;
4661	skb->encapsulation = 0;
4662	skb_shinfo(skb)->gso_type = 0;
4663	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4664
4665	napi->skb = skb;
4666}
4667
4668struct sk_buff *napi_get_frags(struct napi_struct *napi)
4669{
4670	struct sk_buff *skb = napi->skb;
4671
4672	if (!skb) {
4673		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4674		if (skb) {
4675			napi->skb = skb;
4676			skb_mark_napi_id(skb, napi);
4677		}
4678	}
4679	return skb;
4680}
4681EXPORT_SYMBOL(napi_get_frags);
4682
4683static gro_result_t napi_frags_finish(struct napi_struct *napi,
4684				      struct sk_buff *skb,
4685				      gro_result_t ret)
4686{
4687	switch (ret) {
4688	case GRO_NORMAL:
4689	case GRO_HELD:
4690		__skb_push(skb, ETH_HLEN);
4691		skb->protocol = eth_type_trans(skb, skb->dev);
4692		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4693			ret = GRO_DROP;
4694		break;
4695
4696	case GRO_DROP:
4697	case GRO_MERGED_FREE:
4698		napi_reuse_skb(napi, skb);
4699		break;
4700
4701	case GRO_MERGED:
4702		break;
4703	}
4704
4705	return ret;
4706}
4707
4708/* Upper GRO stack assumes network header starts at gro_offset=0
4709 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4710 * We copy ethernet header into skb->data to have a common layout.
4711 */
4712static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4713{
4714	struct sk_buff *skb = napi->skb;
4715	const struct ethhdr *eth;
4716	unsigned int hlen = sizeof(*eth);
4717
4718	napi->skb = NULL;
4719
4720	skb_reset_mac_header(skb);
4721	skb_gro_reset_offset(skb);
4722
4723	eth = skb_gro_header_fast(skb, 0);
4724	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4725		eth = skb_gro_header_slow(skb, hlen, 0);
4726		if (unlikely(!eth)) {
4727			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4728					     __func__, napi->dev->name);
4729			napi_reuse_skb(napi, skb);
4730			return NULL;
4731		}
4732	} else {
4733		gro_pull_from_frag0(skb, hlen);
4734		NAPI_GRO_CB(skb)->frag0 += hlen;
4735		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4736	}
4737	__skb_pull(skb, hlen);
4738
4739	/*
4740	 * This works because the only protocols we care about don't require
4741	 * special handling.
4742	 * We'll fix it up properly in napi_frags_finish()
4743	 */
4744	skb->protocol = eth->h_proto;
4745
4746	return skb;
4747}
4748
4749gro_result_t napi_gro_frags(struct napi_struct *napi)
4750{
4751	struct sk_buff *skb = napi_frags_skb(napi);
4752
4753	if (!skb)
4754		return GRO_DROP;
4755
4756	trace_napi_gro_frags_entry(skb);
4757
4758	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4759}
4760EXPORT_SYMBOL(napi_gro_frags);
4761
4762/* Compute the checksum from gro_offset and return the folded value
4763 * after adding in any pseudo checksum.
4764 */
4765__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4766{
4767	__wsum wsum;
4768	__sum16 sum;
4769
4770	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4771
4772	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4773	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4774	if (likely(!sum)) {
4775		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4776		    !skb->csum_complete_sw)
4777			netdev_rx_csum_fault(skb->dev);
4778	}
4779
4780	NAPI_GRO_CB(skb)->csum = wsum;
4781	NAPI_GRO_CB(skb)->csum_valid = 1;
4782
4783	return sum;
4784}
4785EXPORT_SYMBOL(__skb_gro_checksum_complete);
4786
4787/*
4788 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4789 * Note: called with local irq disabled, but exits with local irq enabled.
4790 */
4791static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4792{
4793#ifdef CONFIG_RPS
4794	struct softnet_data *remsd = sd->rps_ipi_list;
4795
4796	if (remsd) {
4797		sd->rps_ipi_list = NULL;
4798
4799		local_irq_enable();
4800
4801		/* Send pending IPI's to kick RPS processing on remote cpus. */
4802		while (remsd) {
4803			struct softnet_data *next = remsd->rps_ipi_next;
4804
4805			if (cpu_online(remsd->cpu))
4806				smp_call_function_single_async(remsd->cpu,
4807							   &remsd->csd);
4808			remsd = next;
4809		}
4810	} else
4811#endif
4812		local_irq_enable();
4813}
4814
4815static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4816{
4817#ifdef CONFIG_RPS
4818	return sd->rps_ipi_list != NULL;
4819#else
4820	return false;
4821#endif
4822}
4823
4824static int process_backlog(struct napi_struct *napi, int quota)
4825{
4826	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4827	bool again = true;
4828	int work = 0;
4829
4830	/* Check if we have pending ipi, its better to send them now,
4831	 * not waiting net_rx_action() end.
4832	 */
4833	if (sd_has_rps_ipi_waiting(sd)) {
4834		local_irq_disable();
4835		net_rps_action_and_irq_enable(sd);
4836	}
4837
4838	napi->weight = weight_p;
4839	while (again) {
4840		struct sk_buff *skb;
4841
4842		while ((skb = __skb_dequeue(&sd->process_queue))) {
4843			rcu_read_lock();
4844			__netif_receive_skb(skb);
4845			rcu_read_unlock();
4846			input_queue_head_incr(sd);
4847			if (++work >= quota)
4848				return work;
4849
4850		}
4851
4852		local_irq_disable();
4853		rps_lock(sd);
4854		if (skb_queue_empty(&sd->input_pkt_queue)) {
4855			/*
4856			 * Inline a custom version of __napi_complete().
4857			 * only current cpu owns and manipulates this napi,
4858			 * and NAPI_STATE_SCHED is the only possible flag set
4859			 * on backlog.
4860			 * We can use a plain write instead of clear_bit(),
4861			 * and we dont need an smp_mb() memory barrier.
4862			 */
4863			napi->state = 0;
4864			again = false;
4865		} else {
4866			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4867						   &sd->process_queue);
4868		}
4869		rps_unlock(sd);
4870		local_irq_enable();
4871	}
4872
4873	return work;
4874}
4875
4876/**
4877 * __napi_schedule - schedule for receive
4878 * @n: entry to schedule
4879 *
4880 * The entry's receive function will be scheduled to run.
4881 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4882 */
4883void __napi_schedule(struct napi_struct *n)
4884{
4885	unsigned long flags;
4886
4887	local_irq_save(flags);
4888	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4889	local_irq_restore(flags);
4890}
4891EXPORT_SYMBOL(__napi_schedule);
4892
4893/**
4894 * __napi_schedule_irqoff - schedule for receive
4895 * @n: entry to schedule
4896 *
4897 * Variant of __napi_schedule() assuming hard irqs are masked
4898 */
4899void __napi_schedule_irqoff(struct napi_struct *n)
4900{
4901	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4902}
4903EXPORT_SYMBOL(__napi_schedule_irqoff);
4904
4905bool __napi_complete(struct napi_struct *n)
4906{
4907	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4908
4909	/* Some drivers call us directly, instead of calling
4910	 * napi_complete_done().
4911	 */
4912	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4913		return false;
4914
4915	list_del_init(&n->poll_list);
4916	smp_mb__before_atomic();
4917	clear_bit(NAPI_STATE_SCHED, &n->state);
4918	return true;
4919}
4920EXPORT_SYMBOL(__napi_complete);
4921
4922bool napi_complete_done(struct napi_struct *n, int work_done)
4923{
4924	unsigned long flags;
4925
4926	/*
4927	 * 1) Don't let napi dequeue from the cpu poll list
4928	 *    just in case its running on a different cpu.
4929	 * 2) If we are busy polling, do nothing here, we have
4930	 *    the guarantee we will be called later.
4931	 */
4932	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4933				 NAPIF_STATE_IN_BUSY_POLL)))
4934		return false;
4935
4936	if (n->gro_list) {
4937		unsigned long timeout = 0;
4938
4939		if (work_done)
4940			timeout = n->dev->gro_flush_timeout;
4941
4942		if (timeout)
4943			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4944				      HRTIMER_MODE_REL_PINNED);
4945		else
4946			napi_gro_flush(n, false);
4947	}
4948	if (likely(list_empty(&n->poll_list))) {
4949		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4950	} else {
4951		/* If n->poll_list is not empty, we need to mask irqs */
4952		local_irq_save(flags);
4953		__napi_complete(n);
4954		local_irq_restore(flags);
4955	}
4956	return true;
4957}
4958EXPORT_SYMBOL(napi_complete_done);
4959
4960/* must be called under rcu_read_lock(), as we dont take a reference */
4961static struct napi_struct *napi_by_id(unsigned int napi_id)
4962{
4963	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4964	struct napi_struct *napi;
4965
4966	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4967		if (napi->napi_id == napi_id)
4968			return napi;
4969
4970	return NULL;
4971}
4972
4973#if defined(CONFIG_NET_RX_BUSY_POLL)
4974
4975#define BUSY_POLL_BUDGET 8
4976
4977static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4978{
4979	int rc;
4980
4981	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4982
4983	local_bh_disable();
4984
4985	/* All we really want here is to re-enable device interrupts.
4986	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4987	 */
4988	rc = napi->poll(napi, BUSY_POLL_BUDGET);
4989	netpoll_poll_unlock(have_poll_lock);
4990	if (rc == BUSY_POLL_BUDGET)
4991		__napi_schedule(napi);
4992	local_bh_enable();
4993	if (local_softirq_pending())
4994		do_softirq();
4995}
4996
4997bool sk_busy_loop(struct sock *sk, int nonblock)
4998{
4999	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5000	int (*napi_poll)(struct napi_struct *napi, int budget);
5001	int (*busy_poll)(struct napi_struct *dev);
5002	void *have_poll_lock = NULL;
5003	struct napi_struct *napi;
5004	int rc;
5005
5006restart:
5007	rc = false;
5008	napi_poll = NULL;
5009
5010	rcu_read_lock();
5011
5012	napi = napi_by_id(sk->sk_napi_id);
5013	if (!napi)
5014		goto out;
5015
5016	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5017	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5018
5019	preempt_disable();
5020	for (;;) {
5021		rc = 0;
5022		local_bh_disable();
5023		if (busy_poll) {
5024			rc = busy_poll(napi);
5025			goto count;
5026		}
5027		if (!napi_poll) {
5028			unsigned long val = READ_ONCE(napi->state);
5029
5030			/* If multiple threads are competing for this napi,
5031			 * we avoid dirtying napi->state as much as we can.
5032			 */
5033			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5034				   NAPIF_STATE_IN_BUSY_POLL))
5035				goto count;
5036			if (cmpxchg(&napi->state, val,
5037				    val | NAPIF_STATE_IN_BUSY_POLL |
5038					  NAPIF_STATE_SCHED) != val)
5039				goto count;
5040			have_poll_lock = netpoll_poll_lock(napi);
5041			napi_poll = napi->poll;
5042		}
5043		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5044		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5045count:
5046		if (rc > 0)
5047			__NET_ADD_STATS(sock_net(sk),
5048					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5049		local_bh_enable();
5050
5051		if (rc == LL_FLUSH_FAILED)
5052			break; /* permanent failure */
5053
5054		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5055		    busy_loop_timeout(end_time))
5056			break;
5057
5058		if (unlikely(need_resched())) {
5059			if (napi_poll)
5060				busy_poll_stop(napi, have_poll_lock);
5061			preempt_enable();
5062			rcu_read_unlock();
5063			cond_resched();
5064			rc = !skb_queue_empty(&sk->sk_receive_queue);
5065			if (rc || busy_loop_timeout(end_time))
5066				return rc;
5067			goto restart;
5068		}
5069		cpu_relax();
5070	}
5071	if (napi_poll)
5072		busy_poll_stop(napi, have_poll_lock);
5073	preempt_enable();
5074	rc = !skb_queue_empty(&sk->sk_receive_queue);
5075out:
5076	rcu_read_unlock();
5077	return rc;
5078}
5079EXPORT_SYMBOL(sk_busy_loop);
5080
5081#endif /* CONFIG_NET_RX_BUSY_POLL */
5082
5083static void napi_hash_add(struct napi_struct *napi)
5084{
5085	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5086	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5087		return;
5088
5089	spin_lock(&napi_hash_lock);
5090
5091	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5092	do {
5093		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5094			napi_gen_id = NR_CPUS + 1;
5095	} while (napi_by_id(napi_gen_id));
5096	napi->napi_id = napi_gen_id;
5097
5098	hlist_add_head_rcu(&napi->napi_hash_node,
5099			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5100
5101	spin_unlock(&napi_hash_lock);
5102}
5103
5104/* Warning : caller is responsible to make sure rcu grace period
5105 * is respected before freeing memory containing @napi
5106 */
5107bool napi_hash_del(struct napi_struct *napi)
5108{
5109	bool rcu_sync_needed = false;
5110
5111	spin_lock(&napi_hash_lock);
5112
5113	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5114		rcu_sync_needed = true;
5115		hlist_del_rcu(&napi->napi_hash_node);
5116	}
5117	spin_unlock(&napi_hash_lock);
5118	return rcu_sync_needed;
5119}
5120EXPORT_SYMBOL_GPL(napi_hash_del);
5121
5122static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5123{
5124	struct napi_struct *napi;
5125
5126	napi = container_of(timer, struct napi_struct, timer);
5127	if (napi->gro_list)
5128		napi_schedule(napi);
5129
5130	return HRTIMER_NORESTART;
5131}
5132
5133void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5134		    int (*poll)(struct napi_struct *, int), int weight)
5135{
5136	INIT_LIST_HEAD(&napi->poll_list);
5137	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5138	napi->timer.function = napi_watchdog;
5139	napi->gro_count = 0;
5140	napi->gro_list = NULL;
5141	napi->skb = NULL;
5142	napi->poll = poll;
5143	if (weight > NAPI_POLL_WEIGHT)
5144		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5145			    weight, dev->name);
5146	napi->weight = weight;
5147	list_add(&napi->dev_list, &dev->napi_list);
5148	napi->dev = dev;
5149#ifdef CONFIG_NETPOLL
5150	napi->poll_owner = -1;
5151#endif
5152	set_bit(NAPI_STATE_SCHED, &napi->state);
5153	napi_hash_add(napi);
5154}
5155EXPORT_SYMBOL(netif_napi_add);
5156
5157void napi_disable(struct napi_struct *n)
5158{
5159	might_sleep();
5160	set_bit(NAPI_STATE_DISABLE, &n->state);
5161
5162	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5163		msleep(1);
5164	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5165		msleep(1);
5166
5167	hrtimer_cancel(&n->timer);
5168
5169	clear_bit(NAPI_STATE_DISABLE, &n->state);
5170}
5171EXPORT_SYMBOL(napi_disable);
5172
5173/* Must be called in process context */
5174void netif_napi_del(struct napi_struct *napi)
5175{
5176	might_sleep();
5177	if (napi_hash_del(napi))
5178		synchronize_net();
5179	list_del_init(&napi->dev_list);
5180	napi_free_frags(napi);
5181
5182	kfree_skb_list(napi->gro_list);
5183	napi->gro_list = NULL;
5184	napi->gro_count = 0;
5185}
5186EXPORT_SYMBOL(netif_napi_del);
5187
5188static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5189{
5190	void *have;
5191	int work, weight;
5192
5193	list_del_init(&n->poll_list);
5194
5195	have = netpoll_poll_lock(n);
5196
5197	weight = n->weight;
5198
5199	/* This NAPI_STATE_SCHED test is for avoiding a race
5200	 * with netpoll's poll_napi().  Only the entity which
5201	 * obtains the lock and sees NAPI_STATE_SCHED set will
5202	 * actually make the ->poll() call.  Therefore we avoid
5203	 * accidentally calling ->poll() when NAPI is not scheduled.
5204	 */
5205	work = 0;
5206	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5207		work = n->poll(n, weight);
5208		trace_napi_poll(n, work, weight);
5209	}
5210
5211	WARN_ON_ONCE(work > weight);
5212
5213	if (likely(work < weight))
5214		goto out_unlock;
5215
5216	/* Drivers must not modify the NAPI state if they
5217	 * consume the entire weight.  In such cases this code
5218	 * still "owns" the NAPI instance and therefore can
5219	 * move the instance around on the list at-will.
5220	 */
5221	if (unlikely(napi_disable_pending(n))) {
5222		napi_complete(n);
5223		goto out_unlock;
5224	}
5225
5226	if (n->gro_list) {
5227		/* flush too old packets
5228		 * If HZ < 1000, flush all packets.
5229		 */
5230		napi_gro_flush(n, HZ >= 1000);
5231	}
5232
5233	/* Some drivers may have called napi_schedule
5234	 * prior to exhausting their budget.
5235	 */
5236	if (unlikely(!list_empty(&n->poll_list))) {
5237		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5238			     n->dev ? n->dev->name : "backlog");
5239		goto out_unlock;
5240	}
5241
5242	list_add_tail(&n->poll_list, repoll);
5243
5244out_unlock:
5245	netpoll_poll_unlock(have);
5246
5247	return work;
5248}
5249
5250static __latent_entropy void net_rx_action(struct softirq_action *h)
5251{
5252	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5253	unsigned long time_limit = jiffies + 2;
5254	int budget = netdev_budget;
5255	LIST_HEAD(list);
5256	LIST_HEAD(repoll);
5257
5258	local_irq_disable();
5259	list_splice_init(&sd->poll_list, &list);
5260	local_irq_enable();
5261
5262	for (;;) {
5263		struct napi_struct *n;
5264
5265		if (list_empty(&list)) {
5266			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5267				goto out;
5268			break;
5269		}
5270
5271		n = list_first_entry(&list, struct napi_struct, poll_list);
5272		budget -= napi_poll(n, &repoll);
5273
5274		/* If softirq window is exhausted then punt.
5275		 * Allow this to run for 2 jiffies since which will allow
5276		 * an average latency of 1.5/HZ.
5277		 */
5278		if (unlikely(budget <= 0 ||
5279			     time_after_eq(jiffies, time_limit))) {
5280			sd->time_squeeze++;
5281			break;
5282		}
5283	}
5284
5285	local_irq_disable();
5286
5287	list_splice_tail_init(&sd->poll_list, &list);
5288	list_splice_tail(&repoll, &list);
5289	list_splice(&list, &sd->poll_list);
5290	if (!list_empty(&sd->poll_list))
5291		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5292
5293	net_rps_action_and_irq_enable(sd);
5294out:
5295	__kfree_skb_flush();
5296}
5297
5298struct netdev_adjacent {
5299	struct net_device *dev;
5300
5301	/* upper master flag, there can only be one master device per list */
5302	bool master;
5303
5304	/* counter for the number of times this device was added to us */
5305	u16 ref_nr;
5306
5307	/* private field for the users */
5308	void *private;
5309
5310	struct list_head list;
5311	struct rcu_head rcu;
5312};
5313
5314static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5315						 struct list_head *adj_list)
5316{
5317	struct netdev_adjacent *adj;
5318
5319	list_for_each_entry(adj, adj_list, list) {
5320		if (adj->dev == adj_dev)
5321			return adj;
5322	}
5323	return NULL;
5324}
5325
5326static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5327{
5328	struct net_device *dev = data;
5329
5330	return upper_dev == dev;
5331}
5332
5333/**
5334 * netdev_has_upper_dev - Check if device is linked to an upper device
5335 * @dev: device
5336 * @upper_dev: upper device to check
5337 *
5338 * Find out if a device is linked to specified upper device and return true
5339 * in case it is. Note that this checks only immediate upper device,
5340 * not through a complete stack of devices. The caller must hold the RTNL lock.
5341 */
5342bool netdev_has_upper_dev(struct net_device *dev,
5343			  struct net_device *upper_dev)
5344{
5345	ASSERT_RTNL();
5346
5347	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5348					     upper_dev);
5349}
5350EXPORT_SYMBOL(netdev_has_upper_dev);
5351
5352/**
5353 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5354 * @dev: device
5355 * @upper_dev: upper device to check
5356 *
5357 * Find out if a device is linked to specified upper device and return true
5358 * in case it is. Note that this checks the entire upper device chain.
5359 * The caller must hold rcu lock.
5360 */
5361
5362bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5363				  struct net_device *upper_dev)
5364{
5365	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5366					       upper_dev);
5367}
5368EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5369
5370/**
5371 * netdev_has_any_upper_dev - Check if device is linked to some device
5372 * @dev: device
5373 *
5374 * Find out if a device is linked to an upper device and return true in case
5375 * it is. The caller must hold the RTNL lock.
5376 */
5377static bool netdev_has_any_upper_dev(struct net_device *dev)
5378{
5379	ASSERT_RTNL();
5380
5381	return !list_empty(&dev->adj_list.upper);
5382}
5383
5384/**
5385 * netdev_master_upper_dev_get - Get master upper device
5386 * @dev: device
5387 *
5388 * Find a master upper device and return pointer to it or NULL in case
5389 * it's not there. The caller must hold the RTNL lock.
5390 */
5391struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5392{
5393	struct netdev_adjacent *upper;
5394
5395	ASSERT_RTNL();
5396
5397	if (list_empty(&dev->adj_list.upper))
5398		return NULL;
5399
5400	upper = list_first_entry(&dev->adj_list.upper,
5401				 struct netdev_adjacent, list);
5402	if (likely(upper->master))
5403		return upper->dev;
5404	return NULL;
5405}
5406EXPORT_SYMBOL(netdev_master_upper_dev_get);
5407
5408/**
5409 * netdev_has_any_lower_dev - Check if device is linked to some device
5410 * @dev: device
5411 *
5412 * Find out if a device is linked to a lower device and return true in case
5413 * it is. The caller must hold the RTNL lock.
5414 */
5415static bool netdev_has_any_lower_dev(struct net_device *dev)
5416{
5417	ASSERT_RTNL();
5418
5419	return !list_empty(&dev->adj_list.lower);
5420}
5421
5422void *netdev_adjacent_get_private(struct list_head *adj_list)
5423{
5424	struct netdev_adjacent *adj;
5425
5426	adj = list_entry(adj_list, struct netdev_adjacent, list);
5427
5428	return adj->private;
5429}
5430EXPORT_SYMBOL(netdev_adjacent_get_private);
5431
5432/**
5433 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5434 * @dev: device
5435 * @iter: list_head ** of the current position
5436 *
5437 * Gets the next device from the dev's upper list, starting from iter
5438 * position. The caller must hold RCU read lock.
5439 */
5440struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5441						 struct list_head **iter)
5442{
5443	struct netdev_adjacent *upper;
5444
5445	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5446
5447	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5448
5449	if (&upper->list == &dev->adj_list.upper)
5450		return NULL;
5451
5452	*iter = &upper->list;
5453
5454	return upper->dev;
5455}
5456EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5457
5458static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5459						    struct list_head **iter)
5460{
5461	struct netdev_adjacent *upper;
5462
5463	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5464
5465	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5466
5467	if (&upper->list == &dev->adj_list.upper)
5468		return NULL;
5469
5470	*iter = &upper->list;
5471
5472	return upper->dev;
5473}
5474
5475int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5476				  int (*fn)(struct net_device *dev,
5477					    void *data),
5478				  void *data)
5479{
5480	struct net_device *udev;
5481	struct list_head *iter;
5482	int ret;
5483
5484	for (iter = &dev->adj_list.upper,
5485	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5486	     udev;
5487	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5488		/* first is the upper device itself */
5489		ret = fn(udev, data);
5490		if (ret)
5491			return ret;
5492
5493		/* then look at all of its upper devices */
5494		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5495		if (ret)
5496			return ret;
5497	}
5498
5499	return 0;
5500}
5501EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5502
5503/**
5504 * netdev_lower_get_next_private - Get the next ->private from the
5505 *				   lower neighbour list
5506 * @dev: device
5507 * @iter: list_head ** of the current position
5508 *
5509 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5510 * list, starting from iter position. The caller must hold either hold the
5511 * RTNL lock or its own locking that guarantees that the neighbour lower
5512 * list will remain unchanged.
5513 */
5514void *netdev_lower_get_next_private(struct net_device *dev,
5515				    struct list_head **iter)
5516{
5517	struct netdev_adjacent *lower;
5518
5519	lower = list_entry(*iter, struct netdev_adjacent, list);
5520
5521	if (&lower->list == &dev->adj_list.lower)
5522		return NULL;
5523
5524	*iter = lower->list.next;
5525
5526	return lower->private;
5527}
5528EXPORT_SYMBOL(netdev_lower_get_next_private);
5529
5530/**
5531 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5532 *				       lower neighbour list, RCU
5533 *				       variant
5534 * @dev: device
5535 * @iter: list_head ** of the current position
5536 *
5537 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5538 * list, starting from iter position. The caller must hold RCU read lock.
5539 */
5540void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5541					struct list_head **iter)
5542{
5543	struct netdev_adjacent *lower;
5544
5545	WARN_ON_ONCE(!rcu_read_lock_held());
5546
5547	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5548
5549	if (&lower->list == &dev->adj_list.lower)
5550		return NULL;
5551
5552	*iter = &lower->list;
5553
5554	return lower->private;
5555}
5556EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5557
5558/**
5559 * netdev_lower_get_next - Get the next device from the lower neighbour
5560 *                         list
5561 * @dev: device
5562 * @iter: list_head ** of the current position
5563 *
5564 * Gets the next netdev_adjacent from the dev's lower neighbour
5565 * list, starting from iter position. The caller must hold RTNL lock or
5566 * its own locking that guarantees that the neighbour lower
5567 * list will remain unchanged.
5568 */
5569void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5570{
5571	struct netdev_adjacent *lower;
5572
5573	lower = list_entry(*iter, struct netdev_adjacent, list);
5574
5575	if (&lower->list == &dev->adj_list.lower)
5576		return NULL;
5577
5578	*iter = lower->list.next;
5579
5580	return lower->dev;
5581}
5582EXPORT_SYMBOL(netdev_lower_get_next);
5583
5584static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5585						struct list_head **iter)
5586{
5587	struct netdev_adjacent *lower;
5588
5589	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5590
5591	if (&lower->list == &dev->adj_list.lower)
5592		return NULL;
5593
5594	*iter = &lower->list;
5595
5596	return lower->dev;
5597}
5598
5599int netdev_walk_all_lower_dev(struct net_device *dev,
5600			      int (*fn)(struct net_device *dev,
5601					void *data),
5602			      void *data)
5603{
5604	struct net_device *ldev;
5605	struct list_head *iter;
5606	int ret;
5607
5608	for (iter = &dev->adj_list.lower,
5609	     ldev = netdev_next_lower_dev(dev, &iter);
5610	     ldev;
5611	     ldev = netdev_next_lower_dev(dev, &iter)) {
5612		/* first is the lower device itself */
5613		ret = fn(ldev, data);
5614		if (ret)
5615			return ret;
5616
5617		/* then look at all of its lower devices */
5618		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5619		if (ret)
5620			return ret;
5621	}
5622
5623	return 0;
5624}
5625EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5626
5627static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5628						    struct list_head **iter)
5629{
5630	struct netdev_adjacent *lower;
5631
5632	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5633	if (&lower->list == &dev->adj_list.lower)
5634		return NULL;
5635
5636	*iter = &lower->list;
5637
5638	return lower->dev;
5639}
5640
5641int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5642				  int (*fn)(struct net_device *dev,
5643					    void *data),
5644				  void *data)
5645{
5646	struct net_device *ldev;
5647	struct list_head *iter;
5648	int ret;
5649
5650	for (iter = &dev->adj_list.lower,
5651	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5652	     ldev;
5653	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5654		/* first is the lower device itself */
5655		ret = fn(ldev, data);
5656		if (ret)
5657			return ret;
5658
5659		/* then look at all of its lower devices */
5660		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5661		if (ret)
5662			return ret;
5663	}
5664
5665	return 0;
5666}
5667EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5668
5669/**
5670 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5671 *				       lower neighbour list, RCU
5672 *				       variant
5673 * @dev: device
5674 *
5675 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5676 * list. The caller must hold RCU read lock.
5677 */
5678void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5679{
5680	struct netdev_adjacent *lower;
5681
5682	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5683			struct netdev_adjacent, list);
5684	if (lower)
5685		return lower->private;
5686	return NULL;
5687}
5688EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5689
5690/**
5691 * netdev_master_upper_dev_get_rcu - Get master upper device
5692 * @dev: device
5693 *
5694 * Find a master upper device and return pointer to it or NULL in case
5695 * it's not there. The caller must hold the RCU read lock.
5696 */
5697struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5698{
5699	struct netdev_adjacent *upper;
5700
5701	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5702				       struct netdev_adjacent, list);
5703	if (upper && likely(upper->master))
5704		return upper->dev;
5705	return NULL;
5706}
5707EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5708
5709static int netdev_adjacent_sysfs_add(struct net_device *dev,
5710			      struct net_device *adj_dev,
5711			      struct list_head *dev_list)
5712{
5713	char linkname[IFNAMSIZ+7];
5714	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5715		"upper_%s" : "lower_%s", adj_dev->name);
5716	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5717				 linkname);
5718}
5719static void netdev_adjacent_sysfs_del(struct net_device *dev,
5720			       char *name,
5721			       struct list_head *dev_list)
5722{
5723	char linkname[IFNAMSIZ+7];
5724	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5725		"upper_%s" : "lower_%s", name);
5726	sysfs_remove_link(&(dev->dev.kobj), linkname);
5727}
5728
5729static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5730						 struct net_device *adj_dev,
5731						 struct list_head *dev_list)
5732{
5733	return (dev_list == &dev->adj_list.upper ||
5734		dev_list == &dev->adj_list.lower) &&
5735		net_eq(dev_net(dev), dev_net(adj_dev));
5736}
5737
5738static int __netdev_adjacent_dev_insert(struct net_device *dev,
5739					struct net_device *adj_dev,
5740					struct list_head *dev_list,
5741					void *private, bool master)
5742{
5743	struct netdev_adjacent *adj;
5744	int ret;
5745
5746	adj = __netdev_find_adj(adj_dev, dev_list);
5747
5748	if (adj) {
5749		adj->ref_nr += 1;
5750		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5751			 dev->name, adj_dev->name, adj->ref_nr);
5752
5753		return 0;
5754	}
5755
5756	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5757	if (!adj)
5758		return -ENOMEM;
5759
5760	adj->dev = adj_dev;
5761	adj->master = master;
5762	adj->ref_nr = 1;
5763	adj->private = private;
5764	dev_hold(adj_dev);
5765
5766	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5767		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5768
5769	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5770		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5771		if (ret)
5772			goto free_adj;
5773	}
5774
5775	/* Ensure that master link is always the first item in list. */
5776	if (master) {
5777		ret = sysfs_create_link(&(dev->dev.kobj),
5778					&(adj_dev->dev.kobj), "master");
5779		if (ret)
5780			goto remove_symlinks;
5781
5782		list_add_rcu(&adj->list, dev_list);
5783	} else {
5784		list_add_tail_rcu(&adj->list, dev_list);
5785	}
5786
5787	return 0;
5788
5789remove_symlinks:
5790	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5791		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5792free_adj:
5793	kfree(adj);
5794	dev_put(adj_dev);
5795
5796	return ret;
5797}
5798
5799static void __netdev_adjacent_dev_remove(struct net_device *dev,
5800					 struct net_device *adj_dev,
5801					 u16 ref_nr,
5802					 struct list_head *dev_list)
5803{
5804	struct netdev_adjacent *adj;
5805
5806	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5807		 dev->name, adj_dev->name, ref_nr);
5808
5809	adj = __netdev_find_adj(adj_dev, dev_list);
5810
5811	if (!adj) {
5812		pr_err("Adjacency does not exist for device %s from %s\n",
5813		       dev->name, adj_dev->name);
5814		WARN_ON(1);
5815		return;
5816	}
5817
5818	if (adj->ref_nr > ref_nr) {
5819		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5820			 dev->name, adj_dev->name, ref_nr,
5821			 adj->ref_nr - ref_nr);
5822		adj->ref_nr -= ref_nr;
5823		return;
5824	}
5825
5826	if (adj->master)
5827		sysfs_remove_link(&(dev->dev.kobj), "master");
5828
5829	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5830		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831
5832	list_del_rcu(&adj->list);
5833	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5834		 adj_dev->name, dev->name, adj_dev->name);
5835	dev_put(adj_dev);
5836	kfree_rcu(adj, rcu);
5837}
5838
5839static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5840					    struct net_device *upper_dev,
5841					    struct list_head *up_list,
5842					    struct list_head *down_list,
5843					    void *private, bool master)
5844{
5845	int ret;
5846
5847	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5848					   private, master);
5849	if (ret)
5850		return ret;
5851
5852	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5853					   private, false);
5854	if (ret) {
5855		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5856		return ret;
5857	}
5858
5859	return 0;
5860}
5861
5862static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5863					       struct net_device *upper_dev,
5864					       u16 ref_nr,
5865					       struct list_head *up_list,
5866					       struct list_head *down_list)
5867{
5868	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5869	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5870}
5871
5872static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5873						struct net_device *upper_dev,
5874						void *private, bool master)
5875{
5876	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5877						&dev->adj_list.upper,
5878						&upper_dev->adj_list.lower,
5879						private, master);
5880}
5881
5882static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5883						   struct net_device *upper_dev)
5884{
5885	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5886					   &dev->adj_list.upper,
5887					   &upper_dev->adj_list.lower);
5888}
5889
5890static int __netdev_upper_dev_link(struct net_device *dev,
5891				   struct net_device *upper_dev, bool master,
5892				   void *upper_priv, void *upper_info)
5893{
5894	struct netdev_notifier_changeupper_info changeupper_info;
5895	int ret = 0;
5896
5897	ASSERT_RTNL();
5898
5899	if (dev == upper_dev)
5900		return -EBUSY;
5901
5902	/* To prevent loops, check if dev is not upper device to upper_dev. */
5903	if (netdev_has_upper_dev(upper_dev, dev))
5904		return -EBUSY;
5905
5906	if (netdev_has_upper_dev(dev, upper_dev))
5907		return -EEXIST;
5908
5909	if (master && netdev_master_upper_dev_get(dev))
5910		return -EBUSY;
5911
5912	changeupper_info.upper_dev = upper_dev;
5913	changeupper_info.master = master;
5914	changeupper_info.linking = true;
5915	changeupper_info.upper_info = upper_info;
5916
5917	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5918					    &changeupper_info.info);
5919	ret = notifier_to_errno(ret);
5920	if (ret)
5921		return ret;
5922
5923	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5924						   master);
5925	if (ret)
5926		return ret;
5927
5928	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5929					    &changeupper_info.info);
5930	ret = notifier_to_errno(ret);
5931	if (ret)
5932		goto rollback;
5933
5934	return 0;
5935
5936rollback:
5937	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5938
5939	return ret;
5940}
5941
5942/**
5943 * netdev_upper_dev_link - Add a link to the upper device
5944 * @dev: device
5945 * @upper_dev: new upper device
5946 *
5947 * Adds a link to device which is upper to this one. The caller must hold
5948 * the RTNL lock. On a failure a negative errno code is returned.
5949 * On success the reference counts are adjusted and the function
5950 * returns zero.
5951 */
5952int netdev_upper_dev_link(struct net_device *dev,
5953			  struct net_device *upper_dev)
5954{
5955	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5956}
5957EXPORT_SYMBOL(netdev_upper_dev_link);
5958
5959/**
5960 * netdev_master_upper_dev_link - Add a master link to the upper device
5961 * @dev: device
5962 * @upper_dev: new upper device
5963 * @upper_priv: upper device private
5964 * @upper_info: upper info to be passed down via notifier
5965 *
5966 * Adds a link to device which is upper to this one. In this case, only
5967 * one master upper device can be linked, although other non-master devices
5968 * might be linked as well. The caller must hold the RTNL lock.
5969 * On a failure a negative errno code is returned. On success the reference
5970 * counts are adjusted and the function returns zero.
5971 */
5972int netdev_master_upper_dev_link(struct net_device *dev,
5973				 struct net_device *upper_dev,
5974				 void *upper_priv, void *upper_info)
5975{
5976	return __netdev_upper_dev_link(dev, upper_dev, true,
5977				       upper_priv, upper_info);
5978}
5979EXPORT_SYMBOL(netdev_master_upper_dev_link);
5980
5981/**
5982 * netdev_upper_dev_unlink - Removes a link to upper device
5983 * @dev: device
5984 * @upper_dev: new upper device
5985 *
5986 * Removes a link to device which is upper to this one. The caller must hold
5987 * the RTNL lock.
5988 */
5989void netdev_upper_dev_unlink(struct net_device *dev,
5990			     struct net_device *upper_dev)
5991{
5992	struct netdev_notifier_changeupper_info changeupper_info;
5993	ASSERT_RTNL();
5994
5995	changeupper_info.upper_dev = upper_dev;
5996	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5997	changeupper_info.linking = false;
5998
5999	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6000				      &changeupper_info.info);
6001
6002	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6003
6004	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6005				      &changeupper_info.info);
6006}
6007EXPORT_SYMBOL(netdev_upper_dev_unlink);
6008
6009/**
6010 * netdev_bonding_info_change - Dispatch event about slave change
6011 * @dev: device
6012 * @bonding_info: info to dispatch
6013 *
6014 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6015 * The caller must hold the RTNL lock.
6016 */
6017void netdev_bonding_info_change(struct net_device *dev,
6018				struct netdev_bonding_info *bonding_info)
6019{
6020	struct netdev_notifier_bonding_info	info;
6021
6022	memcpy(&info.bonding_info, bonding_info,
6023	       sizeof(struct netdev_bonding_info));
6024	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6025				      &info.info);
6026}
6027EXPORT_SYMBOL(netdev_bonding_info_change);
6028
6029static void netdev_adjacent_add_links(struct net_device *dev)
6030{
6031	struct netdev_adjacent *iter;
6032
6033	struct net *net = dev_net(dev);
6034
6035	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6036		if (!net_eq(net, dev_net(iter->dev)))
6037			continue;
6038		netdev_adjacent_sysfs_add(iter->dev, dev,
6039					  &iter->dev->adj_list.lower);
6040		netdev_adjacent_sysfs_add(dev, iter->dev,
6041					  &dev->adj_list.upper);
6042	}
6043
6044	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6045		if (!net_eq(net, dev_net(iter->dev)))
6046			continue;
6047		netdev_adjacent_sysfs_add(iter->dev, dev,
6048					  &iter->dev->adj_list.upper);
6049		netdev_adjacent_sysfs_add(dev, iter->dev,
6050					  &dev->adj_list.lower);
6051	}
6052}
6053
6054static void netdev_adjacent_del_links(struct net_device *dev)
6055{
6056	struct netdev_adjacent *iter;
6057
6058	struct net *net = dev_net(dev);
6059
6060	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6061		if (!net_eq(net, dev_net(iter->dev)))
6062			continue;
6063		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6064					  &iter->dev->adj_list.lower);
6065		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6066					  &dev->adj_list.upper);
6067	}
6068
6069	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6070		if (!net_eq(net, dev_net(iter->dev)))
6071			continue;
6072		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6073					  &iter->dev->adj_list.upper);
6074		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6075					  &dev->adj_list.lower);
6076	}
6077}
6078
6079void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6080{
6081	struct netdev_adjacent *iter;
6082
6083	struct net *net = dev_net(dev);
6084
6085	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6086		if (!net_eq(net, dev_net(iter->dev)))
6087			continue;
6088		netdev_adjacent_sysfs_del(iter->dev, oldname,
6089					  &iter->dev->adj_list.lower);
6090		netdev_adjacent_sysfs_add(iter->dev, dev,
6091					  &iter->dev->adj_list.lower);
6092	}
6093
6094	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6095		if (!net_eq(net, dev_net(iter->dev)))
6096			continue;
6097		netdev_adjacent_sysfs_del(iter->dev, oldname,
6098					  &iter->dev->adj_list.upper);
6099		netdev_adjacent_sysfs_add(iter->dev, dev,
6100					  &iter->dev->adj_list.upper);
6101	}
6102}
6103
6104void *netdev_lower_dev_get_private(struct net_device *dev,
6105				   struct net_device *lower_dev)
6106{
6107	struct netdev_adjacent *lower;
6108
6109	if (!lower_dev)
6110		return NULL;
6111	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6112	if (!lower)
6113		return NULL;
6114
6115	return lower->private;
6116}
6117EXPORT_SYMBOL(netdev_lower_dev_get_private);
6118
6119
6120int dev_get_nest_level(struct net_device *dev)
6121{
6122	struct net_device *lower = NULL;
6123	struct list_head *iter;
6124	int max_nest = -1;
6125	int nest;
6126
6127	ASSERT_RTNL();
6128
6129	netdev_for_each_lower_dev(dev, lower, iter) {
6130		nest = dev_get_nest_level(lower);
6131		if (max_nest < nest)
6132			max_nest = nest;
6133	}
6134
6135	return max_nest + 1;
6136}
6137EXPORT_SYMBOL(dev_get_nest_level);
6138
6139/**
6140 * netdev_lower_change - Dispatch event about lower device state change
6141 * @lower_dev: device
6142 * @lower_state_info: state to dispatch
6143 *
6144 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6145 * The caller must hold the RTNL lock.
6146 */
6147void netdev_lower_state_changed(struct net_device *lower_dev,
6148				void *lower_state_info)
6149{
6150	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6151
6152	ASSERT_RTNL();
6153	changelowerstate_info.lower_state_info = lower_state_info;
6154	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6155				      &changelowerstate_info.info);
6156}
6157EXPORT_SYMBOL(netdev_lower_state_changed);
6158
6159int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6160					   struct neighbour *n)
6161{
6162	struct net_device *lower_dev, *stop_dev;
6163	struct list_head *iter;
6164	int err;
6165
6166	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6167		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6168			continue;
6169		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6170		if (err) {
6171			stop_dev = lower_dev;
6172			goto rollback;
6173		}
6174	}
6175	return 0;
6176
6177rollback:
6178	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6179		if (lower_dev == stop_dev)
6180			break;
6181		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6182			continue;
6183		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6184	}
6185	return err;
6186}
6187EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6188
6189void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6190					  struct neighbour *n)
6191{
6192	struct net_device *lower_dev;
6193	struct list_head *iter;
6194
6195	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6196		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6197			continue;
6198		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6199	}
6200}
6201EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6202
6203static void dev_change_rx_flags(struct net_device *dev, int flags)
6204{
6205	const struct net_device_ops *ops = dev->netdev_ops;
6206
6207	if (ops->ndo_change_rx_flags)
6208		ops->ndo_change_rx_flags(dev, flags);
6209}
6210
6211static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6212{
6213	unsigned int old_flags = dev->flags;
6214	kuid_t uid;
6215	kgid_t gid;
6216
6217	ASSERT_RTNL();
6218
6219	dev->flags |= IFF_PROMISC;
6220	dev->promiscuity += inc;
6221	if (dev->promiscuity == 0) {
6222		/*
6223		 * Avoid overflow.
6224		 * If inc causes overflow, untouch promisc and return error.
6225		 */
6226		if (inc < 0)
6227			dev->flags &= ~IFF_PROMISC;
6228		else {
6229			dev->promiscuity -= inc;
6230			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6231				dev->name);
6232			return -EOVERFLOW;
6233		}
6234	}
6235	if (dev->flags != old_flags) {
6236		pr_info("device %s %s promiscuous mode\n",
6237			dev->name,
6238			dev->flags & IFF_PROMISC ? "entered" : "left");
6239		if (audit_enabled) {
6240			current_uid_gid(&uid, &gid);
6241			audit_log(current->audit_context, GFP_ATOMIC,
6242				AUDIT_ANOM_PROMISCUOUS,
6243				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6244				dev->name, (dev->flags & IFF_PROMISC),
6245				(old_flags & IFF_PROMISC),
6246				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6247				from_kuid(&init_user_ns, uid),
6248				from_kgid(&init_user_ns, gid),
6249				audit_get_sessionid(current));
6250		}
6251
6252		dev_change_rx_flags(dev, IFF_PROMISC);
6253	}
6254	if (notify)
6255		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6256	return 0;
6257}
6258
6259/**
6260 *	dev_set_promiscuity	- update promiscuity count on a device
6261 *	@dev: device
6262 *	@inc: modifier
6263 *
6264 *	Add or remove promiscuity from a device. While the count in the device
6265 *	remains above zero the interface remains promiscuous. Once it hits zero
6266 *	the device reverts back to normal filtering operation. A negative inc
6267 *	value is used to drop promiscuity on the device.
6268 *	Return 0 if successful or a negative errno code on error.
6269 */
6270int dev_set_promiscuity(struct net_device *dev, int inc)
6271{
6272	unsigned int old_flags = dev->flags;
6273	int err;
6274
6275	err = __dev_set_promiscuity(dev, inc, true);
6276	if (err < 0)
6277		return err;
6278	if (dev->flags != old_flags)
6279		dev_set_rx_mode(dev);
6280	return err;
6281}
6282EXPORT_SYMBOL(dev_set_promiscuity);
6283
6284static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6285{
6286	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6287
6288	ASSERT_RTNL();
6289
6290	dev->flags |= IFF_ALLMULTI;
6291	dev->allmulti += inc;
6292	if (dev->allmulti == 0) {
6293		/*
6294		 * Avoid overflow.
6295		 * If inc causes overflow, untouch allmulti and return error.
6296		 */
6297		if (inc < 0)
6298			dev->flags &= ~IFF_ALLMULTI;
6299		else {
6300			dev->allmulti -= inc;
6301			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6302				dev->name);
6303			return -EOVERFLOW;
6304		}
6305	}
6306	if (dev->flags ^ old_flags) {
6307		dev_change_rx_flags(dev, IFF_ALLMULTI);
6308		dev_set_rx_mode(dev);
6309		if (notify)
6310			__dev_notify_flags(dev, old_flags,
6311					   dev->gflags ^ old_gflags);
6312	}
6313	return 0;
6314}
6315
6316/**
6317 *	dev_set_allmulti	- update allmulti count on a device
6318 *	@dev: device
6319 *	@inc: modifier
6320 *
6321 *	Add or remove reception of all multicast frames to a device. While the
6322 *	count in the device remains above zero the interface remains listening
6323 *	to all interfaces. Once it hits zero the device reverts back to normal
6324 *	filtering operation. A negative @inc value is used to drop the counter
6325 *	when releasing a resource needing all multicasts.
6326 *	Return 0 if successful or a negative errno code on error.
6327 */
6328
6329int dev_set_allmulti(struct net_device *dev, int inc)
6330{
6331	return __dev_set_allmulti(dev, inc, true);
6332}
6333EXPORT_SYMBOL(dev_set_allmulti);
6334
6335/*
6336 *	Upload unicast and multicast address lists to device and
6337 *	configure RX filtering. When the device doesn't support unicast
6338 *	filtering it is put in promiscuous mode while unicast addresses
6339 *	are present.
6340 */
6341void __dev_set_rx_mode(struct net_device *dev)
6342{
6343	const struct net_device_ops *ops = dev->netdev_ops;
6344
6345	/* dev_open will call this function so the list will stay sane. */
6346	if (!(dev->flags&IFF_UP))
6347		return;
6348
6349	if (!netif_device_present(dev))
6350		return;
6351
6352	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6353		/* Unicast addresses changes may only happen under the rtnl,
6354		 * therefore calling __dev_set_promiscuity here is safe.
6355		 */
6356		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6357			__dev_set_promiscuity(dev, 1, false);
6358			dev->uc_promisc = true;
6359		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6360			__dev_set_promiscuity(dev, -1, false);
6361			dev->uc_promisc = false;
6362		}
6363	}
6364
6365	if (ops->ndo_set_rx_mode)
6366		ops->ndo_set_rx_mode(dev);
6367}
6368
6369void dev_set_rx_mode(struct net_device *dev)
6370{
6371	netif_addr_lock_bh(dev);
6372	__dev_set_rx_mode(dev);
6373	netif_addr_unlock_bh(dev);
6374}
6375
6376/**
6377 *	dev_get_flags - get flags reported to userspace
6378 *	@dev: device
6379 *
6380 *	Get the combination of flag bits exported through APIs to userspace.
6381 */
6382unsigned int dev_get_flags(const struct net_device *dev)
6383{
6384	unsigned int flags;
6385
6386	flags = (dev->flags & ~(IFF_PROMISC |
6387				IFF_ALLMULTI |
6388				IFF_RUNNING |
6389				IFF_LOWER_UP |
6390				IFF_DORMANT)) |
6391		(dev->gflags & (IFF_PROMISC |
6392				IFF_ALLMULTI));
6393
6394	if (netif_running(dev)) {
6395		if (netif_oper_up(dev))
6396			flags |= IFF_RUNNING;
6397		if (netif_carrier_ok(dev))
6398			flags |= IFF_LOWER_UP;
6399		if (netif_dormant(dev))
6400			flags |= IFF_DORMANT;
6401	}
6402
6403	return flags;
6404}
6405EXPORT_SYMBOL(dev_get_flags);
6406
6407int __dev_change_flags(struct net_device *dev, unsigned int flags)
6408{
6409	unsigned int old_flags = dev->flags;
6410	int ret;
6411
6412	ASSERT_RTNL();
6413
6414	/*
6415	 *	Set the flags on our device.
6416	 */
6417
6418	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6419			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6420			       IFF_AUTOMEDIA)) |
6421		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6422				    IFF_ALLMULTI));
6423
6424	/*
6425	 *	Load in the correct multicast list now the flags have changed.
6426	 */
6427
6428	if ((old_flags ^ flags) & IFF_MULTICAST)
6429		dev_change_rx_flags(dev, IFF_MULTICAST);
6430
6431	dev_set_rx_mode(dev);
6432
6433	/*
6434	 *	Have we downed the interface. We handle IFF_UP ourselves
6435	 *	according to user attempts to set it, rather than blindly
6436	 *	setting it.
6437	 */
6438
6439	ret = 0;
6440	if ((old_flags ^ flags) & IFF_UP)
6441		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6442
6443	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6444		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6445		unsigned int old_flags = dev->flags;
6446
6447		dev->gflags ^= IFF_PROMISC;
6448
6449		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6450			if (dev->flags != old_flags)
6451				dev_set_rx_mode(dev);
6452	}
6453
6454	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6455	   is important. Some (broken) drivers set IFF_PROMISC, when
6456	   IFF_ALLMULTI is requested not asking us and not reporting.
6457	 */
6458	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6459		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6460
6461		dev->gflags ^= IFF_ALLMULTI;
6462		__dev_set_allmulti(dev, inc, false);
6463	}
6464
6465	return ret;
6466}
6467
6468void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6469			unsigned int gchanges)
6470{
6471	unsigned int changes = dev->flags ^ old_flags;
6472
6473	if (gchanges)
6474		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6475
6476	if (changes & IFF_UP) {
6477		if (dev->flags & IFF_UP)
6478			call_netdevice_notifiers(NETDEV_UP, dev);
6479		else
6480			call_netdevice_notifiers(NETDEV_DOWN, dev);
6481	}
6482
6483	if (dev->flags & IFF_UP &&
6484	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6485		struct netdev_notifier_change_info change_info;
6486
6487		change_info.flags_changed = changes;
6488		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6489					      &change_info.info);
6490	}
6491}
6492
6493/**
6494 *	dev_change_flags - change device settings
6495 *	@dev: device
6496 *	@flags: device state flags
6497 *
6498 *	Change settings on device based state flags. The flags are
6499 *	in the userspace exported format.
6500 */
6501int dev_change_flags(struct net_device *dev, unsigned int flags)
6502{
6503	int ret;
6504	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6505
6506	ret = __dev_change_flags(dev, flags);
6507	if (ret < 0)
6508		return ret;
6509
6510	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6511	__dev_notify_flags(dev, old_flags, changes);
6512	return ret;
6513}
6514EXPORT_SYMBOL(dev_change_flags);
6515
6516static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6517{
6518	const struct net_device_ops *ops = dev->netdev_ops;
6519
6520	if (ops->ndo_change_mtu)
6521		return ops->ndo_change_mtu(dev, new_mtu);
6522
6523	dev->mtu = new_mtu;
6524	return 0;
6525}
6526
6527/**
6528 *	dev_set_mtu - Change maximum transfer unit
6529 *	@dev: device
6530 *	@new_mtu: new transfer unit
6531 *
6532 *	Change the maximum transfer size of the network device.
6533 */
6534int dev_set_mtu(struct net_device *dev, int new_mtu)
6535{
6536	int err, orig_mtu;
6537
6538	if (new_mtu == dev->mtu)
6539		return 0;
6540
6541	/* MTU must be positive, and in range */
6542	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6543		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6544				    dev->name, new_mtu, dev->min_mtu);
6545		return -EINVAL;
6546	}
6547
6548	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6549		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6550				    dev->name, new_mtu, dev->max_mtu);
6551		return -EINVAL;
6552	}
6553
6554	if (!netif_device_present(dev))
6555		return -ENODEV;
6556
6557	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6558	err = notifier_to_errno(err);
6559	if (err)
6560		return err;
6561
6562	orig_mtu = dev->mtu;
6563	err = __dev_set_mtu(dev, new_mtu);
6564
6565	if (!err) {
6566		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6567		err = notifier_to_errno(err);
6568		if (err) {
6569			/* setting mtu back and notifying everyone again,
6570			 * so that they have a chance to revert changes.
6571			 */
6572			__dev_set_mtu(dev, orig_mtu);
6573			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6574		}
6575	}
6576	return err;
6577}
6578EXPORT_SYMBOL(dev_set_mtu);
6579
6580/**
6581 *	dev_set_group - Change group this device belongs to
6582 *	@dev: device
6583 *	@new_group: group this device should belong to
6584 */
6585void dev_set_group(struct net_device *dev, int new_group)
6586{
6587	dev->group = new_group;
6588}
6589EXPORT_SYMBOL(dev_set_group);
6590
6591/**
6592 *	dev_set_mac_address - Change Media Access Control Address
6593 *	@dev: device
6594 *	@sa: new address
6595 *
6596 *	Change the hardware (MAC) address of the device
6597 */
6598int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6599{
6600	const struct net_device_ops *ops = dev->netdev_ops;
6601	int err;
6602
6603	if (!ops->ndo_set_mac_address)
6604		return -EOPNOTSUPP;
6605	if (sa->sa_family != dev->type)
6606		return -EINVAL;
6607	if (!netif_device_present(dev))
6608		return -ENODEV;
6609	err = ops->ndo_set_mac_address(dev, sa);
6610	if (err)
6611		return err;
6612	dev->addr_assign_type = NET_ADDR_SET;
6613	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6614	add_device_randomness(dev->dev_addr, dev->addr_len);
6615	return 0;
6616}
6617EXPORT_SYMBOL(dev_set_mac_address);
6618
6619/**
6620 *	dev_change_carrier - Change device carrier
6621 *	@dev: device
6622 *	@new_carrier: new value
6623 *
6624 *	Change device carrier
6625 */
6626int dev_change_carrier(struct net_device *dev, bool new_carrier)
6627{
6628	const struct net_device_ops *ops = dev->netdev_ops;
6629
6630	if (!ops->ndo_change_carrier)
6631		return -EOPNOTSUPP;
6632	if (!netif_device_present(dev))
6633		return -ENODEV;
6634	return ops->ndo_change_carrier(dev, new_carrier);
6635}
6636EXPORT_SYMBOL(dev_change_carrier);
6637
6638/**
6639 *	dev_get_phys_port_id - Get device physical port ID
6640 *	@dev: device
6641 *	@ppid: port ID
6642 *
6643 *	Get device physical port ID
6644 */
6645int dev_get_phys_port_id(struct net_device *dev,
6646			 struct netdev_phys_item_id *ppid)
6647{
6648	const struct net_device_ops *ops = dev->netdev_ops;
6649
6650	if (!ops->ndo_get_phys_port_id)
6651		return -EOPNOTSUPP;
6652	return ops->ndo_get_phys_port_id(dev, ppid);
6653}
6654EXPORT_SYMBOL(dev_get_phys_port_id);
6655
6656/**
6657 *	dev_get_phys_port_name - Get device physical port name
6658 *	@dev: device
6659 *	@name: port name
6660 *	@len: limit of bytes to copy to name
6661 *
6662 *	Get device physical port name
6663 */
6664int dev_get_phys_port_name(struct net_device *dev,
6665			   char *name, size_t len)
6666{
6667	const struct net_device_ops *ops = dev->netdev_ops;
6668
6669	if (!ops->ndo_get_phys_port_name)
6670		return -EOPNOTSUPP;
6671	return ops->ndo_get_phys_port_name(dev, name, len);
6672}
6673EXPORT_SYMBOL(dev_get_phys_port_name);
6674
6675/**
6676 *	dev_change_proto_down - update protocol port state information
6677 *	@dev: device
6678 *	@proto_down: new value
6679 *
6680 *	This info can be used by switch drivers to set the phys state of the
6681 *	port.
6682 */
6683int dev_change_proto_down(struct net_device *dev, bool proto_down)
6684{
6685	const struct net_device_ops *ops = dev->netdev_ops;
6686
6687	if (!ops->ndo_change_proto_down)
6688		return -EOPNOTSUPP;
6689	if (!netif_device_present(dev))
6690		return -ENODEV;
6691	return ops->ndo_change_proto_down(dev, proto_down);
6692}
6693EXPORT_SYMBOL(dev_change_proto_down);
6694
6695/**
6696 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6697 *	@dev: device
6698 *	@fd: new program fd or negative value to clear
6699 *	@flags: xdp-related flags
6700 *
6701 *	Set or clear a bpf program for a device
6702 */
6703int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6704{
6705	const struct net_device_ops *ops = dev->netdev_ops;
6706	struct bpf_prog *prog = NULL;
6707	struct netdev_xdp xdp;
6708	int err;
6709
6710	ASSERT_RTNL();
6711
6712	if (!ops->ndo_xdp)
6713		return -EOPNOTSUPP;
6714	if (fd >= 0) {
6715		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6716			memset(&xdp, 0, sizeof(xdp));
6717			xdp.command = XDP_QUERY_PROG;
6718
6719			err = ops->ndo_xdp(dev, &xdp);
6720			if (err < 0)
6721				return err;
6722			if (xdp.prog_attached)
6723				return -EBUSY;
6724		}
6725
6726		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6727		if (IS_ERR(prog))
6728			return PTR_ERR(prog);
6729	}
6730
6731	memset(&xdp, 0, sizeof(xdp));
6732	xdp.command = XDP_SETUP_PROG;
6733	xdp.prog = prog;
6734
6735	err = ops->ndo_xdp(dev, &xdp);
6736	if (err < 0 && prog)
6737		bpf_prog_put(prog);
6738
6739	return err;
6740}
6741EXPORT_SYMBOL(dev_change_xdp_fd);
6742
6743/**
6744 *	dev_new_index	-	allocate an ifindex
6745 *	@net: the applicable net namespace
6746 *
6747 *	Returns a suitable unique value for a new device interface
6748 *	number.  The caller must hold the rtnl semaphore or the
6749 *	dev_base_lock to be sure it remains unique.
6750 */
6751static int dev_new_index(struct net *net)
6752{
6753	int ifindex = net->ifindex;
6754	for (;;) {
6755		if (++ifindex <= 0)
6756			ifindex = 1;
6757		if (!__dev_get_by_index(net, ifindex))
6758			return net->ifindex = ifindex;
6759	}
6760}
6761
6762/* Delayed registration/unregisteration */
6763static LIST_HEAD(net_todo_list);
6764DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6765
6766static void net_set_todo(struct net_device *dev)
6767{
6768	list_add_tail(&dev->todo_list, &net_todo_list);
6769	dev_net(dev)->dev_unreg_count++;
6770}
6771
6772static void rollback_registered_many(struct list_head *head)
6773{
6774	struct net_device *dev, *tmp;
6775	LIST_HEAD(close_head);
6776
6777	BUG_ON(dev_boot_phase);
6778	ASSERT_RTNL();
6779
6780	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6781		/* Some devices call without registering
6782		 * for initialization unwind. Remove those
6783		 * devices and proceed with the remaining.
6784		 */
6785		if (dev->reg_state == NETREG_UNINITIALIZED) {
6786			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6787				 dev->name, dev);
6788
6789			WARN_ON(1);
6790			list_del(&dev->unreg_list);
6791			continue;
6792		}
6793		dev->dismantle = true;
6794		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6795	}
6796
6797	/* If device is running, close it first. */
6798	list_for_each_entry(dev, head, unreg_list)
6799		list_add_tail(&dev->close_list, &close_head);
6800	dev_close_many(&close_head, true);
6801
6802	list_for_each_entry(dev, head, unreg_list) {
6803		/* And unlink it from device chain. */
6804		unlist_netdevice(dev);
6805
6806		dev->reg_state = NETREG_UNREGISTERING;
6807	}
6808	flush_all_backlogs();
6809
6810	synchronize_net();
6811
6812	list_for_each_entry(dev, head, unreg_list) {
6813		struct sk_buff *skb = NULL;
6814
6815		/* Shutdown queueing discipline. */
6816		dev_shutdown(dev);
6817
6818
6819		/* Notify protocols, that we are about to destroy
6820		   this device. They should clean all the things.
6821		*/
6822		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6823
6824		if (!dev->rtnl_link_ops ||
6825		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6826			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6827						     GFP_KERNEL);
6828
6829		/*
6830		 *	Flush the unicast and multicast chains
6831		 */
6832		dev_uc_flush(dev);
6833		dev_mc_flush(dev);
6834
6835		if (dev->netdev_ops->ndo_uninit)
6836			dev->netdev_ops->ndo_uninit(dev);
6837
6838		if (skb)
6839			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6840
6841		/* Notifier chain MUST detach us all upper devices. */
6842		WARN_ON(netdev_has_any_upper_dev(dev));
6843		WARN_ON(netdev_has_any_lower_dev(dev));
6844
6845		/* Remove entries from kobject tree */
6846		netdev_unregister_kobject(dev);
6847#ifdef CONFIG_XPS
6848		/* Remove XPS queueing entries */
6849		netif_reset_xps_queues_gt(dev, 0);
6850#endif
6851	}
6852
6853	synchronize_net();
6854
6855	list_for_each_entry(dev, head, unreg_list)
6856		dev_put(dev);
6857}
6858
6859static void rollback_registered(struct net_device *dev)
6860{
6861	LIST_HEAD(single);
6862
6863	list_add(&dev->unreg_list, &single);
6864	rollback_registered_many(&single);
6865	list_del(&single);
6866}
6867
6868static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6869	struct net_device *upper, netdev_features_t features)
6870{
6871	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6872	netdev_features_t feature;
6873	int feature_bit;
6874
6875	for_each_netdev_feature(&upper_disables, feature_bit) {
6876		feature = __NETIF_F_BIT(feature_bit);
6877		if (!(upper->wanted_features & feature)
6878		    && (features & feature)) {
6879			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6880				   &feature, upper->name);
6881			features &= ~feature;
6882		}
6883	}
6884
6885	return features;
6886}
6887
6888static void netdev_sync_lower_features(struct net_device *upper,
6889	struct net_device *lower, netdev_features_t features)
6890{
6891	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6892	netdev_features_t feature;
6893	int feature_bit;
6894
6895	for_each_netdev_feature(&upper_disables, feature_bit) {
6896		feature = __NETIF_F_BIT(feature_bit);
6897		if (!(features & feature) && (lower->features & feature)) {
6898			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6899				   &feature, lower->name);
6900			lower->wanted_features &= ~feature;
6901			netdev_update_features(lower);
6902
6903			if (unlikely(lower->features & feature))
6904				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6905					    &feature, lower->name);
6906		}
6907	}
6908}
6909
6910static netdev_features_t netdev_fix_features(struct net_device *dev,
6911	netdev_features_t features)
6912{
6913	/* Fix illegal checksum combinations */
6914	if ((features & NETIF_F_HW_CSUM) &&
6915	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6916		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6917		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6918	}
6919
6920	/* TSO requires that SG is present as well. */
6921	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6922		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6923		features &= ~NETIF_F_ALL_TSO;
6924	}
6925
6926	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6927					!(features & NETIF_F_IP_CSUM)) {
6928		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6929		features &= ~NETIF_F_TSO;
6930		features &= ~NETIF_F_TSO_ECN;
6931	}
6932
6933	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6934					 !(features & NETIF_F_IPV6_CSUM)) {
6935		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6936		features &= ~NETIF_F_TSO6;
6937	}
6938
6939	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6940	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6941		features &= ~NETIF_F_TSO_MANGLEID;
6942
6943	/* TSO ECN requires that TSO is present as well. */
6944	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6945		features &= ~NETIF_F_TSO_ECN;
6946
6947	/* Software GSO depends on SG. */
6948	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6949		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6950		features &= ~NETIF_F_GSO;
6951	}
6952
6953	/* UFO needs SG and checksumming */
6954	if (features & NETIF_F_UFO) {
6955		/* maybe split UFO into V4 and V6? */
6956		if (!(features & NETIF_F_HW_CSUM) &&
6957		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6958		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6959			netdev_dbg(dev,
6960				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6961			features &= ~NETIF_F_UFO;
6962		}
6963
6964		if (!(features & NETIF_F_SG)) {
6965			netdev_dbg(dev,
6966				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6967			features &= ~NETIF_F_UFO;
6968		}
6969	}
6970
6971	/* GSO partial features require GSO partial be set */
6972	if ((features & dev->gso_partial_features) &&
6973	    !(features & NETIF_F_GSO_PARTIAL)) {
6974		netdev_dbg(dev,
6975			   "Dropping partially supported GSO features since no GSO partial.\n");
6976		features &= ~dev->gso_partial_features;
6977	}
6978
6979#ifdef CONFIG_NET_RX_BUSY_POLL
6980	if (dev->netdev_ops->ndo_busy_poll)
6981		features |= NETIF_F_BUSY_POLL;
6982	else
6983#endif
6984		features &= ~NETIF_F_BUSY_POLL;
6985
6986	return features;
6987}
6988
6989int __netdev_update_features(struct net_device *dev)
6990{
6991	struct net_device *upper, *lower;
6992	netdev_features_t features;
6993	struct list_head *iter;
6994	int err = -1;
6995
6996	ASSERT_RTNL();
6997
6998	features = netdev_get_wanted_features(dev);
6999
7000	if (dev->netdev_ops->ndo_fix_features)
7001		features = dev->netdev_ops->ndo_fix_features(dev, features);
7002
7003	/* driver might be less strict about feature dependencies */
7004	features = netdev_fix_features(dev, features);
7005
7006	/* some features can't be enabled if they're off an an upper device */
7007	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7008		features = netdev_sync_upper_features(dev, upper, features);
7009
7010	if (dev->features == features)
7011		goto sync_lower;
7012
7013	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7014		&dev->features, &features);
7015
7016	if (dev->netdev_ops->ndo_set_features)
7017		err = dev->netdev_ops->ndo_set_features(dev, features);
7018	else
7019		err = 0;
7020
7021	if (unlikely(err < 0)) {
7022		netdev_err(dev,
7023			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7024			err, &features, &dev->features);
7025		/* return non-0 since some features might have changed and
7026		 * it's better to fire a spurious notification than miss it
7027		 */
7028		return -1;
7029	}
7030
7031sync_lower:
7032	/* some features must be disabled on lower devices when disabled
7033	 * on an upper device (think: bonding master or bridge)
7034	 */
7035	netdev_for_each_lower_dev(dev, lower, iter)
7036		netdev_sync_lower_features(dev, lower, features);
7037
7038	if (!err)
7039		dev->features = features;
7040
7041	return err < 0 ? 0 : 1;
7042}
7043
7044/**
7045 *	netdev_update_features - recalculate device features
7046 *	@dev: the device to check
7047 *
7048 *	Recalculate dev->features set and send notifications if it
7049 *	has changed. Should be called after driver or hardware dependent
7050 *	conditions might have changed that influence the features.
7051 */
7052void netdev_update_features(struct net_device *dev)
7053{
7054	if (__netdev_update_features(dev))
7055		netdev_features_change(dev);
7056}
7057EXPORT_SYMBOL(netdev_update_features);
7058
7059/**
7060 *	netdev_change_features - recalculate device features
7061 *	@dev: the device to check
7062 *
7063 *	Recalculate dev->features set and send notifications even
7064 *	if they have not changed. Should be called instead of
7065 *	netdev_update_features() if also dev->vlan_features might
7066 *	have changed to allow the changes to be propagated to stacked
7067 *	VLAN devices.
7068 */
7069void netdev_change_features(struct net_device *dev)
7070{
7071	__netdev_update_features(dev);
7072	netdev_features_change(dev);
7073}
7074EXPORT_SYMBOL(netdev_change_features);
7075
7076/**
7077 *	netif_stacked_transfer_operstate -	transfer operstate
7078 *	@rootdev: the root or lower level device to transfer state from
7079 *	@dev: the device to transfer operstate to
7080 *
7081 *	Transfer operational state from root to device. This is normally
7082 *	called when a stacking relationship exists between the root
7083 *	device and the device(a leaf device).
7084 */
7085void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7086					struct net_device *dev)
7087{
7088	if (rootdev->operstate == IF_OPER_DORMANT)
7089		netif_dormant_on(dev);
7090	else
7091		netif_dormant_off(dev);
7092
7093	if (netif_carrier_ok(rootdev)) {
7094		if (!netif_carrier_ok(dev))
7095			netif_carrier_on(dev);
7096	} else {
7097		if (netif_carrier_ok(dev))
7098			netif_carrier_off(dev);
7099	}
7100}
7101EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7102
7103#ifdef CONFIG_SYSFS
7104static int netif_alloc_rx_queues(struct net_device *dev)
7105{
7106	unsigned int i, count = dev->num_rx_queues;
7107	struct netdev_rx_queue *rx;
7108	size_t sz = count * sizeof(*rx);
7109
7110	BUG_ON(count < 1);
7111
7112	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7113	if (!rx) {
7114		rx = vzalloc(sz);
7115		if (!rx)
7116			return -ENOMEM;
7117	}
7118	dev->_rx = rx;
7119
7120	for (i = 0; i < count; i++)
7121		rx[i].dev = dev;
7122	return 0;
7123}
7124#endif
7125
7126static void netdev_init_one_queue(struct net_device *dev,
7127				  struct netdev_queue *queue, void *_unused)
7128{
7129	/* Initialize queue lock */
7130	spin_lock_init(&queue->_xmit_lock);
7131	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7132	queue->xmit_lock_owner = -1;
7133	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7134	queue->dev = dev;
7135#ifdef CONFIG_BQL
7136	dql_init(&queue->dql, HZ);
7137#endif
7138}
7139
7140static void netif_free_tx_queues(struct net_device *dev)
7141{
7142	kvfree(dev->_tx);
7143}
7144
7145static int netif_alloc_netdev_queues(struct net_device *dev)
7146{
7147	unsigned int count = dev->num_tx_queues;
7148	struct netdev_queue *tx;
7149	size_t sz = count * sizeof(*tx);
7150
7151	if (count < 1 || count > 0xffff)
7152		return -EINVAL;
7153
7154	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7155	if (!tx) {
7156		tx = vzalloc(sz);
7157		if (!tx)
7158			return -ENOMEM;
7159	}
7160	dev->_tx = tx;
7161
7162	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7163	spin_lock_init(&dev->tx_global_lock);
7164
7165	return 0;
7166}
7167
7168void netif_tx_stop_all_queues(struct net_device *dev)
7169{
7170	unsigned int i;
7171
7172	for (i = 0; i < dev->num_tx_queues; i++) {
7173		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7174		netif_tx_stop_queue(txq);
7175	}
7176}
7177EXPORT_SYMBOL(netif_tx_stop_all_queues);
7178
7179/**
7180 *	register_netdevice	- register a network device
7181 *	@dev: device to register
7182 *
7183 *	Take a completed network device structure and add it to the kernel
7184 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7185 *	chain. 0 is returned on success. A negative errno code is returned
7186 *	on a failure to set up the device, or if the name is a duplicate.
7187 *
7188 *	Callers must hold the rtnl semaphore. You may want
7189 *	register_netdev() instead of this.
7190 *
7191 *	BUGS:
7192 *	The locking appears insufficient to guarantee two parallel registers
7193 *	will not get the same name.
7194 */
7195
7196int register_netdevice(struct net_device *dev)
7197{
7198	int ret;
7199	struct net *net = dev_net(dev);
7200
7201	BUG_ON(dev_boot_phase);
7202	ASSERT_RTNL();
7203
7204	might_sleep();
7205
7206	/* When net_device's are persistent, this will be fatal. */
7207	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7208	BUG_ON(!net);
7209
7210	spin_lock_init(&dev->addr_list_lock);
7211	netdev_set_addr_lockdep_class(dev);
7212
7213	ret = dev_get_valid_name(net, dev, dev->name);
7214	if (ret < 0)
7215		goto out;
7216
7217	/* Init, if this function is available */
7218	if (dev->netdev_ops->ndo_init) {
7219		ret = dev->netdev_ops->ndo_init(dev);
7220		if (ret) {
7221			if (ret > 0)
7222				ret = -EIO;
7223			goto out;
7224		}
7225	}
7226
7227	if (((dev->hw_features | dev->features) &
7228	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7229	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7230	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7231		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7232		ret = -EINVAL;
7233		goto err_uninit;
7234	}
7235
7236	ret = -EBUSY;
7237	if (!dev->ifindex)
7238		dev->ifindex = dev_new_index(net);
7239	else if (__dev_get_by_index(net, dev->ifindex))
7240		goto err_uninit;
7241
7242	/* Transfer changeable features to wanted_features and enable
7243	 * software offloads (GSO and GRO).
7244	 */
7245	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7246	dev->features |= NETIF_F_SOFT_FEATURES;
7247	dev->wanted_features = dev->features & dev->hw_features;
7248
7249	if (!(dev->flags & IFF_LOOPBACK))
7250		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7251
7252	/* If IPv4 TCP segmentation offload is supported we should also
7253	 * allow the device to enable segmenting the frame with the option
7254	 * of ignoring a static IP ID value.  This doesn't enable the
7255	 * feature itself but allows the user to enable it later.
7256	 */
7257	if (dev->hw_features & NETIF_F_TSO)
7258		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7259	if (dev->vlan_features & NETIF_F_TSO)
7260		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7261	if (dev->mpls_features & NETIF_F_TSO)
7262		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7263	if (dev->hw_enc_features & NETIF_F_TSO)
7264		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7265
7266	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7267	 */
7268	dev->vlan_features |= NETIF_F_HIGHDMA;
7269
7270	/* Make NETIF_F_SG inheritable to tunnel devices.
7271	 */
7272	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7273
7274	/* Make NETIF_F_SG inheritable to MPLS.
7275	 */
7276	dev->mpls_features |= NETIF_F_SG;
7277
7278	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7279	ret = notifier_to_errno(ret);
7280	if (ret)
7281		goto err_uninit;
7282
7283	ret = netdev_register_kobject(dev);
7284	if (ret)
7285		goto err_uninit;
7286	dev->reg_state = NETREG_REGISTERED;
7287
7288	__netdev_update_features(dev);
7289
7290	/*
7291	 *	Default initial state at registry is that the
7292	 *	device is present.
7293	 */
7294
7295	set_bit(__LINK_STATE_PRESENT, &dev->state);
7296
7297	linkwatch_init_dev(dev);
7298
7299	dev_init_scheduler(dev);
7300	dev_hold(dev);
7301	list_netdevice(dev);
7302	add_device_randomness(dev->dev_addr, dev->addr_len);
7303
7304	/* If the device has permanent device address, driver should
7305	 * set dev_addr and also addr_assign_type should be set to
7306	 * NET_ADDR_PERM (default value).
7307	 */
7308	if (dev->addr_assign_type == NET_ADDR_PERM)
7309		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7310
7311	/* Notify protocols, that a new device appeared. */
7312	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7313	ret = notifier_to_errno(ret);
7314	if (ret) {
7315		rollback_registered(dev);
7316		dev->reg_state = NETREG_UNREGISTERED;
7317	}
7318	/*
7319	 *	Prevent userspace races by waiting until the network
7320	 *	device is fully setup before sending notifications.
7321	 */
7322	if (!dev->rtnl_link_ops ||
7323	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7324		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7325
7326out:
7327	return ret;
7328
7329err_uninit:
7330	if (dev->netdev_ops->ndo_uninit)
7331		dev->netdev_ops->ndo_uninit(dev);
7332	goto out;
7333}
7334EXPORT_SYMBOL(register_netdevice);
7335
7336/**
7337 *	init_dummy_netdev	- init a dummy network device for NAPI
7338 *	@dev: device to init
7339 *
7340 *	This takes a network device structure and initialize the minimum
7341 *	amount of fields so it can be used to schedule NAPI polls without
7342 *	registering a full blown interface. This is to be used by drivers
7343 *	that need to tie several hardware interfaces to a single NAPI
7344 *	poll scheduler due to HW limitations.
7345 */
7346int init_dummy_netdev(struct net_device *dev)
7347{
7348	/* Clear everything. Note we don't initialize spinlocks
7349	 * are they aren't supposed to be taken by any of the
7350	 * NAPI code and this dummy netdev is supposed to be
7351	 * only ever used for NAPI polls
7352	 */
7353	memset(dev, 0, sizeof(struct net_device));
7354
7355	/* make sure we BUG if trying to hit standard
7356	 * register/unregister code path
7357	 */
7358	dev->reg_state = NETREG_DUMMY;
7359
7360	/* NAPI wants this */
7361	INIT_LIST_HEAD(&dev->napi_list);
7362
7363	/* a dummy interface is started by default */
7364	set_bit(__LINK_STATE_PRESENT, &dev->state);
7365	set_bit(__LINK_STATE_START, &dev->state);
7366
7367	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7368	 * because users of this 'device' dont need to change
7369	 * its refcount.
7370	 */
7371
7372	return 0;
7373}
7374EXPORT_SYMBOL_GPL(init_dummy_netdev);
7375
7376
7377/**
7378 *	register_netdev	- register a network device
7379 *	@dev: device to register
7380 *
7381 *	Take a completed network device structure and add it to the kernel
7382 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7383 *	chain. 0 is returned on success. A negative errno code is returned
7384 *	on a failure to set up the device, or if the name is a duplicate.
7385 *
7386 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7387 *	and expands the device name if you passed a format string to
7388 *	alloc_netdev.
7389 */
7390int register_netdev(struct net_device *dev)
7391{
7392	int err;
7393
7394	rtnl_lock();
7395	err = register_netdevice(dev);
7396	rtnl_unlock();
7397	return err;
7398}
7399EXPORT_SYMBOL(register_netdev);
7400
7401int netdev_refcnt_read(const struct net_device *dev)
7402{
7403	int i, refcnt = 0;
7404
7405	for_each_possible_cpu(i)
7406		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7407	return refcnt;
7408}
7409EXPORT_SYMBOL(netdev_refcnt_read);
7410
7411/**
7412 * netdev_wait_allrefs - wait until all references are gone.
7413 * @dev: target net_device
7414 *
7415 * This is called when unregistering network devices.
7416 *
7417 * Any protocol or device that holds a reference should register
7418 * for netdevice notification, and cleanup and put back the
7419 * reference if they receive an UNREGISTER event.
7420 * We can get stuck here if buggy protocols don't correctly
7421 * call dev_put.
7422 */
7423static void netdev_wait_allrefs(struct net_device *dev)
7424{
7425	unsigned long rebroadcast_time, warning_time;
7426	int refcnt;
7427
7428	linkwatch_forget_dev(dev);
7429
7430	rebroadcast_time = warning_time = jiffies;
7431	refcnt = netdev_refcnt_read(dev);
7432
7433	while (refcnt != 0) {
7434		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7435			rtnl_lock();
7436
7437			/* Rebroadcast unregister notification */
7438			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7439
7440			__rtnl_unlock();
7441			rcu_barrier();
7442			rtnl_lock();
7443
7444			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7445			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7446				     &dev->state)) {
7447				/* We must not have linkwatch events
7448				 * pending on unregister. If this
7449				 * happens, we simply run the queue
7450				 * unscheduled, resulting in a noop
7451				 * for this device.
7452				 */
7453				linkwatch_run_queue();
7454			}
7455
7456			__rtnl_unlock();
7457
7458			rebroadcast_time = jiffies;
7459		}
7460
7461		msleep(250);
7462
7463		refcnt = netdev_refcnt_read(dev);
7464
7465		if (time_after(jiffies, warning_time + 10 * HZ)) {
7466			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7467				 dev->name, refcnt);
7468			warning_time = jiffies;
7469		}
7470	}
7471}
7472
7473/* The sequence is:
7474 *
7475 *	rtnl_lock();
7476 *	...
7477 *	register_netdevice(x1);
7478 *	register_netdevice(x2);
7479 *	...
7480 *	unregister_netdevice(y1);
7481 *	unregister_netdevice(y2);
7482 *      ...
7483 *	rtnl_unlock();
7484 *	free_netdev(y1);
7485 *	free_netdev(y2);
7486 *
7487 * We are invoked by rtnl_unlock().
7488 * This allows us to deal with problems:
7489 * 1) We can delete sysfs objects which invoke hotplug
7490 *    without deadlocking with linkwatch via keventd.
7491 * 2) Since we run with the RTNL semaphore not held, we can sleep
7492 *    safely in order to wait for the netdev refcnt to drop to zero.
7493 *
7494 * We must not return until all unregister events added during
7495 * the interval the lock was held have been completed.
7496 */
7497void netdev_run_todo(void)
7498{
7499	struct list_head list;
7500
7501	/* Snapshot list, allow later requests */
7502	list_replace_init(&net_todo_list, &list);
7503
7504	__rtnl_unlock();
7505
7506
7507	/* Wait for rcu callbacks to finish before next phase */
7508	if (!list_empty(&list))
7509		rcu_barrier();
7510
7511	while (!list_empty(&list)) {
7512		struct net_device *dev
7513			= list_first_entry(&list, struct net_device, todo_list);
7514		list_del(&dev->todo_list);
7515
7516		rtnl_lock();
7517		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7518		__rtnl_unlock();
7519
7520		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7521			pr_err("network todo '%s' but state %d\n",
7522			       dev->name, dev->reg_state);
7523			dump_stack();
7524			continue;
7525		}
7526
7527		dev->reg_state = NETREG_UNREGISTERED;
7528
7529		netdev_wait_allrefs(dev);
7530
7531		/* paranoia */
7532		BUG_ON(netdev_refcnt_read(dev));
7533		BUG_ON(!list_empty(&dev->ptype_all));
7534		BUG_ON(!list_empty(&dev->ptype_specific));
7535		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7536		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7537		WARN_ON(dev->dn_ptr);
7538
7539		if (dev->destructor)
7540			dev->destructor(dev);
7541
7542		/* Report a network device has been unregistered */
7543		rtnl_lock();
7544		dev_net(dev)->dev_unreg_count--;
7545		__rtnl_unlock();
7546		wake_up(&netdev_unregistering_wq);
7547
7548		/* Free network device */
7549		kobject_put(&dev->dev.kobj);
7550	}
7551}
7552
7553/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7554 * all the same fields in the same order as net_device_stats, with only
7555 * the type differing, but rtnl_link_stats64 may have additional fields
7556 * at the end for newer counters.
7557 */
7558void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7559			     const struct net_device_stats *netdev_stats)
7560{
7561#if BITS_PER_LONG == 64
7562	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7563	memcpy(stats64, netdev_stats, sizeof(*stats64));
7564	/* zero out counters that only exist in rtnl_link_stats64 */
7565	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7566	       sizeof(*stats64) - sizeof(*netdev_stats));
7567#else
7568	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7569	const unsigned long *src = (const unsigned long *)netdev_stats;
7570	u64 *dst = (u64 *)stats64;
7571
7572	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7573	for (i = 0; i < n; i++)
7574		dst[i] = src[i];
7575	/* zero out counters that only exist in rtnl_link_stats64 */
7576	memset((char *)stats64 + n * sizeof(u64), 0,
7577	       sizeof(*stats64) - n * sizeof(u64));
7578#endif
7579}
7580EXPORT_SYMBOL(netdev_stats_to_stats64);
7581
7582/**
7583 *	dev_get_stats	- get network device statistics
7584 *	@dev: device to get statistics from
7585 *	@storage: place to store stats
7586 *
7587 *	Get network statistics from device. Return @storage.
7588 *	The device driver may provide its own method by setting
7589 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7590 *	otherwise the internal statistics structure is used.
7591 */
7592struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7593					struct rtnl_link_stats64 *storage)
7594{
7595	const struct net_device_ops *ops = dev->netdev_ops;
7596
7597	if (ops->ndo_get_stats64) {
7598		memset(storage, 0, sizeof(*storage));
7599		ops->ndo_get_stats64(dev, storage);
7600	} else if (ops->ndo_get_stats) {
7601		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7602	} else {
7603		netdev_stats_to_stats64(storage, &dev->stats);
7604	}
7605	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7606	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7607	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7608	return storage;
7609}
7610EXPORT_SYMBOL(dev_get_stats);
7611
7612struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7613{
7614	struct netdev_queue *queue = dev_ingress_queue(dev);
7615
7616#ifdef CONFIG_NET_CLS_ACT
7617	if (queue)
7618		return queue;
7619	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7620	if (!queue)
7621		return NULL;
7622	netdev_init_one_queue(dev, queue, NULL);
7623	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7624	queue->qdisc_sleeping = &noop_qdisc;
7625	rcu_assign_pointer(dev->ingress_queue, queue);
7626#endif
7627	return queue;
7628}
7629
7630static const struct ethtool_ops default_ethtool_ops;
7631
7632void netdev_set_default_ethtool_ops(struct net_device *dev,
7633				    const struct ethtool_ops *ops)
7634{
7635	if (dev->ethtool_ops == &default_ethtool_ops)
7636		dev->ethtool_ops = ops;
7637}
7638EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7639
7640void netdev_freemem(struct net_device *dev)
7641{
7642	char *addr = (char *)dev - dev->padded;
7643
7644	kvfree(addr);
7645}
7646
7647/**
7648 *	alloc_netdev_mqs - allocate network device
7649 *	@sizeof_priv:		size of private data to allocate space for
7650 *	@name:			device name format string
7651 *	@name_assign_type: 	origin of device name
7652 *	@setup:			callback to initialize device
7653 *	@txqs:			the number of TX subqueues to allocate
7654 *	@rxqs:			the number of RX subqueues to allocate
7655 *
7656 *	Allocates a struct net_device with private data area for driver use
7657 *	and performs basic initialization.  Also allocates subqueue structs
7658 *	for each queue on the device.
7659 */
7660struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7661		unsigned char name_assign_type,
7662		void (*setup)(struct net_device *),
7663		unsigned int txqs, unsigned int rxqs)
7664{
7665	struct net_device *dev;
7666	size_t alloc_size;
7667	struct net_device *p;
7668
7669	BUG_ON(strlen(name) >= sizeof(dev->name));
7670
7671	if (txqs < 1) {
7672		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7673		return NULL;
7674	}
7675
7676#ifdef CONFIG_SYSFS
7677	if (rxqs < 1) {
7678		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7679		return NULL;
7680	}
7681#endif
7682
7683	alloc_size = sizeof(struct net_device);
7684	if (sizeof_priv) {
7685		/* ensure 32-byte alignment of private area */
7686		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7687		alloc_size += sizeof_priv;
7688	}
7689	/* ensure 32-byte alignment of whole construct */
7690	alloc_size += NETDEV_ALIGN - 1;
7691
7692	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7693	if (!p)
7694		p = vzalloc(alloc_size);
7695	if (!p)
7696		return NULL;
7697
7698	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7699	dev->padded = (char *)dev - (char *)p;
7700
7701	dev->pcpu_refcnt = alloc_percpu(int);
7702	if (!dev->pcpu_refcnt)
7703		goto free_dev;
7704
7705	if (dev_addr_init(dev))
7706		goto free_pcpu;
7707
7708	dev_mc_init(dev);
7709	dev_uc_init(dev);
7710
7711	dev_net_set(dev, &init_net);
7712
7713	dev->gso_max_size = GSO_MAX_SIZE;
7714	dev->gso_max_segs = GSO_MAX_SEGS;
7715
7716	INIT_LIST_HEAD(&dev->napi_list);
7717	INIT_LIST_HEAD(&dev->unreg_list);
7718	INIT_LIST_HEAD(&dev->close_list);
7719	INIT_LIST_HEAD(&dev->link_watch_list);
7720	INIT_LIST_HEAD(&dev->adj_list.upper);
7721	INIT_LIST_HEAD(&dev->adj_list.lower);
7722	INIT_LIST_HEAD(&dev->ptype_all);
7723	INIT_LIST_HEAD(&dev->ptype_specific);
7724#ifdef CONFIG_NET_SCHED
7725	hash_init(dev->qdisc_hash);
7726#endif
7727	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7728	setup(dev);
7729
7730	if (!dev->tx_queue_len) {
7731		dev->priv_flags |= IFF_NO_QUEUE;
7732		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7733	}
7734
7735	dev->num_tx_queues = txqs;
7736	dev->real_num_tx_queues = txqs;
7737	if (netif_alloc_netdev_queues(dev))
7738		goto free_all;
7739
7740#ifdef CONFIG_SYSFS
7741	dev->num_rx_queues = rxqs;
7742	dev->real_num_rx_queues = rxqs;
7743	if (netif_alloc_rx_queues(dev))
7744		goto free_all;
7745#endif
7746
7747	strcpy(dev->name, name);
7748	dev->name_assign_type = name_assign_type;
7749	dev->group = INIT_NETDEV_GROUP;
7750	if (!dev->ethtool_ops)
7751		dev->ethtool_ops = &default_ethtool_ops;
7752
7753	nf_hook_ingress_init(dev);
7754
7755	return dev;
7756
7757free_all:
7758	free_netdev(dev);
7759	return NULL;
7760
7761free_pcpu:
7762	free_percpu(dev->pcpu_refcnt);
7763free_dev:
7764	netdev_freemem(dev);
7765	return NULL;
7766}
7767EXPORT_SYMBOL(alloc_netdev_mqs);
7768
7769/**
7770 *	free_netdev - free network device
7771 *	@dev: device
7772 *
7773 *	This function does the last stage of destroying an allocated device
7774 * 	interface. The reference to the device object is released.
7775 *	If this is the last reference then it will be freed.
7776 *	Must be called in process context.
7777 */
7778void free_netdev(struct net_device *dev)
7779{
7780	struct napi_struct *p, *n;
7781
7782	might_sleep();
7783	netif_free_tx_queues(dev);
7784#ifdef CONFIG_SYSFS
7785	kvfree(dev->_rx);
7786#endif
7787
7788	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7789
7790	/* Flush device addresses */
7791	dev_addr_flush(dev);
7792
7793	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7794		netif_napi_del(p);
7795
7796	free_percpu(dev->pcpu_refcnt);
7797	dev->pcpu_refcnt = NULL;
7798
7799	/*  Compatibility with error handling in drivers */
7800	if (dev->reg_state == NETREG_UNINITIALIZED) {
7801		netdev_freemem(dev);
7802		return;
7803	}
7804
7805	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7806	dev->reg_state = NETREG_RELEASED;
7807
7808	/* will free via device release */
7809	put_device(&dev->dev);
7810}
7811EXPORT_SYMBOL(free_netdev);
7812
7813/**
7814 *	synchronize_net -  Synchronize with packet receive processing
7815 *
7816 *	Wait for packets currently being received to be done.
7817 *	Does not block later packets from starting.
7818 */
7819void synchronize_net(void)
7820{
7821	might_sleep();
7822	if (rtnl_is_locked())
7823		synchronize_rcu_expedited();
7824	else
7825		synchronize_rcu();
7826}
7827EXPORT_SYMBOL(synchronize_net);
7828
7829/**
7830 *	unregister_netdevice_queue - remove device from the kernel
7831 *	@dev: device
7832 *	@head: list
7833 *
7834 *	This function shuts down a device interface and removes it
7835 *	from the kernel tables.
7836 *	If head not NULL, device is queued to be unregistered later.
7837 *
7838 *	Callers must hold the rtnl semaphore.  You may want
7839 *	unregister_netdev() instead of this.
7840 */
7841
7842void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7843{
7844	ASSERT_RTNL();
7845
7846	if (head) {
7847		list_move_tail(&dev->unreg_list, head);
7848	} else {
7849		rollback_registered(dev);
7850		/* Finish processing unregister after unlock */
7851		net_set_todo(dev);
7852	}
7853}
7854EXPORT_SYMBOL(unregister_netdevice_queue);
7855
7856/**
7857 *	unregister_netdevice_many - unregister many devices
7858 *	@head: list of devices
7859 *
7860 *  Note: As most callers use a stack allocated list_head,
7861 *  we force a list_del() to make sure stack wont be corrupted later.
7862 */
7863void unregister_netdevice_many(struct list_head *head)
7864{
7865	struct net_device *dev;
7866
7867	if (!list_empty(head)) {
7868		rollback_registered_many(head);
7869		list_for_each_entry(dev, head, unreg_list)
7870			net_set_todo(dev);
7871		list_del(head);
7872	}
7873}
7874EXPORT_SYMBOL(unregister_netdevice_many);
7875
7876/**
7877 *	unregister_netdev - remove device from the kernel
7878 *	@dev: device
7879 *
7880 *	This function shuts down a device interface and removes it
7881 *	from the kernel tables.
7882 *
7883 *	This is just a wrapper for unregister_netdevice that takes
7884 *	the rtnl semaphore.  In general you want to use this and not
7885 *	unregister_netdevice.
7886 */
7887void unregister_netdev(struct net_device *dev)
7888{
7889	rtnl_lock();
7890	unregister_netdevice(dev);
7891	rtnl_unlock();
7892}
7893EXPORT_SYMBOL(unregister_netdev);
7894
7895/**
7896 *	dev_change_net_namespace - move device to different nethost namespace
7897 *	@dev: device
7898 *	@net: network namespace
7899 *	@pat: If not NULL name pattern to try if the current device name
7900 *	      is already taken in the destination network namespace.
7901 *
7902 *	This function shuts down a device interface and moves it
7903 *	to a new network namespace. On success 0 is returned, on
7904 *	a failure a netagive errno code is returned.
7905 *
7906 *	Callers must hold the rtnl semaphore.
7907 */
7908
7909int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7910{
7911	int err;
7912
7913	ASSERT_RTNL();
7914
7915	/* Don't allow namespace local devices to be moved. */
7916	err = -EINVAL;
7917	if (dev->features & NETIF_F_NETNS_LOCAL)
7918		goto out;
7919
7920	/* Ensure the device has been registrered */
7921	if (dev->reg_state != NETREG_REGISTERED)
7922		goto out;
7923
7924	/* Get out if there is nothing todo */
7925	err = 0;
7926	if (net_eq(dev_net(dev), net))
7927		goto out;
7928
7929	/* Pick the destination device name, and ensure
7930	 * we can use it in the destination network namespace.
7931	 */
7932	err = -EEXIST;
7933	if (__dev_get_by_name(net, dev->name)) {
7934		/* We get here if we can't use the current device name */
7935		if (!pat)
7936			goto out;
7937		if (dev_get_valid_name(net, dev, pat) < 0)
7938			goto out;
7939	}
7940
7941	/*
7942	 * And now a mini version of register_netdevice unregister_netdevice.
7943	 */
7944
7945	/* If device is running close it first. */
7946	dev_close(dev);
7947
7948	/* And unlink it from device chain */
7949	err = -ENODEV;
7950	unlist_netdevice(dev);
7951
7952	synchronize_net();
7953
7954	/* Shutdown queueing discipline. */
7955	dev_shutdown(dev);
7956
7957	/* Notify protocols, that we are about to destroy
7958	   this device. They should clean all the things.
7959
7960	   Note that dev->reg_state stays at NETREG_REGISTERED.
7961	   This is wanted because this way 8021q and macvlan know
7962	   the device is just moving and can keep their slaves up.
7963	*/
7964	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7965	rcu_barrier();
7966	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7967	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7968
7969	/*
7970	 *	Flush the unicast and multicast chains
7971	 */
7972	dev_uc_flush(dev);
7973	dev_mc_flush(dev);
7974
7975	/* Send a netdev-removed uevent to the old namespace */
7976	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7977	netdev_adjacent_del_links(dev);
7978
7979	/* Actually switch the network namespace */
7980	dev_net_set(dev, net);
7981
7982	/* If there is an ifindex conflict assign a new one */
7983	if (__dev_get_by_index(net, dev->ifindex))
7984		dev->ifindex = dev_new_index(net);
7985
7986	/* Send a netdev-add uevent to the new namespace */
7987	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7988	netdev_adjacent_add_links(dev);
7989
7990	/* Fixup kobjects */
7991	err = device_rename(&dev->dev, dev->name);
7992	WARN_ON(err);
7993
7994	/* Add the device back in the hashes */
7995	list_netdevice(dev);
7996
7997	/* Notify protocols, that a new device appeared. */
7998	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7999
8000	/*
8001	 *	Prevent userspace races by waiting until the network
8002	 *	device is fully setup before sending notifications.
8003	 */
8004	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8005
8006	synchronize_net();
8007	err = 0;
8008out:
8009	return err;
8010}
8011EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8012
8013static int dev_cpu_dead(unsigned int oldcpu)
8014{
8015	struct sk_buff **list_skb;
8016	struct sk_buff *skb;
8017	unsigned int cpu;
8018	struct softnet_data *sd, *oldsd;
8019
8020	local_irq_disable();
8021	cpu = smp_processor_id();
8022	sd = &per_cpu(softnet_data, cpu);
8023	oldsd = &per_cpu(softnet_data, oldcpu);
8024
8025	/* Find end of our completion_queue. */
8026	list_skb = &sd->completion_queue;
8027	while (*list_skb)
8028		list_skb = &(*list_skb)->next;
8029	/* Append completion queue from offline CPU. */
8030	*list_skb = oldsd->completion_queue;
8031	oldsd->completion_queue = NULL;
8032
8033	/* Append output queue from offline CPU. */
8034	if (oldsd->output_queue) {
8035		*sd->output_queue_tailp = oldsd->output_queue;
8036		sd->output_queue_tailp = oldsd->output_queue_tailp;
8037		oldsd->output_queue = NULL;
8038		oldsd->output_queue_tailp = &oldsd->output_queue;
8039	}
8040	/* Append NAPI poll list from offline CPU, with one exception :
8041	 * process_backlog() must be called by cpu owning percpu backlog.
8042	 * We properly handle process_queue & input_pkt_queue later.
8043	 */
8044	while (!list_empty(&oldsd->poll_list)) {
8045		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8046							    struct napi_struct,
8047							    poll_list);
8048
8049		list_del_init(&napi->poll_list);
8050		if (napi->poll == process_backlog)
8051			napi->state = 0;
8052		else
8053			____napi_schedule(sd, napi);
8054	}
8055
8056	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8057	local_irq_enable();
8058
8059	/* Process offline CPU's input_pkt_queue */
8060	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8061		netif_rx_ni(skb);
8062		input_queue_head_incr(oldsd);
8063	}
8064	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8065		netif_rx_ni(skb);
8066		input_queue_head_incr(oldsd);
8067	}
8068
8069	return 0;
8070}
8071
8072/**
8073 *	netdev_increment_features - increment feature set by one
8074 *	@all: current feature set
8075 *	@one: new feature set
8076 *	@mask: mask feature set
8077 *
8078 *	Computes a new feature set after adding a device with feature set
8079 *	@one to the master device with current feature set @all.  Will not
8080 *	enable anything that is off in @mask. Returns the new feature set.
8081 */
8082netdev_features_t netdev_increment_features(netdev_features_t all,
8083	netdev_features_t one, netdev_features_t mask)
8084{
8085	if (mask & NETIF_F_HW_CSUM)
8086		mask |= NETIF_F_CSUM_MASK;
8087	mask |= NETIF_F_VLAN_CHALLENGED;
8088
8089	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8090	all &= one | ~NETIF_F_ALL_FOR_ALL;
8091
8092	/* If one device supports hw checksumming, set for all. */
8093	if (all & NETIF_F_HW_CSUM)
8094		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8095
8096	return all;
8097}
8098EXPORT_SYMBOL(netdev_increment_features);
8099
8100static struct hlist_head * __net_init netdev_create_hash(void)
8101{
8102	int i;
8103	struct hlist_head *hash;
8104
8105	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8106	if (hash != NULL)
8107		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8108			INIT_HLIST_HEAD(&hash[i]);
8109
8110	return hash;
8111}
8112
8113/* Initialize per network namespace state */
8114static int __net_init netdev_init(struct net *net)
8115{
8116	if (net != &init_net)
8117		INIT_LIST_HEAD(&net->dev_base_head);
8118
8119	net->dev_name_head = netdev_create_hash();
8120	if (net->dev_name_head == NULL)
8121		goto err_name;
8122
8123	net->dev_index_head = netdev_create_hash();
8124	if (net->dev_index_head == NULL)
8125		goto err_idx;
8126
8127	return 0;
8128
8129err_idx:
8130	kfree(net->dev_name_head);
8131err_name:
8132	return -ENOMEM;
8133}
8134
8135/**
8136 *	netdev_drivername - network driver for the device
8137 *	@dev: network device
8138 *
8139 *	Determine network driver for device.
8140 */
8141const char *netdev_drivername(const struct net_device *dev)
8142{
8143	const struct device_driver *driver;
8144	const struct device *parent;
8145	const char *empty = "";
8146
8147	parent = dev->dev.parent;
8148	if (!parent)
8149		return empty;
8150
8151	driver = parent->driver;
8152	if (driver && driver->name)
8153		return driver->name;
8154	return empty;
8155}
8156
8157static void __netdev_printk(const char *level, const struct net_device *dev,
8158			    struct va_format *vaf)
8159{
8160	if (dev && dev->dev.parent) {
8161		dev_printk_emit(level[1] - '0',
8162				dev->dev.parent,
8163				"%s %s %s%s: %pV",
8164				dev_driver_string(dev->dev.parent),
8165				dev_name(dev->dev.parent),
8166				netdev_name(dev), netdev_reg_state(dev),
8167				vaf);
8168	} else if (dev) {
8169		printk("%s%s%s: %pV",
8170		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8171	} else {
8172		printk("%s(NULL net_device): %pV", level, vaf);
8173	}
8174}
8175
8176void netdev_printk(const char *level, const struct net_device *dev,
8177		   const char *format, ...)
8178{
8179	struct va_format vaf;
8180	va_list args;
8181
8182	va_start(args, format);
8183
8184	vaf.fmt = format;
8185	vaf.va = &args;
8186
8187	__netdev_printk(level, dev, &vaf);
8188
8189	va_end(args);
8190}
8191EXPORT_SYMBOL(netdev_printk);
8192
8193#define define_netdev_printk_level(func, level)			\
8194void func(const struct net_device *dev, const char *fmt, ...)	\
8195{								\
8196	struct va_format vaf;					\
8197	va_list args;						\
8198								\
8199	va_start(args, fmt);					\
8200								\
8201	vaf.fmt = fmt;						\
8202	vaf.va = &args;						\
8203								\
8204	__netdev_printk(level, dev, &vaf);			\
8205								\
8206	va_end(args);						\
8207}								\
8208EXPORT_SYMBOL(func);
8209
8210define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8211define_netdev_printk_level(netdev_alert, KERN_ALERT);
8212define_netdev_printk_level(netdev_crit, KERN_CRIT);
8213define_netdev_printk_level(netdev_err, KERN_ERR);
8214define_netdev_printk_level(netdev_warn, KERN_WARNING);
8215define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8216define_netdev_printk_level(netdev_info, KERN_INFO);
8217
8218static void __net_exit netdev_exit(struct net *net)
8219{
8220	kfree(net->dev_name_head);
8221	kfree(net->dev_index_head);
8222}
8223
8224static struct pernet_operations __net_initdata netdev_net_ops = {
8225	.init = netdev_init,
8226	.exit = netdev_exit,
8227};
8228
8229static void __net_exit default_device_exit(struct net *net)
8230{
8231	struct net_device *dev, *aux;
8232	/*
8233	 * Push all migratable network devices back to the
8234	 * initial network namespace
8235	 */
8236	rtnl_lock();
8237	for_each_netdev_safe(net, dev, aux) {
8238		int err;
8239		char fb_name[IFNAMSIZ];
8240
8241		/* Ignore unmoveable devices (i.e. loopback) */
8242		if (dev->features & NETIF_F_NETNS_LOCAL)
8243			continue;
8244
8245		/* Leave virtual devices for the generic cleanup */
8246		if (dev->rtnl_link_ops)
8247			continue;
8248
8249		/* Push remaining network devices to init_net */
8250		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8251		err = dev_change_net_namespace(dev, &init_net, fb_name);
8252		if (err) {
8253			pr_emerg("%s: failed to move %s to init_net: %d\n",
8254				 __func__, dev->name, err);
8255			BUG();
8256		}
8257	}
8258	rtnl_unlock();
8259}
8260
8261static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8262{
8263	/* Return with the rtnl_lock held when there are no network
8264	 * devices unregistering in any network namespace in net_list.
8265	 */
8266	struct net *net;
8267	bool unregistering;
8268	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8269
8270	add_wait_queue(&netdev_unregistering_wq, &wait);
8271	for (;;) {
8272		unregistering = false;
8273		rtnl_lock();
8274		list_for_each_entry(net, net_list, exit_list) {
8275			if (net->dev_unreg_count > 0) {
8276				unregistering = true;
8277				break;
8278			}
8279		}
8280		if (!unregistering)
8281			break;
8282		__rtnl_unlock();
8283
8284		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8285	}
8286	remove_wait_queue(&netdev_unregistering_wq, &wait);
8287}
8288
8289static void __net_exit default_device_exit_batch(struct list_head *net_list)
8290{
8291	/* At exit all network devices most be removed from a network
8292	 * namespace.  Do this in the reverse order of registration.
8293	 * Do this across as many network namespaces as possible to
8294	 * improve batching efficiency.
8295	 */
8296	struct net_device *dev;
8297	struct net *net;
8298	LIST_HEAD(dev_kill_list);
8299
8300	/* To prevent network device cleanup code from dereferencing
8301	 * loopback devices or network devices that have been freed
8302	 * wait here for all pending unregistrations to complete,
8303	 * before unregistring the loopback device and allowing the
8304	 * network namespace be freed.
8305	 *
8306	 * The netdev todo list containing all network devices
8307	 * unregistrations that happen in default_device_exit_batch
8308	 * will run in the rtnl_unlock() at the end of
8309	 * default_device_exit_batch.
8310	 */
8311	rtnl_lock_unregistering(net_list);
8312	list_for_each_entry(net, net_list, exit_list) {
8313		for_each_netdev_reverse(net, dev) {
8314			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8315				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8316			else
8317				unregister_netdevice_queue(dev, &dev_kill_list);
8318		}
8319	}
8320	unregister_netdevice_many(&dev_kill_list);
8321	rtnl_unlock();
8322}
8323
8324static struct pernet_operations __net_initdata default_device_ops = {
8325	.exit = default_device_exit,
8326	.exit_batch = default_device_exit_batch,
8327};
8328
8329/*
8330 *	Initialize the DEV module. At boot time this walks the device list and
8331 *	unhooks any devices that fail to initialise (normally hardware not
8332 *	present) and leaves us with a valid list of present and active devices.
8333 *
8334 */
8335
8336/*
8337 *       This is called single threaded during boot, so no need
8338 *       to take the rtnl semaphore.
8339 */
8340static int __init net_dev_init(void)
8341{
8342	int i, rc = -ENOMEM;
8343
8344	BUG_ON(!dev_boot_phase);
8345
8346	if (dev_proc_init())
8347		goto out;
8348
8349	if (netdev_kobject_init())
8350		goto out;
8351
8352	INIT_LIST_HEAD(&ptype_all);
8353	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8354		INIT_LIST_HEAD(&ptype_base[i]);
8355
8356	INIT_LIST_HEAD(&offload_base);
8357
8358	if (register_pernet_subsys(&netdev_net_ops))
8359		goto out;
8360
8361	/*
8362	 *	Initialise the packet receive queues.
8363	 */
8364
8365	for_each_possible_cpu(i) {
8366		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8367		struct softnet_data *sd = &per_cpu(softnet_data, i);
8368
8369		INIT_WORK(flush, flush_backlog);
8370
8371		skb_queue_head_init(&sd->input_pkt_queue);
8372		skb_queue_head_init(&sd->process_queue);
8373		INIT_LIST_HEAD(&sd->poll_list);
8374		sd->output_queue_tailp = &sd->output_queue;
8375#ifdef CONFIG_RPS
8376		sd->csd.func = rps_trigger_softirq;
8377		sd->csd.info = sd;
8378		sd->cpu = i;
8379#endif
8380
8381		sd->backlog.poll = process_backlog;
8382		sd->backlog.weight = weight_p;
8383	}
8384
8385	dev_boot_phase = 0;
8386
8387	/* The loopback device is special if any other network devices
8388	 * is present in a network namespace the loopback device must
8389	 * be present. Since we now dynamically allocate and free the
8390	 * loopback device ensure this invariant is maintained by
8391	 * keeping the loopback device as the first device on the
8392	 * list of network devices.  Ensuring the loopback devices
8393	 * is the first device that appears and the last network device
8394	 * that disappears.
8395	 */
8396	if (register_pernet_device(&loopback_net_ops))
8397		goto out;
8398
8399	if (register_pernet_device(&default_device_ops))
8400		goto out;
8401
8402	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8403	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8404
8405	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8406				       NULL, dev_cpu_dead);
8407	WARN_ON(rc < 0);
8408	dst_subsys_init();
8409	rc = 0;
8410out:
8411	return rc;
8412}
8413
8414subsys_initcall(net_dev_init);