net/core/dev.c at v4.10 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v4.10 214 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <net/busy_poll.h>
 101#include <linux/rtnetlink.h>
 102#include <linux/stat.h>
 103#include <net/dst.h>
 104#include <net/dst_metadata.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121#include <linux/if_vlan.h>
 122#include <linux/ip.h>
 123#include <net/ip.h>
 124#include <net/mpls.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129#include <trace/events/napi.h>
 130#include <trace/events/net.h>
 131#include <trace/events/skb.h>
 132#include <linux/pci.h>
 133#include <linux/inetdevice.h>
 134#include <linux/cpu_rmap.h>
 135#include <linux/static_key.h>
 136#include <linux/hashtable.h>
 137#include <linux/vmalloc.h>
 138#include <linux/if_macvlan.h>
 139#include <linux/errqueue.h>
 140#include <linux/hrtimer.h>
 141#include <linux/netfilter_ingress.h>
 142#include <linux/crash_dump.h>
 143
 144#include "net-sysfs.h"
 145
 146/* Instead of increasing this, you should create a hash table. */
 147#define MAX_GRO_SKBS 8
 148
 149/* This should be increased if a protocol with a bigger head is added. */
 150#define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152static DEFINE_SPINLOCK(ptype_lock);
 153static DEFINE_SPINLOCK(offload_lock);
 154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155struct list_head ptype_all __read_mostly;	/* Taps */
 156static struct list_head offload_base __read_mostly;
 157
 158static int netif_rx_internal(struct sk_buff *skb);
 159static int call_netdevice_notifiers_info(unsigned long val,
 160					 struct net_device *dev,
 161					 struct netdev_notifier_info *info);
 162
 163/*
 164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165 * semaphore.
 166 *
 167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168 *
 169 * Writers must hold the rtnl semaphore while they loop through the
 170 * dev_base_head list, and hold dev_base_lock for writing when they do the
 171 * actual updates.  This allows pure readers to access the list even
 172 * while a writer is preparing to update it.
 173 *
 174 * To put it another way, dev_base_lock is held for writing only to
 175 * protect against pure readers; the rtnl semaphore provides the
 176 * protection against other writers.
 177 *
 178 * See, for example usages, register_netdevice() and
 179 * unregister_netdevice(), which must be called with the rtnl
 180 * semaphore held.
 181 */
 182DEFINE_RWLOCK(dev_base_lock);
 183EXPORT_SYMBOL(dev_base_lock);
 184
 185/* protects napi_hash addition/deletion and napi_gen_id */
 186static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188static unsigned int napi_gen_id = NR_CPUS;
 189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191static seqcount_t devnet_rename_seq;
 192
 193static inline void dev_base_seq_inc(struct net *net)
 194{
 195	while (++net->dev_base_seq == 0);
 196}
 197
 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199{
 200	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210static inline void rps_lock(struct softnet_data *sd)
 211{
 212#ifdef CONFIG_RPS
 213	spin_lock(&sd->input_pkt_queue.lock);
 214#endif
 215}
 216
 217static inline void rps_unlock(struct softnet_data *sd)
 218{
 219#ifdef CONFIG_RPS
 220	spin_unlock(&sd->input_pkt_queue.lock);
 221#endif
 222}
 223
 224/* Device list insertion */
 225static void list_netdevice(struct net_device *dev)
 226{
 227	struct net *net = dev_net(dev);
 228
 229	ASSERT_RTNL();
 230
 231	write_lock_bh(&dev_base_lock);
 232	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234	hlist_add_head_rcu(&dev->index_hlist,
 235			   dev_index_hash(net, dev->ifindex));
 236	write_unlock_bh(&dev_base_lock);
 237
 238	dev_base_seq_inc(net);
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246	ASSERT_RTNL();
 247
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254
 255	dev_base_seq_inc(dev_net(dev));
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376static inline struct list_head *ptype_head(const struct packet_type *pt)
 377{
 378	if (pt->type == htons(ETH_P_ALL))
 379		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380	else
 381		return pt->dev ? &pt->dev->ptype_specific :
 382				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	pr_warn("dev_remove_pack: %p not found\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462/**
 463 *	dev_add_offload - register offload handlers
 464 *	@po: protocol offload declaration
 465 *
 466 *	Add protocol offload handlers to the networking stack. The passed
 467 *	&proto_offload is linked into kernel lists and may not be freed until
 468 *	it has been removed from the kernel lists.
 469 *
 470 *	This call does not sleep therefore it can not
 471 *	guarantee all CPU's that are in middle of receiving packets
 472 *	will see the new offload handlers (until the next received packet).
 473 */
 474void dev_add_offload(struct packet_offload *po)
 475{
 476	struct packet_offload *elem;
 477
 478	spin_lock(&offload_lock);
 479	list_for_each_entry(elem, &offload_base, list) {
 480		if (po->priority < elem->priority)
 481			break;
 482	}
 483	list_add_rcu(&po->list, elem->list.prev);
 484	spin_unlock(&offload_lock);
 485}
 486EXPORT_SYMBOL(dev_add_offload);
 487
 488/**
 489 *	__dev_remove_offload	 - remove offload handler
 490 *	@po: packet offload declaration
 491 *
 492 *	Remove a protocol offload handler that was previously added to the
 493 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 494 *	is removed from the kernel lists and can be freed or reused once this
 495 *	function returns.
 496 *
 497 *      The packet type might still be in use by receivers
 498 *	and must not be freed until after all the CPU's have gone
 499 *	through a quiescent state.
 500 */
 501static void __dev_remove_offload(struct packet_offload *po)
 502{
 503	struct list_head *head = &offload_base;
 504	struct packet_offload *po1;
 505
 506	spin_lock(&offload_lock);
 507
 508	list_for_each_entry(po1, head, list) {
 509		if (po == po1) {
 510			list_del_rcu(&po->list);
 511			goto out;
 512		}
 513	}
 514
 515	pr_warn("dev_remove_offload: %p not found\n", po);
 516out:
 517	spin_unlock(&offload_lock);
 518}
 519
 520/**
 521 *	dev_remove_offload	 - remove packet offload handler
 522 *	@po: packet offload declaration
 523 *
 524 *	Remove a packet offload handler that was previously added to the kernel
 525 *	offload handlers by dev_add_offload(). The passed &offload_type is
 526 *	removed from the kernel lists and can be freed or reused once this
 527 *	function returns.
 528 *
 529 *	This call sleeps to guarantee that no CPU is looking at the packet
 530 *	type after return.
 531 */
 532void dev_remove_offload(struct packet_offload *po)
 533{
 534	__dev_remove_offload(po);
 535
 536	synchronize_net();
 537}
 538EXPORT_SYMBOL(dev_remove_offload);
 539
 540/******************************************************************************
 541
 542		      Device Boot-time Settings Routines
 543
 544*******************************************************************************/
 545
 546/* Boot time configuration table */
 547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549/**
 550 *	netdev_boot_setup_add	- add new setup entry
 551 *	@name: name of the device
 552 *	@map: configured settings for the device
 553 *
 554 *	Adds new setup entry to the dev_boot_setup list.  The function
 555 *	returns 0 on error and 1 on success.  This is a generic routine to
 556 *	all netdevices.
 557 */
 558static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559{
 560	struct netdev_boot_setup *s;
 561	int i;
 562
 563	s = dev_boot_setup;
 564	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566			memset(s[i].name, 0, sizeof(s[i].name));
 567			strlcpy(s[i].name, name, IFNAMSIZ);
 568			memcpy(&s[i].map, map, sizeof(s[i].map));
 569			break;
 570		}
 571	}
 572
 573	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574}
 575
 576/**
 577 *	netdev_boot_setup_check	- check boot time settings
 578 *	@dev: the netdevice
 579 *
 580 * 	Check boot time settings for the device.
 581 *	The found settings are set for the device to be used
 582 *	later in the device probing.
 583 *	Returns 0 if no settings found, 1 if they are.
 584 */
 585int netdev_boot_setup_check(struct net_device *dev)
 586{
 587	struct netdev_boot_setup *s = dev_boot_setup;
 588	int i;
 589
 590	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592		    !strcmp(dev->name, s[i].name)) {
 593			dev->irq 	= s[i].map.irq;
 594			dev->base_addr 	= s[i].map.base_addr;
 595			dev->mem_start 	= s[i].map.mem_start;
 596			dev->mem_end 	= s[i].map.mem_end;
 597			return 1;
 598		}
 599	}
 600	return 0;
 601}
 602EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605/**
 606 *	netdev_boot_base	- get address from boot time settings
 607 *	@prefix: prefix for network device
 608 *	@unit: id for network device
 609 *
 610 * 	Check boot time settings for the base address of device.
 611 *	The found settings are set for the device to be used
 612 *	later in the device probing.
 613 *	Returns 0 if no settings found.
 614 */
 615unsigned long netdev_boot_base(const char *prefix, int unit)
 616{
 617	const struct netdev_boot_setup *s = dev_boot_setup;
 618	char name[IFNAMSIZ];
 619	int i;
 620
 621	sprintf(name, "%s%d", prefix, unit);
 622
 623	/*
 624	 * If device already registered then return base of 1
 625	 * to indicate not to probe for this interface
 626	 */
 627	if (__dev_get_by_name(&init_net, name))
 628		return 1;
 629
 630	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631		if (!strcmp(name, s[i].name))
 632			return s[i].map.base_addr;
 633	return 0;
 634}
 635
 636/*
 637 * Saves at boot time configured settings for any netdevice.
 638 */
 639int __init netdev_boot_setup(char *str)
 640{
 641	int ints[5];
 642	struct ifmap map;
 643
 644	str = get_options(str, ARRAY_SIZE(ints), ints);
 645	if (!str || !*str)
 646		return 0;
 647
 648	/* Save settings */
 649	memset(&map, 0, sizeof(map));
 650	if (ints[0] > 0)
 651		map.irq = ints[1];
 652	if (ints[0] > 1)
 653		map.base_addr = ints[2];
 654	if (ints[0] > 2)
 655		map.mem_start = ints[3];
 656	if (ints[0] > 3)
 657		map.mem_end = ints[4];
 658
 659	/* Add new entry to the list */
 660	return netdev_boot_setup_add(str, &map);
 661}
 662
 663__setup("netdev=", netdev_boot_setup);
 664
 665/*******************************************************************************
 666
 667			    Device Interface Subroutines
 668
 669*******************************************************************************/
 670
 671/**
 672 *	dev_get_iflink	- get 'iflink' value of a interface
 673 *	@dev: targeted interface
 674 *
 675 *	Indicates the ifindex the interface is linked to.
 676 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 677 */
 678
 679int dev_get_iflink(const struct net_device *dev)
 680{
 681	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682		return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684	return dev->ifindex;
 685}
 686EXPORT_SYMBOL(dev_get_iflink);
 687
 688/**
 689 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 690 *	@dev: targeted interface
 691 *	@skb: The packet.
 692 *
 693 *	For better visibility of tunnel traffic OVS needs to retrieve
 694 *	egress tunnel information for a packet. Following API allows
 695 *	user to get this info.
 696 */
 697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698{
 699	struct ip_tunnel_info *info;
 700
 701	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702		return -EINVAL;
 703
 704	info = skb_tunnel_info_unclone(skb);
 705	if (!info)
 706		return -ENOMEM;
 707	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708		return -EINVAL;
 709
 710	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711}
 712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714/**
 715 *	__dev_get_by_name	- find a device by its name
 716 *	@net: the applicable net namespace
 717 *	@name: name to find
 718 *
 719 *	Find an interface by name. Must be called under RTNL semaphore
 720 *	or @dev_base_lock. If the name is found a pointer to the device
 721 *	is returned. If the name is not found then %NULL is returned. The
 722 *	reference counters are not incremented so the caller must be
 723 *	careful with locks.
 724 */
 725
 726struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727{
 728	struct net_device *dev;
 729	struct hlist_head *head = dev_name_hash(net, name);
 730
 731	hlist_for_each_entry(dev, head, name_hlist)
 732		if (!strncmp(dev->name, name, IFNAMSIZ))
 733			return dev;
 734
 735	return NULL;
 736}
 737EXPORT_SYMBOL(__dev_get_by_name);
 738
 739/**
 740 *	dev_get_by_name_rcu	- find a device by its name
 741 *	@net: the applicable net namespace
 742 *	@name: name to find
 743 *
 744 *	Find an interface by name.
 745 *	If the name is found a pointer to the device is returned.
 746 * 	If the name is not found then %NULL is returned.
 747 *	The reference counters are not incremented so the caller must be
 748 *	careful with locks. The caller must hold RCU lock.
 749 */
 750
 751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752{
 753	struct net_device *dev;
 754	struct hlist_head *head = dev_name_hash(net, name);
 755
 756	hlist_for_each_entry_rcu(dev, head, name_hlist)
 757		if (!strncmp(dev->name, name, IFNAMSIZ))
 758			return dev;
 759
 760	return NULL;
 761}
 762EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764/**
 765 *	dev_get_by_name		- find a device by its name
 766 *	@net: the applicable net namespace
 767 *	@name: name to find
 768 *
 769 *	Find an interface by name. This can be called from any
 770 *	context and does its own locking. The returned handle has
 771 *	the usage count incremented and the caller must use dev_put() to
 772 *	release it when it is no longer needed. %NULL is returned if no
 773 *	matching device is found.
 774 */
 775
 776struct net_device *dev_get_by_name(struct net *net, const char *name)
 777{
 778	struct net_device *dev;
 779
 780	rcu_read_lock();
 781	dev = dev_get_by_name_rcu(net, name);
 782	if (dev)
 783		dev_hold(dev);
 784	rcu_read_unlock();
 785	return dev;
 786}
 787EXPORT_SYMBOL(dev_get_by_name);
 788
 789/**
 790 *	__dev_get_by_index - find a device by its ifindex
 791 *	@net: the applicable net namespace
 792 *	@ifindex: index of device
 793 *
 794 *	Search for an interface by index. Returns %NULL if the device
 795 *	is not found or a pointer to the device. The device has not
 796 *	had its reference counter increased so the caller must be careful
 797 *	about locking. The caller must hold either the RTNL semaphore
 798 *	or @dev_base_lock.
 799 */
 800
 801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802{
 803	struct net_device *dev;
 804	struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806	hlist_for_each_entry(dev, head, index_hlist)
 807		if (dev->ifindex == ifindex)
 808			return dev;
 809
 810	return NULL;
 811}
 812EXPORT_SYMBOL(__dev_get_by_index);
 813
 814/**
 815 *	dev_get_by_index_rcu - find a device by its ifindex
 816 *	@net: the applicable net namespace
 817 *	@ifindex: index of device
 818 *
 819 *	Search for an interface by index. Returns %NULL if the device
 820 *	is not found or a pointer to the device. The device has not
 821 *	had its reference counter increased so the caller must be careful
 822 *	about locking. The caller must hold RCU lock.
 823 */
 824
 825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826{
 827	struct net_device *dev;
 828	struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830	hlist_for_each_entry_rcu(dev, head, index_hlist)
 831		if (dev->ifindex == ifindex)
 832			return dev;
 833
 834	return NULL;
 835}
 836EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839/**
 840 *	dev_get_by_index - find a device by its ifindex
 841 *	@net: the applicable net namespace
 842 *	@ifindex: index of device
 843 *
 844 *	Search for an interface by index. Returns NULL if the device
 845 *	is not found or a pointer to the device. The device returned has
 846 *	had a reference added and the pointer is safe until the user calls
 847 *	dev_put to indicate they have finished with it.
 848 */
 849
 850struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851{
 852	struct net_device *dev;
 853
 854	rcu_read_lock();
 855	dev = dev_get_by_index_rcu(net, ifindex);
 856	if (dev)
 857		dev_hold(dev);
 858	rcu_read_unlock();
 859	return dev;
 860}
 861EXPORT_SYMBOL(dev_get_by_index);
 862
 863/**
 864 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 865 *	@net: network namespace
 866 *	@name: a pointer to the buffer where the name will be stored.
 867 *	@ifindex: the ifindex of the interface to get the name from.
 868 *
 869 *	The use of raw_seqcount_begin() and cond_resched() before
 870 *	retrying is required as we want to give the writers a chance
 871 *	to complete when CONFIG_PREEMPT is not set.
 872 */
 873int netdev_get_name(struct net *net, char *name, int ifindex)
 874{
 875	struct net_device *dev;
 876	unsigned int seq;
 877
 878retry:
 879	seq = raw_seqcount_begin(&devnet_rename_seq);
 880	rcu_read_lock();
 881	dev = dev_get_by_index_rcu(net, ifindex);
 882	if (!dev) {
 883		rcu_read_unlock();
 884		return -ENODEV;
 885	}
 886
 887	strcpy(name, dev->name);
 888	rcu_read_unlock();
 889	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890		cond_resched();
 891		goto retry;
 892	}
 893
 894	return 0;
 895}
 896
 897/**
 898 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 899 *	@net: the applicable net namespace
 900 *	@type: media type of device
 901 *	@ha: hardware address
 902 *
 903 *	Search for an interface by MAC address. Returns NULL if the device
 904 *	is not found or a pointer to the device.
 905 *	The caller must hold RCU or RTNL.
 906 *	The returned device has not had its ref count increased
 907 *	and the caller must therefore be careful about locking
 908 *
 909 */
 910
 911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912				       const char *ha)
 913{
 914	struct net_device *dev;
 915
 916	for_each_netdev_rcu(net, dev)
 917		if (dev->type == type &&
 918		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 919			return dev;
 920
 921	return NULL;
 922}
 923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926{
 927	struct net_device *dev;
 928
 929	ASSERT_RTNL();
 930	for_each_netdev(net, dev)
 931		if (dev->type == type)
 932			return dev;
 933
 934	return NULL;
 935}
 936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939{
 940	struct net_device *dev, *ret = NULL;
 941
 942	rcu_read_lock();
 943	for_each_netdev_rcu(net, dev)
 944		if (dev->type == type) {
 945			dev_hold(dev);
 946			ret = dev;
 947			break;
 948		}
 949	rcu_read_unlock();
 950	return ret;
 951}
 952EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954/**
 955 *	__dev_get_by_flags - find any device with given flags
 956 *	@net: the applicable net namespace
 957 *	@if_flags: IFF_* values
 958 *	@mask: bitmask of bits in if_flags to check
 959 *
 960 *	Search for any interface with the given flags. Returns NULL if a device
 961 *	is not found or a pointer to the device. Must be called inside
 962 *	rtnl_lock(), and result refcount is unchanged.
 963 */
 964
 965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966				      unsigned short mask)
 967{
 968	struct net_device *dev, *ret;
 969
 970	ASSERT_RTNL();
 971
 972	ret = NULL;
 973	for_each_netdev(net, dev) {
 974		if (((dev->flags ^ if_flags) & mask) == 0) {
 975			ret = dev;
 976			break;
 977		}
 978	}
 979	return ret;
 980}
 981EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983/**
 984 *	dev_valid_name - check if name is okay for network device
 985 *	@name: name string
 986 *
 987 *	Network device names need to be valid file names to
 988 *	to allow sysfs to work.  We also disallow any kind of
 989 *	whitespace.
 990 */
 991bool dev_valid_name(const char *name)
 992{
 993	if (*name == '\0')
 994		return false;
 995	if (strlen(name) >= IFNAMSIZ)
 996		return false;
 997	if (!strcmp(name, ".") || !strcmp(name, ".."))
 998		return false;
 999
1000	while (*name) {
1001		if (*name == '/' || *name == ':' || isspace(*name))
1002			return false;
1003		name++;
1004	}
1005	return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 *	__dev_alloc_name - allocate a name for a device
1011 *	@net: network namespace to allocate the device name in
1012 *	@name: name format string
1013 *	@buf:  scratch buffer and result name string
1014 *
1015 *	Passed a format string - eg "lt%d" it will try and find a suitable
1016 *	id. It scans list of devices to build up a free map, then chooses
1017 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018 *	while allocating the name and adding the device in order to avoid
1019 *	duplicates.
1020 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 *	Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026	int i = 0;
1027	const char *p;
1028	const int max_netdevices = 8*PAGE_SIZE;
1029	unsigned long *inuse;
1030	struct net_device *d;
1031
1032	p = strnchr(name, IFNAMSIZ-1, '%');
1033	if (p) {
1034		/*
1035		 * Verify the string as this thing may have come from
1036		 * the user.  There must be either one "%d" and no other "%"
1037		 * characters.
1038		 */
1039		if (p[1] != 'd' || strchr(p + 2, '%'))
1040			return -EINVAL;
1041
1042		/* Use one page as a bit array of possible slots */
1043		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044		if (!inuse)
1045			return -ENOMEM;
1046
1047		for_each_netdev(net, d) {
1048			if (!sscanf(d->name, name, &i))
1049				continue;
1050			if (i < 0 || i >= max_netdevices)
1051				continue;
1052
1053			/*  avoid cases where sscanf is not exact inverse of printf */
1054			snprintf(buf, IFNAMSIZ, name, i);
1055			if (!strncmp(buf, d->name, IFNAMSIZ))
1056				set_bit(i, inuse);
1057		}
1058
1059		i = find_first_zero_bit(inuse, max_netdevices);
1060		free_page((unsigned long) inuse);
1061	}
1062
1063	if (buf != name)
1064		snprintf(buf, IFNAMSIZ, name, i);
1065	if (!__dev_get_by_name(net, buf))
1066		return i;
1067
1068	/* It is possible to run out of possible slots
1069	 * when the name is long and there isn't enough space left
1070	 * for the digits, or if all bits are used.
1071	 */
1072	return -ENFILE;
1073}
1074
1075/**
1076 *	dev_alloc_name - allocate a name for a device
1077 *	@dev: device
1078 *	@name: name format string
1079 *
1080 *	Passed a format string - eg "lt%d" it will try and find a suitable
1081 *	id. It scans list of devices to build up a free map, then chooses
1082 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083 *	while allocating the name and adding the device in order to avoid
1084 *	duplicates.
1085 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 *	Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091	char buf[IFNAMSIZ];
1092	struct net *net;
1093	int ret;
1094
1095	BUG_ON(!dev_net(dev));
1096	net = dev_net(dev);
1097	ret = __dev_alloc_name(net, name, buf);
1098	if (ret >= 0)
1099		strlcpy(dev->name, buf, IFNAMSIZ);
1100	return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105			     struct net_device *dev,
1106			     const char *name)
1107{
1108	char buf[IFNAMSIZ];
1109	int ret;
1110
1111	ret = __dev_alloc_name(net, name, buf);
1112	if (ret >= 0)
1113		strlcpy(dev->name, buf, IFNAMSIZ);
1114	return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118			      struct net_device *dev,
1119			      const char *name)
1120{
1121	BUG_ON(!net);
1122
1123	if (!dev_valid_name(name))
1124		return -EINVAL;
1125
1126	if (strchr(name, '%'))
1127		return dev_alloc_name_ns(net, dev, name);
1128	else if (__dev_get_by_name(net, name))
1129		return -EEXIST;
1130	else if (dev->name != name)
1131		strlcpy(dev->name, name, IFNAMSIZ);
1132
1133	return 0;
1134}
1135
1136/**
1137 *	dev_change_name - change name of a device
1138 *	@dev: device
1139 *	@newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 *	Change name of a device, can pass format strings "eth%d".
1142 *	for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146	unsigned char old_assign_type;
1147	char oldname[IFNAMSIZ];
1148	int err = 0;
1149	int ret;
1150	struct net *net;
1151
1152	ASSERT_RTNL();
1153	BUG_ON(!dev_net(dev));
1154
1155	net = dev_net(dev);
1156	if (dev->flags & IFF_UP)
1157		return -EBUSY;
1158
1159	write_seqcount_begin(&devnet_rename_seq);
1160
1161	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162		write_seqcount_end(&devnet_rename_seq);
1163		return 0;
1164	}
1165
1166	memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168	err = dev_get_valid_name(net, dev, newname);
1169	if (err < 0) {
1170		write_seqcount_end(&devnet_rename_seq);
1171		return err;
1172	}
1173
1174	if (oldname[0] && !strchr(oldname, '%'))
1175		netdev_info(dev, "renamed from %s\n", oldname);
1176
1177	old_assign_type = dev->name_assign_type;
1178	dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181	ret = device_rename(&dev->dev, dev->name);
1182	if (ret) {
1183		memcpy(dev->name, oldname, IFNAMSIZ);
1184		dev->name_assign_type = old_assign_type;
1185		write_seqcount_end(&devnet_rename_seq);
1186		return ret;
1187	}
1188
1189	write_seqcount_end(&devnet_rename_seq);
1190
1191	netdev_adjacent_rename_links(dev, oldname);
1192
1193	write_lock_bh(&dev_base_lock);
1194	hlist_del_rcu(&dev->name_hlist);
1195	write_unlock_bh(&dev_base_lock);
1196
1197	synchronize_rcu();
1198
1199	write_lock_bh(&dev_base_lock);
1200	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201	write_unlock_bh(&dev_base_lock);
1202
1203	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204	ret = notifier_to_errno(ret);
1205
1206	if (ret) {
1207		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208		if (err >= 0) {
1209			err = ret;
1210			write_seqcount_begin(&devnet_rename_seq);
1211			memcpy(dev->name, oldname, IFNAMSIZ);
1212			memcpy(oldname, newname, IFNAMSIZ);
1213			dev->name_assign_type = old_assign_type;
1214			old_assign_type = NET_NAME_RENAMED;
1215			goto rollback;
1216		} else {
1217			pr_err("%s: name change rollback failed: %d\n",
1218			       dev->name, ret);
1219		}
1220	}
1221
1222	return err;
1223}
1224
1225/**
1226 *	dev_set_alias - change ifalias of a device
1227 *	@dev: device
1228 *	@alias: name up to IFALIASZ
1229 *	@len: limit of bytes to copy from info
1230 *
1231 *	Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235	char *new_ifalias;
1236
1237	ASSERT_RTNL();
1238
1239	if (len >= IFALIASZ)
1240		return -EINVAL;
1241
1242	if (!len) {
1243		kfree(dev->ifalias);
1244		dev->ifalias = NULL;
1245		return 0;
1246	}
1247
1248	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249	if (!new_ifalias)
1250		return -ENOMEM;
1251	dev->ifalias = new_ifalias;
1252
1253	strlcpy(dev->ifalias, alias, len+1);
1254	return len;
1255}
1256
1257
1258/**
1259 *	netdev_features_change - device changes features
1260 *	@dev: device to cause notification
1261 *
1262 *	Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 *	netdev_state_change - device changes state
1272 *	@dev: device to cause notification
1273 *
1274 *	Called to indicate a device has changed state. This function calls
1275 *	the notifier chains for netdev_chain and sends a NEWLINK message
1276 *	to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280	if (dev->flags & IFF_UP) {
1281		struct netdev_notifier_change_info change_info;
1282
1283		change_info.flags_changed = 0;
1284		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285					      &change_info.info);
1286		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287	}
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * 	netdev_notify_peers - notify network peers about existence of @dev
1293 * 	@dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303	rtnl_lock();
1304	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305	rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311	const struct net_device_ops *ops = dev->netdev_ops;
1312	int ret;
1313
1314	ASSERT_RTNL();
1315
1316	if (!netif_device_present(dev))
1317		return -ENODEV;
1318
1319	/* Block netpoll from trying to do any rx path servicing.
1320	 * If we don't do this there is a chance ndo_poll_controller
1321	 * or ndo_poll may be running while we open the device
1322	 */
1323	netpoll_poll_disable(dev);
1324
1325	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326	ret = notifier_to_errno(ret);
1327	if (ret)
1328		return ret;
1329
1330	set_bit(__LINK_STATE_START, &dev->state);
1331
1332	if (ops->ndo_validate_addr)
1333		ret = ops->ndo_validate_addr(dev);
1334
1335	if (!ret && ops->ndo_open)
1336		ret = ops->ndo_open(dev);
1337
1338	netpoll_poll_enable(dev);
1339
1340	if (ret)
1341		clear_bit(__LINK_STATE_START, &dev->state);
1342	else {
1343		dev->flags |= IFF_UP;
1344		dev_set_rx_mode(dev);
1345		dev_activate(dev);
1346		add_device_randomness(dev->dev_addr, dev->addr_len);
1347	}
1348
1349	return ret;
1350}
1351
1352/**
1353 *	dev_open	- prepare an interface for use.
1354 *	@dev:	device to open
1355 *
1356 *	Takes a device from down to up state. The device's private open
1357 *	function is invoked and then the multicast lists are loaded. Finally
1358 *	the device is moved into the up state and a %NETDEV_UP message is
1359 *	sent to the netdev notifier chain.
1360 *
1361 *	Calling this function on an active interface is a nop. On a failure
1362 *	a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366	int ret;
1367
1368	if (dev->flags & IFF_UP)
1369		return 0;
1370
1371	ret = __dev_open(dev);
1372	if (ret < 0)
1373		return ret;
1374
1375	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376	call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378	return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384	struct net_device *dev;
1385
1386	ASSERT_RTNL();
1387	might_sleep();
1388
1389	list_for_each_entry(dev, head, close_list) {
1390		/* Temporarily disable netpoll until the interface is down */
1391		netpoll_poll_disable(dev);
1392
1393		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395		clear_bit(__LINK_STATE_START, &dev->state);
1396
1397		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398		 * can be even on different cpu. So just clear netif_running().
1399		 *
1400		 * dev->stop() will invoke napi_disable() on all of it's
1401		 * napi_struct instances on this device.
1402		 */
1403		smp_mb__after_atomic(); /* Commit netif_running(). */
1404	}
1405
1406	dev_deactivate_many(head);
1407
1408	list_for_each_entry(dev, head, close_list) {
1409		const struct net_device_ops *ops = dev->netdev_ops;
1410
1411		/*
1412		 *	Call the device specific close. This cannot fail.
1413		 *	Only if device is UP
1414		 *
1415		 *	We allow it to be called even after a DETACH hot-plug
1416		 *	event.
1417		 */
1418		if (ops->ndo_stop)
1419			ops->ndo_stop(dev);
1420
1421		dev->flags &= ~IFF_UP;
1422		netpoll_poll_enable(dev);
1423	}
1424
1425	return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430	int retval;
1431	LIST_HEAD(single);
1432
1433	list_add(&dev->close_list, &single);
1434	retval = __dev_close_many(&single);
1435	list_del(&single);
1436
1437	return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442	struct net_device *dev, *tmp;
1443
1444	/* Remove the devices that don't need to be closed */
1445	list_for_each_entry_safe(dev, tmp, head, close_list)
1446		if (!(dev->flags & IFF_UP))
1447			list_del_init(&dev->close_list);
1448
1449	__dev_close_many(head);
1450
1451	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454		if (unlink)
1455			list_del_init(&dev->close_list);
1456	}
1457
1458	return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 *	dev_close - shutdown an interface.
1464 *	@dev: device to shutdown
1465 *
1466 *	This function moves an active device into down state. A
1467 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 *	chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473	if (dev->flags & IFF_UP) {
1474		LIST_HEAD(single);
1475
1476		list_add(&dev->close_list, &single);
1477		dev_close_many(&single, true);
1478		list_del(&single);
1479	}
1480	return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 *	dev_disable_lro - disable Large Receive Offload on a device
1487 *	@dev: device
1488 *
1489 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490 *	called under RTNL.  This is needed if received packets may be
1491 *	forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495	struct net_device *lower_dev;
1496	struct list_head *iter;
1497
1498	dev->wanted_features &= ~NETIF_F_LRO;
1499	netdev_update_features(dev);
1500
1501	if (unlikely(dev->features & NETIF_F_LRO))
1502		netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505		dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510				   struct net_device *dev)
1511{
1512	struct netdev_notifier_info info;
1513
1514	netdev_notifier_info_init(&info, dev);
1515	return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 *	register_netdevice_notifier - register a network notifier block
1522 *	@nb: notifier
1523 *
1524 *	Register a notifier to be called when network device events occur.
1525 *	The notifier passed is linked into the kernel structures and must
1526 *	not be reused until it has been unregistered. A negative errno code
1527 *	is returned on a failure.
1528 *
1529 * 	When registered all registration and up events are replayed
1530 *	to the new notifier to allow device to have a race free
1531 *	view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536	struct net_device *dev;
1537	struct net_device *last;
1538	struct net *net;
1539	int err;
1540
1541	rtnl_lock();
1542	err = raw_notifier_chain_register(&netdev_chain, nb);
1543	if (err)
1544		goto unlock;
1545	if (dev_boot_phase)
1546		goto unlock;
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550			err = notifier_to_errno(err);
1551			if (err)
1552				goto rollback;
1553
1554			if (!(dev->flags & IFF_UP))
1555				continue;
1556
1557			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558		}
1559	}
1560
1561unlock:
1562	rtnl_unlock();
1563	return err;
1564
1565rollback:
1566	last = dev;
1567	for_each_net(net) {
1568		for_each_netdev(net, dev) {
1569			if (dev == last)
1570				goto outroll;
1571
1572			if (dev->flags & IFF_UP) {
1573				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574							dev);
1575				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576			}
1577			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578		}
1579	}
1580
1581outroll:
1582	raw_notifier_chain_unregister(&netdev_chain, nb);
1583	goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 *	unregister_netdevice_notifier - unregister a network notifier block
1589 *	@nb: notifier
1590 *
1591 *	Unregister a notifier previously registered by
1592 *	register_netdevice_notifier(). The notifier is unlinked into the
1593 *	kernel structures and may then be reused. A negative errno code
1594 *	is returned on a failure.
1595 *
1596 * 	After unregistering unregister and down device events are synthesized
1597 *	for all devices on the device list to the removed notifier to remove
1598 *	the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603	struct net_device *dev;
1604	struct net *net;
1605	int err;
1606
1607	rtnl_lock();
1608	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609	if (err)
1610		goto unlock;
1611
1612	for_each_net(net) {
1613		for_each_netdev(net, dev) {
1614			if (dev->flags & IFF_UP) {
1615				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616							dev);
1617				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618			}
1619			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620		}
1621	}
1622unlock:
1623	rtnl_unlock();
1624	return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 *	call_netdevice_notifiers_info - call all network notifier blocks
1630 *	@val: value passed unmodified to notifier function
1631 *	@dev: net_device pointer passed unmodified to notifier function
1632 *	@info: notifier information data
1633 *
1634 *	Call all network notifier blocks.  Parameters and return value
1635 *	are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639					 struct net_device *dev,
1640					 struct netdev_notifier_info *info)
1641{
1642	ASSERT_RTNL();
1643	netdev_notifier_info_init(info, dev);
1644	return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 *	call_netdevice_notifiers - call all network notifier blocks
1649 *      @val: value passed unmodified to notifier function
1650 *      @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 *	Call all network notifier blocks.  Parameters and return value
1653 *	are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658	struct netdev_notifier_info info;
1659
1660	return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669	static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675	static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685	static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691	static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698static atomic_t netstamp_needed_deferred;
1699static void netstamp_clear(struct work_struct *work)
1700{
1701	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1702
1703	while (deferred--)
1704		static_key_slow_dec(&netstamp_needed);
1705}
1706static DECLARE_WORK(netstamp_work, netstamp_clear);
1707#endif
1708
1709void net_enable_timestamp(void)
1710{
1711	static_key_slow_inc(&netstamp_needed);
1712}
1713EXPORT_SYMBOL(net_enable_timestamp);
1714
1715void net_disable_timestamp(void)
1716{
1717#ifdef HAVE_JUMP_LABEL
1718	/* net_disable_timestamp() can be called from non process context */
1719	atomic_inc(&netstamp_needed_deferred);
1720	schedule_work(&netstamp_work);
1721#else
1722	static_key_slow_dec(&netstamp_needed);
1723#endif
1724}
1725EXPORT_SYMBOL(net_disable_timestamp);
1726
1727static inline void net_timestamp_set(struct sk_buff *skb)
1728{
1729	skb->tstamp = 0;
1730	if (static_key_false(&netstamp_needed))
1731		__net_timestamp(skb);
1732}
1733
1734#define net_timestamp_check(COND, SKB)			\
1735	if (static_key_false(&netstamp_needed)) {		\
1736		if ((COND) && !(SKB)->tstamp)	\
1737			__net_timestamp(SKB);		\
1738	}						\
1739
1740bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1741{
1742	unsigned int len;
1743
1744	if (!(dev->flags & IFF_UP))
1745		return false;
1746
1747	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1748	if (skb->len <= len)
1749		return true;
1750
1751	/* if TSO is enabled, we don't care about the length as the packet
1752	 * could be forwarded without being segmented before
1753	 */
1754	if (skb_is_gso(skb))
1755		return true;
1756
1757	return false;
1758}
1759EXPORT_SYMBOL_GPL(is_skb_forwardable);
1760
1761int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1762{
1763	int ret = ____dev_forward_skb(dev, skb);
1764
1765	if (likely(!ret)) {
1766		skb->protocol = eth_type_trans(skb, dev);
1767		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1768	}
1769
1770	return ret;
1771}
1772EXPORT_SYMBOL_GPL(__dev_forward_skb);
1773
1774/**
1775 * dev_forward_skb - loopback an skb to another netif
1776 *
1777 * @dev: destination network device
1778 * @skb: buffer to forward
1779 *
1780 * return values:
1781 *	NET_RX_SUCCESS	(no congestion)
1782 *	NET_RX_DROP     (packet was dropped, but freed)
1783 *
1784 * dev_forward_skb can be used for injecting an skb from the
1785 * start_xmit function of one device into the receive queue
1786 * of another device.
1787 *
1788 * The receiving device may be in another namespace, so
1789 * we have to clear all information in the skb that could
1790 * impact namespace isolation.
1791 */
1792int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1793{
1794	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1795}
1796EXPORT_SYMBOL_GPL(dev_forward_skb);
1797
1798static inline int deliver_skb(struct sk_buff *skb,
1799			      struct packet_type *pt_prev,
1800			      struct net_device *orig_dev)
1801{
1802	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1803		return -ENOMEM;
1804	atomic_inc(&skb->users);
1805	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1806}
1807
1808static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1809					  struct packet_type **pt,
1810					  struct net_device *orig_dev,
1811					  __be16 type,
1812					  struct list_head *ptype_list)
1813{
1814	struct packet_type *ptype, *pt_prev = *pt;
1815
1816	list_for_each_entry_rcu(ptype, ptype_list, list) {
1817		if (ptype->type != type)
1818			continue;
1819		if (pt_prev)
1820			deliver_skb(skb, pt_prev, orig_dev);
1821		pt_prev = ptype;
1822	}
1823	*pt = pt_prev;
1824}
1825
1826static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1827{
1828	if (!ptype->af_packet_priv || !skb->sk)
1829		return false;
1830
1831	if (ptype->id_match)
1832		return ptype->id_match(ptype, skb->sk);
1833	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1834		return true;
1835
1836	return false;
1837}
1838
1839/*
1840 *	Support routine. Sends outgoing frames to any network
1841 *	taps currently in use.
1842 */
1843
1844void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1845{
1846	struct packet_type *ptype;
1847	struct sk_buff *skb2 = NULL;
1848	struct packet_type *pt_prev = NULL;
1849	struct list_head *ptype_list = &ptype_all;
1850
1851	rcu_read_lock();
1852again:
1853	list_for_each_entry_rcu(ptype, ptype_list, list) {
1854		/* Never send packets back to the socket
1855		 * they originated from - MvS (miquels@drinkel.ow.org)
1856		 */
1857		if (skb_loop_sk(ptype, skb))
1858			continue;
1859
1860		if (pt_prev) {
1861			deliver_skb(skb2, pt_prev, skb->dev);
1862			pt_prev = ptype;
1863			continue;
1864		}
1865
1866		/* need to clone skb, done only once */
1867		skb2 = skb_clone(skb, GFP_ATOMIC);
1868		if (!skb2)
1869			goto out_unlock;
1870
1871		net_timestamp_set(skb2);
1872
1873		/* skb->nh should be correctly
1874		 * set by sender, so that the second statement is
1875		 * just protection against buggy protocols.
1876		 */
1877		skb_reset_mac_header(skb2);
1878
1879		if (skb_network_header(skb2) < skb2->data ||
1880		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1881			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1882					     ntohs(skb2->protocol),
1883					     dev->name);
1884			skb_reset_network_header(skb2);
1885		}
1886
1887		skb2->transport_header = skb2->network_header;
1888		skb2->pkt_type = PACKET_OUTGOING;
1889		pt_prev = ptype;
1890	}
1891
1892	if (ptype_list == &ptype_all) {
1893		ptype_list = &dev->ptype_all;
1894		goto again;
1895	}
1896out_unlock:
1897	if (pt_prev)
1898		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1899	rcu_read_unlock();
1900}
1901EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1902
1903/**
1904 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1905 * @dev: Network device
1906 * @txq: number of queues available
1907 *
1908 * If real_num_tx_queues is changed the tc mappings may no longer be
1909 * valid. To resolve this verify the tc mapping remains valid and if
1910 * not NULL the mapping. With no priorities mapping to this
1911 * offset/count pair it will no longer be used. In the worst case TC0
1912 * is invalid nothing can be done so disable priority mappings. If is
1913 * expected that drivers will fix this mapping if they can before
1914 * calling netif_set_real_num_tx_queues.
1915 */
1916static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1917{
1918	int i;
1919	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1920
1921	/* If TC0 is invalidated disable TC mapping */
1922	if (tc->offset + tc->count > txq) {
1923		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1924		dev->num_tc = 0;
1925		return;
1926	}
1927
1928	/* Invalidated prio to tc mappings set to TC0 */
1929	for (i = 1; i < TC_BITMASK + 1; i++) {
1930		int q = netdev_get_prio_tc_map(dev, i);
1931
1932		tc = &dev->tc_to_txq[q];
1933		if (tc->offset + tc->count > txq) {
1934			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1935				i, q);
1936			netdev_set_prio_tc_map(dev, i, 0);
1937		}
1938	}
1939}
1940
1941int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1942{
1943	if (dev->num_tc) {
1944		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1945		int i;
1946
1947		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1948			if ((txq - tc->offset) < tc->count)
1949				return i;
1950		}
1951
1952		return -1;
1953	}
1954
1955	return 0;
1956}
1957
1958#ifdef CONFIG_XPS
1959static DEFINE_MUTEX(xps_map_mutex);
1960#define xmap_dereference(P)		\
1961	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1962
1963static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1964			     int tci, u16 index)
1965{
1966	struct xps_map *map = NULL;
1967	int pos;
1968
1969	if (dev_maps)
1970		map = xmap_dereference(dev_maps->cpu_map[tci]);
1971	if (!map)
1972		return false;
1973
1974	for (pos = map->len; pos--;) {
1975		if (map->queues[pos] != index)
1976			continue;
1977
1978		if (map->len > 1) {
1979			map->queues[pos] = map->queues[--map->len];
1980			break;
1981		}
1982
1983		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1984		kfree_rcu(map, rcu);
1985		return false;
1986	}
1987
1988	return true;
1989}
1990
1991static bool remove_xps_queue_cpu(struct net_device *dev,
1992				 struct xps_dev_maps *dev_maps,
1993				 int cpu, u16 offset, u16 count)
1994{
1995	int num_tc = dev->num_tc ? : 1;
1996	bool active = false;
1997	int tci;
1998
1999	for (tci = cpu * num_tc; num_tc--; tci++) {
2000		int i, j;
2001
2002		for (i = count, j = offset; i--; j++) {
2003			if (!remove_xps_queue(dev_maps, cpu, j))
2004				break;
2005		}
2006
2007		active |= i < 0;
2008	}
2009
2010	return active;
2011}
2012
2013static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2014				   u16 count)
2015{
2016	struct xps_dev_maps *dev_maps;
2017	int cpu, i;
2018	bool active = false;
2019
2020	mutex_lock(&xps_map_mutex);
2021	dev_maps = xmap_dereference(dev->xps_maps);
2022
2023	if (!dev_maps)
2024		goto out_no_maps;
2025
2026	for_each_possible_cpu(cpu)
2027		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2028					       offset, count);
2029
2030	if (!active) {
2031		RCU_INIT_POINTER(dev->xps_maps, NULL);
2032		kfree_rcu(dev_maps, rcu);
2033	}
2034
2035	for (i = offset + (count - 1); count--; i--)
2036		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2037					     NUMA_NO_NODE);
2038
2039out_no_maps:
2040	mutex_unlock(&xps_map_mutex);
2041}
2042
2043static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2044{
2045	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2046}
2047
2048static struct xps_map *expand_xps_map(struct xps_map *map,
2049				      int cpu, u16 index)
2050{
2051	struct xps_map *new_map;
2052	int alloc_len = XPS_MIN_MAP_ALLOC;
2053	int i, pos;
2054
2055	for (pos = 0; map && pos < map->len; pos++) {
2056		if (map->queues[pos] != index)
2057			continue;
2058		return map;
2059	}
2060
2061	/* Need to add queue to this CPU's existing map */
2062	if (map) {
2063		if (pos < map->alloc_len)
2064			return map;
2065
2066		alloc_len = map->alloc_len * 2;
2067	}
2068
2069	/* Need to allocate new map to store queue on this CPU's map */
2070	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2071			       cpu_to_node(cpu));
2072	if (!new_map)
2073		return NULL;
2074
2075	for (i = 0; i < pos; i++)
2076		new_map->queues[i] = map->queues[i];
2077	new_map->alloc_len = alloc_len;
2078	new_map->len = pos;
2079
2080	return new_map;
2081}
2082
2083int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2084			u16 index)
2085{
2086	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2087	int i, cpu, tci, numa_node_id = -2;
2088	int maps_sz, num_tc = 1, tc = 0;
2089	struct xps_map *map, *new_map;
2090	bool active = false;
2091
2092	if (dev->num_tc) {
2093		num_tc = dev->num_tc;
2094		tc = netdev_txq_to_tc(dev, index);
2095		if (tc < 0)
2096			return -EINVAL;
2097	}
2098
2099	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2100	if (maps_sz < L1_CACHE_BYTES)
2101		maps_sz = L1_CACHE_BYTES;
2102
2103	mutex_lock(&xps_map_mutex);
2104
2105	dev_maps = xmap_dereference(dev->xps_maps);
2106
2107	/* allocate memory for queue storage */
2108	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2109		if (!new_dev_maps)
2110			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2111		if (!new_dev_maps) {
2112			mutex_unlock(&xps_map_mutex);
2113			return -ENOMEM;
2114		}
2115
2116		tci = cpu * num_tc + tc;
2117		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2118				 NULL;
2119
2120		map = expand_xps_map(map, cpu, index);
2121		if (!map)
2122			goto error;
2123
2124		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2125	}
2126
2127	if (!new_dev_maps)
2128		goto out_no_new_maps;
2129
2130	for_each_possible_cpu(cpu) {
2131		/* copy maps belonging to foreign traffic classes */
2132		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2133			/* fill in the new device map from the old device map */
2134			map = xmap_dereference(dev_maps->cpu_map[tci]);
2135			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2136		}
2137
2138		/* We need to explicitly update tci as prevous loop
2139		 * could break out early if dev_maps is NULL.
2140		 */
2141		tci = cpu * num_tc + tc;
2142
2143		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2144			/* add queue to CPU maps */
2145			int pos = 0;
2146
2147			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2148			while ((pos < map->len) && (map->queues[pos] != index))
2149				pos++;
2150
2151			if (pos == map->len)
2152				map->queues[map->len++] = index;
2153#ifdef CONFIG_NUMA
2154			if (numa_node_id == -2)
2155				numa_node_id = cpu_to_node(cpu);
2156			else if (numa_node_id != cpu_to_node(cpu))
2157				numa_node_id = -1;
2158#endif
2159		} else if (dev_maps) {
2160			/* fill in the new device map from the old device map */
2161			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163		}
2164
2165		/* copy maps belonging to foreign traffic classes */
2166		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2167			/* fill in the new device map from the old device map */
2168			map = xmap_dereference(dev_maps->cpu_map[tci]);
2169			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2170		}
2171	}
2172
2173	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2174
2175	/* Cleanup old maps */
2176	if (!dev_maps)
2177		goto out_no_old_maps;
2178
2179	for_each_possible_cpu(cpu) {
2180		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2181			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2182			map = xmap_dereference(dev_maps->cpu_map[tci]);
2183			if (map && map != new_map)
2184				kfree_rcu(map, rcu);
2185		}
2186	}
2187
2188	kfree_rcu(dev_maps, rcu);
2189
2190out_no_old_maps:
2191	dev_maps = new_dev_maps;
2192	active = true;
2193
2194out_no_new_maps:
2195	/* update Tx queue numa node */
2196	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2197				     (numa_node_id >= 0) ? numa_node_id :
2198				     NUMA_NO_NODE);
2199
2200	if (!dev_maps)
2201		goto out_no_maps;
2202
2203	/* removes queue from unused CPUs */
2204	for_each_possible_cpu(cpu) {
2205		for (i = tc, tci = cpu * num_tc; i--; tci++)
2206			active |= remove_xps_queue(dev_maps, tci, index);
2207		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2208			active |= remove_xps_queue(dev_maps, tci, index);
2209		for (i = num_tc - tc, tci++; --i; tci++)
2210			active |= remove_xps_queue(dev_maps, tci, index);
2211	}
2212
2213	/* free map if not active */
2214	if (!active) {
2215		RCU_INIT_POINTER(dev->xps_maps, NULL);
2216		kfree_rcu(dev_maps, rcu);
2217	}
2218
2219out_no_maps:
2220	mutex_unlock(&xps_map_mutex);
2221
2222	return 0;
2223error:
2224	/* remove any maps that we added */
2225	for_each_possible_cpu(cpu) {
2226		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2227			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2228			map = dev_maps ?
2229			      xmap_dereference(dev_maps->cpu_map[tci]) :
2230			      NULL;
2231			if (new_map && new_map != map)
2232				kfree(new_map);
2233		}
2234	}
2235
2236	mutex_unlock(&xps_map_mutex);
2237
2238	kfree(new_dev_maps);
2239	return -ENOMEM;
2240}
2241EXPORT_SYMBOL(netif_set_xps_queue);
2242
2243#endif
2244void netdev_reset_tc(struct net_device *dev)
2245{
2246#ifdef CONFIG_XPS
2247	netif_reset_xps_queues_gt(dev, 0);
2248#endif
2249	dev->num_tc = 0;
2250	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2251	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2252}
2253EXPORT_SYMBOL(netdev_reset_tc);
2254
2255int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2256{
2257	if (tc >= dev->num_tc)
2258		return -EINVAL;
2259
2260#ifdef CONFIG_XPS
2261	netif_reset_xps_queues(dev, offset, count);
2262#endif
2263	dev->tc_to_txq[tc].count = count;
2264	dev->tc_to_txq[tc].offset = offset;
2265	return 0;
2266}
2267EXPORT_SYMBOL(netdev_set_tc_queue);
2268
2269int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2270{
2271	if (num_tc > TC_MAX_QUEUE)
2272		return -EINVAL;
2273
2274#ifdef CONFIG_XPS
2275	netif_reset_xps_queues_gt(dev, 0);
2276#endif
2277	dev->num_tc = num_tc;
2278	return 0;
2279}
2280EXPORT_SYMBOL(netdev_set_num_tc);
2281
2282/*
2283 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2284 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2285 */
2286int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2287{
2288	int rc;
2289
2290	if (txq < 1 || txq > dev->num_tx_queues)
2291		return -EINVAL;
2292
2293	if (dev->reg_state == NETREG_REGISTERED ||
2294	    dev->reg_state == NETREG_UNREGISTERING) {
2295		ASSERT_RTNL();
2296
2297		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2298						  txq);
2299		if (rc)
2300			return rc;
2301
2302		if (dev->num_tc)
2303			netif_setup_tc(dev, txq);
2304
2305		if (txq < dev->real_num_tx_queues) {
2306			qdisc_reset_all_tx_gt(dev, txq);
2307#ifdef CONFIG_XPS
2308			netif_reset_xps_queues_gt(dev, txq);
2309#endif
2310		}
2311	}
2312
2313	dev->real_num_tx_queues = txq;
2314	return 0;
2315}
2316EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2317
2318#ifdef CONFIG_SYSFS
2319/**
2320 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2321 *	@dev: Network device
2322 *	@rxq: Actual number of RX queues
2323 *
2324 *	This must be called either with the rtnl_lock held or before
2325 *	registration of the net device.  Returns 0 on success, or a
2326 *	negative error code.  If called before registration, it always
2327 *	succeeds.
2328 */
2329int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2330{
2331	int rc;
2332
2333	if (rxq < 1 || rxq > dev->num_rx_queues)
2334		return -EINVAL;
2335
2336	if (dev->reg_state == NETREG_REGISTERED) {
2337		ASSERT_RTNL();
2338
2339		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2340						  rxq);
2341		if (rc)
2342			return rc;
2343	}
2344
2345	dev->real_num_rx_queues = rxq;
2346	return 0;
2347}
2348EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2349#endif
2350
2351/**
2352 * netif_get_num_default_rss_queues - default number of RSS queues
2353 *
2354 * This routine should set an upper limit on the number of RSS queues
2355 * used by default by multiqueue devices.
2356 */
2357int netif_get_num_default_rss_queues(void)
2358{
2359	return is_kdump_kernel() ?
2360		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2361}
2362EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2363
2364static void __netif_reschedule(struct Qdisc *q)
2365{
2366	struct softnet_data *sd;
2367	unsigned long flags;
2368
2369	local_irq_save(flags);
2370	sd = this_cpu_ptr(&softnet_data);
2371	q->next_sched = NULL;
2372	*sd->output_queue_tailp = q;
2373	sd->output_queue_tailp = &q->next_sched;
2374	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2375	local_irq_restore(flags);
2376}
2377
2378void __netif_schedule(struct Qdisc *q)
2379{
2380	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2381		__netif_reschedule(q);
2382}
2383EXPORT_SYMBOL(__netif_schedule);
2384
2385struct dev_kfree_skb_cb {
2386	enum skb_free_reason reason;
2387};
2388
2389static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2390{
2391	return (struct dev_kfree_skb_cb *)skb->cb;
2392}
2393
2394void netif_schedule_queue(struct netdev_queue *txq)
2395{
2396	rcu_read_lock();
2397	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2398		struct Qdisc *q = rcu_dereference(txq->qdisc);
2399
2400		__netif_schedule(q);
2401	}
2402	rcu_read_unlock();
2403}
2404EXPORT_SYMBOL(netif_schedule_queue);
2405
2406/**
2407 *	netif_wake_subqueue - allow sending packets on subqueue
2408 *	@dev: network device
2409 *	@queue_index: sub queue index
2410 *
2411 * Resume individual transmit queue of a device with multiple transmit queues.
2412 */
2413void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2414{
2415	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2416
2417	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2418		struct Qdisc *q;
2419
2420		rcu_read_lock();
2421		q = rcu_dereference(txq->qdisc);
2422		__netif_schedule(q);
2423		rcu_read_unlock();
2424	}
2425}
2426EXPORT_SYMBOL(netif_wake_subqueue);
2427
2428void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2429{
2430	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2431		struct Qdisc *q;
2432
2433		rcu_read_lock();
2434		q = rcu_dereference(dev_queue->qdisc);
2435		__netif_schedule(q);
2436		rcu_read_unlock();
2437	}
2438}
2439EXPORT_SYMBOL(netif_tx_wake_queue);
2440
2441void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2442{
2443	unsigned long flags;
2444
2445	if (likely(atomic_read(&skb->users) == 1)) {
2446		smp_rmb();
2447		atomic_set(&skb->users, 0);
2448	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2449		return;
2450	}
2451	get_kfree_skb_cb(skb)->reason = reason;
2452	local_irq_save(flags);
2453	skb->next = __this_cpu_read(softnet_data.completion_queue);
2454	__this_cpu_write(softnet_data.completion_queue, skb);
2455	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2456	local_irq_restore(flags);
2457}
2458EXPORT_SYMBOL(__dev_kfree_skb_irq);
2459
2460void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2461{
2462	if (in_irq() || irqs_disabled())
2463		__dev_kfree_skb_irq(skb, reason);
2464	else
2465		dev_kfree_skb(skb);
2466}
2467EXPORT_SYMBOL(__dev_kfree_skb_any);
2468
2469
2470/**
2471 * netif_device_detach - mark device as removed
2472 * @dev: network device
2473 *
2474 * Mark device as removed from system and therefore no longer available.
2475 */
2476void netif_device_detach(struct net_device *dev)
2477{
2478	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2479	    netif_running(dev)) {
2480		netif_tx_stop_all_queues(dev);
2481	}
2482}
2483EXPORT_SYMBOL(netif_device_detach);
2484
2485/**
2486 * netif_device_attach - mark device as attached
2487 * @dev: network device
2488 *
2489 * Mark device as attached from system and restart if needed.
2490 */
2491void netif_device_attach(struct net_device *dev)
2492{
2493	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2494	    netif_running(dev)) {
2495		netif_tx_wake_all_queues(dev);
2496		__netdev_watchdog_up(dev);
2497	}
2498}
2499EXPORT_SYMBOL(netif_device_attach);
2500
2501/*
2502 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2503 * to be used as a distribution range.
2504 */
2505u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2506		  unsigned int num_tx_queues)
2507{
2508	u32 hash;
2509	u16 qoffset = 0;
2510	u16 qcount = num_tx_queues;
2511
2512	if (skb_rx_queue_recorded(skb)) {
2513		hash = skb_get_rx_queue(skb);
2514		while (unlikely(hash >= num_tx_queues))
2515			hash -= num_tx_queues;
2516		return hash;
2517	}
2518
2519	if (dev->num_tc) {
2520		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2521		qoffset = dev->tc_to_txq[tc].offset;
2522		qcount = dev->tc_to_txq[tc].count;
2523	}
2524
2525	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2526}
2527EXPORT_SYMBOL(__skb_tx_hash);
2528
2529static void skb_warn_bad_offload(const struct sk_buff *skb)
2530{
2531	static const netdev_features_t null_features;
2532	struct net_device *dev = skb->dev;
2533	const char *name = "";
2534
2535	if (!net_ratelimit())
2536		return;
2537
2538	if (dev) {
2539		if (dev->dev.parent)
2540			name = dev_driver_string(dev->dev.parent);
2541		else
2542			name = netdev_name(dev);
2543	}
2544	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2545	     "gso_type=%d ip_summed=%d\n",
2546	     name, dev ? &dev->features : &null_features,
2547	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2548	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2549	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2550}
2551
2552/*
2553 * Invalidate hardware checksum when packet is to be mangled, and
2554 * complete checksum manually on outgoing path.
2555 */
2556int skb_checksum_help(struct sk_buff *skb)
2557{
2558	__wsum csum;
2559	int ret = 0, offset;
2560
2561	if (skb->ip_summed == CHECKSUM_COMPLETE)
2562		goto out_set_summed;
2563
2564	if (unlikely(skb_shinfo(skb)->gso_size)) {
2565		skb_warn_bad_offload(skb);
2566		return -EINVAL;
2567	}
2568
2569	/* Before computing a checksum, we should make sure no frag could
2570	 * be modified by an external entity : checksum could be wrong.
2571	 */
2572	if (skb_has_shared_frag(skb)) {
2573		ret = __skb_linearize(skb);
2574		if (ret)
2575			goto out;
2576	}
2577
2578	offset = skb_checksum_start_offset(skb);
2579	BUG_ON(offset >= skb_headlen(skb));
2580	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2581
2582	offset += skb->csum_offset;
2583	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2584
2585	if (skb_cloned(skb) &&
2586	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2587		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2588		if (ret)
2589			goto out;
2590	}
2591
2592	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2593out_set_summed:
2594	skb->ip_summed = CHECKSUM_NONE;
2595out:
2596	return ret;
2597}
2598EXPORT_SYMBOL(skb_checksum_help);
2599
2600__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2601{
2602	__be16 type = skb->protocol;
2603
2604	/* Tunnel gso handlers can set protocol to ethernet. */
2605	if (type == htons(ETH_P_TEB)) {
2606		struct ethhdr *eth;
2607
2608		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2609			return 0;
2610
2611		eth = (struct ethhdr *)skb_mac_header(skb);
2612		type = eth->h_proto;
2613	}
2614
2615	return __vlan_get_protocol(skb, type, depth);
2616}
2617
2618/**
2619 *	skb_mac_gso_segment - mac layer segmentation handler.
2620 *	@skb: buffer to segment
2621 *	@features: features for the output path (see dev->features)
2622 */
2623struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2624				    netdev_features_t features)
2625{
2626	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2627	struct packet_offload *ptype;
2628	int vlan_depth = skb->mac_len;
2629	__be16 type = skb_network_protocol(skb, &vlan_depth);
2630
2631	if (unlikely(!type))
2632		return ERR_PTR(-EINVAL);
2633
2634	__skb_pull(skb, vlan_depth);
2635
2636	rcu_read_lock();
2637	list_for_each_entry_rcu(ptype, &offload_base, list) {
2638		if (ptype->type == type && ptype->callbacks.gso_segment) {
2639			segs = ptype->callbacks.gso_segment(skb, features);
2640			break;
2641		}
2642	}
2643	rcu_read_unlock();
2644
2645	__skb_push(skb, skb->data - skb_mac_header(skb));
2646
2647	return segs;
2648}
2649EXPORT_SYMBOL(skb_mac_gso_segment);
2650
2651
2652/* openvswitch calls this on rx path, so we need a different check.
2653 */
2654static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2655{
2656	if (tx_path)
2657		return skb->ip_summed != CHECKSUM_PARTIAL;
2658	else
2659		return skb->ip_summed == CHECKSUM_NONE;
2660}
2661
2662/**
2663 *	__skb_gso_segment - Perform segmentation on skb.
2664 *	@skb: buffer to segment
2665 *	@features: features for the output path (see dev->features)
2666 *	@tx_path: whether it is called in TX path
2667 *
2668 *	This function segments the given skb and returns a list of segments.
2669 *
2670 *	It may return NULL if the skb requires no segmentation.  This is
2671 *	only possible when GSO is used for verifying header integrity.
2672 *
2673 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2674 */
2675struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2676				  netdev_features_t features, bool tx_path)
2677{
2678	if (unlikely(skb_needs_check(skb, tx_path))) {
2679		int err;
2680
2681		skb_warn_bad_offload(skb);
2682
2683		err = skb_cow_head(skb, 0);
2684		if (err < 0)
2685			return ERR_PTR(err);
2686	}
2687
2688	/* Only report GSO partial support if it will enable us to
2689	 * support segmentation on this frame without needing additional
2690	 * work.
2691	 */
2692	if (features & NETIF_F_GSO_PARTIAL) {
2693		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2694		struct net_device *dev = skb->dev;
2695
2696		partial_features |= dev->features & dev->gso_partial_features;
2697		if (!skb_gso_ok(skb, features | partial_features))
2698			features &= ~NETIF_F_GSO_PARTIAL;
2699	}
2700
2701	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2702		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2703
2704	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2705	SKB_GSO_CB(skb)->encap_level = 0;
2706
2707	skb_reset_mac_header(skb);
2708	skb_reset_mac_len(skb);
2709
2710	return skb_mac_gso_segment(skb, features);
2711}
2712EXPORT_SYMBOL(__skb_gso_segment);
2713
2714/* Take action when hardware reception checksum errors are detected. */
2715#ifdef CONFIG_BUG
2716void netdev_rx_csum_fault(struct net_device *dev)
2717{
2718	if (net_ratelimit()) {
2719		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2720		dump_stack();
2721	}
2722}
2723EXPORT_SYMBOL(netdev_rx_csum_fault);
2724#endif
2725
2726/* Actually, we should eliminate this check as soon as we know, that:
2727 * 1. IOMMU is present and allows to map all the memory.
2728 * 2. No high memory really exists on this machine.
2729 */
2730
2731static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2732{
2733#ifdef CONFIG_HIGHMEM
2734	int i;
2735	if (!(dev->features & NETIF_F_HIGHDMA)) {
2736		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2737			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2738			if (PageHighMem(skb_frag_page(frag)))
2739				return 1;
2740		}
2741	}
2742
2743	if (PCI_DMA_BUS_IS_PHYS) {
2744		struct device *pdev = dev->dev.parent;
2745
2746		if (!pdev)
2747			return 0;
2748		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2749			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2750			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2751			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2752				return 1;
2753		}
2754	}
2755#endif
2756	return 0;
2757}
2758
2759/* If MPLS offload request, verify we are testing hardware MPLS features
2760 * instead of standard features for the netdev.
2761 */
2762#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2763static netdev_features_t net_mpls_features(struct sk_buff *skb,
2764					   netdev_features_t features,
2765					   __be16 type)
2766{
2767	if (eth_p_mpls(type))
2768		features &= skb->dev->mpls_features;
2769
2770	return features;
2771}
2772#else
2773static netdev_features_t net_mpls_features(struct sk_buff *skb,
2774					   netdev_features_t features,
2775					   __be16 type)
2776{
2777	return features;
2778}
2779#endif
2780
2781static netdev_features_t harmonize_features(struct sk_buff *skb,
2782	netdev_features_t features)
2783{
2784	int tmp;
2785	__be16 type;
2786
2787	type = skb_network_protocol(skb, &tmp);
2788	features = net_mpls_features(skb, features, type);
2789
2790	if (skb->ip_summed != CHECKSUM_NONE &&
2791	    !can_checksum_protocol(features, type)) {
2792		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2793	}
2794	if (illegal_highdma(skb->dev, skb))
2795		features &= ~NETIF_F_SG;
2796
2797	return features;
2798}
2799
2800netdev_features_t passthru_features_check(struct sk_buff *skb,
2801					  struct net_device *dev,
2802					  netdev_features_t features)
2803{
2804	return features;
2805}
2806EXPORT_SYMBOL(passthru_features_check);
2807
2808static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2809					     struct net_device *dev,
2810					     netdev_features_t features)
2811{
2812	return vlan_features_check(skb, features);
2813}
2814
2815static netdev_features_t gso_features_check(const struct sk_buff *skb,
2816					    struct net_device *dev,
2817					    netdev_features_t features)
2818{
2819	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2820
2821	if (gso_segs > dev->gso_max_segs)
2822		return features & ~NETIF_F_GSO_MASK;
2823
2824	/* Support for GSO partial features requires software
2825	 * intervention before we can actually process the packets
2826	 * so we need to strip support for any partial features now
2827	 * and we can pull them back in after we have partially
2828	 * segmented the frame.
2829	 */
2830	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2831		features &= ~dev->gso_partial_features;
2832
2833	/* Make sure to clear the IPv4 ID mangling feature if the
2834	 * IPv4 header has the potential to be fragmented.
2835	 */
2836	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2837		struct iphdr *iph = skb->encapsulation ?
2838				    inner_ip_hdr(skb) : ip_hdr(skb);
2839
2840		if (!(iph->frag_off & htons(IP_DF)))
2841			features &= ~NETIF_F_TSO_MANGLEID;
2842	}
2843
2844	return features;
2845}
2846
2847netdev_features_t netif_skb_features(struct sk_buff *skb)
2848{
2849	struct net_device *dev = skb->dev;
2850	netdev_features_t features = dev->features;
2851
2852	if (skb_is_gso(skb))
2853		features = gso_features_check(skb, dev, features);
2854
2855	/* If encapsulation offload request, verify we are testing
2856	 * hardware encapsulation features instead of standard
2857	 * features for the netdev
2858	 */
2859	if (skb->encapsulation)
2860		features &= dev->hw_enc_features;
2861
2862	if (skb_vlan_tagged(skb))
2863		features = netdev_intersect_features(features,
2864						     dev->vlan_features |
2865						     NETIF_F_HW_VLAN_CTAG_TX |
2866						     NETIF_F_HW_VLAN_STAG_TX);
2867
2868	if (dev->netdev_ops->ndo_features_check)
2869		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2870								features);
2871	else
2872		features &= dflt_features_check(skb, dev, features);
2873
2874	return harmonize_features(skb, features);
2875}
2876EXPORT_SYMBOL(netif_skb_features);
2877
2878static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2879		    struct netdev_queue *txq, bool more)
2880{
2881	unsigned int len;
2882	int rc;
2883
2884	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2885		dev_queue_xmit_nit(skb, dev);
2886
2887	len = skb->len;
2888	trace_net_dev_start_xmit(skb, dev);
2889	rc = netdev_start_xmit(skb, dev, txq, more);
2890	trace_net_dev_xmit(skb, rc, dev, len);
2891
2892	return rc;
2893}
2894
2895struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2896				    struct netdev_queue *txq, int *ret)
2897{
2898	struct sk_buff *skb = first;
2899	int rc = NETDEV_TX_OK;
2900
2901	while (skb) {
2902		struct sk_buff *next = skb->next;
2903
2904		skb->next = NULL;
2905		rc = xmit_one(skb, dev, txq, next != NULL);
2906		if (unlikely(!dev_xmit_complete(rc))) {
2907			skb->next = next;
2908			goto out;
2909		}
2910
2911		skb = next;
2912		if (netif_xmit_stopped(txq) && skb) {
2913			rc = NETDEV_TX_BUSY;
2914			break;
2915		}
2916	}
2917
2918out:
2919	*ret = rc;
2920	return skb;
2921}
2922
2923static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2924					  netdev_features_t features)
2925{
2926	if (skb_vlan_tag_present(skb) &&
2927	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2928		skb = __vlan_hwaccel_push_inside(skb);
2929	return skb;
2930}
2931
2932static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2933{
2934	netdev_features_t features;
2935
2936	features = netif_skb_features(skb);
2937	skb = validate_xmit_vlan(skb, features);
2938	if (unlikely(!skb))
2939		goto out_null;
2940
2941	if (netif_needs_gso(skb, features)) {
2942		struct sk_buff *segs;
2943
2944		segs = skb_gso_segment(skb, features);
2945		if (IS_ERR(segs)) {
2946			goto out_kfree_skb;
2947		} else if (segs) {
2948			consume_skb(skb);
2949			skb = segs;
2950		}
2951	} else {
2952		if (skb_needs_linearize(skb, features) &&
2953		    __skb_linearize(skb))
2954			goto out_kfree_skb;
2955
2956		/* If packet is not checksummed and device does not
2957		 * support checksumming for this protocol, complete
2958		 * checksumming here.
2959		 */
2960		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2961			if (skb->encapsulation)
2962				skb_set_inner_transport_header(skb,
2963							       skb_checksum_start_offset(skb));
2964			else
2965				skb_set_transport_header(skb,
2966							 skb_checksum_start_offset(skb));
2967			if (!(features & NETIF_F_CSUM_MASK) &&
2968			    skb_checksum_help(skb))
2969				goto out_kfree_skb;
2970		}
2971	}
2972
2973	return skb;
2974
2975out_kfree_skb:
2976	kfree_skb(skb);
2977out_null:
2978	atomic_long_inc(&dev->tx_dropped);
2979	return NULL;
2980}
2981
2982struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2983{
2984	struct sk_buff *next, *head = NULL, *tail;
2985
2986	for (; skb != NULL; skb = next) {
2987		next = skb->next;
2988		skb->next = NULL;
2989
2990		/* in case skb wont be segmented, point to itself */
2991		skb->prev = skb;
2992
2993		skb = validate_xmit_skb(skb, dev);
2994		if (!skb)
2995			continue;
2996
2997		if (!head)
2998			head = skb;
2999		else
3000			tail->next = skb;
3001		/* If skb was segmented, skb->prev points to
3002		 * the last segment. If not, it still contains skb.
3003		 */
3004		tail = skb->prev;
3005	}
3006	return head;
3007}
3008EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3009
3010static void qdisc_pkt_len_init(struct sk_buff *skb)
3011{
3012	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3013
3014	qdisc_skb_cb(skb)->pkt_len = skb->len;
3015
3016	/* To get more precise estimation of bytes sent on wire,
3017	 * we add to pkt_len the headers size of all segments
3018	 */
3019	if (shinfo->gso_size)  {
3020		unsigned int hdr_len;
3021		u16 gso_segs = shinfo->gso_segs;
3022
3023		/* mac layer + network layer */
3024		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3025
3026		/* + transport layer */
3027		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3028			hdr_len += tcp_hdrlen(skb);
3029		else
3030			hdr_len += sizeof(struct udphdr);
3031
3032		if (shinfo->gso_type & SKB_GSO_DODGY)
3033			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3034						shinfo->gso_size);
3035
3036		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3037	}
3038}
3039
3040static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3041				 struct net_device *dev,
3042				 struct netdev_queue *txq)
3043{
3044	spinlock_t *root_lock = qdisc_lock(q);
3045	struct sk_buff *to_free = NULL;
3046	bool contended;
3047	int rc;
3048
3049	qdisc_calculate_pkt_len(skb, q);
3050	/*
3051	 * Heuristic to force contended enqueues to serialize on a
3052	 * separate lock before trying to get qdisc main lock.
3053	 * This permits qdisc->running owner to get the lock more
3054	 * often and dequeue packets faster.
3055	 */
3056	contended = qdisc_is_running(q);
3057	if (unlikely(contended))
3058		spin_lock(&q->busylock);
3059
3060	spin_lock(root_lock);
3061	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3062		__qdisc_drop(skb, &to_free);
3063		rc = NET_XMIT_DROP;
3064	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3065		   qdisc_run_begin(q)) {
3066		/*
3067		 * This is a work-conserving queue; there are no old skbs
3068		 * waiting to be sent out; and the qdisc is not running -
3069		 * xmit the skb directly.
3070		 */
3071
3072		qdisc_bstats_update(q, skb);
3073
3074		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3075			if (unlikely(contended)) {
3076				spin_unlock(&q->busylock);
3077				contended = false;
3078			}
3079			__qdisc_run(q);
3080		} else
3081			qdisc_run_end(q);
3082
3083		rc = NET_XMIT_SUCCESS;
3084	} else {
3085		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3086		if (qdisc_run_begin(q)) {
3087			if (unlikely(contended)) {
3088				spin_unlock(&q->busylock);
3089				contended = false;
3090			}
3091			__qdisc_run(q);
3092		}
3093	}
3094	spin_unlock(root_lock);
3095	if (unlikely(to_free))
3096		kfree_skb_list(to_free);
3097	if (unlikely(contended))
3098		spin_unlock(&q->busylock);
3099	return rc;
3100}
3101
3102#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3103static void skb_update_prio(struct sk_buff *skb)
3104{
3105	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3106
3107	if (!skb->priority && skb->sk && map) {
3108		unsigned int prioidx =
3109			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3110
3111		if (prioidx < map->priomap_len)
3112			skb->priority = map->priomap[prioidx];
3113	}
3114}
3115#else
3116#define skb_update_prio(skb)
3117#endif
3118
3119DEFINE_PER_CPU(int, xmit_recursion);
3120EXPORT_SYMBOL(xmit_recursion);
3121
3122/**
3123 *	dev_loopback_xmit - loop back @skb
3124 *	@net: network namespace this loopback is happening in
3125 *	@sk:  sk needed to be a netfilter okfn
3126 *	@skb: buffer to transmit
3127 */
3128int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3129{
3130	skb_reset_mac_header(skb);
3131	__skb_pull(skb, skb_network_offset(skb));
3132	skb->pkt_type = PACKET_LOOPBACK;
3133	skb->ip_summed = CHECKSUM_UNNECESSARY;
3134	WARN_ON(!skb_dst(skb));
3135	skb_dst_force(skb);
3136	netif_rx_ni(skb);
3137	return 0;
3138}
3139EXPORT_SYMBOL(dev_loopback_xmit);
3140
3141#ifdef CONFIG_NET_EGRESS
3142static struct sk_buff *
3143sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3144{
3145	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3146	struct tcf_result cl_res;
3147
3148	if (!cl)
3149		return skb;
3150
3151	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3152	 * earlier by the caller.
3153	 */
3154	qdisc_bstats_cpu_update(cl->q, skb);
3155
3156	switch (tc_classify(skb, cl, &cl_res, false)) {
3157	case TC_ACT_OK:
3158	case TC_ACT_RECLASSIFY:
3159		skb->tc_index = TC_H_MIN(cl_res.classid);
3160		break;
3161	case TC_ACT_SHOT:
3162		qdisc_qstats_cpu_drop(cl->q);
3163		*ret = NET_XMIT_DROP;
3164		kfree_skb(skb);
3165		return NULL;
3166	case TC_ACT_STOLEN:
3167	case TC_ACT_QUEUED:
3168		*ret = NET_XMIT_SUCCESS;
3169		consume_skb(skb);
3170		return NULL;
3171	case TC_ACT_REDIRECT:
3172		/* No need to push/pop skb's mac_header here on egress! */
3173		skb_do_redirect(skb);
3174		*ret = NET_XMIT_SUCCESS;
3175		return NULL;
3176	default:
3177		break;
3178	}
3179
3180	return skb;
3181}
3182#endif /* CONFIG_NET_EGRESS */
3183
3184static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3185{
3186#ifdef CONFIG_XPS
3187	struct xps_dev_maps *dev_maps;
3188	struct xps_map *map;
3189	int queue_index = -1;
3190
3191	rcu_read_lock();
3192	dev_maps = rcu_dereference(dev->xps_maps);
3193	if (dev_maps) {
3194		unsigned int tci = skb->sender_cpu - 1;
3195
3196		if (dev->num_tc) {
3197			tci *= dev->num_tc;
3198			tci += netdev_get_prio_tc_map(dev, skb->priority);
3199		}
3200
3201		map = rcu_dereference(dev_maps->cpu_map[tci]);
3202		if (map) {
3203			if (map->len == 1)
3204				queue_index = map->queues[0];
3205			else
3206				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3207									   map->len)];
3208			if (unlikely(queue_index >= dev->real_num_tx_queues))
3209				queue_index = -1;
3210		}
3211	}
3212	rcu_read_unlock();
3213
3214	return queue_index;
3215#else
3216	return -1;
3217#endif
3218}
3219
3220static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3221{
3222	struct sock *sk = skb->sk;
3223	int queue_index = sk_tx_queue_get(sk);
3224
3225	if (queue_index < 0 || skb->ooo_okay ||
3226	    queue_index >= dev->real_num_tx_queues) {
3227		int new_index = get_xps_queue(dev, skb);
3228		if (new_index < 0)
3229			new_index = skb_tx_hash(dev, skb);
3230
3231		if (queue_index != new_index && sk &&
3232		    sk_fullsock(sk) &&
3233		    rcu_access_pointer(sk->sk_dst_cache))
3234			sk_tx_queue_set(sk, new_index);
3235
3236		queue_index = new_index;
3237	}
3238
3239	return queue_index;
3240}
3241
3242struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3243				    struct sk_buff *skb,
3244				    void *accel_priv)
3245{
3246	int queue_index = 0;
3247
3248#ifdef CONFIG_XPS
3249	u32 sender_cpu = skb->sender_cpu - 1;
3250
3251	if (sender_cpu >= (u32)NR_CPUS)
3252		skb->sender_cpu = raw_smp_processor_id() + 1;
3253#endif
3254
3255	if (dev->real_num_tx_queues != 1) {
3256		const struct net_device_ops *ops = dev->netdev_ops;
3257		if (ops->ndo_select_queue)
3258			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3259							    __netdev_pick_tx);
3260		else
3261			queue_index = __netdev_pick_tx(dev, skb);
3262
3263		if (!accel_priv)
3264			queue_index = netdev_cap_txqueue(dev, queue_index);
3265	}
3266
3267	skb_set_queue_mapping(skb, queue_index);
3268	return netdev_get_tx_queue(dev, queue_index);
3269}
3270
3271/**
3272 *	__dev_queue_xmit - transmit a buffer
3273 *	@skb: buffer to transmit
3274 *	@accel_priv: private data used for L2 forwarding offload
3275 *
3276 *	Queue a buffer for transmission to a network device. The caller must
3277 *	have set the device and priority and built the buffer before calling
3278 *	this function. The function can be called from an interrupt.
3279 *
3280 *	A negative errno code is returned on a failure. A success does not
3281 *	guarantee the frame will be transmitted as it may be dropped due
3282 *	to congestion or traffic shaping.
3283 *
3284 * -----------------------------------------------------------------------------------
3285 *      I notice this method can also return errors from the queue disciplines,
3286 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3287 *      be positive.
3288 *
3289 *      Regardless of the return value, the skb is consumed, so it is currently
3290 *      difficult to retry a send to this method.  (You can bump the ref count
3291 *      before sending to hold a reference for retry if you are careful.)
3292 *
3293 *      When calling this method, interrupts MUST be enabled.  This is because
3294 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3295 *          --BLG
3296 */
3297static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3298{
3299	struct net_device *dev = skb->dev;
3300	struct netdev_queue *txq;
3301	struct Qdisc *q;
3302	int rc = -ENOMEM;
3303
3304	skb_reset_mac_header(skb);
3305
3306	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3307		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3308
3309	/* Disable soft irqs for various locks below. Also
3310	 * stops preemption for RCU.
3311	 */
3312	rcu_read_lock_bh();
3313
3314	skb_update_prio(skb);
3315
3316	qdisc_pkt_len_init(skb);
3317#ifdef CONFIG_NET_CLS_ACT
3318	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3319# ifdef CONFIG_NET_EGRESS
3320	if (static_key_false(&egress_needed)) {
3321		skb = sch_handle_egress(skb, &rc, dev);
3322		if (!skb)
3323			goto out;
3324	}
3325# endif
3326#endif
3327	/* If device/qdisc don't need skb->dst, release it right now while
3328	 * its hot in this cpu cache.
3329	 */
3330	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3331		skb_dst_drop(skb);
3332	else
3333		skb_dst_force(skb);
3334
3335	txq = netdev_pick_tx(dev, skb, accel_priv);
3336	q = rcu_dereference_bh(txq->qdisc);
3337
3338	trace_net_dev_queue(skb);
3339	if (q->enqueue) {
3340		rc = __dev_xmit_skb(skb, q, dev, txq);
3341		goto out;
3342	}
3343
3344	/* The device has no queue. Common case for software devices:
3345	   loopback, all the sorts of tunnels...
3346
3347	   Really, it is unlikely that netif_tx_lock protection is necessary
3348	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3349	   counters.)
3350	   However, it is possible, that they rely on protection
3351	   made by us here.
3352
3353	   Check this and shot the lock. It is not prone from deadlocks.
3354	   Either shot noqueue qdisc, it is even simpler 8)
3355	 */
3356	if (dev->flags & IFF_UP) {
3357		int cpu = smp_processor_id(); /* ok because BHs are off */
3358
3359		if (txq->xmit_lock_owner != cpu) {
3360			if (unlikely(__this_cpu_read(xmit_recursion) >
3361				     XMIT_RECURSION_LIMIT))
3362				goto recursion_alert;
3363
3364			skb = validate_xmit_skb(skb, dev);
3365			if (!skb)
3366				goto out;
3367
3368			HARD_TX_LOCK(dev, txq, cpu);
3369
3370			if (!netif_xmit_stopped(txq)) {
3371				__this_cpu_inc(xmit_recursion);
3372				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3373				__this_cpu_dec(xmit_recursion);
3374				if (dev_xmit_complete(rc)) {
3375					HARD_TX_UNLOCK(dev, txq);
3376					goto out;
3377				}
3378			}
3379			HARD_TX_UNLOCK(dev, txq);
3380			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3381					     dev->name);
3382		} else {
3383			/* Recursion is detected! It is possible,
3384			 * unfortunately
3385			 */
3386recursion_alert:
3387			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3388					     dev->name);
3389		}
3390	}
3391
3392	rc = -ENETDOWN;
3393	rcu_read_unlock_bh();
3394
3395	atomic_long_inc(&dev->tx_dropped);
3396	kfree_skb_list(skb);
3397	return rc;
3398out:
3399	rcu_read_unlock_bh();
3400	return rc;
3401}
3402
3403int dev_queue_xmit(struct sk_buff *skb)
3404{
3405	return __dev_queue_xmit(skb, NULL);
3406}
3407EXPORT_SYMBOL(dev_queue_xmit);
3408
3409int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3410{
3411	return __dev_queue_xmit(skb, accel_priv);
3412}
3413EXPORT_SYMBOL(dev_queue_xmit_accel);
3414
3415
3416/*=======================================================================
3417			Receiver routines
3418  =======================================================================*/
3419
3420int netdev_max_backlog __read_mostly = 1000;
3421EXPORT_SYMBOL(netdev_max_backlog);
3422
3423int netdev_tstamp_prequeue __read_mostly = 1;
3424int netdev_budget __read_mostly = 300;
3425int weight_p __read_mostly = 64;            /* old backlog weight */
3426
3427/* Called with irq disabled */
3428static inline void ____napi_schedule(struct softnet_data *sd,
3429				     struct napi_struct *napi)
3430{
3431	list_add_tail(&napi->poll_list, &sd->poll_list);
3432	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3433}
3434
3435#ifdef CONFIG_RPS
3436
3437/* One global table that all flow-based protocols share. */
3438struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3439EXPORT_SYMBOL(rps_sock_flow_table);
3440u32 rps_cpu_mask __read_mostly;
3441EXPORT_SYMBOL(rps_cpu_mask);
3442
3443struct static_key rps_needed __read_mostly;
3444EXPORT_SYMBOL(rps_needed);
3445struct static_key rfs_needed __read_mostly;
3446EXPORT_SYMBOL(rfs_needed);
3447
3448static struct rps_dev_flow *
3449set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3450	    struct rps_dev_flow *rflow, u16 next_cpu)
3451{
3452	if (next_cpu < nr_cpu_ids) {
3453#ifdef CONFIG_RFS_ACCEL
3454		struct netdev_rx_queue *rxqueue;
3455		struct rps_dev_flow_table *flow_table;
3456		struct rps_dev_flow *old_rflow;
3457		u32 flow_id;
3458		u16 rxq_index;
3459		int rc;
3460
3461		/* Should we steer this flow to a different hardware queue? */
3462		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3463		    !(dev->features & NETIF_F_NTUPLE))
3464			goto out;
3465		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3466		if (rxq_index == skb_get_rx_queue(skb))
3467			goto out;
3468
3469		rxqueue = dev->_rx + rxq_index;
3470		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3471		if (!flow_table)
3472			goto out;
3473		flow_id = skb_get_hash(skb) & flow_table->mask;
3474		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3475							rxq_index, flow_id);
3476		if (rc < 0)
3477			goto out;
3478		old_rflow = rflow;
3479		rflow = &flow_table->flows[flow_id];
3480		rflow->filter = rc;
3481		if (old_rflow->filter == rflow->filter)
3482			old_rflow->filter = RPS_NO_FILTER;
3483	out:
3484#endif
3485		rflow->last_qtail =
3486			per_cpu(softnet_data, next_cpu).input_queue_head;
3487	}
3488
3489	rflow->cpu = next_cpu;
3490	return rflow;
3491}
3492
3493/*
3494 * get_rps_cpu is called from netif_receive_skb and returns the target
3495 * CPU from the RPS map of the receiving queue for a given skb.
3496 * rcu_read_lock must be held on entry.
3497 */
3498static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3499		       struct rps_dev_flow **rflowp)
3500{
3501	const struct rps_sock_flow_table *sock_flow_table;
3502	struct netdev_rx_queue *rxqueue = dev->_rx;
3503	struct rps_dev_flow_table *flow_table;
3504	struct rps_map *map;
3505	int cpu = -1;
3506	u32 tcpu;
3507	u32 hash;
3508
3509	if (skb_rx_queue_recorded(skb)) {
3510		u16 index = skb_get_rx_queue(skb);
3511
3512		if (unlikely(index >= dev->real_num_rx_queues)) {
3513			WARN_ONCE(dev->real_num_rx_queues > 1,
3514				  "%s received packet on queue %u, but number "
3515				  "of RX queues is %u\n",
3516				  dev->name, index, dev->real_num_rx_queues);
3517			goto done;
3518		}
3519		rxqueue += index;
3520	}
3521
3522	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3523
3524	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3525	map = rcu_dereference(rxqueue->rps_map);
3526	if (!flow_table && !map)
3527		goto done;
3528
3529	skb_reset_network_header(skb);
3530	hash = skb_get_hash(skb);
3531	if (!hash)
3532		goto done;
3533
3534	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3535	if (flow_table && sock_flow_table) {
3536		struct rps_dev_flow *rflow;
3537		u32 next_cpu;
3538		u32 ident;
3539
3540		/* First check into global flow table if there is a match */
3541		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3542		if ((ident ^ hash) & ~rps_cpu_mask)
3543			goto try_rps;
3544
3545		next_cpu = ident & rps_cpu_mask;
3546
3547		/* OK, now we know there is a match,
3548		 * we can look at the local (per receive queue) flow table
3549		 */
3550		rflow = &flow_table->flows[hash & flow_table->mask];
3551		tcpu = rflow->cpu;
3552
3553		/*
3554		 * If the desired CPU (where last recvmsg was done) is
3555		 * different from current CPU (one in the rx-queue flow
3556		 * table entry), switch if one of the following holds:
3557		 *   - Current CPU is unset (>= nr_cpu_ids).
3558		 *   - Current CPU is offline.
3559		 *   - The current CPU's queue tail has advanced beyond the
3560		 *     last packet that was enqueued using this table entry.
3561		 *     This guarantees that all previous packets for the flow
3562		 *     have been dequeued, thus preserving in order delivery.
3563		 */
3564		if (unlikely(tcpu != next_cpu) &&
3565		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3566		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3567		      rflow->last_qtail)) >= 0)) {
3568			tcpu = next_cpu;
3569			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3570		}
3571
3572		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3573			*rflowp = rflow;
3574			cpu = tcpu;
3575			goto done;
3576		}
3577	}
3578
3579try_rps:
3580
3581	if (map) {
3582		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3583		if (cpu_online(tcpu)) {
3584			cpu = tcpu;
3585			goto done;
3586		}
3587	}
3588
3589done:
3590	return cpu;
3591}
3592
3593#ifdef CONFIG_RFS_ACCEL
3594
3595/**
3596 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3597 * @dev: Device on which the filter was set
3598 * @rxq_index: RX queue index
3599 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3600 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3601 *
3602 * Drivers that implement ndo_rx_flow_steer() should periodically call
3603 * this function for each installed filter and remove the filters for
3604 * which it returns %true.
3605 */
3606bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3607			 u32 flow_id, u16 filter_id)
3608{
3609	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3610	struct rps_dev_flow_table *flow_table;
3611	struct rps_dev_flow *rflow;
3612	bool expire = true;
3613	unsigned int cpu;
3614
3615	rcu_read_lock();
3616	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3617	if (flow_table && flow_id <= flow_table->mask) {
3618		rflow = &flow_table->flows[flow_id];
3619		cpu = ACCESS_ONCE(rflow->cpu);
3620		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3621		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3622			   rflow->last_qtail) <
3623		     (int)(10 * flow_table->mask)))
3624			expire = false;
3625	}
3626	rcu_read_unlock();
3627	return expire;
3628}
3629EXPORT_SYMBOL(rps_may_expire_flow);
3630
3631#endif /* CONFIG_RFS_ACCEL */
3632
3633/* Called from hardirq (IPI) context */
3634static void rps_trigger_softirq(void *data)
3635{
3636	struct softnet_data *sd = data;
3637
3638	____napi_schedule(sd, &sd->backlog);
3639	sd->received_rps++;
3640}
3641
3642#endif /* CONFIG_RPS */
3643
3644/*
3645 * Check if this softnet_data structure is another cpu one
3646 * If yes, queue it to our IPI list and return 1
3647 * If no, return 0
3648 */
3649static int rps_ipi_queued(struct softnet_data *sd)
3650{
3651#ifdef CONFIG_RPS
3652	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3653
3654	if (sd != mysd) {
3655		sd->rps_ipi_next = mysd->rps_ipi_list;
3656		mysd->rps_ipi_list = sd;
3657
3658		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3659		return 1;
3660	}
3661#endif /* CONFIG_RPS */
3662	return 0;
3663}
3664
3665#ifdef CONFIG_NET_FLOW_LIMIT
3666int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3667#endif
3668
3669static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3670{
3671#ifdef CONFIG_NET_FLOW_LIMIT
3672	struct sd_flow_limit *fl;
3673	struct softnet_data *sd;
3674	unsigned int old_flow, new_flow;
3675
3676	if (qlen < (netdev_max_backlog >> 1))
3677		return false;
3678
3679	sd = this_cpu_ptr(&softnet_data);
3680
3681	rcu_read_lock();
3682	fl = rcu_dereference(sd->flow_limit);
3683	if (fl) {
3684		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3685		old_flow = fl->history[fl->history_head];
3686		fl->history[fl->history_head] = new_flow;
3687
3688		fl->history_head++;
3689		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3690
3691		if (likely(fl->buckets[old_flow]))
3692			fl->buckets[old_flow]--;
3693
3694		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3695			fl->count++;
3696			rcu_read_unlock();
3697			return true;
3698		}
3699	}
3700	rcu_read_unlock();
3701#endif
3702	return false;
3703}
3704
3705/*
3706 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3707 * queue (may be a remote CPU queue).
3708 */
3709static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3710			      unsigned int *qtail)
3711{
3712	struct softnet_data *sd;
3713	unsigned long flags;
3714	unsigned int qlen;
3715
3716	sd = &per_cpu(softnet_data, cpu);
3717
3718	local_irq_save(flags);
3719
3720	rps_lock(sd);
3721	if (!netif_running(skb->dev))
3722		goto drop;
3723	qlen = skb_queue_len(&sd->input_pkt_queue);
3724	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3725		if (qlen) {
3726enqueue:
3727			__skb_queue_tail(&sd->input_pkt_queue, skb);
3728			input_queue_tail_incr_save(sd, qtail);
3729			rps_unlock(sd);
3730			local_irq_restore(flags);
3731			return NET_RX_SUCCESS;
3732		}
3733
3734		/* Schedule NAPI for backlog device
3735		 * We can use non atomic operation since we own the queue lock
3736		 */
3737		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3738			if (!rps_ipi_queued(sd))
3739				____napi_schedule(sd, &sd->backlog);
3740		}
3741		goto enqueue;
3742	}
3743
3744drop:
3745	sd->dropped++;
3746	rps_unlock(sd);
3747
3748	local_irq_restore(flags);
3749
3750	atomic_long_inc(&skb->dev->rx_dropped);
3751	kfree_skb(skb);
3752	return NET_RX_DROP;
3753}
3754
3755static int netif_rx_internal(struct sk_buff *skb)
3756{
3757	int ret;
3758
3759	net_timestamp_check(netdev_tstamp_prequeue, skb);
3760
3761	trace_netif_rx(skb);
3762#ifdef CONFIG_RPS
3763	if (static_key_false(&rps_needed)) {
3764		struct rps_dev_flow voidflow, *rflow = &voidflow;
3765		int cpu;
3766
3767		preempt_disable();
3768		rcu_read_lock();
3769
3770		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3771		if (cpu < 0)
3772			cpu = smp_processor_id();
3773
3774		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3775
3776		rcu_read_unlock();
3777		preempt_enable();
3778	} else
3779#endif
3780	{
3781		unsigned int qtail;
3782		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3783		put_cpu();
3784	}
3785	return ret;
3786}
3787
3788/**
3789 *	netif_rx	-	post buffer to the network code
3790 *	@skb: buffer to post
3791 *
3792 *	This function receives a packet from a device driver and queues it for
3793 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3794 *	may be dropped during processing for congestion control or by the
3795 *	protocol layers.
3796 *
3797 *	return values:
3798 *	NET_RX_SUCCESS	(no congestion)
3799 *	NET_RX_DROP     (packet was dropped)
3800 *
3801 */
3802
3803int netif_rx(struct sk_buff *skb)
3804{
3805	trace_netif_rx_entry(skb);
3806
3807	return netif_rx_internal(skb);
3808}
3809EXPORT_SYMBOL(netif_rx);
3810
3811int netif_rx_ni(struct sk_buff *skb)
3812{
3813	int err;
3814
3815	trace_netif_rx_ni_entry(skb);
3816
3817	preempt_disable();
3818	err = netif_rx_internal(skb);
3819	if (local_softirq_pending())
3820		do_softirq();
3821	preempt_enable();
3822
3823	return err;
3824}
3825EXPORT_SYMBOL(netif_rx_ni);
3826
3827static __latent_entropy void net_tx_action(struct softirq_action *h)
3828{
3829	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3830
3831	if (sd->completion_queue) {
3832		struct sk_buff *clist;
3833
3834		local_irq_disable();
3835		clist = sd->completion_queue;
3836		sd->completion_queue = NULL;
3837		local_irq_enable();
3838
3839		while (clist) {
3840			struct sk_buff *skb = clist;
3841			clist = clist->next;
3842
3843			WARN_ON(atomic_read(&skb->users));
3844			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3845				trace_consume_skb(skb);
3846			else
3847				trace_kfree_skb(skb, net_tx_action);
3848
3849			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3850				__kfree_skb(skb);
3851			else
3852				__kfree_skb_defer(skb);
3853		}
3854
3855		__kfree_skb_flush();
3856	}
3857
3858	if (sd->output_queue) {
3859		struct Qdisc *head;
3860
3861		local_irq_disable();
3862		head = sd->output_queue;
3863		sd->output_queue = NULL;
3864		sd->output_queue_tailp = &sd->output_queue;
3865		local_irq_enable();
3866
3867		while (head) {
3868			struct Qdisc *q = head;
3869			spinlock_t *root_lock;
3870
3871			head = head->next_sched;
3872
3873			root_lock = qdisc_lock(q);
3874			spin_lock(root_lock);
3875			/* We need to make sure head->next_sched is read
3876			 * before clearing __QDISC_STATE_SCHED
3877			 */
3878			smp_mb__before_atomic();
3879			clear_bit(__QDISC_STATE_SCHED, &q->state);
3880			qdisc_run(q);
3881			spin_unlock(root_lock);
3882		}
3883	}
3884}
3885
3886#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3887/* This hook is defined here for ATM LANE */
3888int (*br_fdb_test_addr_hook)(struct net_device *dev,
3889			     unsigned char *addr) __read_mostly;
3890EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3891#endif
3892
3893static inline struct sk_buff *
3894sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3895		   struct net_device *orig_dev)
3896{
3897#ifdef CONFIG_NET_CLS_ACT
3898	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3899	struct tcf_result cl_res;
3900
3901	/* If there's at least one ingress present somewhere (so
3902	 * we get here via enabled static key), remaining devices
3903	 * that are not configured with an ingress qdisc will bail
3904	 * out here.
3905	 */
3906	if (!cl)
3907		return skb;
3908	if (*pt_prev) {
3909		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3910		*pt_prev = NULL;
3911	}
3912
3913	qdisc_skb_cb(skb)->pkt_len = skb->len;
3914	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3915	qdisc_bstats_cpu_update(cl->q, skb);
3916
3917	switch (tc_classify(skb, cl, &cl_res, false)) {
3918	case TC_ACT_OK:
3919	case TC_ACT_RECLASSIFY:
3920		skb->tc_index = TC_H_MIN(cl_res.classid);
3921		break;
3922	case TC_ACT_SHOT:
3923		qdisc_qstats_cpu_drop(cl->q);
3924		kfree_skb(skb);
3925		return NULL;
3926	case TC_ACT_STOLEN:
3927	case TC_ACT_QUEUED:
3928		consume_skb(skb);
3929		return NULL;
3930	case TC_ACT_REDIRECT:
3931		/* skb_mac_header check was done by cls/act_bpf, so
3932		 * we can safely push the L2 header back before
3933		 * redirecting to another netdev
3934		 */
3935		__skb_push(skb, skb->mac_len);
3936		skb_do_redirect(skb);
3937		return NULL;
3938	default:
3939		break;
3940	}
3941#endif /* CONFIG_NET_CLS_ACT */
3942	return skb;
3943}
3944
3945/**
3946 *	netdev_is_rx_handler_busy - check if receive handler is registered
3947 *	@dev: device to check
3948 *
3949 *	Check if a receive handler is already registered for a given device.
3950 *	Return true if there one.
3951 *
3952 *	The caller must hold the rtnl_mutex.
3953 */
3954bool netdev_is_rx_handler_busy(struct net_device *dev)
3955{
3956	ASSERT_RTNL();
3957	return dev && rtnl_dereference(dev->rx_handler);
3958}
3959EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3960
3961/**
3962 *	netdev_rx_handler_register - register receive handler
3963 *	@dev: device to register a handler for
3964 *	@rx_handler: receive handler to register
3965 *	@rx_handler_data: data pointer that is used by rx handler
3966 *
3967 *	Register a receive handler for a device. This handler will then be
3968 *	called from __netif_receive_skb. A negative errno code is returned
3969 *	on a failure.
3970 *
3971 *	The caller must hold the rtnl_mutex.
3972 *
3973 *	For a general description of rx_handler, see enum rx_handler_result.
3974 */
3975int netdev_rx_handler_register(struct net_device *dev,
3976			       rx_handler_func_t *rx_handler,
3977			       void *rx_handler_data)
3978{
3979	ASSERT_RTNL();
3980
3981	if (dev->rx_handler)
3982		return -EBUSY;
3983
3984	/* Note: rx_handler_data must be set before rx_handler */
3985	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3986	rcu_assign_pointer(dev->rx_handler, rx_handler);
3987
3988	return 0;
3989}
3990EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3991
3992/**
3993 *	netdev_rx_handler_unregister - unregister receive handler
3994 *	@dev: device to unregister a handler from
3995 *
3996 *	Unregister a receive handler from a device.
3997 *
3998 *	The caller must hold the rtnl_mutex.
3999 */
4000void netdev_rx_handler_unregister(struct net_device *dev)
4001{
4002
4003	ASSERT_RTNL();
4004	RCU_INIT_POINTER(dev->rx_handler, NULL);
4005	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4006	 * section has a guarantee to see a non NULL rx_handler_data
4007	 * as well.
4008	 */
4009	synchronize_net();
4010	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4011}
4012EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4013
4014/*
4015 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4016 * the special handling of PFMEMALLOC skbs.
4017 */
4018static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4019{
4020	switch (skb->protocol) {
4021	case htons(ETH_P_ARP):
4022	case htons(ETH_P_IP):
4023	case htons(ETH_P_IPV6):
4024	case htons(ETH_P_8021Q):
4025	case htons(ETH_P_8021AD):
4026		return true;
4027	default:
4028		return false;
4029	}
4030}
4031
4032static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4033			     int *ret, struct net_device *orig_dev)
4034{
4035#ifdef CONFIG_NETFILTER_INGRESS
4036	if (nf_hook_ingress_active(skb)) {
4037		int ingress_retval;
4038
4039		if (*pt_prev) {
4040			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4041			*pt_prev = NULL;
4042		}
4043
4044		rcu_read_lock();
4045		ingress_retval = nf_hook_ingress(skb);
4046		rcu_read_unlock();
4047		return ingress_retval;
4048	}
4049#endif /* CONFIG_NETFILTER_INGRESS */
4050	return 0;
4051}
4052
4053static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4054{
4055	struct packet_type *ptype, *pt_prev;
4056	rx_handler_func_t *rx_handler;
4057	struct net_device *orig_dev;
4058	bool deliver_exact = false;
4059	int ret = NET_RX_DROP;
4060	__be16 type;
4061
4062	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4063
4064	trace_netif_receive_skb(skb);
4065
4066	orig_dev = skb->dev;
4067
4068	skb_reset_network_header(skb);
4069	if (!skb_transport_header_was_set(skb))
4070		skb_reset_transport_header(skb);
4071	skb_reset_mac_len(skb);
4072
4073	pt_prev = NULL;
4074
4075another_round:
4076	skb->skb_iif = skb->dev->ifindex;
4077
4078	__this_cpu_inc(softnet_data.processed);
4079
4080	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4081	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4082		skb = skb_vlan_untag(skb);
4083		if (unlikely(!skb))
4084			goto out;
4085	}
4086
4087#ifdef CONFIG_NET_CLS_ACT
4088	if (skb->tc_verd & TC_NCLS) {
4089		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4090		goto ncls;
4091	}
4092#endif
4093
4094	if (pfmemalloc)
4095		goto skip_taps;
4096
4097	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4098		if (pt_prev)
4099			ret = deliver_skb(skb, pt_prev, orig_dev);
4100		pt_prev = ptype;
4101	}
4102
4103	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4104		if (pt_prev)
4105			ret = deliver_skb(skb, pt_prev, orig_dev);
4106		pt_prev = ptype;
4107	}
4108
4109skip_taps:
4110#ifdef CONFIG_NET_INGRESS
4111	if (static_key_false(&ingress_needed)) {
4112		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4113		if (!skb)
4114			goto out;
4115
4116		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4117			goto out;
4118	}
4119#endif
4120#ifdef CONFIG_NET_CLS_ACT
4121	skb->tc_verd = 0;
4122ncls:
4123#endif
4124	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4125		goto drop;
4126
4127	if (skb_vlan_tag_present(skb)) {
4128		if (pt_prev) {
4129			ret = deliver_skb(skb, pt_prev, orig_dev);
4130			pt_prev = NULL;
4131		}
4132		if (vlan_do_receive(&skb))
4133			goto another_round;
4134		else if (unlikely(!skb))
4135			goto out;
4136	}
4137
4138	rx_handler = rcu_dereference(skb->dev->rx_handler);
4139	if (rx_handler) {
4140		if (pt_prev) {
4141			ret = deliver_skb(skb, pt_prev, orig_dev);
4142			pt_prev = NULL;
4143		}
4144		switch (rx_handler(&skb)) {
4145		case RX_HANDLER_CONSUMED:
4146			ret = NET_RX_SUCCESS;
4147			goto out;
4148		case RX_HANDLER_ANOTHER:
4149			goto another_round;
4150		case RX_HANDLER_EXACT:
4151			deliver_exact = true;
4152		case RX_HANDLER_PASS:
4153			break;
4154		default:
4155			BUG();
4156		}
4157	}
4158
4159	if (unlikely(skb_vlan_tag_present(skb))) {
4160		if (skb_vlan_tag_get_id(skb))
4161			skb->pkt_type = PACKET_OTHERHOST;
4162		/* Note: we might in the future use prio bits
4163		 * and set skb->priority like in vlan_do_receive()
4164		 * For the time being, just ignore Priority Code Point
4165		 */
4166		skb->vlan_tci = 0;
4167	}
4168
4169	type = skb->protocol;
4170
4171	/* deliver only exact match when indicated */
4172	if (likely(!deliver_exact)) {
4173		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4174				       &ptype_base[ntohs(type) &
4175						   PTYPE_HASH_MASK]);
4176	}
4177
4178	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4179			       &orig_dev->ptype_specific);
4180
4181	if (unlikely(skb->dev != orig_dev)) {
4182		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4183				       &skb->dev->ptype_specific);
4184	}
4185
4186	if (pt_prev) {
4187		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4188			goto drop;
4189		else
4190			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4191	} else {
4192drop:
4193		if (!deliver_exact)
4194			atomic_long_inc(&skb->dev->rx_dropped);
4195		else
4196			atomic_long_inc(&skb->dev->rx_nohandler);
4197		kfree_skb(skb);
4198		/* Jamal, now you will not able to escape explaining
4199		 * me how you were going to use this. :-)
4200		 */
4201		ret = NET_RX_DROP;
4202	}
4203
4204out:
4205	return ret;
4206}
4207
4208static int __netif_receive_skb(struct sk_buff *skb)
4209{
4210	int ret;
4211
4212	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4213		unsigned long pflags = current->flags;
4214
4215		/*
4216		 * PFMEMALLOC skbs are special, they should
4217		 * - be delivered to SOCK_MEMALLOC sockets only
4218		 * - stay away from userspace
4219		 * - have bounded memory usage
4220		 *
4221		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4222		 * context down to all allocation sites.
4223		 */
4224		current->flags |= PF_MEMALLOC;
4225		ret = __netif_receive_skb_core(skb, true);
4226		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4227	} else
4228		ret = __netif_receive_skb_core(skb, false);
4229
4230	return ret;
4231}
4232
4233static int netif_receive_skb_internal(struct sk_buff *skb)
4234{
4235	int ret;
4236
4237	net_timestamp_check(netdev_tstamp_prequeue, skb);
4238
4239	if (skb_defer_rx_timestamp(skb))
4240		return NET_RX_SUCCESS;
4241
4242	rcu_read_lock();
4243
4244#ifdef CONFIG_RPS
4245	if (static_key_false(&rps_needed)) {
4246		struct rps_dev_flow voidflow, *rflow = &voidflow;
4247		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4248
4249		if (cpu >= 0) {
4250			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4251			rcu_read_unlock();
4252			return ret;
4253		}
4254	}
4255#endif
4256	ret = __netif_receive_skb(skb);
4257	rcu_read_unlock();
4258	return ret;
4259}
4260
4261/**
4262 *	netif_receive_skb - process receive buffer from network
4263 *	@skb: buffer to process
4264 *
4265 *	netif_receive_skb() is the main receive data processing function.
4266 *	It always succeeds. The buffer may be dropped during processing
4267 *	for congestion control or by the protocol layers.
4268 *
4269 *	This function may only be called from softirq context and interrupts
4270 *	should be enabled.
4271 *
4272 *	Return values (usually ignored):
4273 *	NET_RX_SUCCESS: no congestion
4274 *	NET_RX_DROP: packet was dropped
4275 */
4276int netif_receive_skb(struct sk_buff *skb)
4277{
4278	trace_netif_receive_skb_entry(skb);
4279
4280	return netif_receive_skb_internal(skb);
4281}
4282EXPORT_SYMBOL(netif_receive_skb);
4283
4284DEFINE_PER_CPU(struct work_struct, flush_works);
4285
4286/* Network device is going away, flush any packets still pending */
4287static void flush_backlog(struct work_struct *work)
4288{
4289	struct sk_buff *skb, *tmp;
4290	struct softnet_data *sd;
4291
4292	local_bh_disable();
4293	sd = this_cpu_ptr(&softnet_data);
4294
4295	local_irq_disable();
4296	rps_lock(sd);
4297	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4298		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4299			__skb_unlink(skb, &sd->input_pkt_queue);
4300			kfree_skb(skb);
4301			input_queue_head_incr(sd);
4302		}
4303	}
4304	rps_unlock(sd);
4305	local_irq_enable();
4306
4307	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4308		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4309			__skb_unlink(skb, &sd->process_queue);
4310			kfree_skb(skb);
4311			input_queue_head_incr(sd);
4312		}
4313	}
4314	local_bh_enable();
4315}
4316
4317static void flush_all_backlogs(void)
4318{
4319	unsigned int cpu;
4320
4321	get_online_cpus();
4322
4323	for_each_online_cpu(cpu)
4324		queue_work_on(cpu, system_highpri_wq,
4325			      per_cpu_ptr(&flush_works, cpu));
4326
4327	for_each_online_cpu(cpu)
4328		flush_work(per_cpu_ptr(&flush_works, cpu));
4329
4330	put_online_cpus();
4331}
4332
4333static int napi_gro_complete(struct sk_buff *skb)
4334{
4335	struct packet_offload *ptype;
4336	__be16 type = skb->protocol;
4337	struct list_head *head = &offload_base;
4338	int err = -ENOENT;
4339
4340	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4341
4342	if (NAPI_GRO_CB(skb)->count == 1) {
4343		skb_shinfo(skb)->gso_size = 0;
4344		goto out;
4345	}
4346
4347	rcu_read_lock();
4348	list_for_each_entry_rcu(ptype, head, list) {
4349		if (ptype->type != type || !ptype->callbacks.gro_complete)
4350			continue;
4351
4352		err = ptype->callbacks.gro_complete(skb, 0);
4353		break;
4354	}
4355	rcu_read_unlock();
4356
4357	if (err) {
4358		WARN_ON(&ptype->list == head);
4359		kfree_skb(skb);
4360		return NET_RX_SUCCESS;
4361	}
4362
4363out:
4364	return netif_receive_skb_internal(skb);
4365}
4366
4367/* napi->gro_list contains packets ordered by age.
4368 * youngest packets at the head of it.
4369 * Complete skbs in reverse order to reduce latencies.
4370 */
4371void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4372{
4373	struct sk_buff *skb, *prev = NULL;
4374
4375	/* scan list and build reverse chain */
4376	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4377		skb->prev = prev;
4378		prev = skb;
4379	}
4380
4381	for (skb = prev; skb; skb = prev) {
4382		skb->next = NULL;
4383
4384		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4385			return;
4386
4387		prev = skb->prev;
4388		napi_gro_complete(skb);
4389		napi->gro_count--;
4390	}
4391
4392	napi->gro_list = NULL;
4393}
4394EXPORT_SYMBOL(napi_gro_flush);
4395
4396static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4397{
4398	struct sk_buff *p;
4399	unsigned int maclen = skb->dev->hard_header_len;
4400	u32 hash = skb_get_hash_raw(skb);
4401
4402	for (p = napi->gro_list; p; p = p->next) {
4403		unsigned long diffs;
4404
4405		NAPI_GRO_CB(p)->flush = 0;
4406
4407		if (hash != skb_get_hash_raw(p)) {
4408			NAPI_GRO_CB(p)->same_flow = 0;
4409			continue;
4410		}
4411
4412		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4413		diffs |= p->vlan_tci ^ skb->vlan_tci;
4414		diffs |= skb_metadata_dst_cmp(p, skb);
4415		if (maclen == ETH_HLEN)
4416			diffs |= compare_ether_header(skb_mac_header(p),
4417						      skb_mac_header(skb));
4418		else if (!diffs)
4419			diffs = memcmp(skb_mac_header(p),
4420				       skb_mac_header(skb),
4421				       maclen);
4422		NAPI_GRO_CB(p)->same_flow = !diffs;
4423	}
4424}
4425
4426static void skb_gro_reset_offset(struct sk_buff *skb)
4427{
4428	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4429	const skb_frag_t *frag0 = &pinfo->frags[0];
4430
4431	NAPI_GRO_CB(skb)->data_offset = 0;
4432	NAPI_GRO_CB(skb)->frag0 = NULL;
4433	NAPI_GRO_CB(skb)->frag0_len = 0;
4434
4435	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4436	    pinfo->nr_frags &&
4437	    !PageHighMem(skb_frag_page(frag0))) {
4438		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4439		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4440						    skb_frag_size(frag0),
4441						    skb->end - skb->tail);
4442	}
4443}
4444
4445static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4446{
4447	struct skb_shared_info *pinfo = skb_shinfo(skb);
4448
4449	BUG_ON(skb->end - skb->tail < grow);
4450
4451	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4452
4453	skb->data_len -= grow;
4454	skb->tail += grow;
4455
4456	pinfo->frags[0].page_offset += grow;
4457	skb_frag_size_sub(&pinfo->frags[0], grow);
4458
4459	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4460		skb_frag_unref(skb, 0);
4461		memmove(pinfo->frags, pinfo->frags + 1,
4462			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4463	}
4464}
4465
4466static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4467{
4468	struct sk_buff **pp = NULL;
4469	struct packet_offload *ptype;
4470	__be16 type = skb->protocol;
4471	struct list_head *head = &offload_base;
4472	int same_flow;
4473	enum gro_result ret;
4474	int grow;
4475
4476	if (!(skb->dev->features & NETIF_F_GRO))
4477		goto normal;
4478
4479	if (skb->csum_bad)
4480		goto normal;
4481
4482	gro_list_prepare(napi, skb);
4483
4484	rcu_read_lock();
4485	list_for_each_entry_rcu(ptype, head, list) {
4486		if (ptype->type != type || !ptype->callbacks.gro_receive)
4487			continue;
4488
4489		skb_set_network_header(skb, skb_gro_offset(skb));
4490		skb_reset_mac_len(skb);
4491		NAPI_GRO_CB(skb)->same_flow = 0;
4492		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4493		NAPI_GRO_CB(skb)->free = 0;
4494		NAPI_GRO_CB(skb)->encap_mark = 0;
4495		NAPI_GRO_CB(skb)->recursion_counter = 0;
4496		NAPI_GRO_CB(skb)->is_fou = 0;
4497		NAPI_GRO_CB(skb)->is_atomic = 1;
4498		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4499
4500		/* Setup for GRO checksum validation */
4501		switch (skb->ip_summed) {
4502		case CHECKSUM_COMPLETE:
4503			NAPI_GRO_CB(skb)->csum = skb->csum;
4504			NAPI_GRO_CB(skb)->csum_valid = 1;
4505			NAPI_GRO_CB(skb)->csum_cnt = 0;
4506			break;
4507		case CHECKSUM_UNNECESSARY:
4508			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4509			NAPI_GRO_CB(skb)->csum_valid = 0;
4510			break;
4511		default:
4512			NAPI_GRO_CB(skb)->csum_cnt = 0;
4513			NAPI_GRO_CB(skb)->csum_valid = 0;
4514		}
4515
4516		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4517		break;
4518	}
4519	rcu_read_unlock();
4520
4521	if (&ptype->list == head)
4522		goto normal;
4523
4524	same_flow = NAPI_GRO_CB(skb)->same_flow;
4525	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4526
4527	if (pp) {
4528		struct sk_buff *nskb = *pp;
4529
4530		*pp = nskb->next;
4531		nskb->next = NULL;
4532		napi_gro_complete(nskb);
4533		napi->gro_count--;
4534	}
4535
4536	if (same_flow)
4537		goto ok;
4538
4539	if (NAPI_GRO_CB(skb)->flush)
4540		goto normal;
4541
4542	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4543		struct sk_buff *nskb = napi->gro_list;
4544
4545		/* locate the end of the list to select the 'oldest' flow */
4546		while (nskb->next) {
4547			pp = &nskb->next;
4548			nskb = *pp;
4549		}
4550		*pp = NULL;
4551		nskb->next = NULL;
4552		napi_gro_complete(nskb);
4553	} else {
4554		napi->gro_count++;
4555	}
4556	NAPI_GRO_CB(skb)->count = 1;
4557	NAPI_GRO_CB(skb)->age = jiffies;
4558	NAPI_GRO_CB(skb)->last = skb;
4559	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4560	skb->next = napi->gro_list;
4561	napi->gro_list = skb;
4562	ret = GRO_HELD;
4563
4564pull:
4565	grow = skb_gro_offset(skb) - skb_headlen(skb);
4566	if (grow > 0)
4567		gro_pull_from_frag0(skb, grow);
4568ok:
4569	return ret;
4570
4571normal:
4572	ret = GRO_NORMAL;
4573	goto pull;
4574}
4575
4576struct packet_offload *gro_find_receive_by_type(__be16 type)
4577{
4578	struct list_head *offload_head = &offload_base;
4579	struct packet_offload *ptype;
4580
4581	list_for_each_entry_rcu(ptype, offload_head, list) {
4582		if (ptype->type != type || !ptype->callbacks.gro_receive)
4583			continue;
4584		return ptype;
4585	}
4586	return NULL;
4587}
4588EXPORT_SYMBOL(gro_find_receive_by_type);
4589
4590struct packet_offload *gro_find_complete_by_type(__be16 type)
4591{
4592	struct list_head *offload_head = &offload_base;
4593	struct packet_offload *ptype;
4594
4595	list_for_each_entry_rcu(ptype, offload_head, list) {
4596		if (ptype->type != type || !ptype->callbacks.gro_complete)
4597			continue;
4598		return ptype;
4599	}
4600	return NULL;
4601}
4602EXPORT_SYMBOL(gro_find_complete_by_type);
4603
4604static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4605{
4606	switch (ret) {
4607	case GRO_NORMAL:
4608		if (netif_receive_skb_internal(skb))
4609			ret = GRO_DROP;
4610		break;
4611
4612	case GRO_DROP:
4613		kfree_skb(skb);
4614		break;
4615
4616	case GRO_MERGED_FREE:
4617		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4618			skb_dst_drop(skb);
4619			kmem_cache_free(skbuff_head_cache, skb);
4620		} else {
4621			__kfree_skb(skb);
4622		}
4623		break;
4624
4625	case GRO_HELD:
4626	case GRO_MERGED:
4627		break;
4628	}
4629
4630	return ret;
4631}
4632
4633gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4634{
4635	skb_mark_napi_id(skb, napi);
4636	trace_napi_gro_receive_entry(skb);
4637
4638	skb_gro_reset_offset(skb);
4639
4640	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4641}
4642EXPORT_SYMBOL(napi_gro_receive);
4643
4644static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4645{
4646	if (unlikely(skb->pfmemalloc)) {
4647		consume_skb(skb);
4648		return;
4649	}
4650	__skb_pull(skb, skb_headlen(skb));
4651	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4652	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4653	skb->vlan_tci = 0;
4654	skb->dev = napi->dev;
4655	skb->skb_iif = 0;
4656	skb->encapsulation = 0;
4657	skb_shinfo(skb)->gso_type = 0;
4658	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4659
4660	napi->skb = skb;
4661}
4662
4663struct sk_buff *napi_get_frags(struct napi_struct *napi)
4664{
4665	struct sk_buff *skb = napi->skb;
4666
4667	if (!skb) {
4668		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4669		if (skb) {
4670			napi->skb = skb;
4671			skb_mark_napi_id(skb, napi);
4672		}
4673	}
4674	return skb;
4675}
4676EXPORT_SYMBOL(napi_get_frags);
4677
4678static gro_result_t napi_frags_finish(struct napi_struct *napi,
4679				      struct sk_buff *skb,
4680				      gro_result_t ret)
4681{
4682	switch (ret) {
4683	case GRO_NORMAL:
4684	case GRO_HELD:
4685		__skb_push(skb, ETH_HLEN);
4686		skb->protocol = eth_type_trans(skb, skb->dev);
4687		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4688			ret = GRO_DROP;
4689		break;
4690
4691	case GRO_DROP:
4692	case GRO_MERGED_FREE:
4693		napi_reuse_skb(napi, skb);
4694		break;
4695
4696	case GRO_MERGED:
4697		break;
4698	}
4699
4700	return ret;
4701}
4702
4703/* Upper GRO stack assumes network header starts at gro_offset=0
4704 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4705 * We copy ethernet header into skb->data to have a common layout.
4706 */
4707static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4708{
4709	struct sk_buff *skb = napi->skb;
4710	const struct ethhdr *eth;
4711	unsigned int hlen = sizeof(*eth);
4712
4713	napi->skb = NULL;
4714
4715	skb_reset_mac_header(skb);
4716	skb_gro_reset_offset(skb);
4717
4718	eth = skb_gro_header_fast(skb, 0);
4719	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4720		eth = skb_gro_header_slow(skb, hlen, 0);
4721		if (unlikely(!eth)) {
4722			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4723					     __func__, napi->dev->name);
4724			napi_reuse_skb(napi, skb);
4725			return NULL;
4726		}
4727	} else {
4728		gro_pull_from_frag0(skb, hlen);
4729		NAPI_GRO_CB(skb)->frag0 += hlen;
4730		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4731	}
4732	__skb_pull(skb, hlen);
4733
4734	/*
4735	 * This works because the only protocols we care about don't require
4736	 * special handling.
4737	 * We'll fix it up properly in napi_frags_finish()
4738	 */
4739	skb->protocol = eth->h_proto;
4740
4741	return skb;
4742}
4743
4744gro_result_t napi_gro_frags(struct napi_struct *napi)
4745{
4746	struct sk_buff *skb = napi_frags_skb(napi);
4747
4748	if (!skb)
4749		return GRO_DROP;
4750
4751	trace_napi_gro_frags_entry(skb);
4752
4753	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4754}
4755EXPORT_SYMBOL(napi_gro_frags);
4756
4757/* Compute the checksum from gro_offset and return the folded value
4758 * after adding in any pseudo checksum.
4759 */
4760__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4761{
4762	__wsum wsum;
4763	__sum16 sum;
4764
4765	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4766
4767	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4768	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4769	if (likely(!sum)) {
4770		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4771		    !skb->csum_complete_sw)
4772			netdev_rx_csum_fault(skb->dev);
4773	}
4774
4775	NAPI_GRO_CB(skb)->csum = wsum;
4776	NAPI_GRO_CB(skb)->csum_valid = 1;
4777
4778	return sum;
4779}
4780EXPORT_SYMBOL(__skb_gro_checksum_complete);
4781
4782/*
4783 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4784 * Note: called with local irq disabled, but exits with local irq enabled.
4785 */
4786static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4787{
4788#ifdef CONFIG_RPS
4789	struct softnet_data *remsd = sd->rps_ipi_list;
4790
4791	if (remsd) {
4792		sd->rps_ipi_list = NULL;
4793
4794		local_irq_enable();
4795
4796		/* Send pending IPI's to kick RPS processing on remote cpus. */
4797		while (remsd) {
4798			struct softnet_data *next = remsd->rps_ipi_next;
4799
4800			if (cpu_online(remsd->cpu))
4801				smp_call_function_single_async(remsd->cpu,
4802							   &remsd->csd);
4803			remsd = next;
4804		}
4805	} else
4806#endif
4807		local_irq_enable();
4808}
4809
4810static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4811{
4812#ifdef CONFIG_RPS
4813	return sd->rps_ipi_list != NULL;
4814#else
4815	return false;
4816#endif
4817}
4818
4819static int process_backlog(struct napi_struct *napi, int quota)
4820{
4821	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4822	bool again = true;
4823	int work = 0;
4824
4825	/* Check if we have pending ipi, its better to send them now,
4826	 * not waiting net_rx_action() end.
4827	 */
4828	if (sd_has_rps_ipi_waiting(sd)) {
4829		local_irq_disable();
4830		net_rps_action_and_irq_enable(sd);
4831	}
4832
4833	napi->weight = weight_p;
4834	while (again) {
4835		struct sk_buff *skb;
4836
4837		while ((skb = __skb_dequeue(&sd->process_queue))) {
4838			rcu_read_lock();
4839			__netif_receive_skb(skb);
4840			rcu_read_unlock();
4841			input_queue_head_incr(sd);
4842			if (++work >= quota)
4843				return work;
4844
4845		}
4846
4847		local_irq_disable();
4848		rps_lock(sd);
4849		if (skb_queue_empty(&sd->input_pkt_queue)) {
4850			/*
4851			 * Inline a custom version of __napi_complete().
4852			 * only current cpu owns and manipulates this napi,
4853			 * and NAPI_STATE_SCHED is the only possible flag set
4854			 * on backlog.
4855			 * We can use a plain write instead of clear_bit(),
4856			 * and we dont need an smp_mb() memory barrier.
4857			 */
4858			napi->state = 0;
4859			again = false;
4860		} else {
4861			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4862						   &sd->process_queue);
4863		}
4864		rps_unlock(sd);
4865		local_irq_enable();
4866	}
4867
4868	return work;
4869}
4870
4871/**
4872 * __napi_schedule - schedule for receive
4873 * @n: entry to schedule
4874 *
4875 * The entry's receive function will be scheduled to run.
4876 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4877 */
4878void __napi_schedule(struct napi_struct *n)
4879{
4880	unsigned long flags;
4881
4882	local_irq_save(flags);
4883	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4884	local_irq_restore(flags);
4885}
4886EXPORT_SYMBOL(__napi_schedule);
4887
4888/**
4889 * __napi_schedule_irqoff - schedule for receive
4890 * @n: entry to schedule
4891 *
4892 * Variant of __napi_schedule() assuming hard irqs are masked
4893 */
4894void __napi_schedule_irqoff(struct napi_struct *n)
4895{
4896	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4897}
4898EXPORT_SYMBOL(__napi_schedule_irqoff);
4899
4900bool __napi_complete(struct napi_struct *n)
4901{
4902	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4903
4904	/* Some drivers call us directly, instead of calling
4905	 * napi_complete_done().
4906	 */
4907	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4908		return false;
4909
4910	list_del_init(&n->poll_list);
4911	smp_mb__before_atomic();
4912	clear_bit(NAPI_STATE_SCHED, &n->state);
4913	return true;
4914}
4915EXPORT_SYMBOL(__napi_complete);
4916
4917bool napi_complete_done(struct napi_struct *n, int work_done)
4918{
4919	unsigned long flags;
4920
4921	/*
4922	 * 1) Don't let napi dequeue from the cpu poll list
4923	 *    just in case its running on a different cpu.
4924	 * 2) If we are busy polling, do nothing here, we have
4925	 *    the guarantee we will be called later.
4926	 */
4927	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4928				 NAPIF_STATE_IN_BUSY_POLL)))
4929		return false;
4930
4931	if (n->gro_list) {
4932		unsigned long timeout = 0;
4933
4934		if (work_done)
4935			timeout = n->dev->gro_flush_timeout;
4936
4937		if (timeout)
4938			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4939				      HRTIMER_MODE_REL_PINNED);
4940		else
4941			napi_gro_flush(n, false);
4942	}
4943	if (likely(list_empty(&n->poll_list))) {
4944		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4945	} else {
4946		/* If n->poll_list is not empty, we need to mask irqs */
4947		local_irq_save(flags);
4948		__napi_complete(n);
4949		local_irq_restore(flags);
4950	}
4951	return true;
4952}
4953EXPORT_SYMBOL(napi_complete_done);
4954
4955/* must be called under rcu_read_lock(), as we dont take a reference */
4956static struct napi_struct *napi_by_id(unsigned int napi_id)
4957{
4958	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4959	struct napi_struct *napi;
4960
4961	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4962		if (napi->napi_id == napi_id)
4963			return napi;
4964
4965	return NULL;
4966}
4967
4968#if defined(CONFIG_NET_RX_BUSY_POLL)
4969
4970#define BUSY_POLL_BUDGET 8
4971
4972static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4973{
4974	int rc;
4975
4976	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4977
4978	local_bh_disable();
4979
4980	/* All we really want here is to re-enable device interrupts.
4981	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4982	 */
4983	rc = napi->poll(napi, BUSY_POLL_BUDGET);
4984	netpoll_poll_unlock(have_poll_lock);
4985	if (rc == BUSY_POLL_BUDGET)
4986		__napi_schedule(napi);
4987	local_bh_enable();
4988	if (local_softirq_pending())
4989		do_softirq();
4990}
4991
4992bool sk_busy_loop(struct sock *sk, int nonblock)
4993{
4994	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4995	int (*napi_poll)(struct napi_struct *napi, int budget);
4996	int (*busy_poll)(struct napi_struct *dev);
4997	void *have_poll_lock = NULL;
4998	struct napi_struct *napi;
4999	int rc;
5000
5001restart:
5002	rc = false;
5003	napi_poll = NULL;
5004
5005	rcu_read_lock();
5006
5007	napi = napi_by_id(sk->sk_napi_id);
5008	if (!napi)
5009		goto out;
5010
5011	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5012	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5013
5014	preempt_disable();
5015	for (;;) {
5016		rc = 0;
5017		local_bh_disable();
5018		if (busy_poll) {
5019			rc = busy_poll(napi);
5020			goto count;
5021		}
5022		if (!napi_poll) {
5023			unsigned long val = READ_ONCE(napi->state);
5024
5025			/* If multiple threads are competing for this napi,
5026			 * we avoid dirtying napi->state as much as we can.
5027			 */
5028			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5029				   NAPIF_STATE_IN_BUSY_POLL))
5030				goto count;
5031			if (cmpxchg(&napi->state, val,
5032				    val | NAPIF_STATE_IN_BUSY_POLL |
5033					  NAPIF_STATE_SCHED) != val)
5034				goto count;
5035			have_poll_lock = netpoll_poll_lock(napi);
5036			napi_poll = napi->poll;
5037		}
5038		rc = napi_poll(napi, BUSY_POLL_BUDGET);
5039		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5040count:
5041		if (rc > 0)
5042			__NET_ADD_STATS(sock_net(sk),
5043					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5044		local_bh_enable();
5045
5046		if (rc == LL_FLUSH_FAILED)
5047			break; /* permanent failure */
5048
5049		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5050		    busy_loop_timeout(end_time))
5051			break;
5052
5053		if (unlikely(need_resched())) {
5054			if (napi_poll)
5055				busy_poll_stop(napi, have_poll_lock);
5056			preempt_enable();
5057			rcu_read_unlock();
5058			cond_resched();
5059			rc = !skb_queue_empty(&sk->sk_receive_queue);
5060			if (rc || busy_loop_timeout(end_time))
5061				return rc;
5062			goto restart;
5063		}
5064		cpu_relax();
5065	}
5066	if (napi_poll)
5067		busy_poll_stop(napi, have_poll_lock);
5068	preempt_enable();
5069	rc = !skb_queue_empty(&sk->sk_receive_queue);
5070out:
5071	rcu_read_unlock();
5072	return rc;
5073}
5074EXPORT_SYMBOL(sk_busy_loop);
5075
5076#endif /* CONFIG_NET_RX_BUSY_POLL */
5077
5078static void napi_hash_add(struct napi_struct *napi)
5079{
5080	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5081	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5082		return;
5083
5084	spin_lock(&napi_hash_lock);
5085
5086	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5087	do {
5088		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5089			napi_gen_id = NR_CPUS + 1;
5090	} while (napi_by_id(napi_gen_id));
5091	napi->napi_id = napi_gen_id;
5092
5093	hlist_add_head_rcu(&napi->napi_hash_node,
5094			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5095
5096	spin_unlock(&napi_hash_lock);
5097}
5098
5099/* Warning : caller is responsible to make sure rcu grace period
5100 * is respected before freeing memory containing @napi
5101 */
5102bool napi_hash_del(struct napi_struct *napi)
5103{
5104	bool rcu_sync_needed = false;
5105
5106	spin_lock(&napi_hash_lock);
5107
5108	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5109		rcu_sync_needed = true;
5110		hlist_del_rcu(&napi->napi_hash_node);
5111	}
5112	spin_unlock(&napi_hash_lock);
5113	return rcu_sync_needed;
5114}
5115EXPORT_SYMBOL_GPL(napi_hash_del);
5116
5117static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5118{
5119	struct napi_struct *napi;
5120
5121	napi = container_of(timer, struct napi_struct, timer);
5122	if (napi->gro_list)
5123		napi_schedule(napi);
5124
5125	return HRTIMER_NORESTART;
5126}
5127
5128void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5129		    int (*poll)(struct napi_struct *, int), int weight)
5130{
5131	INIT_LIST_HEAD(&napi->poll_list);
5132	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5133	napi->timer.function = napi_watchdog;
5134	napi->gro_count = 0;
5135	napi->gro_list = NULL;
5136	napi->skb = NULL;
5137	napi->poll = poll;
5138	if (weight > NAPI_POLL_WEIGHT)
5139		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5140			    weight, dev->name);
5141	napi->weight = weight;
5142	list_add(&napi->dev_list, &dev->napi_list);
5143	napi->dev = dev;
5144#ifdef CONFIG_NETPOLL
5145	napi->poll_owner = -1;
5146#endif
5147	set_bit(NAPI_STATE_SCHED, &napi->state);
5148	napi_hash_add(napi);
5149}
5150EXPORT_SYMBOL(netif_napi_add);
5151
5152void napi_disable(struct napi_struct *n)
5153{
5154	might_sleep();
5155	set_bit(NAPI_STATE_DISABLE, &n->state);
5156
5157	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5158		msleep(1);
5159	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5160		msleep(1);
5161
5162	hrtimer_cancel(&n->timer);
5163
5164	clear_bit(NAPI_STATE_DISABLE, &n->state);
5165}
5166EXPORT_SYMBOL(napi_disable);
5167
5168/* Must be called in process context */
5169void netif_napi_del(struct napi_struct *napi)
5170{
5171	might_sleep();
5172	if (napi_hash_del(napi))
5173		synchronize_net();
5174	list_del_init(&napi->dev_list);
5175	napi_free_frags(napi);
5176
5177	kfree_skb_list(napi->gro_list);
5178	napi->gro_list = NULL;
5179	napi->gro_count = 0;
5180}
5181EXPORT_SYMBOL(netif_napi_del);
5182
5183static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5184{
5185	void *have;
5186	int work, weight;
5187
5188	list_del_init(&n->poll_list);
5189
5190	have = netpoll_poll_lock(n);
5191
5192	weight = n->weight;
5193
5194	/* This NAPI_STATE_SCHED test is for avoiding a race
5195	 * with netpoll's poll_napi().  Only the entity which
5196	 * obtains the lock and sees NAPI_STATE_SCHED set will
5197	 * actually make the ->poll() call.  Therefore we avoid
5198	 * accidentally calling ->poll() when NAPI is not scheduled.
5199	 */
5200	work = 0;
5201	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5202		work = n->poll(n, weight);
5203		trace_napi_poll(n, work, weight);
5204	}
5205
5206	WARN_ON_ONCE(work > weight);
5207
5208	if (likely(work < weight))
5209		goto out_unlock;
5210
5211	/* Drivers must not modify the NAPI state if they
5212	 * consume the entire weight.  In such cases this code
5213	 * still "owns" the NAPI instance and therefore can
5214	 * move the instance around on the list at-will.
5215	 */
5216	if (unlikely(napi_disable_pending(n))) {
5217		napi_complete(n);
5218		goto out_unlock;
5219	}
5220
5221	if (n->gro_list) {
5222		/* flush too old packets
5223		 * If HZ < 1000, flush all packets.
5224		 */
5225		napi_gro_flush(n, HZ >= 1000);
5226	}
5227
5228	/* Some drivers may have called napi_schedule
5229	 * prior to exhausting their budget.
5230	 */
5231	if (unlikely(!list_empty(&n->poll_list))) {
5232		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5233			     n->dev ? n->dev->name : "backlog");
5234		goto out_unlock;
5235	}
5236
5237	list_add_tail(&n->poll_list, repoll);
5238
5239out_unlock:
5240	netpoll_poll_unlock(have);
5241
5242	return work;
5243}
5244
5245static __latent_entropy void net_rx_action(struct softirq_action *h)
5246{
5247	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5248	unsigned long time_limit = jiffies + 2;
5249	int budget = netdev_budget;
5250	LIST_HEAD(list);
5251	LIST_HEAD(repoll);
5252
5253	local_irq_disable();
5254	list_splice_init(&sd->poll_list, &list);
5255	local_irq_enable();
5256
5257	for (;;) {
5258		struct napi_struct *n;
5259
5260		if (list_empty(&list)) {
5261			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5262				goto out;
5263			break;
5264		}
5265
5266		n = list_first_entry(&list, struct napi_struct, poll_list);
5267		budget -= napi_poll(n, &repoll);
5268
5269		/* If softirq window is exhausted then punt.
5270		 * Allow this to run for 2 jiffies since which will allow
5271		 * an average latency of 1.5/HZ.
5272		 */
5273		if (unlikely(budget <= 0 ||
5274			     time_after_eq(jiffies, time_limit))) {
5275			sd->time_squeeze++;
5276			break;
5277		}
5278	}
5279
5280	local_irq_disable();
5281
5282	list_splice_tail_init(&sd->poll_list, &list);
5283	list_splice_tail(&repoll, &list);
5284	list_splice(&list, &sd->poll_list);
5285	if (!list_empty(&sd->poll_list))
5286		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5287
5288	net_rps_action_and_irq_enable(sd);
5289out:
5290	__kfree_skb_flush();
5291}
5292
5293struct netdev_adjacent {
5294	struct net_device *dev;
5295
5296	/* upper master flag, there can only be one master device per list */
5297	bool master;
5298
5299	/* counter for the number of times this device was added to us */
5300	u16 ref_nr;
5301
5302	/* private field for the users */
5303	void *private;
5304
5305	struct list_head list;
5306	struct rcu_head rcu;
5307};
5308
5309static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5310						 struct list_head *adj_list)
5311{
5312	struct netdev_adjacent *adj;
5313
5314	list_for_each_entry(adj, adj_list, list) {
5315		if (adj->dev == adj_dev)
5316			return adj;
5317	}
5318	return NULL;
5319}
5320
5321static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5322{
5323	struct net_device *dev = data;
5324
5325	return upper_dev == dev;
5326}
5327
5328/**
5329 * netdev_has_upper_dev - Check if device is linked to an upper device
5330 * @dev: device
5331 * @upper_dev: upper device to check
5332 *
5333 * Find out if a device is linked to specified upper device and return true
5334 * in case it is. Note that this checks only immediate upper device,
5335 * not through a complete stack of devices. The caller must hold the RTNL lock.
5336 */
5337bool netdev_has_upper_dev(struct net_device *dev,
5338			  struct net_device *upper_dev)
5339{
5340	ASSERT_RTNL();
5341
5342	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5343					     upper_dev);
5344}
5345EXPORT_SYMBOL(netdev_has_upper_dev);
5346
5347/**
5348 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5349 * @dev: device
5350 * @upper_dev: upper device to check
5351 *
5352 * Find out if a device is linked to specified upper device and return true
5353 * in case it is. Note that this checks the entire upper device chain.
5354 * The caller must hold rcu lock.
5355 */
5356
5357bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5358				  struct net_device *upper_dev)
5359{
5360	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5361					       upper_dev);
5362}
5363EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5364
5365/**
5366 * netdev_has_any_upper_dev - Check if device is linked to some device
5367 * @dev: device
5368 *
5369 * Find out if a device is linked to an upper device and return true in case
5370 * it is. The caller must hold the RTNL lock.
5371 */
5372static bool netdev_has_any_upper_dev(struct net_device *dev)
5373{
5374	ASSERT_RTNL();
5375
5376	return !list_empty(&dev->adj_list.upper);
5377}
5378
5379/**
5380 * netdev_master_upper_dev_get - Get master upper device
5381 * @dev: device
5382 *
5383 * Find a master upper device and return pointer to it or NULL in case
5384 * it's not there. The caller must hold the RTNL lock.
5385 */
5386struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5387{
5388	struct netdev_adjacent *upper;
5389
5390	ASSERT_RTNL();
5391
5392	if (list_empty(&dev->adj_list.upper))
5393		return NULL;
5394
5395	upper = list_first_entry(&dev->adj_list.upper,
5396				 struct netdev_adjacent, list);
5397	if (likely(upper->master))
5398		return upper->dev;
5399	return NULL;
5400}
5401EXPORT_SYMBOL(netdev_master_upper_dev_get);
5402
5403/**
5404 * netdev_has_any_lower_dev - Check if device is linked to some device
5405 * @dev: device
5406 *
5407 * Find out if a device is linked to a lower device and return true in case
5408 * it is. The caller must hold the RTNL lock.
5409 */
5410static bool netdev_has_any_lower_dev(struct net_device *dev)
5411{
5412	ASSERT_RTNL();
5413
5414	return !list_empty(&dev->adj_list.lower);
5415}
5416
5417void *netdev_adjacent_get_private(struct list_head *adj_list)
5418{
5419	struct netdev_adjacent *adj;
5420
5421	adj = list_entry(adj_list, struct netdev_adjacent, list);
5422
5423	return adj->private;
5424}
5425EXPORT_SYMBOL(netdev_adjacent_get_private);
5426
5427/**
5428 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5429 * @dev: device
5430 * @iter: list_head ** of the current position
5431 *
5432 * Gets the next device from the dev's upper list, starting from iter
5433 * position. The caller must hold RCU read lock.
5434 */
5435struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5436						 struct list_head **iter)
5437{
5438	struct netdev_adjacent *upper;
5439
5440	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5441
5442	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5443
5444	if (&upper->list == &dev->adj_list.upper)
5445		return NULL;
5446
5447	*iter = &upper->list;
5448
5449	return upper->dev;
5450}
5451EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5452
5453static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5454						    struct list_head **iter)
5455{
5456	struct netdev_adjacent *upper;
5457
5458	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5459
5460	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5461
5462	if (&upper->list == &dev->adj_list.upper)
5463		return NULL;
5464
5465	*iter = &upper->list;
5466
5467	return upper->dev;
5468}
5469
5470int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5471				  int (*fn)(struct net_device *dev,
5472					    void *data),
5473				  void *data)
5474{
5475	struct net_device *udev;
5476	struct list_head *iter;
5477	int ret;
5478
5479	for (iter = &dev->adj_list.upper,
5480	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5481	     udev;
5482	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5483		/* first is the upper device itself */
5484		ret = fn(udev, data);
5485		if (ret)
5486			return ret;
5487
5488		/* then look at all of its upper devices */
5489		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5490		if (ret)
5491			return ret;
5492	}
5493
5494	return 0;
5495}
5496EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5497
5498/**
5499 * netdev_lower_get_next_private - Get the next ->private from the
5500 *				   lower neighbour list
5501 * @dev: device
5502 * @iter: list_head ** of the current position
5503 *
5504 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5505 * list, starting from iter position. The caller must hold either hold the
5506 * RTNL lock or its own locking that guarantees that the neighbour lower
5507 * list will remain unchanged.
5508 */
5509void *netdev_lower_get_next_private(struct net_device *dev,
5510				    struct list_head **iter)
5511{
5512	struct netdev_adjacent *lower;
5513
5514	lower = list_entry(*iter, struct netdev_adjacent, list);
5515
5516	if (&lower->list == &dev->adj_list.lower)
5517		return NULL;
5518
5519	*iter = lower->list.next;
5520
5521	return lower->private;
5522}
5523EXPORT_SYMBOL(netdev_lower_get_next_private);
5524
5525/**
5526 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5527 *				       lower neighbour list, RCU
5528 *				       variant
5529 * @dev: device
5530 * @iter: list_head ** of the current position
5531 *
5532 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5533 * list, starting from iter position. The caller must hold RCU read lock.
5534 */
5535void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5536					struct list_head **iter)
5537{
5538	struct netdev_adjacent *lower;
5539
5540	WARN_ON_ONCE(!rcu_read_lock_held());
5541
5542	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5543
5544	if (&lower->list == &dev->adj_list.lower)
5545		return NULL;
5546
5547	*iter = &lower->list;
5548
5549	return lower->private;
5550}
5551EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5552
5553/**
5554 * netdev_lower_get_next - Get the next device from the lower neighbour
5555 *                         list
5556 * @dev: device
5557 * @iter: list_head ** of the current position
5558 *
5559 * Gets the next netdev_adjacent from the dev's lower neighbour
5560 * list, starting from iter position. The caller must hold RTNL lock or
5561 * its own locking that guarantees that the neighbour lower
5562 * list will remain unchanged.
5563 */
5564void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5565{
5566	struct netdev_adjacent *lower;
5567
5568	lower = list_entry(*iter, struct netdev_adjacent, list);
5569
5570	if (&lower->list == &dev->adj_list.lower)
5571		return NULL;
5572
5573	*iter = lower->list.next;
5574
5575	return lower->dev;
5576}
5577EXPORT_SYMBOL(netdev_lower_get_next);
5578
5579static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5580						struct list_head **iter)
5581{
5582	struct netdev_adjacent *lower;
5583
5584	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5585
5586	if (&lower->list == &dev->adj_list.lower)
5587		return NULL;
5588
5589	*iter = &lower->list;
5590
5591	return lower->dev;
5592}
5593
5594int netdev_walk_all_lower_dev(struct net_device *dev,
5595			      int (*fn)(struct net_device *dev,
5596					void *data),
5597			      void *data)
5598{
5599	struct net_device *ldev;
5600	struct list_head *iter;
5601	int ret;
5602
5603	for (iter = &dev->adj_list.lower,
5604	     ldev = netdev_next_lower_dev(dev, &iter);
5605	     ldev;
5606	     ldev = netdev_next_lower_dev(dev, &iter)) {
5607		/* first is the lower device itself */
5608		ret = fn(ldev, data);
5609		if (ret)
5610			return ret;
5611
5612		/* then look at all of its lower devices */
5613		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5614		if (ret)
5615			return ret;
5616	}
5617
5618	return 0;
5619}
5620EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5621
5622static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5623						    struct list_head **iter)
5624{
5625	struct netdev_adjacent *lower;
5626
5627	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5628	if (&lower->list == &dev->adj_list.lower)
5629		return NULL;
5630
5631	*iter = &lower->list;
5632
5633	return lower->dev;
5634}
5635
5636int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5637				  int (*fn)(struct net_device *dev,
5638					    void *data),
5639				  void *data)
5640{
5641	struct net_device *ldev;
5642	struct list_head *iter;
5643	int ret;
5644
5645	for (iter = &dev->adj_list.lower,
5646	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5647	     ldev;
5648	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5649		/* first is the lower device itself */
5650		ret = fn(ldev, data);
5651		if (ret)
5652			return ret;
5653
5654		/* then look at all of its lower devices */
5655		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5656		if (ret)
5657			return ret;
5658	}
5659
5660	return 0;
5661}
5662EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5663
5664/**
5665 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5666 *				       lower neighbour list, RCU
5667 *				       variant
5668 * @dev: device
5669 *
5670 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5671 * list. The caller must hold RCU read lock.
5672 */
5673void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5674{
5675	struct netdev_adjacent *lower;
5676
5677	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5678			struct netdev_adjacent, list);
5679	if (lower)
5680		return lower->private;
5681	return NULL;
5682}
5683EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5684
5685/**
5686 * netdev_master_upper_dev_get_rcu - Get master upper device
5687 * @dev: device
5688 *
5689 * Find a master upper device and return pointer to it or NULL in case
5690 * it's not there. The caller must hold the RCU read lock.
5691 */
5692struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5693{
5694	struct netdev_adjacent *upper;
5695
5696	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5697				       struct netdev_adjacent, list);
5698	if (upper && likely(upper->master))
5699		return upper->dev;
5700	return NULL;
5701}
5702EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5703
5704static int netdev_adjacent_sysfs_add(struct net_device *dev,
5705			      struct net_device *adj_dev,
5706			      struct list_head *dev_list)
5707{
5708	char linkname[IFNAMSIZ+7];
5709	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5710		"upper_%s" : "lower_%s", adj_dev->name);
5711	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5712				 linkname);
5713}
5714static void netdev_adjacent_sysfs_del(struct net_device *dev,
5715			       char *name,
5716			       struct list_head *dev_list)
5717{
5718	char linkname[IFNAMSIZ+7];
5719	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5720		"upper_%s" : "lower_%s", name);
5721	sysfs_remove_link(&(dev->dev.kobj), linkname);
5722}
5723
5724static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5725						 struct net_device *adj_dev,
5726						 struct list_head *dev_list)
5727{
5728	return (dev_list == &dev->adj_list.upper ||
5729		dev_list == &dev->adj_list.lower) &&
5730		net_eq(dev_net(dev), dev_net(adj_dev));
5731}
5732
5733static int __netdev_adjacent_dev_insert(struct net_device *dev,
5734					struct net_device *adj_dev,
5735					struct list_head *dev_list,
5736					void *private, bool master)
5737{
5738	struct netdev_adjacent *adj;
5739	int ret;
5740
5741	adj = __netdev_find_adj(adj_dev, dev_list);
5742
5743	if (adj) {
5744		adj->ref_nr += 1;
5745		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5746			 dev->name, adj_dev->name, adj->ref_nr);
5747
5748		return 0;
5749	}
5750
5751	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5752	if (!adj)
5753		return -ENOMEM;
5754
5755	adj->dev = adj_dev;
5756	adj->master = master;
5757	adj->ref_nr = 1;
5758	adj->private = private;
5759	dev_hold(adj_dev);
5760
5761	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5762		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5763
5764	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5765		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5766		if (ret)
5767			goto free_adj;
5768	}
5769
5770	/* Ensure that master link is always the first item in list. */
5771	if (master) {
5772		ret = sysfs_create_link(&(dev->dev.kobj),
5773					&(adj_dev->dev.kobj), "master");
5774		if (ret)
5775			goto remove_symlinks;
5776
5777		list_add_rcu(&adj->list, dev_list);
5778	} else {
5779		list_add_tail_rcu(&adj->list, dev_list);
5780	}
5781
5782	return 0;
5783
5784remove_symlinks:
5785	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5786		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5787free_adj:
5788	kfree(adj);
5789	dev_put(adj_dev);
5790
5791	return ret;
5792}
5793
5794static void __netdev_adjacent_dev_remove(struct net_device *dev,
5795					 struct net_device *adj_dev,
5796					 u16 ref_nr,
5797					 struct list_head *dev_list)
5798{
5799	struct netdev_adjacent *adj;
5800
5801	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5802		 dev->name, adj_dev->name, ref_nr);
5803
5804	adj = __netdev_find_adj(adj_dev, dev_list);
5805
5806	if (!adj) {
5807		pr_err("Adjacency does not exist for device %s from %s\n",
5808		       dev->name, adj_dev->name);
5809		WARN_ON(1);
5810		return;
5811	}
5812
5813	if (adj->ref_nr > ref_nr) {
5814		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5815			 dev->name, adj_dev->name, ref_nr,
5816			 adj->ref_nr - ref_nr);
5817		adj->ref_nr -= ref_nr;
5818		return;
5819	}
5820
5821	if (adj->master)
5822		sysfs_remove_link(&(dev->dev.kobj), "master");
5823
5824	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5825		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5826
5827	list_del_rcu(&adj->list);
5828	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5829		 adj_dev->name, dev->name, adj_dev->name);
5830	dev_put(adj_dev);
5831	kfree_rcu(adj, rcu);
5832}
5833
5834static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5835					    struct net_device *upper_dev,
5836					    struct list_head *up_list,
5837					    struct list_head *down_list,
5838					    void *private, bool master)
5839{
5840	int ret;
5841
5842	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5843					   private, master);
5844	if (ret)
5845		return ret;
5846
5847	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5848					   private, false);
5849	if (ret) {
5850		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5851		return ret;
5852	}
5853
5854	return 0;
5855}
5856
5857static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5858					       struct net_device *upper_dev,
5859					       u16 ref_nr,
5860					       struct list_head *up_list,
5861					       struct list_head *down_list)
5862{
5863	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5864	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5865}
5866
5867static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5868						struct net_device *upper_dev,
5869						void *private, bool master)
5870{
5871	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5872						&dev->adj_list.upper,
5873						&upper_dev->adj_list.lower,
5874						private, master);
5875}
5876
5877static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5878						   struct net_device *upper_dev)
5879{
5880	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5881					   &dev->adj_list.upper,
5882					   &upper_dev->adj_list.lower);
5883}
5884
5885static int __netdev_upper_dev_link(struct net_device *dev,
5886				   struct net_device *upper_dev, bool master,
5887				   void *upper_priv, void *upper_info)
5888{
5889	struct netdev_notifier_changeupper_info changeupper_info;
5890	int ret = 0;
5891
5892	ASSERT_RTNL();
5893
5894	if (dev == upper_dev)
5895		return -EBUSY;
5896
5897	/* To prevent loops, check if dev is not upper device to upper_dev. */
5898	if (netdev_has_upper_dev(upper_dev, dev))
5899		return -EBUSY;
5900
5901	if (netdev_has_upper_dev(dev, upper_dev))
5902		return -EEXIST;
5903
5904	if (master && netdev_master_upper_dev_get(dev))
5905		return -EBUSY;
5906
5907	changeupper_info.upper_dev = upper_dev;
5908	changeupper_info.master = master;
5909	changeupper_info.linking = true;
5910	changeupper_info.upper_info = upper_info;
5911
5912	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5913					    &changeupper_info.info);
5914	ret = notifier_to_errno(ret);
5915	if (ret)
5916		return ret;
5917
5918	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5919						   master);
5920	if (ret)
5921		return ret;
5922
5923	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5924					    &changeupper_info.info);
5925	ret = notifier_to_errno(ret);
5926	if (ret)
5927		goto rollback;
5928
5929	return 0;
5930
5931rollback:
5932	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5933
5934	return ret;
5935}
5936
5937/**
5938 * netdev_upper_dev_link - Add a link to the upper device
5939 * @dev: device
5940 * @upper_dev: new upper device
5941 *
5942 * Adds a link to device which is upper to this one. The caller must hold
5943 * the RTNL lock. On a failure a negative errno code is returned.
5944 * On success the reference counts are adjusted and the function
5945 * returns zero.
5946 */
5947int netdev_upper_dev_link(struct net_device *dev,
5948			  struct net_device *upper_dev)
5949{
5950	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5951}
5952EXPORT_SYMBOL(netdev_upper_dev_link);
5953
5954/**
5955 * netdev_master_upper_dev_link - Add a master link to the upper device
5956 * @dev: device
5957 * @upper_dev: new upper device
5958 * @upper_priv: upper device private
5959 * @upper_info: upper info to be passed down via notifier
5960 *
5961 * Adds a link to device which is upper to this one. In this case, only
5962 * one master upper device can be linked, although other non-master devices
5963 * might be linked as well. The caller must hold the RTNL lock.
5964 * On a failure a negative errno code is returned. On success the reference
5965 * counts are adjusted and the function returns zero.
5966 */
5967int netdev_master_upper_dev_link(struct net_device *dev,
5968				 struct net_device *upper_dev,
5969				 void *upper_priv, void *upper_info)
5970{
5971	return __netdev_upper_dev_link(dev, upper_dev, true,
5972				       upper_priv, upper_info);
5973}
5974EXPORT_SYMBOL(netdev_master_upper_dev_link);
5975
5976/**
5977 * netdev_upper_dev_unlink - Removes a link to upper device
5978 * @dev: device
5979 * @upper_dev: new upper device
5980 *
5981 * Removes a link to device which is upper to this one. The caller must hold
5982 * the RTNL lock.
5983 */
5984void netdev_upper_dev_unlink(struct net_device *dev,
5985			     struct net_device *upper_dev)
5986{
5987	struct netdev_notifier_changeupper_info changeupper_info;
5988	ASSERT_RTNL();
5989
5990	changeupper_info.upper_dev = upper_dev;
5991	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5992	changeupper_info.linking = false;
5993
5994	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5995				      &changeupper_info.info);
5996
5997	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5998
5999	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6000				      &changeupper_info.info);
6001}
6002EXPORT_SYMBOL(netdev_upper_dev_unlink);
6003
6004/**
6005 * netdev_bonding_info_change - Dispatch event about slave change
6006 * @dev: device
6007 * @bonding_info: info to dispatch
6008 *
6009 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6010 * The caller must hold the RTNL lock.
6011 */
6012void netdev_bonding_info_change(struct net_device *dev,
6013				struct netdev_bonding_info *bonding_info)
6014{
6015	struct netdev_notifier_bonding_info	info;
6016
6017	memcpy(&info.bonding_info, bonding_info,
6018	       sizeof(struct netdev_bonding_info));
6019	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6020				      &info.info);
6021}
6022EXPORT_SYMBOL(netdev_bonding_info_change);
6023
6024static void netdev_adjacent_add_links(struct net_device *dev)
6025{
6026	struct netdev_adjacent *iter;
6027
6028	struct net *net = dev_net(dev);
6029
6030	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6031		if (!net_eq(net, dev_net(iter->dev)))
6032			continue;
6033		netdev_adjacent_sysfs_add(iter->dev, dev,
6034					  &iter->dev->adj_list.lower);
6035		netdev_adjacent_sysfs_add(dev, iter->dev,
6036					  &dev->adj_list.upper);
6037	}
6038
6039	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6040		if (!net_eq(net, dev_net(iter->dev)))
6041			continue;
6042		netdev_adjacent_sysfs_add(iter->dev, dev,
6043					  &iter->dev->adj_list.upper);
6044		netdev_adjacent_sysfs_add(dev, iter->dev,
6045					  &dev->adj_list.lower);
6046	}
6047}
6048
6049static void netdev_adjacent_del_links(struct net_device *dev)
6050{
6051	struct netdev_adjacent *iter;
6052
6053	struct net *net = dev_net(dev);
6054
6055	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6056		if (!net_eq(net, dev_net(iter->dev)))
6057			continue;
6058		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6059					  &iter->dev->adj_list.lower);
6060		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6061					  &dev->adj_list.upper);
6062	}
6063
6064	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6065		if (!net_eq(net, dev_net(iter->dev)))
6066			continue;
6067		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6068					  &iter->dev->adj_list.upper);
6069		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6070					  &dev->adj_list.lower);
6071	}
6072}
6073
6074void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6075{
6076	struct netdev_adjacent *iter;
6077
6078	struct net *net = dev_net(dev);
6079
6080	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6081		if (!net_eq(net, dev_net(iter->dev)))
6082			continue;
6083		netdev_adjacent_sysfs_del(iter->dev, oldname,
6084					  &iter->dev->adj_list.lower);
6085		netdev_adjacent_sysfs_add(iter->dev, dev,
6086					  &iter->dev->adj_list.lower);
6087	}
6088
6089	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6090		if (!net_eq(net, dev_net(iter->dev)))
6091			continue;
6092		netdev_adjacent_sysfs_del(iter->dev, oldname,
6093					  &iter->dev->adj_list.upper);
6094		netdev_adjacent_sysfs_add(iter->dev, dev,
6095					  &iter->dev->adj_list.upper);
6096	}
6097}
6098
6099void *netdev_lower_dev_get_private(struct net_device *dev,
6100				   struct net_device *lower_dev)
6101{
6102	struct netdev_adjacent *lower;
6103
6104	if (!lower_dev)
6105		return NULL;
6106	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6107	if (!lower)
6108		return NULL;
6109
6110	return lower->private;
6111}
6112EXPORT_SYMBOL(netdev_lower_dev_get_private);
6113
6114
6115int dev_get_nest_level(struct net_device *dev)
6116{
6117	struct net_device *lower = NULL;
6118	struct list_head *iter;
6119	int max_nest = -1;
6120	int nest;
6121
6122	ASSERT_RTNL();
6123
6124	netdev_for_each_lower_dev(dev, lower, iter) {
6125		nest = dev_get_nest_level(lower);
6126		if (max_nest < nest)
6127			max_nest = nest;
6128	}
6129
6130	return max_nest + 1;
6131}
6132EXPORT_SYMBOL(dev_get_nest_level);
6133
6134/**
6135 * netdev_lower_change - Dispatch event about lower device state change
6136 * @lower_dev: device
6137 * @lower_state_info: state to dispatch
6138 *
6139 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6140 * The caller must hold the RTNL lock.
6141 */
6142void netdev_lower_state_changed(struct net_device *lower_dev,
6143				void *lower_state_info)
6144{
6145	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6146
6147	ASSERT_RTNL();
6148	changelowerstate_info.lower_state_info = lower_state_info;
6149	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6150				      &changelowerstate_info.info);
6151}
6152EXPORT_SYMBOL(netdev_lower_state_changed);
6153
6154int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6155					   struct neighbour *n)
6156{
6157	struct net_device *lower_dev, *stop_dev;
6158	struct list_head *iter;
6159	int err;
6160
6161	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6162		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6163			continue;
6164		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6165		if (err) {
6166			stop_dev = lower_dev;
6167			goto rollback;
6168		}
6169	}
6170	return 0;
6171
6172rollback:
6173	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6174		if (lower_dev == stop_dev)
6175			break;
6176		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6177			continue;
6178		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6179	}
6180	return err;
6181}
6182EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6183
6184void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6185					  struct neighbour *n)
6186{
6187	struct net_device *lower_dev;
6188	struct list_head *iter;
6189
6190	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6191		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6192			continue;
6193		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6194	}
6195}
6196EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6197
6198static void dev_change_rx_flags(struct net_device *dev, int flags)
6199{
6200	const struct net_device_ops *ops = dev->netdev_ops;
6201
6202	if (ops->ndo_change_rx_flags)
6203		ops->ndo_change_rx_flags(dev, flags);
6204}
6205
6206static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6207{
6208	unsigned int old_flags = dev->flags;
6209	kuid_t uid;
6210	kgid_t gid;
6211
6212	ASSERT_RTNL();
6213
6214	dev->flags |= IFF_PROMISC;
6215	dev->promiscuity += inc;
6216	if (dev->promiscuity == 0) {
6217		/*
6218		 * Avoid overflow.
6219		 * If inc causes overflow, untouch promisc and return error.
6220		 */
6221		if (inc < 0)
6222			dev->flags &= ~IFF_PROMISC;
6223		else {
6224			dev->promiscuity -= inc;
6225			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6226				dev->name);
6227			return -EOVERFLOW;
6228		}
6229	}
6230	if (dev->flags != old_flags) {
6231		pr_info("device %s %s promiscuous mode\n",
6232			dev->name,
6233			dev->flags & IFF_PROMISC ? "entered" : "left");
6234		if (audit_enabled) {
6235			current_uid_gid(&uid, &gid);
6236			audit_log(current->audit_context, GFP_ATOMIC,
6237				AUDIT_ANOM_PROMISCUOUS,
6238				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6239				dev->name, (dev->flags & IFF_PROMISC),
6240				(old_flags & IFF_PROMISC),
6241				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6242				from_kuid(&init_user_ns, uid),
6243				from_kgid(&init_user_ns, gid),
6244				audit_get_sessionid(current));
6245		}
6246
6247		dev_change_rx_flags(dev, IFF_PROMISC);
6248	}
6249	if (notify)
6250		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6251	return 0;
6252}
6253
6254/**
6255 *	dev_set_promiscuity	- update promiscuity count on a device
6256 *	@dev: device
6257 *	@inc: modifier
6258 *
6259 *	Add or remove promiscuity from a device. While the count in the device
6260 *	remains above zero the interface remains promiscuous. Once it hits zero
6261 *	the device reverts back to normal filtering operation. A negative inc
6262 *	value is used to drop promiscuity on the device.
6263 *	Return 0 if successful or a negative errno code on error.
6264 */
6265int dev_set_promiscuity(struct net_device *dev, int inc)
6266{
6267	unsigned int old_flags = dev->flags;
6268	int err;
6269
6270	err = __dev_set_promiscuity(dev, inc, true);
6271	if (err < 0)
6272		return err;
6273	if (dev->flags != old_flags)
6274		dev_set_rx_mode(dev);
6275	return err;
6276}
6277EXPORT_SYMBOL(dev_set_promiscuity);
6278
6279static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6280{
6281	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6282
6283	ASSERT_RTNL();
6284
6285	dev->flags |= IFF_ALLMULTI;
6286	dev->allmulti += inc;
6287	if (dev->allmulti == 0) {
6288		/*
6289		 * Avoid overflow.
6290		 * If inc causes overflow, untouch allmulti and return error.
6291		 */
6292		if (inc < 0)
6293			dev->flags &= ~IFF_ALLMULTI;
6294		else {
6295			dev->allmulti -= inc;
6296			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6297				dev->name);
6298			return -EOVERFLOW;
6299		}
6300	}
6301	if (dev->flags ^ old_flags) {
6302		dev_change_rx_flags(dev, IFF_ALLMULTI);
6303		dev_set_rx_mode(dev);
6304		if (notify)
6305			__dev_notify_flags(dev, old_flags,
6306					   dev->gflags ^ old_gflags);
6307	}
6308	return 0;
6309}
6310
6311/**
6312 *	dev_set_allmulti	- update allmulti count on a device
6313 *	@dev: device
6314 *	@inc: modifier
6315 *
6316 *	Add or remove reception of all multicast frames to a device. While the
6317 *	count in the device remains above zero the interface remains listening
6318 *	to all interfaces. Once it hits zero the device reverts back to normal
6319 *	filtering operation. A negative @inc value is used to drop the counter
6320 *	when releasing a resource needing all multicasts.
6321 *	Return 0 if successful or a negative errno code on error.
6322 */
6323
6324int dev_set_allmulti(struct net_device *dev, int inc)
6325{
6326	return __dev_set_allmulti(dev, inc, true);
6327}
6328EXPORT_SYMBOL(dev_set_allmulti);
6329
6330/*
6331 *	Upload unicast and multicast address lists to device and
6332 *	configure RX filtering. When the device doesn't support unicast
6333 *	filtering it is put in promiscuous mode while unicast addresses
6334 *	are present.
6335 */
6336void __dev_set_rx_mode(struct net_device *dev)
6337{
6338	const struct net_device_ops *ops = dev->netdev_ops;
6339
6340	/* dev_open will call this function so the list will stay sane. */
6341	if (!(dev->flags&IFF_UP))
6342		return;
6343
6344	if (!netif_device_present(dev))
6345		return;
6346
6347	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6348		/* Unicast addresses changes may only happen under the rtnl,
6349		 * therefore calling __dev_set_promiscuity here is safe.
6350		 */
6351		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6352			__dev_set_promiscuity(dev, 1, false);
6353			dev->uc_promisc = true;
6354		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6355			__dev_set_promiscuity(dev, -1, false);
6356			dev->uc_promisc = false;
6357		}
6358	}
6359
6360	if (ops->ndo_set_rx_mode)
6361		ops->ndo_set_rx_mode(dev);
6362}
6363
6364void dev_set_rx_mode(struct net_device *dev)
6365{
6366	netif_addr_lock_bh(dev);
6367	__dev_set_rx_mode(dev);
6368	netif_addr_unlock_bh(dev);
6369}
6370
6371/**
6372 *	dev_get_flags - get flags reported to userspace
6373 *	@dev: device
6374 *
6375 *	Get the combination of flag bits exported through APIs to userspace.
6376 */
6377unsigned int dev_get_flags(const struct net_device *dev)
6378{
6379	unsigned int flags;
6380
6381	flags = (dev->flags & ~(IFF_PROMISC |
6382				IFF_ALLMULTI |
6383				IFF_RUNNING |
6384				IFF_LOWER_UP |
6385				IFF_DORMANT)) |
6386		(dev->gflags & (IFF_PROMISC |
6387				IFF_ALLMULTI));
6388
6389	if (netif_running(dev)) {
6390		if (netif_oper_up(dev))
6391			flags |= IFF_RUNNING;
6392		if (netif_carrier_ok(dev))
6393			flags |= IFF_LOWER_UP;
6394		if (netif_dormant(dev))
6395			flags |= IFF_DORMANT;
6396	}
6397
6398	return flags;
6399}
6400EXPORT_SYMBOL(dev_get_flags);
6401
6402int __dev_change_flags(struct net_device *dev, unsigned int flags)
6403{
6404	unsigned int old_flags = dev->flags;
6405	int ret;
6406
6407	ASSERT_RTNL();
6408
6409	/*
6410	 *	Set the flags on our device.
6411	 */
6412
6413	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6414			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6415			       IFF_AUTOMEDIA)) |
6416		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6417				    IFF_ALLMULTI));
6418
6419	/*
6420	 *	Load in the correct multicast list now the flags have changed.
6421	 */
6422
6423	if ((old_flags ^ flags) & IFF_MULTICAST)
6424		dev_change_rx_flags(dev, IFF_MULTICAST);
6425
6426	dev_set_rx_mode(dev);
6427
6428	/*
6429	 *	Have we downed the interface. We handle IFF_UP ourselves
6430	 *	according to user attempts to set it, rather than blindly
6431	 *	setting it.
6432	 */
6433
6434	ret = 0;
6435	if ((old_flags ^ flags) & IFF_UP)
6436		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6437
6438	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6439		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6440		unsigned int old_flags = dev->flags;
6441
6442		dev->gflags ^= IFF_PROMISC;
6443
6444		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6445			if (dev->flags != old_flags)
6446				dev_set_rx_mode(dev);
6447	}
6448
6449	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6450	   is important. Some (broken) drivers set IFF_PROMISC, when
6451	   IFF_ALLMULTI is requested not asking us and not reporting.
6452	 */
6453	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6454		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6455
6456		dev->gflags ^= IFF_ALLMULTI;
6457		__dev_set_allmulti(dev, inc, false);
6458	}
6459
6460	return ret;
6461}
6462
6463void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6464			unsigned int gchanges)
6465{
6466	unsigned int changes = dev->flags ^ old_flags;
6467
6468	if (gchanges)
6469		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6470
6471	if (changes & IFF_UP) {
6472		if (dev->flags & IFF_UP)
6473			call_netdevice_notifiers(NETDEV_UP, dev);
6474		else
6475			call_netdevice_notifiers(NETDEV_DOWN, dev);
6476	}
6477
6478	if (dev->flags & IFF_UP &&
6479	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6480		struct netdev_notifier_change_info change_info;
6481
6482		change_info.flags_changed = changes;
6483		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6484					      &change_info.info);
6485	}
6486}
6487
6488/**
6489 *	dev_change_flags - change device settings
6490 *	@dev: device
6491 *	@flags: device state flags
6492 *
6493 *	Change settings on device based state flags. The flags are
6494 *	in the userspace exported format.
6495 */
6496int dev_change_flags(struct net_device *dev, unsigned int flags)
6497{
6498	int ret;
6499	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6500
6501	ret = __dev_change_flags(dev, flags);
6502	if (ret < 0)
6503		return ret;
6504
6505	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6506	__dev_notify_flags(dev, old_flags, changes);
6507	return ret;
6508}
6509EXPORT_SYMBOL(dev_change_flags);
6510
6511static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6512{
6513	const struct net_device_ops *ops = dev->netdev_ops;
6514
6515	if (ops->ndo_change_mtu)
6516		return ops->ndo_change_mtu(dev, new_mtu);
6517
6518	dev->mtu = new_mtu;
6519	return 0;
6520}
6521
6522/**
6523 *	dev_set_mtu - Change maximum transfer unit
6524 *	@dev: device
6525 *	@new_mtu: new transfer unit
6526 *
6527 *	Change the maximum transfer size of the network device.
6528 */
6529int dev_set_mtu(struct net_device *dev, int new_mtu)
6530{
6531	int err, orig_mtu;
6532
6533	if (new_mtu == dev->mtu)
6534		return 0;
6535
6536	/* MTU must be positive, and in range */
6537	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6538		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6539				    dev->name, new_mtu, dev->min_mtu);
6540		return -EINVAL;
6541	}
6542
6543	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6544		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6545				    dev->name, new_mtu, dev->max_mtu);
6546		return -EINVAL;
6547	}
6548
6549	if (!netif_device_present(dev))
6550		return -ENODEV;
6551
6552	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6553	err = notifier_to_errno(err);
6554	if (err)
6555		return err;
6556
6557	orig_mtu = dev->mtu;
6558	err = __dev_set_mtu(dev, new_mtu);
6559
6560	if (!err) {
6561		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6562		err = notifier_to_errno(err);
6563		if (err) {
6564			/* setting mtu back and notifying everyone again,
6565			 * so that they have a chance to revert changes.
6566			 */
6567			__dev_set_mtu(dev, orig_mtu);
6568			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6569		}
6570	}
6571	return err;
6572}
6573EXPORT_SYMBOL(dev_set_mtu);
6574
6575/**
6576 *	dev_set_group - Change group this device belongs to
6577 *	@dev: device
6578 *	@new_group: group this device should belong to
6579 */
6580void dev_set_group(struct net_device *dev, int new_group)
6581{
6582	dev->group = new_group;
6583}
6584EXPORT_SYMBOL(dev_set_group);
6585
6586/**
6587 *	dev_set_mac_address - Change Media Access Control Address
6588 *	@dev: device
6589 *	@sa: new address
6590 *
6591 *	Change the hardware (MAC) address of the device
6592 */
6593int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6594{
6595	const struct net_device_ops *ops = dev->netdev_ops;
6596	int err;
6597
6598	if (!ops->ndo_set_mac_address)
6599		return -EOPNOTSUPP;
6600	if (sa->sa_family != dev->type)
6601		return -EINVAL;
6602	if (!netif_device_present(dev))
6603		return -ENODEV;
6604	err = ops->ndo_set_mac_address(dev, sa);
6605	if (err)
6606		return err;
6607	dev->addr_assign_type = NET_ADDR_SET;
6608	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6609	add_device_randomness(dev->dev_addr, dev->addr_len);
6610	return 0;
6611}
6612EXPORT_SYMBOL(dev_set_mac_address);
6613
6614/**
6615 *	dev_change_carrier - Change device carrier
6616 *	@dev: device
6617 *	@new_carrier: new value
6618 *
6619 *	Change device carrier
6620 */
6621int dev_change_carrier(struct net_device *dev, bool new_carrier)
6622{
6623	const struct net_device_ops *ops = dev->netdev_ops;
6624
6625	if (!ops->ndo_change_carrier)
6626		return -EOPNOTSUPP;
6627	if (!netif_device_present(dev))
6628		return -ENODEV;
6629	return ops->ndo_change_carrier(dev, new_carrier);
6630}
6631EXPORT_SYMBOL(dev_change_carrier);
6632
6633/**
6634 *	dev_get_phys_port_id - Get device physical port ID
6635 *	@dev: device
6636 *	@ppid: port ID
6637 *
6638 *	Get device physical port ID
6639 */
6640int dev_get_phys_port_id(struct net_device *dev,
6641			 struct netdev_phys_item_id *ppid)
6642{
6643	const struct net_device_ops *ops = dev->netdev_ops;
6644
6645	if (!ops->ndo_get_phys_port_id)
6646		return -EOPNOTSUPP;
6647	return ops->ndo_get_phys_port_id(dev, ppid);
6648}
6649EXPORT_SYMBOL(dev_get_phys_port_id);
6650
6651/**
6652 *	dev_get_phys_port_name - Get device physical port name
6653 *	@dev: device
6654 *	@name: port name
6655 *	@len: limit of bytes to copy to name
6656 *
6657 *	Get device physical port name
6658 */
6659int dev_get_phys_port_name(struct net_device *dev,
6660			   char *name, size_t len)
6661{
6662	const struct net_device_ops *ops = dev->netdev_ops;
6663
6664	if (!ops->ndo_get_phys_port_name)
6665		return -EOPNOTSUPP;
6666	return ops->ndo_get_phys_port_name(dev, name, len);
6667}
6668EXPORT_SYMBOL(dev_get_phys_port_name);
6669
6670/**
6671 *	dev_change_proto_down - update protocol port state information
6672 *	@dev: device
6673 *	@proto_down: new value
6674 *
6675 *	This info can be used by switch drivers to set the phys state of the
6676 *	port.
6677 */
6678int dev_change_proto_down(struct net_device *dev, bool proto_down)
6679{
6680	const struct net_device_ops *ops = dev->netdev_ops;
6681
6682	if (!ops->ndo_change_proto_down)
6683		return -EOPNOTSUPP;
6684	if (!netif_device_present(dev))
6685		return -ENODEV;
6686	return ops->ndo_change_proto_down(dev, proto_down);
6687}
6688EXPORT_SYMBOL(dev_change_proto_down);
6689
6690/**
6691 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6692 *	@dev: device
6693 *	@fd: new program fd or negative value to clear
6694 *	@flags: xdp-related flags
6695 *
6696 *	Set or clear a bpf program for a device
6697 */
6698int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6699{
6700	const struct net_device_ops *ops = dev->netdev_ops;
6701	struct bpf_prog *prog = NULL;
6702	struct netdev_xdp xdp;
6703	int err;
6704
6705	ASSERT_RTNL();
6706
6707	if (!ops->ndo_xdp)
6708		return -EOPNOTSUPP;
6709	if (fd >= 0) {
6710		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6711			memset(&xdp, 0, sizeof(xdp));
6712			xdp.command = XDP_QUERY_PROG;
6713
6714			err = ops->ndo_xdp(dev, &xdp);
6715			if (err < 0)
6716				return err;
6717			if (xdp.prog_attached)
6718				return -EBUSY;
6719		}
6720
6721		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6722		if (IS_ERR(prog))
6723			return PTR_ERR(prog);
6724	}
6725
6726	memset(&xdp, 0, sizeof(xdp));
6727	xdp.command = XDP_SETUP_PROG;
6728	xdp.prog = prog;
6729
6730	err = ops->ndo_xdp(dev, &xdp);
6731	if (err < 0 && prog)
6732		bpf_prog_put(prog);
6733
6734	return err;
6735}
6736EXPORT_SYMBOL(dev_change_xdp_fd);
6737
6738/**
6739 *	dev_new_index	-	allocate an ifindex
6740 *	@net: the applicable net namespace
6741 *
6742 *	Returns a suitable unique value for a new device interface
6743 *	number.  The caller must hold the rtnl semaphore or the
6744 *	dev_base_lock to be sure it remains unique.
6745 */
6746static int dev_new_index(struct net *net)
6747{
6748	int ifindex = net->ifindex;
6749	for (;;) {
6750		if (++ifindex <= 0)
6751			ifindex = 1;
6752		if (!__dev_get_by_index(net, ifindex))
6753			return net->ifindex = ifindex;
6754	}
6755}
6756
6757/* Delayed registration/unregisteration */
6758static LIST_HEAD(net_todo_list);
6759DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6760
6761static void net_set_todo(struct net_device *dev)
6762{
6763	list_add_tail(&dev->todo_list, &net_todo_list);
6764	dev_net(dev)->dev_unreg_count++;
6765}
6766
6767static void rollback_registered_many(struct list_head *head)
6768{
6769	struct net_device *dev, *tmp;
6770	LIST_HEAD(close_head);
6771
6772	BUG_ON(dev_boot_phase);
6773	ASSERT_RTNL();
6774
6775	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6776		/* Some devices call without registering
6777		 * for initialization unwind. Remove those
6778		 * devices and proceed with the remaining.
6779		 */
6780		if (dev->reg_state == NETREG_UNINITIALIZED) {
6781			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6782				 dev->name, dev);
6783
6784			WARN_ON(1);
6785			list_del(&dev->unreg_list);
6786			continue;
6787		}
6788		dev->dismantle = true;
6789		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6790	}
6791
6792	/* If device is running, close it first. */
6793	list_for_each_entry(dev, head, unreg_list)
6794		list_add_tail(&dev->close_list, &close_head);
6795	dev_close_many(&close_head, true);
6796
6797	list_for_each_entry(dev, head, unreg_list) {
6798		/* And unlink it from device chain. */
6799		unlist_netdevice(dev);
6800
6801		dev->reg_state = NETREG_UNREGISTERING;
6802	}
6803	flush_all_backlogs();
6804
6805	synchronize_net();
6806
6807	list_for_each_entry(dev, head, unreg_list) {
6808		struct sk_buff *skb = NULL;
6809
6810		/* Shutdown queueing discipline. */
6811		dev_shutdown(dev);
6812
6813
6814		/* Notify protocols, that we are about to destroy
6815		   this device. They should clean all the things.
6816		*/
6817		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6818
6819		if (!dev->rtnl_link_ops ||
6820		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6821			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6822						     GFP_KERNEL);
6823
6824		/*
6825		 *	Flush the unicast and multicast chains
6826		 */
6827		dev_uc_flush(dev);
6828		dev_mc_flush(dev);
6829
6830		if (dev->netdev_ops->ndo_uninit)
6831			dev->netdev_ops->ndo_uninit(dev);
6832
6833		if (skb)
6834			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6835
6836		/* Notifier chain MUST detach us all upper devices. */
6837		WARN_ON(netdev_has_any_upper_dev(dev));
6838		WARN_ON(netdev_has_any_lower_dev(dev));
6839
6840		/* Remove entries from kobject tree */
6841		netdev_unregister_kobject(dev);
6842#ifdef CONFIG_XPS
6843		/* Remove XPS queueing entries */
6844		netif_reset_xps_queues_gt(dev, 0);
6845#endif
6846	}
6847
6848	synchronize_net();
6849
6850	list_for_each_entry(dev, head, unreg_list)
6851		dev_put(dev);
6852}
6853
6854static void rollback_registered(struct net_device *dev)
6855{
6856	LIST_HEAD(single);
6857
6858	list_add(&dev->unreg_list, &single);
6859	rollback_registered_many(&single);
6860	list_del(&single);
6861}
6862
6863static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6864	struct net_device *upper, netdev_features_t features)
6865{
6866	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6867	netdev_features_t feature;
6868	int feature_bit;
6869
6870	for_each_netdev_feature(&upper_disables, feature_bit) {
6871		feature = __NETIF_F_BIT(feature_bit);
6872		if (!(upper->wanted_features & feature)
6873		    && (features & feature)) {
6874			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6875				   &feature, upper->name);
6876			features &= ~feature;
6877		}
6878	}
6879
6880	return features;
6881}
6882
6883static void netdev_sync_lower_features(struct net_device *upper,
6884	struct net_device *lower, netdev_features_t features)
6885{
6886	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6887	netdev_features_t feature;
6888	int feature_bit;
6889
6890	for_each_netdev_feature(&upper_disables, feature_bit) {
6891		feature = __NETIF_F_BIT(feature_bit);
6892		if (!(features & feature) && (lower->features & feature)) {
6893			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6894				   &feature, lower->name);
6895			lower->wanted_features &= ~feature;
6896			netdev_update_features(lower);
6897
6898			if (unlikely(lower->features & feature))
6899				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6900					    &feature, lower->name);
6901		}
6902	}
6903}
6904
6905static netdev_features_t netdev_fix_features(struct net_device *dev,
6906	netdev_features_t features)
6907{
6908	/* Fix illegal checksum combinations */
6909	if ((features & NETIF_F_HW_CSUM) &&
6910	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6911		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6912		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6913	}
6914
6915	/* TSO requires that SG is present as well. */
6916	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6917		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6918		features &= ~NETIF_F_ALL_TSO;
6919	}
6920
6921	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6922					!(features & NETIF_F_IP_CSUM)) {
6923		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6924		features &= ~NETIF_F_TSO;
6925		features &= ~NETIF_F_TSO_ECN;
6926	}
6927
6928	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6929					 !(features & NETIF_F_IPV6_CSUM)) {
6930		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6931		features &= ~NETIF_F_TSO6;
6932	}
6933
6934	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6935	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6936		features &= ~NETIF_F_TSO_MANGLEID;
6937
6938	/* TSO ECN requires that TSO is present as well. */
6939	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6940		features &= ~NETIF_F_TSO_ECN;
6941
6942	/* Software GSO depends on SG. */
6943	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6944		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6945		features &= ~NETIF_F_GSO;
6946	}
6947
6948	/* UFO needs SG and checksumming */
6949	if (features & NETIF_F_UFO) {
6950		/* maybe split UFO into V4 and V6? */
6951		if (!(features & NETIF_F_HW_CSUM) &&
6952		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6953		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6954			netdev_dbg(dev,
6955				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6956			features &= ~NETIF_F_UFO;
6957		}
6958
6959		if (!(features & NETIF_F_SG)) {
6960			netdev_dbg(dev,
6961				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6962			features &= ~NETIF_F_UFO;
6963		}
6964	}
6965
6966	/* GSO partial features require GSO partial be set */
6967	if ((features & dev->gso_partial_features) &&
6968	    !(features & NETIF_F_GSO_PARTIAL)) {
6969		netdev_dbg(dev,
6970			   "Dropping partially supported GSO features since no GSO partial.\n");
6971		features &= ~dev->gso_partial_features;
6972	}
6973
6974#ifdef CONFIG_NET_RX_BUSY_POLL
6975	if (dev->netdev_ops->ndo_busy_poll)
6976		features |= NETIF_F_BUSY_POLL;
6977	else
6978#endif
6979		features &= ~NETIF_F_BUSY_POLL;
6980
6981	return features;
6982}
6983
6984int __netdev_update_features(struct net_device *dev)
6985{
6986	struct net_device *upper, *lower;
6987	netdev_features_t features;
6988	struct list_head *iter;
6989	int err = -1;
6990
6991	ASSERT_RTNL();
6992
6993	features = netdev_get_wanted_features(dev);
6994
6995	if (dev->netdev_ops->ndo_fix_features)
6996		features = dev->netdev_ops->ndo_fix_features(dev, features);
6997
6998	/* driver might be less strict about feature dependencies */
6999	features = netdev_fix_features(dev, features);
7000
7001	/* some features can't be enabled if they're off an an upper device */
7002	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7003		features = netdev_sync_upper_features(dev, upper, features);
7004
7005	if (dev->features == features)
7006		goto sync_lower;
7007
7008	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7009		&dev->features, &features);
7010
7011	if (dev->netdev_ops->ndo_set_features)
7012		err = dev->netdev_ops->ndo_set_features(dev, features);
7013	else
7014		err = 0;
7015
7016	if (unlikely(err < 0)) {
7017		netdev_err(dev,
7018			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7019			err, &features, &dev->features);
7020		/* return non-0 since some features might have changed and
7021		 * it's better to fire a spurious notification than miss it
7022		 */
7023		return -1;
7024	}
7025
7026sync_lower:
7027	/* some features must be disabled on lower devices when disabled
7028	 * on an upper device (think: bonding master or bridge)
7029	 */
7030	netdev_for_each_lower_dev(dev, lower, iter)
7031		netdev_sync_lower_features(dev, lower, features);
7032
7033	if (!err)
7034		dev->features = features;
7035
7036	return err < 0 ? 0 : 1;
7037}
7038
7039/**
7040 *	netdev_update_features - recalculate device features
7041 *	@dev: the device to check
7042 *
7043 *	Recalculate dev->features set and send notifications if it
7044 *	has changed. Should be called after driver or hardware dependent
7045 *	conditions might have changed that influence the features.
7046 */
7047void netdev_update_features(struct net_device *dev)
7048{
7049	if (__netdev_update_features(dev))
7050		netdev_features_change(dev);
7051}
7052EXPORT_SYMBOL(netdev_update_features);
7053
7054/**
7055 *	netdev_change_features - recalculate device features
7056 *	@dev: the device to check
7057 *
7058 *	Recalculate dev->features set and send notifications even
7059 *	if they have not changed. Should be called instead of
7060 *	netdev_update_features() if also dev->vlan_features might
7061 *	have changed to allow the changes to be propagated to stacked
7062 *	VLAN devices.
7063 */
7064void netdev_change_features(struct net_device *dev)
7065{
7066	__netdev_update_features(dev);
7067	netdev_features_change(dev);
7068}
7069EXPORT_SYMBOL(netdev_change_features);
7070
7071/**
7072 *	netif_stacked_transfer_operstate -	transfer operstate
7073 *	@rootdev: the root or lower level device to transfer state from
7074 *	@dev: the device to transfer operstate to
7075 *
7076 *	Transfer operational state from root to device. This is normally
7077 *	called when a stacking relationship exists between the root
7078 *	device and the device(a leaf device).
7079 */
7080void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7081					struct net_device *dev)
7082{
7083	if (rootdev->operstate == IF_OPER_DORMANT)
7084		netif_dormant_on(dev);
7085	else
7086		netif_dormant_off(dev);
7087
7088	if (netif_carrier_ok(rootdev)) {
7089		if (!netif_carrier_ok(dev))
7090			netif_carrier_on(dev);
7091	} else {
7092		if (netif_carrier_ok(dev))
7093			netif_carrier_off(dev);
7094	}
7095}
7096EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7097
7098#ifdef CONFIG_SYSFS
7099static int netif_alloc_rx_queues(struct net_device *dev)
7100{
7101	unsigned int i, count = dev->num_rx_queues;
7102	struct netdev_rx_queue *rx;
7103	size_t sz = count * sizeof(*rx);
7104
7105	BUG_ON(count < 1);
7106
7107	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7108	if (!rx) {
7109		rx = vzalloc(sz);
7110		if (!rx)
7111			return -ENOMEM;
7112	}
7113	dev->_rx = rx;
7114
7115	for (i = 0; i < count; i++)
7116		rx[i].dev = dev;
7117	return 0;
7118}
7119#endif
7120
7121static void netdev_init_one_queue(struct net_device *dev,
7122				  struct netdev_queue *queue, void *_unused)
7123{
7124	/* Initialize queue lock */
7125	spin_lock_init(&queue->_xmit_lock);
7126	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7127	queue->xmit_lock_owner = -1;
7128	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7129	queue->dev = dev;
7130#ifdef CONFIG_BQL
7131	dql_init(&queue->dql, HZ);
7132#endif
7133}
7134
7135static void netif_free_tx_queues(struct net_device *dev)
7136{
7137	kvfree(dev->_tx);
7138}
7139
7140static int netif_alloc_netdev_queues(struct net_device *dev)
7141{
7142	unsigned int count = dev->num_tx_queues;
7143	struct netdev_queue *tx;
7144	size_t sz = count * sizeof(*tx);
7145
7146	if (count < 1 || count > 0xffff)
7147		return -EINVAL;
7148
7149	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7150	if (!tx) {
7151		tx = vzalloc(sz);
7152		if (!tx)
7153			return -ENOMEM;
7154	}
7155	dev->_tx = tx;
7156
7157	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7158	spin_lock_init(&dev->tx_global_lock);
7159
7160	return 0;
7161}
7162
7163void netif_tx_stop_all_queues(struct net_device *dev)
7164{
7165	unsigned int i;
7166
7167	for (i = 0; i < dev->num_tx_queues; i++) {
7168		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7169		netif_tx_stop_queue(txq);
7170	}
7171}
7172EXPORT_SYMBOL(netif_tx_stop_all_queues);
7173
7174/**
7175 *	register_netdevice	- register a network device
7176 *	@dev: device to register
7177 *
7178 *	Take a completed network device structure and add it to the kernel
7179 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7180 *	chain. 0 is returned on success. A negative errno code is returned
7181 *	on a failure to set up the device, or if the name is a duplicate.
7182 *
7183 *	Callers must hold the rtnl semaphore. You may want
7184 *	register_netdev() instead of this.
7185 *
7186 *	BUGS:
7187 *	The locking appears insufficient to guarantee two parallel registers
7188 *	will not get the same name.
7189 */
7190
7191int register_netdevice(struct net_device *dev)
7192{
7193	int ret;
7194	struct net *net = dev_net(dev);
7195
7196	BUG_ON(dev_boot_phase);
7197	ASSERT_RTNL();
7198
7199	might_sleep();
7200
7201	/* When net_device's are persistent, this will be fatal. */
7202	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7203	BUG_ON(!net);
7204
7205	spin_lock_init(&dev->addr_list_lock);
7206	netdev_set_addr_lockdep_class(dev);
7207
7208	ret = dev_get_valid_name(net, dev, dev->name);
7209	if (ret < 0)
7210		goto out;
7211
7212	/* Init, if this function is available */
7213	if (dev->netdev_ops->ndo_init) {
7214		ret = dev->netdev_ops->ndo_init(dev);
7215		if (ret) {
7216			if (ret > 0)
7217				ret = -EIO;
7218			goto out;
7219		}
7220	}
7221
7222	if (((dev->hw_features | dev->features) &
7223	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7224	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7225	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7226		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7227		ret = -EINVAL;
7228		goto err_uninit;
7229	}
7230
7231	ret = -EBUSY;
7232	if (!dev->ifindex)
7233		dev->ifindex = dev_new_index(net);
7234	else if (__dev_get_by_index(net, dev->ifindex))
7235		goto err_uninit;
7236
7237	/* Transfer changeable features to wanted_features and enable
7238	 * software offloads (GSO and GRO).
7239	 */
7240	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7241	dev->features |= NETIF_F_SOFT_FEATURES;
7242	dev->wanted_features = dev->features & dev->hw_features;
7243
7244	if (!(dev->flags & IFF_LOOPBACK))
7245		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7246
7247	/* If IPv4 TCP segmentation offload is supported we should also
7248	 * allow the device to enable segmenting the frame with the option
7249	 * of ignoring a static IP ID value.  This doesn't enable the
7250	 * feature itself but allows the user to enable it later.
7251	 */
7252	if (dev->hw_features & NETIF_F_TSO)
7253		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7254	if (dev->vlan_features & NETIF_F_TSO)
7255		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7256	if (dev->mpls_features & NETIF_F_TSO)
7257		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7258	if (dev->hw_enc_features & NETIF_F_TSO)
7259		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7260
7261	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7262	 */
7263	dev->vlan_features |= NETIF_F_HIGHDMA;
7264
7265	/* Make NETIF_F_SG inheritable to tunnel devices.
7266	 */
7267	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7268
7269	/* Make NETIF_F_SG inheritable to MPLS.
7270	 */
7271	dev->mpls_features |= NETIF_F_SG;
7272
7273	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7274	ret = notifier_to_errno(ret);
7275	if (ret)
7276		goto err_uninit;
7277
7278	ret = netdev_register_kobject(dev);
7279	if (ret)
7280		goto err_uninit;
7281	dev->reg_state = NETREG_REGISTERED;
7282
7283	__netdev_update_features(dev);
7284
7285	/*
7286	 *	Default initial state at registry is that the
7287	 *	device is present.
7288	 */
7289
7290	set_bit(__LINK_STATE_PRESENT, &dev->state);
7291
7292	linkwatch_init_dev(dev);
7293
7294	dev_init_scheduler(dev);
7295	dev_hold(dev);
7296	list_netdevice(dev);
7297	add_device_randomness(dev->dev_addr, dev->addr_len);
7298
7299	/* If the device has permanent device address, driver should
7300	 * set dev_addr and also addr_assign_type should be set to
7301	 * NET_ADDR_PERM (default value).
7302	 */
7303	if (dev->addr_assign_type == NET_ADDR_PERM)
7304		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7305
7306	/* Notify protocols, that a new device appeared. */
7307	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7308	ret = notifier_to_errno(ret);
7309	if (ret) {
7310		rollback_registered(dev);
7311		dev->reg_state = NETREG_UNREGISTERED;
7312	}
7313	/*
7314	 *	Prevent userspace races by waiting until the network
7315	 *	device is fully setup before sending notifications.
7316	 */
7317	if (!dev->rtnl_link_ops ||
7318	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7319		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7320
7321out:
7322	return ret;
7323
7324err_uninit:
7325	if (dev->netdev_ops->ndo_uninit)
7326		dev->netdev_ops->ndo_uninit(dev);
7327	goto out;
7328}
7329EXPORT_SYMBOL(register_netdevice);
7330
7331/**
7332 *	init_dummy_netdev	- init a dummy network device for NAPI
7333 *	@dev: device to init
7334 *
7335 *	This takes a network device structure and initialize the minimum
7336 *	amount of fields so it can be used to schedule NAPI polls without
7337 *	registering a full blown interface. This is to be used by drivers
7338 *	that need to tie several hardware interfaces to a single NAPI
7339 *	poll scheduler due to HW limitations.
7340 */
7341int init_dummy_netdev(struct net_device *dev)
7342{
7343	/* Clear everything. Note we don't initialize spinlocks
7344	 * are they aren't supposed to be taken by any of the
7345	 * NAPI code and this dummy netdev is supposed to be
7346	 * only ever used for NAPI polls
7347	 */
7348	memset(dev, 0, sizeof(struct net_device));
7349
7350	/* make sure we BUG if trying to hit standard
7351	 * register/unregister code path
7352	 */
7353	dev->reg_state = NETREG_DUMMY;
7354
7355	/* NAPI wants this */
7356	INIT_LIST_HEAD(&dev->napi_list);
7357
7358	/* a dummy interface is started by default */
7359	set_bit(__LINK_STATE_PRESENT, &dev->state);
7360	set_bit(__LINK_STATE_START, &dev->state);
7361
7362	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7363	 * because users of this 'device' dont need to change
7364	 * its refcount.
7365	 */
7366
7367	return 0;
7368}
7369EXPORT_SYMBOL_GPL(init_dummy_netdev);
7370
7371
7372/**
7373 *	register_netdev	- register a network device
7374 *	@dev: device to register
7375 *
7376 *	Take a completed network device structure and add it to the kernel
7377 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7378 *	chain. 0 is returned on success. A negative errno code is returned
7379 *	on a failure to set up the device, or if the name is a duplicate.
7380 *
7381 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7382 *	and expands the device name if you passed a format string to
7383 *	alloc_netdev.
7384 */
7385int register_netdev(struct net_device *dev)
7386{
7387	int err;
7388
7389	rtnl_lock();
7390	err = register_netdevice(dev);
7391	rtnl_unlock();
7392	return err;
7393}
7394EXPORT_SYMBOL(register_netdev);
7395
7396int netdev_refcnt_read(const struct net_device *dev)
7397{
7398	int i, refcnt = 0;
7399
7400	for_each_possible_cpu(i)
7401		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7402	return refcnt;
7403}
7404EXPORT_SYMBOL(netdev_refcnt_read);
7405
7406/**
7407 * netdev_wait_allrefs - wait until all references are gone.
7408 * @dev: target net_device
7409 *
7410 * This is called when unregistering network devices.
7411 *
7412 * Any protocol or device that holds a reference should register
7413 * for netdevice notification, and cleanup and put back the
7414 * reference if they receive an UNREGISTER event.
7415 * We can get stuck here if buggy protocols don't correctly
7416 * call dev_put.
7417 */
7418static void netdev_wait_allrefs(struct net_device *dev)
7419{
7420	unsigned long rebroadcast_time, warning_time;
7421	int refcnt;
7422
7423	linkwatch_forget_dev(dev);
7424
7425	rebroadcast_time = warning_time = jiffies;
7426	refcnt = netdev_refcnt_read(dev);
7427
7428	while (refcnt != 0) {
7429		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7430			rtnl_lock();
7431
7432			/* Rebroadcast unregister notification */
7433			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7434
7435			__rtnl_unlock();
7436			rcu_barrier();
7437			rtnl_lock();
7438
7439			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7440			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7441				     &dev->state)) {
7442				/* We must not have linkwatch events
7443				 * pending on unregister. If this
7444				 * happens, we simply run the queue
7445				 * unscheduled, resulting in a noop
7446				 * for this device.
7447				 */
7448				linkwatch_run_queue();
7449			}
7450
7451			__rtnl_unlock();
7452
7453			rebroadcast_time = jiffies;
7454		}
7455
7456		msleep(250);
7457
7458		refcnt = netdev_refcnt_read(dev);
7459
7460		if (time_after(jiffies, warning_time + 10 * HZ)) {
7461			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7462				 dev->name, refcnt);
7463			warning_time = jiffies;
7464		}
7465	}
7466}
7467
7468/* The sequence is:
7469 *
7470 *	rtnl_lock();
7471 *	...
7472 *	register_netdevice(x1);
7473 *	register_netdevice(x2);
7474 *	...
7475 *	unregister_netdevice(y1);
7476 *	unregister_netdevice(y2);
7477 *      ...
7478 *	rtnl_unlock();
7479 *	free_netdev(y1);
7480 *	free_netdev(y2);
7481 *
7482 * We are invoked by rtnl_unlock().
7483 * This allows us to deal with problems:
7484 * 1) We can delete sysfs objects which invoke hotplug
7485 *    without deadlocking with linkwatch via keventd.
7486 * 2) Since we run with the RTNL semaphore not held, we can sleep
7487 *    safely in order to wait for the netdev refcnt to drop to zero.
7488 *
7489 * We must not return until all unregister events added during
7490 * the interval the lock was held have been completed.
7491 */
7492void netdev_run_todo(void)
7493{
7494	struct list_head list;
7495
7496	/* Snapshot list, allow later requests */
7497	list_replace_init(&net_todo_list, &list);
7498
7499	__rtnl_unlock();
7500
7501
7502	/* Wait for rcu callbacks to finish before next phase */
7503	if (!list_empty(&list))
7504		rcu_barrier();
7505
7506	while (!list_empty(&list)) {
7507		struct net_device *dev
7508			= list_first_entry(&list, struct net_device, todo_list);
7509		list_del(&dev->todo_list);
7510
7511		rtnl_lock();
7512		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7513		__rtnl_unlock();
7514
7515		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7516			pr_err("network todo '%s' but state %d\n",
7517			       dev->name, dev->reg_state);
7518			dump_stack();
7519			continue;
7520		}
7521
7522		dev->reg_state = NETREG_UNREGISTERED;
7523
7524		netdev_wait_allrefs(dev);
7525
7526		/* paranoia */
7527		BUG_ON(netdev_refcnt_read(dev));
7528		BUG_ON(!list_empty(&dev->ptype_all));
7529		BUG_ON(!list_empty(&dev->ptype_specific));
7530		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7531		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7532		WARN_ON(dev->dn_ptr);
7533
7534		if (dev->destructor)
7535			dev->destructor(dev);
7536
7537		/* Report a network device has been unregistered */
7538		rtnl_lock();
7539		dev_net(dev)->dev_unreg_count--;
7540		__rtnl_unlock();
7541		wake_up(&netdev_unregistering_wq);
7542
7543		/* Free network device */
7544		kobject_put(&dev->dev.kobj);
7545	}
7546}
7547
7548/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7549 * all the same fields in the same order as net_device_stats, with only
7550 * the type differing, but rtnl_link_stats64 may have additional fields
7551 * at the end for newer counters.
7552 */
7553void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7554			     const struct net_device_stats *netdev_stats)
7555{
7556#if BITS_PER_LONG == 64
7557	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7558	memcpy(stats64, netdev_stats, sizeof(*stats64));
7559	/* zero out counters that only exist in rtnl_link_stats64 */
7560	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7561	       sizeof(*stats64) - sizeof(*netdev_stats));
7562#else
7563	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7564	const unsigned long *src = (const unsigned long *)netdev_stats;
7565	u64 *dst = (u64 *)stats64;
7566
7567	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7568	for (i = 0; i < n; i++)
7569		dst[i] = src[i];
7570	/* zero out counters that only exist in rtnl_link_stats64 */
7571	memset((char *)stats64 + n * sizeof(u64), 0,
7572	       sizeof(*stats64) - n * sizeof(u64));
7573#endif
7574}
7575EXPORT_SYMBOL(netdev_stats_to_stats64);
7576
7577/**
7578 *	dev_get_stats	- get network device statistics
7579 *	@dev: device to get statistics from
7580 *	@storage: place to store stats
7581 *
7582 *	Get network statistics from device. Return @storage.
7583 *	The device driver may provide its own method by setting
7584 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7585 *	otherwise the internal statistics structure is used.
7586 */
7587struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7588					struct rtnl_link_stats64 *storage)
7589{
7590	const struct net_device_ops *ops = dev->netdev_ops;
7591
7592	if (ops->ndo_get_stats64) {
7593		memset(storage, 0, sizeof(*storage));
7594		ops->ndo_get_stats64(dev, storage);
7595	} else if (ops->ndo_get_stats) {
7596		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7597	} else {
7598		netdev_stats_to_stats64(storage, &dev->stats);
7599	}
7600	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7601	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7602	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7603	return storage;
7604}
7605EXPORT_SYMBOL(dev_get_stats);
7606
7607struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7608{
7609	struct netdev_queue *queue = dev_ingress_queue(dev);
7610
7611#ifdef CONFIG_NET_CLS_ACT
7612	if (queue)
7613		return queue;
7614	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7615	if (!queue)
7616		return NULL;
7617	netdev_init_one_queue(dev, queue, NULL);
7618	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7619	queue->qdisc_sleeping = &noop_qdisc;
7620	rcu_assign_pointer(dev->ingress_queue, queue);
7621#endif
7622	return queue;
7623}
7624
7625static const struct ethtool_ops default_ethtool_ops;
7626
7627void netdev_set_default_ethtool_ops(struct net_device *dev,
7628				    const struct ethtool_ops *ops)
7629{
7630	if (dev->ethtool_ops == &default_ethtool_ops)
7631		dev->ethtool_ops = ops;
7632}
7633EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7634
7635void netdev_freemem(struct net_device *dev)
7636{
7637	char *addr = (char *)dev - dev->padded;
7638
7639	kvfree(addr);
7640}
7641
7642/**
7643 *	alloc_netdev_mqs - allocate network device
7644 *	@sizeof_priv:		size of private data to allocate space for
7645 *	@name:			device name format string
7646 *	@name_assign_type: 	origin of device name
7647 *	@setup:			callback to initialize device
7648 *	@txqs:			the number of TX subqueues to allocate
7649 *	@rxqs:			the number of RX subqueues to allocate
7650 *
7651 *	Allocates a struct net_device with private data area for driver use
7652 *	and performs basic initialization.  Also allocates subqueue structs
7653 *	for each queue on the device.
7654 */
7655struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7656		unsigned char name_assign_type,
7657		void (*setup)(struct net_device *),
7658		unsigned int txqs, unsigned int rxqs)
7659{
7660	struct net_device *dev;
7661	size_t alloc_size;
7662	struct net_device *p;
7663
7664	BUG_ON(strlen(name) >= sizeof(dev->name));
7665
7666	if (txqs < 1) {
7667		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7668		return NULL;
7669	}
7670
7671#ifdef CONFIG_SYSFS
7672	if (rxqs < 1) {
7673		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7674		return NULL;
7675	}
7676#endif
7677
7678	alloc_size = sizeof(struct net_device);
7679	if (sizeof_priv) {
7680		/* ensure 32-byte alignment of private area */
7681		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7682		alloc_size += sizeof_priv;
7683	}
7684	/* ensure 32-byte alignment of whole construct */
7685	alloc_size += NETDEV_ALIGN - 1;
7686
7687	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7688	if (!p)
7689		p = vzalloc(alloc_size);
7690	if (!p)
7691		return NULL;
7692
7693	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7694	dev->padded = (char *)dev - (char *)p;
7695
7696	dev->pcpu_refcnt = alloc_percpu(int);
7697	if (!dev->pcpu_refcnt)
7698		goto free_dev;
7699
7700	if (dev_addr_init(dev))
7701		goto free_pcpu;
7702
7703	dev_mc_init(dev);
7704	dev_uc_init(dev);
7705
7706	dev_net_set(dev, &init_net);
7707
7708	dev->gso_max_size = GSO_MAX_SIZE;
7709	dev->gso_max_segs = GSO_MAX_SEGS;
7710
7711	INIT_LIST_HEAD(&dev->napi_list);
7712	INIT_LIST_HEAD(&dev->unreg_list);
7713	INIT_LIST_HEAD(&dev->close_list);
7714	INIT_LIST_HEAD(&dev->link_watch_list);
7715	INIT_LIST_HEAD(&dev->adj_list.upper);
7716	INIT_LIST_HEAD(&dev->adj_list.lower);
7717	INIT_LIST_HEAD(&dev->ptype_all);
7718	INIT_LIST_HEAD(&dev->ptype_specific);
7719#ifdef CONFIG_NET_SCHED
7720	hash_init(dev->qdisc_hash);
7721#endif
7722	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7723	setup(dev);
7724
7725	if (!dev->tx_queue_len) {
7726		dev->priv_flags |= IFF_NO_QUEUE;
7727		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7728	}
7729
7730	dev->num_tx_queues = txqs;
7731	dev->real_num_tx_queues = txqs;
7732	if (netif_alloc_netdev_queues(dev))
7733		goto free_all;
7734
7735#ifdef CONFIG_SYSFS
7736	dev->num_rx_queues = rxqs;
7737	dev->real_num_rx_queues = rxqs;
7738	if (netif_alloc_rx_queues(dev))
7739		goto free_all;
7740#endif
7741
7742	strcpy(dev->name, name);
7743	dev->name_assign_type = name_assign_type;
7744	dev->group = INIT_NETDEV_GROUP;
7745	if (!dev->ethtool_ops)
7746		dev->ethtool_ops = &default_ethtool_ops;
7747
7748	nf_hook_ingress_init(dev);
7749
7750	return dev;
7751
7752free_all:
7753	free_netdev(dev);
7754	return NULL;
7755
7756free_pcpu:
7757	free_percpu(dev->pcpu_refcnt);
7758free_dev:
7759	netdev_freemem(dev);
7760	return NULL;
7761}
7762EXPORT_SYMBOL(alloc_netdev_mqs);
7763
7764/**
7765 *	free_netdev - free network device
7766 *	@dev: device
7767 *
7768 *	This function does the last stage of destroying an allocated device
7769 * 	interface. The reference to the device object is released.
7770 *	If this is the last reference then it will be freed.
7771 *	Must be called in process context.
7772 */
7773void free_netdev(struct net_device *dev)
7774{
7775	struct napi_struct *p, *n;
7776
7777	might_sleep();
7778	netif_free_tx_queues(dev);
7779#ifdef CONFIG_SYSFS
7780	kvfree(dev->_rx);
7781#endif
7782
7783	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7784
7785	/* Flush device addresses */
7786	dev_addr_flush(dev);
7787
7788	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7789		netif_napi_del(p);
7790
7791	free_percpu(dev->pcpu_refcnt);
7792	dev->pcpu_refcnt = NULL;
7793
7794	/*  Compatibility with error handling in drivers */
7795	if (dev->reg_state == NETREG_UNINITIALIZED) {
7796		netdev_freemem(dev);
7797		return;
7798	}
7799
7800	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7801	dev->reg_state = NETREG_RELEASED;
7802
7803	/* will free via device release */
7804	put_device(&dev->dev);
7805}
7806EXPORT_SYMBOL(free_netdev);
7807
7808/**
7809 *	synchronize_net -  Synchronize with packet receive processing
7810 *
7811 *	Wait for packets currently being received to be done.
7812 *	Does not block later packets from starting.
7813 */
7814void synchronize_net(void)
7815{
7816	might_sleep();
7817	if (rtnl_is_locked())
7818		synchronize_rcu_expedited();
7819	else
7820		synchronize_rcu();
7821}
7822EXPORT_SYMBOL(synchronize_net);
7823
7824/**
7825 *	unregister_netdevice_queue - remove device from the kernel
7826 *	@dev: device
7827 *	@head: list
7828 *
7829 *	This function shuts down a device interface and removes it
7830 *	from the kernel tables.
7831 *	If head not NULL, device is queued to be unregistered later.
7832 *
7833 *	Callers must hold the rtnl semaphore.  You may want
7834 *	unregister_netdev() instead of this.
7835 */
7836
7837void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7838{
7839	ASSERT_RTNL();
7840
7841	if (head) {
7842		list_move_tail(&dev->unreg_list, head);
7843	} else {
7844		rollback_registered(dev);
7845		/* Finish processing unregister after unlock */
7846		net_set_todo(dev);
7847	}
7848}
7849EXPORT_SYMBOL(unregister_netdevice_queue);
7850
7851/**
7852 *	unregister_netdevice_many - unregister many devices
7853 *	@head: list of devices
7854 *
7855 *  Note: As most callers use a stack allocated list_head,
7856 *  we force a list_del() to make sure stack wont be corrupted later.
7857 */
7858void unregister_netdevice_many(struct list_head *head)
7859{
7860	struct net_device *dev;
7861
7862	if (!list_empty(head)) {
7863		rollback_registered_many(head);
7864		list_for_each_entry(dev, head, unreg_list)
7865			net_set_todo(dev);
7866		list_del(head);
7867	}
7868}
7869EXPORT_SYMBOL(unregister_netdevice_many);
7870
7871/**
7872 *	unregister_netdev - remove device from the kernel
7873 *	@dev: device
7874 *
7875 *	This function shuts down a device interface and removes it
7876 *	from the kernel tables.
7877 *
7878 *	This is just a wrapper for unregister_netdevice that takes
7879 *	the rtnl semaphore.  In general you want to use this and not
7880 *	unregister_netdevice.
7881 */
7882void unregister_netdev(struct net_device *dev)
7883{
7884	rtnl_lock();
7885	unregister_netdevice(dev);
7886	rtnl_unlock();
7887}
7888EXPORT_SYMBOL(unregister_netdev);
7889
7890/**
7891 *	dev_change_net_namespace - move device to different nethost namespace
7892 *	@dev: device
7893 *	@net: network namespace
7894 *	@pat: If not NULL name pattern to try if the current device name
7895 *	      is already taken in the destination network namespace.
7896 *
7897 *	This function shuts down a device interface and moves it
7898 *	to a new network namespace. On success 0 is returned, on
7899 *	a failure a netagive errno code is returned.
7900 *
7901 *	Callers must hold the rtnl semaphore.
7902 */
7903
7904int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7905{
7906	int err;
7907
7908	ASSERT_RTNL();
7909
7910	/* Don't allow namespace local devices to be moved. */
7911	err = -EINVAL;
7912	if (dev->features & NETIF_F_NETNS_LOCAL)
7913		goto out;
7914
7915	/* Ensure the device has been registrered */
7916	if (dev->reg_state != NETREG_REGISTERED)
7917		goto out;
7918
7919	/* Get out if there is nothing todo */
7920	err = 0;
7921	if (net_eq(dev_net(dev), net))
7922		goto out;
7923
7924	/* Pick the destination device name, and ensure
7925	 * we can use it in the destination network namespace.
7926	 */
7927	err = -EEXIST;
7928	if (__dev_get_by_name(net, dev->name)) {
7929		/* We get here if we can't use the current device name */
7930		if (!pat)
7931			goto out;
7932		if (dev_get_valid_name(net, dev, pat) < 0)
7933			goto out;
7934	}
7935
7936	/*
7937	 * And now a mini version of register_netdevice unregister_netdevice.
7938	 */
7939
7940	/* If device is running close it first. */
7941	dev_close(dev);
7942
7943	/* And unlink it from device chain */
7944	err = -ENODEV;
7945	unlist_netdevice(dev);
7946
7947	synchronize_net();
7948
7949	/* Shutdown queueing discipline. */
7950	dev_shutdown(dev);
7951
7952	/* Notify protocols, that we are about to destroy
7953	   this device. They should clean all the things.
7954
7955	   Note that dev->reg_state stays at NETREG_REGISTERED.
7956	   This is wanted because this way 8021q and macvlan know
7957	   the device is just moving and can keep their slaves up.
7958	*/
7959	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7960	rcu_barrier();
7961	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7962	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7963
7964	/*
7965	 *	Flush the unicast and multicast chains
7966	 */
7967	dev_uc_flush(dev);
7968	dev_mc_flush(dev);
7969
7970	/* Send a netdev-removed uevent to the old namespace */
7971	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7972	netdev_adjacent_del_links(dev);
7973
7974	/* Actually switch the network namespace */
7975	dev_net_set(dev, net);
7976
7977	/* If there is an ifindex conflict assign a new one */
7978	if (__dev_get_by_index(net, dev->ifindex))
7979		dev->ifindex = dev_new_index(net);
7980
7981	/* Send a netdev-add uevent to the new namespace */
7982	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7983	netdev_adjacent_add_links(dev);
7984
7985	/* Fixup kobjects */
7986	err = device_rename(&dev->dev, dev->name);
7987	WARN_ON(err);
7988
7989	/* Add the device back in the hashes */
7990	list_netdevice(dev);
7991
7992	/* Notify protocols, that a new device appeared. */
7993	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7994
7995	/*
7996	 *	Prevent userspace races by waiting until the network
7997	 *	device is fully setup before sending notifications.
7998	 */
7999	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8000
8001	synchronize_net();
8002	err = 0;
8003out:
8004	return err;
8005}
8006EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8007
8008static int dev_cpu_dead(unsigned int oldcpu)
8009{
8010	struct sk_buff **list_skb;
8011	struct sk_buff *skb;
8012	unsigned int cpu;
8013	struct softnet_data *sd, *oldsd;
8014
8015	local_irq_disable();
8016	cpu = smp_processor_id();
8017	sd = &per_cpu(softnet_data, cpu);
8018	oldsd = &per_cpu(softnet_data, oldcpu);
8019
8020	/* Find end of our completion_queue. */
8021	list_skb = &sd->completion_queue;
8022	while (*list_skb)
8023		list_skb = &(*list_skb)->next;
8024	/* Append completion queue from offline CPU. */
8025	*list_skb = oldsd->completion_queue;
8026	oldsd->completion_queue = NULL;
8027
8028	/* Append output queue from offline CPU. */
8029	if (oldsd->output_queue) {
8030		*sd->output_queue_tailp = oldsd->output_queue;
8031		sd->output_queue_tailp = oldsd->output_queue_tailp;
8032		oldsd->output_queue = NULL;
8033		oldsd->output_queue_tailp = &oldsd->output_queue;
8034	}
8035	/* Append NAPI poll list from offline CPU, with one exception :
8036	 * process_backlog() must be called by cpu owning percpu backlog.
8037	 * We properly handle process_queue & input_pkt_queue later.
8038	 */
8039	while (!list_empty(&oldsd->poll_list)) {
8040		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8041							    struct napi_struct,
8042							    poll_list);
8043
8044		list_del_init(&napi->poll_list);
8045		if (napi->poll == process_backlog)
8046			napi->state = 0;
8047		else
8048			____napi_schedule(sd, napi);
8049	}
8050
8051	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8052	local_irq_enable();
8053
8054	/* Process offline CPU's input_pkt_queue */
8055	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8056		netif_rx_ni(skb);
8057		input_queue_head_incr(oldsd);
8058	}
8059	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8060		netif_rx_ni(skb);
8061		input_queue_head_incr(oldsd);
8062	}
8063
8064	return 0;
8065}
8066
8067/**
8068 *	netdev_increment_features - increment feature set by one
8069 *	@all: current feature set
8070 *	@one: new feature set
8071 *	@mask: mask feature set
8072 *
8073 *	Computes a new feature set after adding a device with feature set
8074 *	@one to the master device with current feature set @all.  Will not
8075 *	enable anything that is off in @mask. Returns the new feature set.
8076 */
8077netdev_features_t netdev_increment_features(netdev_features_t all,
8078	netdev_features_t one, netdev_features_t mask)
8079{
8080	if (mask & NETIF_F_HW_CSUM)
8081		mask |= NETIF_F_CSUM_MASK;
8082	mask |= NETIF_F_VLAN_CHALLENGED;
8083
8084	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8085	all &= one | ~NETIF_F_ALL_FOR_ALL;
8086
8087	/* If one device supports hw checksumming, set for all. */
8088	if (all & NETIF_F_HW_CSUM)
8089		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8090
8091	return all;
8092}
8093EXPORT_SYMBOL(netdev_increment_features);
8094
8095static struct hlist_head * __net_init netdev_create_hash(void)
8096{
8097	int i;
8098	struct hlist_head *hash;
8099
8100	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8101	if (hash != NULL)
8102		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8103			INIT_HLIST_HEAD(&hash[i]);
8104
8105	return hash;
8106}
8107
8108/* Initialize per network namespace state */
8109static int __net_init netdev_init(struct net *net)
8110{
8111	if (net != &init_net)
8112		INIT_LIST_HEAD(&net->dev_base_head);
8113
8114	net->dev_name_head = netdev_create_hash();
8115	if (net->dev_name_head == NULL)
8116		goto err_name;
8117
8118	net->dev_index_head = netdev_create_hash();
8119	if (net->dev_index_head == NULL)
8120		goto err_idx;
8121
8122	return 0;
8123
8124err_idx:
8125	kfree(net->dev_name_head);
8126err_name:
8127	return -ENOMEM;
8128}
8129
8130/**
8131 *	netdev_drivername - network driver for the device
8132 *	@dev: network device
8133 *
8134 *	Determine network driver for device.
8135 */
8136const char *netdev_drivername(const struct net_device *dev)
8137{
8138	const struct device_driver *driver;
8139	const struct device *parent;
8140	const char *empty = "";
8141
8142	parent = dev->dev.parent;
8143	if (!parent)
8144		return empty;
8145
8146	driver = parent->driver;
8147	if (driver && driver->name)
8148		return driver->name;
8149	return empty;
8150}
8151
8152static void __netdev_printk(const char *level, const struct net_device *dev,
8153			    struct va_format *vaf)
8154{
8155	if (dev && dev->dev.parent) {
8156		dev_printk_emit(level[1] - '0',
8157				dev->dev.parent,
8158				"%s %s %s%s: %pV",
8159				dev_driver_string(dev->dev.parent),
8160				dev_name(dev->dev.parent),
8161				netdev_name(dev), netdev_reg_state(dev),
8162				vaf);
8163	} else if (dev) {
8164		printk("%s%s%s: %pV",
8165		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8166	} else {
8167		printk("%s(NULL net_device): %pV", level, vaf);
8168	}
8169}
8170
8171void netdev_printk(const char *level, const struct net_device *dev,
8172		   const char *format, ...)
8173{
8174	struct va_format vaf;
8175	va_list args;
8176
8177	va_start(args, format);
8178
8179	vaf.fmt = format;
8180	vaf.va = &args;
8181
8182	__netdev_printk(level, dev, &vaf);
8183
8184	va_end(args);
8185}
8186EXPORT_SYMBOL(netdev_printk);
8187
8188#define define_netdev_printk_level(func, level)			\
8189void func(const struct net_device *dev, const char *fmt, ...)	\
8190{								\
8191	struct va_format vaf;					\
8192	va_list args;						\
8193								\
8194	va_start(args, fmt);					\
8195								\
8196	vaf.fmt = fmt;						\
8197	vaf.va = &args;						\
8198								\
8199	__netdev_printk(level, dev, &vaf);			\
8200								\
8201	va_end(args);						\
8202}								\
8203EXPORT_SYMBOL(func);
8204
8205define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8206define_netdev_printk_level(netdev_alert, KERN_ALERT);
8207define_netdev_printk_level(netdev_crit, KERN_CRIT);
8208define_netdev_printk_level(netdev_err, KERN_ERR);
8209define_netdev_printk_level(netdev_warn, KERN_WARNING);
8210define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8211define_netdev_printk_level(netdev_info, KERN_INFO);
8212
8213static void __net_exit netdev_exit(struct net *net)
8214{
8215	kfree(net->dev_name_head);
8216	kfree(net->dev_index_head);
8217}
8218
8219static struct pernet_operations __net_initdata netdev_net_ops = {
8220	.init = netdev_init,
8221	.exit = netdev_exit,
8222};
8223
8224static void __net_exit default_device_exit(struct net *net)
8225{
8226	struct net_device *dev, *aux;
8227	/*
8228	 * Push all migratable network devices back to the
8229	 * initial network namespace
8230	 */
8231	rtnl_lock();
8232	for_each_netdev_safe(net, dev, aux) {
8233		int err;
8234		char fb_name[IFNAMSIZ];
8235
8236		/* Ignore unmoveable devices (i.e. loopback) */
8237		if (dev->features & NETIF_F_NETNS_LOCAL)
8238			continue;
8239
8240		/* Leave virtual devices for the generic cleanup */
8241		if (dev->rtnl_link_ops)
8242			continue;
8243
8244		/* Push remaining network devices to init_net */
8245		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8246		err = dev_change_net_namespace(dev, &init_net, fb_name);
8247		if (err) {
8248			pr_emerg("%s: failed to move %s to init_net: %d\n",
8249				 __func__, dev->name, err);
8250			BUG();
8251		}
8252	}
8253	rtnl_unlock();
8254}
8255
8256static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8257{
8258	/* Return with the rtnl_lock held when there are no network
8259	 * devices unregistering in any network namespace in net_list.
8260	 */
8261	struct net *net;
8262	bool unregistering;
8263	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8264
8265	add_wait_queue(&netdev_unregistering_wq, &wait);
8266	for (;;) {
8267		unregistering = false;
8268		rtnl_lock();
8269		list_for_each_entry(net, net_list, exit_list) {
8270			if (net->dev_unreg_count > 0) {
8271				unregistering = true;
8272				break;
8273			}
8274		}
8275		if (!unregistering)
8276			break;
8277		__rtnl_unlock();
8278
8279		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8280	}
8281	remove_wait_queue(&netdev_unregistering_wq, &wait);
8282}
8283
8284static void __net_exit default_device_exit_batch(struct list_head *net_list)
8285{
8286	/* At exit all network devices most be removed from a network
8287	 * namespace.  Do this in the reverse order of registration.
8288	 * Do this across as many network namespaces as possible to
8289	 * improve batching efficiency.
8290	 */
8291	struct net_device *dev;
8292	struct net *net;
8293	LIST_HEAD(dev_kill_list);
8294
8295	/* To prevent network device cleanup code from dereferencing
8296	 * loopback devices or network devices that have been freed
8297	 * wait here for all pending unregistrations to complete,
8298	 * before unregistring the loopback device and allowing the
8299	 * network namespace be freed.
8300	 *
8301	 * The netdev todo list containing all network devices
8302	 * unregistrations that happen in default_device_exit_batch
8303	 * will run in the rtnl_unlock() at the end of
8304	 * default_device_exit_batch.
8305	 */
8306	rtnl_lock_unregistering(net_list);
8307	list_for_each_entry(net, net_list, exit_list) {
8308		for_each_netdev_reverse(net, dev) {
8309			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8310				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8311			else
8312				unregister_netdevice_queue(dev, &dev_kill_list);
8313		}
8314	}
8315	unregister_netdevice_many(&dev_kill_list);
8316	rtnl_unlock();
8317}
8318
8319static struct pernet_operations __net_initdata default_device_ops = {
8320	.exit = default_device_exit,
8321	.exit_batch = default_device_exit_batch,
8322};
8323
8324/*
8325 *	Initialize the DEV module. At boot time this walks the device list and
8326 *	unhooks any devices that fail to initialise (normally hardware not
8327 *	present) and leaves us with a valid list of present and active devices.
8328 *
8329 */
8330
8331/*
8332 *       This is called single threaded during boot, so no need
8333 *       to take the rtnl semaphore.
8334 */
8335static int __init net_dev_init(void)
8336{
8337	int i, rc = -ENOMEM;
8338
8339	BUG_ON(!dev_boot_phase);
8340
8341	if (dev_proc_init())
8342		goto out;
8343
8344	if (netdev_kobject_init())
8345		goto out;
8346
8347	INIT_LIST_HEAD(&ptype_all);
8348	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8349		INIT_LIST_HEAD(&ptype_base[i]);
8350
8351	INIT_LIST_HEAD(&offload_base);
8352
8353	if (register_pernet_subsys(&netdev_net_ops))
8354		goto out;
8355
8356	/*
8357	 *	Initialise the packet receive queues.
8358	 */
8359
8360	for_each_possible_cpu(i) {
8361		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8362		struct softnet_data *sd = &per_cpu(softnet_data, i);
8363
8364		INIT_WORK(flush, flush_backlog);
8365
8366		skb_queue_head_init(&sd->input_pkt_queue);
8367		skb_queue_head_init(&sd->process_queue);
8368		INIT_LIST_HEAD(&sd->poll_list);
8369		sd->output_queue_tailp = &sd->output_queue;
8370#ifdef CONFIG_RPS
8371		sd->csd.func = rps_trigger_softirq;
8372		sd->csd.info = sd;
8373		sd->cpu = i;
8374#endif
8375
8376		sd->backlog.poll = process_backlog;
8377		sd->backlog.weight = weight_p;
8378	}
8379
8380	dev_boot_phase = 0;
8381
8382	/* The loopback device is special if any other network devices
8383	 * is present in a network namespace the loopback device must
8384	 * be present. Since we now dynamically allocate and free the
8385	 * loopback device ensure this invariant is maintained by
8386	 * keeping the loopback device as the first device on the
8387	 * list of network devices.  Ensuring the loopback devices
8388	 * is the first device that appears and the last network device
8389	 * that disappears.
8390	 */
8391	if (register_pernet_device(&loopback_net_ops))
8392		goto out;
8393
8394	if (register_pernet_device(&default_device_ops))
8395		goto out;
8396
8397	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8398	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8399
8400	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8401				       NULL, dev_cpu_dead);
8402	WARN_ON(rc < 0);
8403	dst_subsys_init();
8404	rc = 0;
8405out:
8406	return rc;
8407}
8408
8409subsys_initcall(net_dev_init);