net/core/dev.c at v3.16 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v3.16 183 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <linux/ipv6.h>
 122#include <linux/in.h>
 123#include <linux/jhash.h>
 124#include <linux/random.h>
 125#include <trace/events/napi.h>
 126#include <trace/events/net.h>
 127#include <trace/events/skb.h>
 128#include <linux/pci.h>
 129#include <linux/inetdevice.h>
 130#include <linux/cpu_rmap.h>
 131#include <linux/static_key.h>
 132#include <linux/hashtable.h>
 133#include <linux/vmalloc.h>
 134#include <linux/if_macvlan.h>
 135
 136#include "net-sysfs.h"
 137
 138/* Instead of increasing this, you should create a hash table. */
 139#define MAX_GRO_SKBS 8
 140
 141/* This should be increased if a protocol with a bigger head is added. */
 142#define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144static DEFINE_SPINLOCK(ptype_lock);
 145static DEFINE_SPINLOCK(offload_lock);
 146struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147struct list_head ptype_all __read_mostly;	/* Taps */
 148static struct list_head offload_base __read_mostly;
 149
 150static int netif_rx_internal(struct sk_buff *skb);
 151static int call_netdevice_notifiers_info(unsigned long val,
 152					 struct net_device *dev,
 153					 struct netdev_notifier_info *info);
 154
 155/*
 156 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 157 * semaphore.
 158 *
 159 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 160 *
 161 * Writers must hold the rtnl semaphore while they loop through the
 162 * dev_base_head list, and hold dev_base_lock for writing when they do the
 163 * actual updates.  This allows pure readers to access the list even
 164 * while a writer is preparing to update it.
 165 *
 166 * To put it another way, dev_base_lock is held for writing only to
 167 * protect against pure readers; the rtnl semaphore provides the
 168 * protection against other writers.
 169 *
 170 * See, for example usages, register_netdevice() and
 171 * unregister_netdevice(), which must be called with the rtnl
 172 * semaphore held.
 173 */
 174DEFINE_RWLOCK(dev_base_lock);
 175EXPORT_SYMBOL(dev_base_lock);
 176
 177/* protects napi_hash addition/deletion and napi_gen_id */
 178static DEFINE_SPINLOCK(napi_hash_lock);
 179
 180static unsigned int napi_gen_id;
 181static DEFINE_HASHTABLE(napi_hash, 8);
 182
 183static seqcount_t devnet_rename_seq;
 184
 185static inline void dev_base_seq_inc(struct net *net)
 186{
 187	while (++net->dev_base_seq == 0);
 188}
 189
 190static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 191{
 192	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 193
 194	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 195}
 196
 197static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 198{
 199	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 200}
 201
 202static inline void rps_lock(struct softnet_data *sd)
 203{
 204#ifdef CONFIG_RPS
 205	spin_lock(&sd->input_pkt_queue.lock);
 206#endif
 207}
 208
 209static inline void rps_unlock(struct softnet_data *sd)
 210{
 211#ifdef CONFIG_RPS
 212	spin_unlock(&sd->input_pkt_queue.lock);
 213#endif
 214}
 215
 216/* Device list insertion */
 217static void list_netdevice(struct net_device *dev)
 218{
 219	struct net *net = dev_net(dev);
 220
 221	ASSERT_RTNL();
 222
 223	write_lock_bh(&dev_base_lock);
 224	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 225	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 226	hlist_add_head_rcu(&dev->index_hlist,
 227			   dev_index_hash(net, dev->ifindex));
 228	write_unlock_bh(&dev_base_lock);
 229
 230	dev_base_seq_inc(net);
 231}
 232
 233/* Device list removal
 234 * caller must respect a RCU grace period before freeing/reusing dev
 235 */
 236static void unlist_netdevice(struct net_device *dev)
 237{
 238	ASSERT_RTNL();
 239
 240	/* Unlink dev from the device chain */
 241	write_lock_bh(&dev_base_lock);
 242	list_del_rcu(&dev->dev_list);
 243	hlist_del_rcu(&dev->name_hlist);
 244	hlist_del_rcu(&dev->index_hlist);
 245	write_unlock_bh(&dev_base_lock);
 246
 247	dev_base_seq_inc(dev_net(dev));
 248}
 249
 250/*
 251 *	Our notifier list
 252 */
 253
 254static RAW_NOTIFIER_HEAD(netdev_chain);
 255
 256/*
 257 *	Device drivers call our routines to queue packets here. We empty the
 258 *	queue in the local softnet handler.
 259 */
 260
 261DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 262EXPORT_PER_CPU_SYMBOL(softnet_data);
 263
 264#ifdef CONFIG_LOCKDEP
 265/*
 266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267 * according to dev->type
 268 */
 269static const unsigned short netdev_lock_type[] =
 270	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 283	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 284	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 285
 286static const char *const netdev_lock_name[] =
 287	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 300	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 301	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 302
 303static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307{
 308	int i;
 309
 310	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311		if (netdev_lock_type[i] == dev_type)
 312			return i;
 313	/* the last key is used by default */
 314	return ARRAY_SIZE(netdev_lock_type) - 1;
 315}
 316
 317static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318						 unsigned short dev_type)
 319{
 320	int i;
 321
 322	i = netdev_lock_pos(dev_type);
 323	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324				   netdev_lock_name[i]);
 325}
 326
 327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328{
 329	int i;
 330
 331	i = netdev_lock_pos(dev->type);
 332	lockdep_set_class_and_name(&dev->addr_list_lock,
 333				   &netdev_addr_lock_key[i],
 334				   netdev_lock_name[i]);
 335}
 336#else
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338						 unsigned short dev_type)
 339{
 340}
 341static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342{
 343}
 344#endif
 345
 346/*******************************************************************************
 347
 348		Protocol management and registration routines
 349
 350*******************************************************************************/
 351
 352/*
 353 *	Add a protocol ID to the list. Now that the input handler is
 354 *	smarter we can dispense with all the messy stuff that used to be
 355 *	here.
 356 *
 357 *	BEWARE!!! Protocol handlers, mangling input packets,
 358 *	MUST BE last in hash buckets and checking protocol handlers
 359 *	MUST start from promiscuous ptype_all chain in net_bh.
 360 *	It is true now, do not change it.
 361 *	Explanation follows: if protocol handler, mangling packet, will
 362 *	be the first on list, it is not able to sense, that packet
 363 *	is cloned and should be copied-on-write, so that it will
 364 *	change it and subsequent readers will get broken packet.
 365 *							--ANK (980803)
 366 */
 367
 368static inline struct list_head *ptype_head(const struct packet_type *pt)
 369{
 370	if (pt->type == htons(ETH_P_ALL))
 371		return &ptype_all;
 372	else
 373		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 374}
 375
 376/**
 377 *	dev_add_pack - add packet handler
 378 *	@pt: packet type declaration
 379 *
 380 *	Add a protocol handler to the networking stack. The passed &packet_type
 381 *	is linked into kernel lists and may not be freed until it has been
 382 *	removed from the kernel lists.
 383 *
 384 *	This call does not sleep therefore it can not
 385 *	guarantee all CPU's that are in middle of receiving packets
 386 *	will see the new packet type (until the next received packet).
 387 */
 388
 389void dev_add_pack(struct packet_type *pt)
 390{
 391	struct list_head *head = ptype_head(pt);
 392
 393	spin_lock(&ptype_lock);
 394	list_add_rcu(&pt->list, head);
 395	spin_unlock(&ptype_lock);
 396}
 397EXPORT_SYMBOL(dev_add_pack);
 398
 399/**
 400 *	__dev_remove_pack	 - remove packet handler
 401 *	@pt: packet type declaration
 402 *
 403 *	Remove a protocol handler that was previously added to the kernel
 404 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 405 *	from the kernel lists and can be freed or reused once this function
 406 *	returns.
 407 *
 408 *      The packet type might still be in use by receivers
 409 *	and must not be freed until after all the CPU's have gone
 410 *	through a quiescent state.
 411 */
 412void __dev_remove_pack(struct packet_type *pt)
 413{
 414	struct list_head *head = ptype_head(pt);
 415	struct packet_type *pt1;
 416
 417	spin_lock(&ptype_lock);
 418
 419	list_for_each_entry(pt1, head, list) {
 420		if (pt == pt1) {
 421			list_del_rcu(&pt->list);
 422			goto out;
 423		}
 424	}
 425
 426	pr_warn("dev_remove_pack: %p not found\n", pt);
 427out:
 428	spin_unlock(&ptype_lock);
 429}
 430EXPORT_SYMBOL(__dev_remove_pack);
 431
 432/**
 433 *	dev_remove_pack	 - remove packet handler
 434 *	@pt: packet type declaration
 435 *
 436 *	Remove a protocol handler that was previously added to the kernel
 437 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438 *	from the kernel lists and can be freed or reused once this function
 439 *	returns.
 440 *
 441 *	This call sleeps to guarantee that no CPU is looking at the packet
 442 *	type after return.
 443 */
 444void dev_remove_pack(struct packet_type *pt)
 445{
 446	__dev_remove_pack(pt);
 447
 448	synchronize_net();
 449}
 450EXPORT_SYMBOL(dev_remove_pack);
 451
 452
 453/**
 454 *	dev_add_offload - register offload handlers
 455 *	@po: protocol offload declaration
 456 *
 457 *	Add protocol offload handlers to the networking stack. The passed
 458 *	&proto_offload is linked into kernel lists and may not be freed until
 459 *	it has been removed from the kernel lists.
 460 *
 461 *	This call does not sleep therefore it can not
 462 *	guarantee all CPU's that are in middle of receiving packets
 463 *	will see the new offload handlers (until the next received packet).
 464 */
 465void dev_add_offload(struct packet_offload *po)
 466{
 467	struct list_head *head = &offload_base;
 468
 469	spin_lock(&offload_lock);
 470	list_add_rcu(&po->list, head);
 471	spin_unlock(&offload_lock);
 472}
 473EXPORT_SYMBOL(dev_add_offload);
 474
 475/**
 476 *	__dev_remove_offload	 - remove offload handler
 477 *	@po: packet offload declaration
 478 *
 479 *	Remove a protocol offload handler that was previously added to the
 480 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 481 *	is removed from the kernel lists and can be freed or reused once this
 482 *	function returns.
 483 *
 484 *      The packet type might still be in use by receivers
 485 *	and must not be freed until after all the CPU's have gone
 486 *	through a quiescent state.
 487 */
 488static void __dev_remove_offload(struct packet_offload *po)
 489{
 490	struct list_head *head = &offload_base;
 491	struct packet_offload *po1;
 492
 493	spin_lock(&offload_lock);
 494
 495	list_for_each_entry(po1, head, list) {
 496		if (po == po1) {
 497			list_del_rcu(&po->list);
 498			goto out;
 499		}
 500	}
 501
 502	pr_warn("dev_remove_offload: %p not found\n", po);
 503out:
 504	spin_unlock(&offload_lock);
 505}
 506
 507/**
 508 *	dev_remove_offload	 - remove packet offload handler
 509 *	@po: packet offload declaration
 510 *
 511 *	Remove a packet offload handler that was previously added to the kernel
 512 *	offload handlers by dev_add_offload(). The passed &offload_type is
 513 *	removed from the kernel lists and can be freed or reused once this
 514 *	function returns.
 515 *
 516 *	This call sleeps to guarantee that no CPU is looking at the packet
 517 *	type after return.
 518 */
 519void dev_remove_offload(struct packet_offload *po)
 520{
 521	__dev_remove_offload(po);
 522
 523	synchronize_net();
 524}
 525EXPORT_SYMBOL(dev_remove_offload);
 526
 527/******************************************************************************
 528
 529		      Device Boot-time Settings Routines
 530
 531*******************************************************************************/
 532
 533/* Boot time configuration table */
 534static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 535
 536/**
 537 *	netdev_boot_setup_add	- add new setup entry
 538 *	@name: name of the device
 539 *	@map: configured settings for the device
 540 *
 541 *	Adds new setup entry to the dev_boot_setup list.  The function
 542 *	returns 0 on error and 1 on success.  This is a generic routine to
 543 *	all netdevices.
 544 */
 545static int netdev_boot_setup_add(char *name, struct ifmap *map)
 546{
 547	struct netdev_boot_setup *s;
 548	int i;
 549
 550	s = dev_boot_setup;
 551	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 552		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 553			memset(s[i].name, 0, sizeof(s[i].name));
 554			strlcpy(s[i].name, name, IFNAMSIZ);
 555			memcpy(&s[i].map, map, sizeof(s[i].map));
 556			break;
 557		}
 558	}
 559
 560	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 561}
 562
 563/**
 564 *	netdev_boot_setup_check	- check boot time settings
 565 *	@dev: the netdevice
 566 *
 567 * 	Check boot time settings for the device.
 568 *	The found settings are set for the device to be used
 569 *	later in the device probing.
 570 *	Returns 0 if no settings found, 1 if they are.
 571 */
 572int netdev_boot_setup_check(struct net_device *dev)
 573{
 574	struct netdev_boot_setup *s = dev_boot_setup;
 575	int i;
 576
 577	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 579		    !strcmp(dev->name, s[i].name)) {
 580			dev->irq 	= s[i].map.irq;
 581			dev->base_addr 	= s[i].map.base_addr;
 582			dev->mem_start 	= s[i].map.mem_start;
 583			dev->mem_end 	= s[i].map.mem_end;
 584			return 1;
 585		}
 586	}
 587	return 0;
 588}
 589EXPORT_SYMBOL(netdev_boot_setup_check);
 590
 591
 592/**
 593 *	netdev_boot_base	- get address from boot time settings
 594 *	@prefix: prefix for network device
 595 *	@unit: id for network device
 596 *
 597 * 	Check boot time settings for the base address of device.
 598 *	The found settings are set for the device to be used
 599 *	later in the device probing.
 600 *	Returns 0 if no settings found.
 601 */
 602unsigned long netdev_boot_base(const char *prefix, int unit)
 603{
 604	const struct netdev_boot_setup *s = dev_boot_setup;
 605	char name[IFNAMSIZ];
 606	int i;
 607
 608	sprintf(name, "%s%d", prefix, unit);
 609
 610	/*
 611	 * If device already registered then return base of 1
 612	 * to indicate not to probe for this interface
 613	 */
 614	if (__dev_get_by_name(&init_net, name))
 615		return 1;
 616
 617	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 618		if (!strcmp(name, s[i].name))
 619			return s[i].map.base_addr;
 620	return 0;
 621}
 622
 623/*
 624 * Saves at boot time configured settings for any netdevice.
 625 */
 626int __init netdev_boot_setup(char *str)
 627{
 628	int ints[5];
 629	struct ifmap map;
 630
 631	str = get_options(str, ARRAY_SIZE(ints), ints);
 632	if (!str || !*str)
 633		return 0;
 634
 635	/* Save settings */
 636	memset(&map, 0, sizeof(map));
 637	if (ints[0] > 0)
 638		map.irq = ints[1];
 639	if (ints[0] > 1)
 640		map.base_addr = ints[2];
 641	if (ints[0] > 2)
 642		map.mem_start = ints[3];
 643	if (ints[0] > 3)
 644		map.mem_end = ints[4];
 645
 646	/* Add new entry to the list */
 647	return netdev_boot_setup_add(str, &map);
 648}
 649
 650__setup("netdev=", netdev_boot_setup);
 651
 652/*******************************************************************************
 653
 654			    Device Interface Subroutines
 655
 656*******************************************************************************/
 657
 658/**
 659 *	__dev_get_by_name	- find a device by its name
 660 *	@net: the applicable net namespace
 661 *	@name: name to find
 662 *
 663 *	Find an interface by name. Must be called under RTNL semaphore
 664 *	or @dev_base_lock. If the name is found a pointer to the device
 665 *	is returned. If the name is not found then %NULL is returned. The
 666 *	reference counters are not incremented so the caller must be
 667 *	careful with locks.
 668 */
 669
 670struct net_device *__dev_get_by_name(struct net *net, const char *name)
 671{
 672	struct net_device *dev;
 673	struct hlist_head *head = dev_name_hash(net, name);
 674
 675	hlist_for_each_entry(dev, head, name_hlist)
 676		if (!strncmp(dev->name, name, IFNAMSIZ))
 677			return dev;
 678
 679	return NULL;
 680}
 681EXPORT_SYMBOL(__dev_get_by_name);
 682
 683/**
 684 *	dev_get_by_name_rcu	- find a device by its name
 685 *	@net: the applicable net namespace
 686 *	@name: name to find
 687 *
 688 *	Find an interface by name.
 689 *	If the name is found a pointer to the device is returned.
 690 * 	If the name is not found then %NULL is returned.
 691 *	The reference counters are not incremented so the caller must be
 692 *	careful with locks. The caller must hold RCU lock.
 693 */
 694
 695struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 696{
 697	struct net_device *dev;
 698	struct hlist_head *head = dev_name_hash(net, name);
 699
 700	hlist_for_each_entry_rcu(dev, head, name_hlist)
 701		if (!strncmp(dev->name, name, IFNAMSIZ))
 702			return dev;
 703
 704	return NULL;
 705}
 706EXPORT_SYMBOL(dev_get_by_name_rcu);
 707
 708/**
 709 *	dev_get_by_name		- find a device by its name
 710 *	@net: the applicable net namespace
 711 *	@name: name to find
 712 *
 713 *	Find an interface by name. This can be called from any
 714 *	context and does its own locking. The returned handle has
 715 *	the usage count incremented and the caller must use dev_put() to
 716 *	release it when it is no longer needed. %NULL is returned if no
 717 *	matching device is found.
 718 */
 719
 720struct net_device *dev_get_by_name(struct net *net, const char *name)
 721{
 722	struct net_device *dev;
 723
 724	rcu_read_lock();
 725	dev = dev_get_by_name_rcu(net, name);
 726	if (dev)
 727		dev_hold(dev);
 728	rcu_read_unlock();
 729	return dev;
 730}
 731EXPORT_SYMBOL(dev_get_by_name);
 732
 733/**
 734 *	__dev_get_by_index - find a device by its ifindex
 735 *	@net: the applicable net namespace
 736 *	@ifindex: index of device
 737 *
 738 *	Search for an interface by index. Returns %NULL if the device
 739 *	is not found or a pointer to the device. The device has not
 740 *	had its reference counter increased so the caller must be careful
 741 *	about locking. The caller must hold either the RTNL semaphore
 742 *	or @dev_base_lock.
 743 */
 744
 745struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 746{
 747	struct net_device *dev;
 748	struct hlist_head *head = dev_index_hash(net, ifindex);
 749
 750	hlist_for_each_entry(dev, head, index_hlist)
 751		if (dev->ifindex == ifindex)
 752			return dev;
 753
 754	return NULL;
 755}
 756EXPORT_SYMBOL(__dev_get_by_index);
 757
 758/**
 759 *	dev_get_by_index_rcu - find a device by its ifindex
 760 *	@net: the applicable net namespace
 761 *	@ifindex: index of device
 762 *
 763 *	Search for an interface by index. Returns %NULL if the device
 764 *	is not found or a pointer to the device. The device has not
 765 *	had its reference counter increased so the caller must be careful
 766 *	about locking. The caller must hold RCU lock.
 767 */
 768
 769struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 770{
 771	struct net_device *dev;
 772	struct hlist_head *head = dev_index_hash(net, ifindex);
 773
 774	hlist_for_each_entry_rcu(dev, head, index_hlist)
 775		if (dev->ifindex == ifindex)
 776			return dev;
 777
 778	return NULL;
 779}
 780EXPORT_SYMBOL(dev_get_by_index_rcu);
 781
 782
 783/**
 784 *	dev_get_by_index - find a device by its ifindex
 785 *	@net: the applicable net namespace
 786 *	@ifindex: index of device
 787 *
 788 *	Search for an interface by index. Returns NULL if the device
 789 *	is not found or a pointer to the device. The device returned has
 790 *	had a reference added and the pointer is safe until the user calls
 791 *	dev_put to indicate they have finished with it.
 792 */
 793
 794struct net_device *dev_get_by_index(struct net *net, int ifindex)
 795{
 796	struct net_device *dev;
 797
 798	rcu_read_lock();
 799	dev = dev_get_by_index_rcu(net, ifindex);
 800	if (dev)
 801		dev_hold(dev);
 802	rcu_read_unlock();
 803	return dev;
 804}
 805EXPORT_SYMBOL(dev_get_by_index);
 806
 807/**
 808 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 809 *	@net: network namespace
 810 *	@name: a pointer to the buffer where the name will be stored.
 811 *	@ifindex: the ifindex of the interface to get the name from.
 812 *
 813 *	The use of raw_seqcount_begin() and cond_resched() before
 814 *	retrying is required as we want to give the writers a chance
 815 *	to complete when CONFIG_PREEMPT is not set.
 816 */
 817int netdev_get_name(struct net *net, char *name, int ifindex)
 818{
 819	struct net_device *dev;
 820	unsigned int seq;
 821
 822retry:
 823	seq = raw_seqcount_begin(&devnet_rename_seq);
 824	rcu_read_lock();
 825	dev = dev_get_by_index_rcu(net, ifindex);
 826	if (!dev) {
 827		rcu_read_unlock();
 828		return -ENODEV;
 829	}
 830
 831	strcpy(name, dev->name);
 832	rcu_read_unlock();
 833	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 834		cond_resched();
 835		goto retry;
 836	}
 837
 838	return 0;
 839}
 840
 841/**
 842 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 843 *	@net: the applicable net namespace
 844 *	@type: media type of device
 845 *	@ha: hardware address
 846 *
 847 *	Search for an interface by MAC address. Returns NULL if the device
 848 *	is not found or a pointer to the device.
 849 *	The caller must hold RCU or RTNL.
 850 *	The returned device has not had its ref count increased
 851 *	and the caller must therefore be careful about locking
 852 *
 853 */
 854
 855struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 856				       const char *ha)
 857{
 858	struct net_device *dev;
 859
 860	for_each_netdev_rcu(net, dev)
 861		if (dev->type == type &&
 862		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 863			return dev;
 864
 865	return NULL;
 866}
 867EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 868
 869struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 870{
 871	struct net_device *dev;
 872
 873	ASSERT_RTNL();
 874	for_each_netdev(net, dev)
 875		if (dev->type == type)
 876			return dev;
 877
 878	return NULL;
 879}
 880EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 881
 882struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 883{
 884	struct net_device *dev, *ret = NULL;
 885
 886	rcu_read_lock();
 887	for_each_netdev_rcu(net, dev)
 888		if (dev->type == type) {
 889			dev_hold(dev);
 890			ret = dev;
 891			break;
 892		}
 893	rcu_read_unlock();
 894	return ret;
 895}
 896EXPORT_SYMBOL(dev_getfirstbyhwtype);
 897
 898/**
 899 *	dev_get_by_flags_rcu - find any device with given flags
 900 *	@net: the applicable net namespace
 901 *	@if_flags: IFF_* values
 902 *	@mask: bitmask of bits in if_flags to check
 903 *
 904 *	Search for any interface with the given flags. Returns NULL if a device
 905 *	is not found or a pointer to the device. Must be called inside
 906 *	rcu_read_lock(), and result refcount is unchanged.
 907 */
 908
 909struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 910				    unsigned short mask)
 911{
 912	struct net_device *dev, *ret;
 913
 914	ret = NULL;
 915	for_each_netdev_rcu(net, dev) {
 916		if (((dev->flags ^ if_flags) & mask) == 0) {
 917			ret = dev;
 918			break;
 919		}
 920	}
 921	return ret;
 922}
 923EXPORT_SYMBOL(dev_get_by_flags_rcu);
 924
 925/**
 926 *	dev_valid_name - check if name is okay for network device
 927 *	@name: name string
 928 *
 929 *	Network device names need to be valid file names to
 930 *	to allow sysfs to work.  We also disallow any kind of
 931 *	whitespace.
 932 */
 933bool dev_valid_name(const char *name)
 934{
 935	if (*name == '\0')
 936		return false;
 937	if (strlen(name) >= IFNAMSIZ)
 938		return false;
 939	if (!strcmp(name, ".") || !strcmp(name, ".."))
 940		return false;
 941
 942	while (*name) {
 943		if (*name == '/' || isspace(*name))
 944			return false;
 945		name++;
 946	}
 947	return true;
 948}
 949EXPORT_SYMBOL(dev_valid_name);
 950
 951/**
 952 *	__dev_alloc_name - allocate a name for a device
 953 *	@net: network namespace to allocate the device name in
 954 *	@name: name format string
 955 *	@buf:  scratch buffer and result name string
 956 *
 957 *	Passed a format string - eg "lt%d" it will try and find a suitable
 958 *	id. It scans list of devices to build up a free map, then chooses
 959 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 960 *	while allocating the name and adding the device in order to avoid
 961 *	duplicates.
 962 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 963 *	Returns the number of the unit assigned or a negative errno code.
 964 */
 965
 966static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 967{
 968	int i = 0;
 969	const char *p;
 970	const int max_netdevices = 8*PAGE_SIZE;
 971	unsigned long *inuse;
 972	struct net_device *d;
 973
 974	p = strnchr(name, IFNAMSIZ-1, '%');
 975	if (p) {
 976		/*
 977		 * Verify the string as this thing may have come from
 978		 * the user.  There must be either one "%d" and no other "%"
 979		 * characters.
 980		 */
 981		if (p[1] != 'd' || strchr(p + 2, '%'))
 982			return -EINVAL;
 983
 984		/* Use one page as a bit array of possible slots */
 985		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 986		if (!inuse)
 987			return -ENOMEM;
 988
 989		for_each_netdev(net, d) {
 990			if (!sscanf(d->name, name, &i))
 991				continue;
 992			if (i < 0 || i >= max_netdevices)
 993				continue;
 994
 995			/*  avoid cases where sscanf is not exact inverse of printf */
 996			snprintf(buf, IFNAMSIZ, name, i);
 997			if (!strncmp(buf, d->name, IFNAMSIZ))
 998				set_bit(i, inuse);
 999		}
1000
1001		i = find_first_zero_bit(inuse, max_netdevices);
1002		free_page((unsigned long) inuse);
1003	}
1004
1005	if (buf != name)
1006		snprintf(buf, IFNAMSIZ, name, i);
1007	if (!__dev_get_by_name(net, buf))
1008		return i;
1009
1010	/* It is possible to run out of possible slots
1011	 * when the name is long and there isn't enough space left
1012	 * for the digits, or if all bits are used.
1013	 */
1014	return -ENFILE;
1015}
1016
1017/**
1018 *	dev_alloc_name - allocate a name for a device
1019 *	@dev: device
1020 *	@name: name format string
1021 *
1022 *	Passed a format string - eg "lt%d" it will try and find a suitable
1023 *	id. It scans list of devices to build up a free map, then chooses
1024 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1025 *	while allocating the name and adding the device in order to avoid
1026 *	duplicates.
1027 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028 *	Returns the number of the unit assigned or a negative errno code.
1029 */
1030
1031int dev_alloc_name(struct net_device *dev, const char *name)
1032{
1033	char buf[IFNAMSIZ];
1034	struct net *net;
1035	int ret;
1036
1037	BUG_ON(!dev_net(dev));
1038	net = dev_net(dev);
1039	ret = __dev_alloc_name(net, name, buf);
1040	if (ret >= 0)
1041		strlcpy(dev->name, buf, IFNAMSIZ);
1042	return ret;
1043}
1044EXPORT_SYMBOL(dev_alloc_name);
1045
1046static int dev_alloc_name_ns(struct net *net,
1047			     struct net_device *dev,
1048			     const char *name)
1049{
1050	char buf[IFNAMSIZ];
1051	int ret;
1052
1053	ret = __dev_alloc_name(net, name, buf);
1054	if (ret >= 0)
1055		strlcpy(dev->name, buf, IFNAMSIZ);
1056	return ret;
1057}
1058
1059static int dev_get_valid_name(struct net *net,
1060			      struct net_device *dev,
1061			      const char *name)
1062{
1063	BUG_ON(!net);
1064
1065	if (!dev_valid_name(name))
1066		return -EINVAL;
1067
1068	if (strchr(name, '%'))
1069		return dev_alloc_name_ns(net, dev, name);
1070	else if (__dev_get_by_name(net, name))
1071		return -EEXIST;
1072	else if (dev->name != name)
1073		strlcpy(dev->name, name, IFNAMSIZ);
1074
1075	return 0;
1076}
1077
1078/**
1079 *	dev_change_name - change name of a device
1080 *	@dev: device
1081 *	@newname: name (or format string) must be at least IFNAMSIZ
1082 *
1083 *	Change name of a device, can pass format strings "eth%d".
1084 *	for wildcarding.
1085 */
1086int dev_change_name(struct net_device *dev, const char *newname)
1087{
1088	char oldname[IFNAMSIZ];
1089	int err = 0;
1090	int ret;
1091	struct net *net;
1092
1093	ASSERT_RTNL();
1094	BUG_ON(!dev_net(dev));
1095
1096	net = dev_net(dev);
1097	if (dev->flags & IFF_UP)
1098		return -EBUSY;
1099
1100	write_seqcount_begin(&devnet_rename_seq);
1101
1102	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1103		write_seqcount_end(&devnet_rename_seq);
1104		return 0;
1105	}
1106
1107	memcpy(oldname, dev->name, IFNAMSIZ);
1108
1109	err = dev_get_valid_name(net, dev, newname);
1110	if (err < 0) {
1111		write_seqcount_end(&devnet_rename_seq);
1112		return err;
1113	}
1114
1115rollback:
1116	ret = device_rename(&dev->dev, dev->name);
1117	if (ret) {
1118		memcpy(dev->name, oldname, IFNAMSIZ);
1119		write_seqcount_end(&devnet_rename_seq);
1120		return ret;
1121	}
1122
1123	write_seqcount_end(&devnet_rename_seq);
1124
1125	netdev_adjacent_rename_links(dev, oldname);
1126
1127	write_lock_bh(&dev_base_lock);
1128	hlist_del_rcu(&dev->name_hlist);
1129	write_unlock_bh(&dev_base_lock);
1130
1131	synchronize_rcu();
1132
1133	write_lock_bh(&dev_base_lock);
1134	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1135	write_unlock_bh(&dev_base_lock);
1136
1137	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1138	ret = notifier_to_errno(ret);
1139
1140	if (ret) {
1141		/* err >= 0 after dev_alloc_name() or stores the first errno */
1142		if (err >= 0) {
1143			err = ret;
1144			write_seqcount_begin(&devnet_rename_seq);
1145			memcpy(dev->name, oldname, IFNAMSIZ);
1146			memcpy(oldname, newname, IFNAMSIZ);
1147			goto rollback;
1148		} else {
1149			pr_err("%s: name change rollback failed: %d\n",
1150			       dev->name, ret);
1151		}
1152	}
1153
1154	return err;
1155}
1156
1157/**
1158 *	dev_set_alias - change ifalias of a device
1159 *	@dev: device
1160 *	@alias: name up to IFALIASZ
1161 *	@len: limit of bytes to copy from info
1162 *
1163 *	Set ifalias for a device,
1164 */
1165int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1166{
1167	char *new_ifalias;
1168
1169	ASSERT_RTNL();
1170
1171	if (len >= IFALIASZ)
1172		return -EINVAL;
1173
1174	if (!len) {
1175		kfree(dev->ifalias);
1176		dev->ifalias = NULL;
1177		return 0;
1178	}
1179
1180	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1181	if (!new_ifalias)
1182		return -ENOMEM;
1183	dev->ifalias = new_ifalias;
1184
1185	strlcpy(dev->ifalias, alias, len+1);
1186	return len;
1187}
1188
1189
1190/**
1191 *	netdev_features_change - device changes features
1192 *	@dev: device to cause notification
1193 *
1194 *	Called to indicate a device has changed features.
1195 */
1196void netdev_features_change(struct net_device *dev)
1197{
1198	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1199}
1200EXPORT_SYMBOL(netdev_features_change);
1201
1202/**
1203 *	netdev_state_change - device changes state
1204 *	@dev: device to cause notification
1205 *
1206 *	Called to indicate a device has changed state. This function calls
1207 *	the notifier chains for netdev_chain and sends a NEWLINK message
1208 *	to the routing socket.
1209 */
1210void netdev_state_change(struct net_device *dev)
1211{
1212	if (dev->flags & IFF_UP) {
1213		struct netdev_notifier_change_info change_info;
1214
1215		change_info.flags_changed = 0;
1216		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1217					      &change_info.info);
1218		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1219	}
1220}
1221EXPORT_SYMBOL(netdev_state_change);
1222
1223/**
1224 * 	netdev_notify_peers - notify network peers about existence of @dev
1225 * 	@dev: network device
1226 *
1227 * Generate traffic such that interested network peers are aware of
1228 * @dev, such as by generating a gratuitous ARP. This may be used when
1229 * a device wants to inform the rest of the network about some sort of
1230 * reconfiguration such as a failover event or virtual machine
1231 * migration.
1232 */
1233void netdev_notify_peers(struct net_device *dev)
1234{
1235	rtnl_lock();
1236	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1237	rtnl_unlock();
1238}
1239EXPORT_SYMBOL(netdev_notify_peers);
1240
1241static int __dev_open(struct net_device *dev)
1242{
1243	const struct net_device_ops *ops = dev->netdev_ops;
1244	int ret;
1245
1246	ASSERT_RTNL();
1247
1248	if (!netif_device_present(dev))
1249		return -ENODEV;
1250
1251	/* Block netpoll from trying to do any rx path servicing.
1252	 * If we don't do this there is a chance ndo_poll_controller
1253	 * or ndo_poll may be running while we open the device
1254	 */
1255	netpoll_poll_disable(dev);
1256
1257	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1258	ret = notifier_to_errno(ret);
1259	if (ret)
1260		return ret;
1261
1262	set_bit(__LINK_STATE_START, &dev->state);
1263
1264	if (ops->ndo_validate_addr)
1265		ret = ops->ndo_validate_addr(dev);
1266
1267	if (!ret && ops->ndo_open)
1268		ret = ops->ndo_open(dev);
1269
1270	netpoll_poll_enable(dev);
1271
1272	if (ret)
1273		clear_bit(__LINK_STATE_START, &dev->state);
1274	else {
1275		dev->flags |= IFF_UP;
1276		net_dmaengine_get();
1277		dev_set_rx_mode(dev);
1278		dev_activate(dev);
1279		add_device_randomness(dev->dev_addr, dev->addr_len);
1280	}
1281
1282	return ret;
1283}
1284
1285/**
1286 *	dev_open	- prepare an interface for use.
1287 *	@dev:	device to open
1288 *
1289 *	Takes a device from down to up state. The device's private open
1290 *	function is invoked and then the multicast lists are loaded. Finally
1291 *	the device is moved into the up state and a %NETDEV_UP message is
1292 *	sent to the netdev notifier chain.
1293 *
1294 *	Calling this function on an active interface is a nop. On a failure
1295 *	a negative errno code is returned.
1296 */
1297int dev_open(struct net_device *dev)
1298{
1299	int ret;
1300
1301	if (dev->flags & IFF_UP)
1302		return 0;
1303
1304	ret = __dev_open(dev);
1305	if (ret < 0)
1306		return ret;
1307
1308	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1309	call_netdevice_notifiers(NETDEV_UP, dev);
1310
1311	return ret;
1312}
1313EXPORT_SYMBOL(dev_open);
1314
1315static int __dev_close_many(struct list_head *head)
1316{
1317	struct net_device *dev;
1318
1319	ASSERT_RTNL();
1320	might_sleep();
1321
1322	list_for_each_entry(dev, head, close_list) {
1323		/* Temporarily disable netpoll until the interface is down */
1324		netpoll_poll_disable(dev);
1325
1326		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1327
1328		clear_bit(__LINK_STATE_START, &dev->state);
1329
1330		/* Synchronize to scheduled poll. We cannot touch poll list, it
1331		 * can be even on different cpu. So just clear netif_running().
1332		 *
1333		 * dev->stop() will invoke napi_disable() on all of it's
1334		 * napi_struct instances on this device.
1335		 */
1336		smp_mb__after_atomic(); /* Commit netif_running(). */
1337	}
1338
1339	dev_deactivate_many(head);
1340
1341	list_for_each_entry(dev, head, close_list) {
1342		const struct net_device_ops *ops = dev->netdev_ops;
1343
1344		/*
1345		 *	Call the device specific close. This cannot fail.
1346		 *	Only if device is UP
1347		 *
1348		 *	We allow it to be called even after a DETACH hot-plug
1349		 *	event.
1350		 */
1351		if (ops->ndo_stop)
1352			ops->ndo_stop(dev);
1353
1354		dev->flags &= ~IFF_UP;
1355		net_dmaengine_put();
1356		netpoll_poll_enable(dev);
1357	}
1358
1359	return 0;
1360}
1361
1362static int __dev_close(struct net_device *dev)
1363{
1364	int retval;
1365	LIST_HEAD(single);
1366
1367	list_add(&dev->close_list, &single);
1368	retval = __dev_close_many(&single);
1369	list_del(&single);
1370
1371	return retval;
1372}
1373
1374static int dev_close_many(struct list_head *head)
1375{
1376	struct net_device *dev, *tmp;
1377
1378	/* Remove the devices that don't need to be closed */
1379	list_for_each_entry_safe(dev, tmp, head, close_list)
1380		if (!(dev->flags & IFF_UP))
1381			list_del_init(&dev->close_list);
1382
1383	__dev_close_many(head);
1384
1385	list_for_each_entry_safe(dev, tmp, head, close_list) {
1386		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1387		call_netdevice_notifiers(NETDEV_DOWN, dev);
1388		list_del_init(&dev->close_list);
1389	}
1390
1391	return 0;
1392}
1393
1394/**
1395 *	dev_close - shutdown an interface.
1396 *	@dev: device to shutdown
1397 *
1398 *	This function moves an active device into down state. A
1399 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1400 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1401 *	chain.
1402 */
1403int dev_close(struct net_device *dev)
1404{
1405	if (dev->flags & IFF_UP) {
1406		LIST_HEAD(single);
1407
1408		list_add(&dev->close_list, &single);
1409		dev_close_many(&single);
1410		list_del(&single);
1411	}
1412	return 0;
1413}
1414EXPORT_SYMBOL(dev_close);
1415
1416
1417/**
1418 *	dev_disable_lro - disable Large Receive Offload on a device
1419 *	@dev: device
1420 *
1421 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1422 *	called under RTNL.  This is needed if received packets may be
1423 *	forwarded to another interface.
1424 */
1425void dev_disable_lro(struct net_device *dev)
1426{
1427	/*
1428	 * If we're trying to disable lro on a vlan device
1429	 * use the underlying physical device instead
1430	 */
1431	if (is_vlan_dev(dev))
1432		dev = vlan_dev_real_dev(dev);
1433
1434	/* the same for macvlan devices */
1435	if (netif_is_macvlan(dev))
1436		dev = macvlan_dev_real_dev(dev);
1437
1438	dev->wanted_features &= ~NETIF_F_LRO;
1439	netdev_update_features(dev);
1440
1441	if (unlikely(dev->features & NETIF_F_LRO))
1442		netdev_WARN(dev, "failed to disable LRO!\n");
1443}
1444EXPORT_SYMBOL(dev_disable_lro);
1445
1446static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1447				   struct net_device *dev)
1448{
1449	struct netdev_notifier_info info;
1450
1451	netdev_notifier_info_init(&info, dev);
1452	return nb->notifier_call(nb, val, &info);
1453}
1454
1455static int dev_boot_phase = 1;
1456
1457/**
1458 *	register_netdevice_notifier - register a network notifier block
1459 *	@nb: notifier
1460 *
1461 *	Register a notifier to be called when network device events occur.
1462 *	The notifier passed is linked into the kernel structures and must
1463 *	not be reused until it has been unregistered. A negative errno code
1464 *	is returned on a failure.
1465 *
1466 * 	When registered all registration and up events are replayed
1467 *	to the new notifier to allow device to have a race free
1468 *	view of the network device list.
1469 */
1470
1471int register_netdevice_notifier(struct notifier_block *nb)
1472{
1473	struct net_device *dev;
1474	struct net_device *last;
1475	struct net *net;
1476	int err;
1477
1478	rtnl_lock();
1479	err = raw_notifier_chain_register(&netdev_chain, nb);
1480	if (err)
1481		goto unlock;
1482	if (dev_boot_phase)
1483		goto unlock;
1484	for_each_net(net) {
1485		for_each_netdev(net, dev) {
1486			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1487			err = notifier_to_errno(err);
1488			if (err)
1489				goto rollback;
1490
1491			if (!(dev->flags & IFF_UP))
1492				continue;
1493
1494			call_netdevice_notifier(nb, NETDEV_UP, dev);
1495		}
1496	}
1497
1498unlock:
1499	rtnl_unlock();
1500	return err;
1501
1502rollback:
1503	last = dev;
1504	for_each_net(net) {
1505		for_each_netdev(net, dev) {
1506			if (dev == last)
1507				goto outroll;
1508
1509			if (dev->flags & IFF_UP) {
1510				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1511							dev);
1512				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1513			}
1514			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1515		}
1516	}
1517
1518outroll:
1519	raw_notifier_chain_unregister(&netdev_chain, nb);
1520	goto unlock;
1521}
1522EXPORT_SYMBOL(register_netdevice_notifier);
1523
1524/**
1525 *	unregister_netdevice_notifier - unregister a network notifier block
1526 *	@nb: notifier
1527 *
1528 *	Unregister a notifier previously registered by
1529 *	register_netdevice_notifier(). The notifier is unlinked into the
1530 *	kernel structures and may then be reused. A negative errno code
1531 *	is returned on a failure.
1532 *
1533 * 	After unregistering unregister and down device events are synthesized
1534 *	for all devices on the device list to the removed notifier to remove
1535 *	the need for special case cleanup code.
1536 */
1537
1538int unregister_netdevice_notifier(struct notifier_block *nb)
1539{
1540	struct net_device *dev;
1541	struct net *net;
1542	int err;
1543
1544	rtnl_lock();
1545	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1546	if (err)
1547		goto unlock;
1548
1549	for_each_net(net) {
1550		for_each_netdev(net, dev) {
1551			if (dev->flags & IFF_UP) {
1552				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1553							dev);
1554				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1555			}
1556			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1557		}
1558	}
1559unlock:
1560	rtnl_unlock();
1561	return err;
1562}
1563EXPORT_SYMBOL(unregister_netdevice_notifier);
1564
1565/**
1566 *	call_netdevice_notifiers_info - call all network notifier blocks
1567 *	@val: value passed unmodified to notifier function
1568 *	@dev: net_device pointer passed unmodified to notifier function
1569 *	@info: notifier information data
1570 *
1571 *	Call all network notifier blocks.  Parameters and return value
1572 *	are as for raw_notifier_call_chain().
1573 */
1574
1575static int call_netdevice_notifiers_info(unsigned long val,
1576					 struct net_device *dev,
1577					 struct netdev_notifier_info *info)
1578{
1579	ASSERT_RTNL();
1580	netdev_notifier_info_init(info, dev);
1581	return raw_notifier_call_chain(&netdev_chain, val, info);
1582}
1583
1584/**
1585 *	call_netdevice_notifiers - call all network notifier blocks
1586 *      @val: value passed unmodified to notifier function
1587 *      @dev: net_device pointer passed unmodified to notifier function
1588 *
1589 *	Call all network notifier blocks.  Parameters and return value
1590 *	are as for raw_notifier_call_chain().
1591 */
1592
1593int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1594{
1595	struct netdev_notifier_info info;
1596
1597	return call_netdevice_notifiers_info(val, dev, &info);
1598}
1599EXPORT_SYMBOL(call_netdevice_notifiers);
1600
1601static struct static_key netstamp_needed __read_mostly;
1602#ifdef HAVE_JUMP_LABEL
1603/* We are not allowed to call static_key_slow_dec() from irq context
1604 * If net_disable_timestamp() is called from irq context, defer the
1605 * static_key_slow_dec() calls.
1606 */
1607static atomic_t netstamp_needed_deferred;
1608#endif
1609
1610void net_enable_timestamp(void)
1611{
1612#ifdef HAVE_JUMP_LABEL
1613	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1614
1615	if (deferred) {
1616		while (--deferred)
1617			static_key_slow_dec(&netstamp_needed);
1618		return;
1619	}
1620#endif
1621	static_key_slow_inc(&netstamp_needed);
1622}
1623EXPORT_SYMBOL(net_enable_timestamp);
1624
1625void net_disable_timestamp(void)
1626{
1627#ifdef HAVE_JUMP_LABEL
1628	if (in_interrupt()) {
1629		atomic_inc(&netstamp_needed_deferred);
1630		return;
1631	}
1632#endif
1633	static_key_slow_dec(&netstamp_needed);
1634}
1635EXPORT_SYMBOL(net_disable_timestamp);
1636
1637static inline void net_timestamp_set(struct sk_buff *skb)
1638{
1639	skb->tstamp.tv64 = 0;
1640	if (static_key_false(&netstamp_needed))
1641		__net_timestamp(skb);
1642}
1643
1644#define net_timestamp_check(COND, SKB)			\
1645	if (static_key_false(&netstamp_needed)) {		\
1646		if ((COND) && !(SKB)->tstamp.tv64)	\
1647			__net_timestamp(SKB);		\
1648	}						\
1649
1650bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1651{
1652	unsigned int len;
1653
1654	if (!(dev->flags & IFF_UP))
1655		return false;
1656
1657	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1658	if (skb->len <= len)
1659		return true;
1660
1661	/* if TSO is enabled, we don't care about the length as the packet
1662	 * could be forwarded without being segmented before
1663	 */
1664	if (skb_is_gso(skb))
1665		return true;
1666
1667	return false;
1668}
1669EXPORT_SYMBOL_GPL(is_skb_forwardable);
1670
1671int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1672{
1673	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1674		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1675			atomic_long_inc(&dev->rx_dropped);
1676			kfree_skb(skb);
1677			return NET_RX_DROP;
1678		}
1679	}
1680
1681	if (unlikely(!is_skb_forwardable(dev, skb))) {
1682		atomic_long_inc(&dev->rx_dropped);
1683		kfree_skb(skb);
1684		return NET_RX_DROP;
1685	}
1686
1687	skb_scrub_packet(skb, true);
1688	skb->protocol = eth_type_trans(skb, dev);
1689
1690	return 0;
1691}
1692EXPORT_SYMBOL_GPL(__dev_forward_skb);
1693
1694/**
1695 * dev_forward_skb - loopback an skb to another netif
1696 *
1697 * @dev: destination network device
1698 * @skb: buffer to forward
1699 *
1700 * return values:
1701 *	NET_RX_SUCCESS	(no congestion)
1702 *	NET_RX_DROP     (packet was dropped, but freed)
1703 *
1704 * dev_forward_skb can be used for injecting an skb from the
1705 * start_xmit function of one device into the receive queue
1706 * of another device.
1707 *
1708 * The receiving device may be in another namespace, so
1709 * we have to clear all information in the skb that could
1710 * impact namespace isolation.
1711 */
1712int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1713{
1714	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1715}
1716EXPORT_SYMBOL_GPL(dev_forward_skb);
1717
1718static inline int deliver_skb(struct sk_buff *skb,
1719			      struct packet_type *pt_prev,
1720			      struct net_device *orig_dev)
1721{
1722	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1723		return -ENOMEM;
1724	atomic_inc(&skb->users);
1725	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1726}
1727
1728static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1729{
1730	if (!ptype->af_packet_priv || !skb->sk)
1731		return false;
1732
1733	if (ptype->id_match)
1734		return ptype->id_match(ptype, skb->sk);
1735	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1736		return true;
1737
1738	return false;
1739}
1740
1741/*
1742 *	Support routine. Sends outgoing frames to any network
1743 *	taps currently in use.
1744 */
1745
1746static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1747{
1748	struct packet_type *ptype;
1749	struct sk_buff *skb2 = NULL;
1750	struct packet_type *pt_prev = NULL;
1751
1752	rcu_read_lock();
1753	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1754		/* Never send packets back to the socket
1755		 * they originated from - MvS (miquels@drinkel.ow.org)
1756		 */
1757		if ((ptype->dev == dev || !ptype->dev) &&
1758		    (!skb_loop_sk(ptype, skb))) {
1759			if (pt_prev) {
1760				deliver_skb(skb2, pt_prev, skb->dev);
1761				pt_prev = ptype;
1762				continue;
1763			}
1764
1765			skb2 = skb_clone(skb, GFP_ATOMIC);
1766			if (!skb2)
1767				break;
1768
1769			net_timestamp_set(skb2);
1770
1771			/* skb->nh should be correctly
1772			   set by sender, so that the second statement is
1773			   just protection against buggy protocols.
1774			 */
1775			skb_reset_mac_header(skb2);
1776
1777			if (skb_network_header(skb2) < skb2->data ||
1778			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1779				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1780						     ntohs(skb2->protocol),
1781						     dev->name);
1782				skb_reset_network_header(skb2);
1783			}
1784
1785			skb2->transport_header = skb2->network_header;
1786			skb2->pkt_type = PACKET_OUTGOING;
1787			pt_prev = ptype;
1788		}
1789	}
1790	if (pt_prev)
1791		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1792	rcu_read_unlock();
1793}
1794
1795/**
1796 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1797 * @dev: Network device
1798 * @txq: number of queues available
1799 *
1800 * If real_num_tx_queues is changed the tc mappings may no longer be
1801 * valid. To resolve this verify the tc mapping remains valid and if
1802 * not NULL the mapping. With no priorities mapping to this
1803 * offset/count pair it will no longer be used. In the worst case TC0
1804 * is invalid nothing can be done so disable priority mappings. If is
1805 * expected that drivers will fix this mapping if they can before
1806 * calling netif_set_real_num_tx_queues.
1807 */
1808static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1809{
1810	int i;
1811	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1812
1813	/* If TC0 is invalidated disable TC mapping */
1814	if (tc->offset + tc->count > txq) {
1815		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1816		dev->num_tc = 0;
1817		return;
1818	}
1819
1820	/* Invalidated prio to tc mappings set to TC0 */
1821	for (i = 1; i < TC_BITMASK + 1; i++) {
1822		int q = netdev_get_prio_tc_map(dev, i);
1823
1824		tc = &dev->tc_to_txq[q];
1825		if (tc->offset + tc->count > txq) {
1826			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1827				i, q);
1828			netdev_set_prio_tc_map(dev, i, 0);
1829		}
1830	}
1831}
1832
1833#ifdef CONFIG_XPS
1834static DEFINE_MUTEX(xps_map_mutex);
1835#define xmap_dereference(P)		\
1836	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1837
1838static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1839					int cpu, u16 index)
1840{
1841	struct xps_map *map = NULL;
1842	int pos;
1843
1844	if (dev_maps)
1845		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1846
1847	for (pos = 0; map && pos < map->len; pos++) {
1848		if (map->queues[pos] == index) {
1849			if (map->len > 1) {
1850				map->queues[pos] = map->queues[--map->len];
1851			} else {
1852				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1853				kfree_rcu(map, rcu);
1854				map = NULL;
1855			}
1856			break;
1857		}
1858	}
1859
1860	return map;
1861}
1862
1863static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1864{
1865	struct xps_dev_maps *dev_maps;
1866	int cpu, i;
1867	bool active = false;
1868
1869	mutex_lock(&xps_map_mutex);
1870	dev_maps = xmap_dereference(dev->xps_maps);
1871
1872	if (!dev_maps)
1873		goto out_no_maps;
1874
1875	for_each_possible_cpu(cpu) {
1876		for (i = index; i < dev->num_tx_queues; i++) {
1877			if (!remove_xps_queue(dev_maps, cpu, i))
1878				break;
1879		}
1880		if (i == dev->num_tx_queues)
1881			active = true;
1882	}
1883
1884	if (!active) {
1885		RCU_INIT_POINTER(dev->xps_maps, NULL);
1886		kfree_rcu(dev_maps, rcu);
1887	}
1888
1889	for (i = index; i < dev->num_tx_queues; i++)
1890		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1891					     NUMA_NO_NODE);
1892
1893out_no_maps:
1894	mutex_unlock(&xps_map_mutex);
1895}
1896
1897static struct xps_map *expand_xps_map(struct xps_map *map,
1898				      int cpu, u16 index)
1899{
1900	struct xps_map *new_map;
1901	int alloc_len = XPS_MIN_MAP_ALLOC;
1902	int i, pos;
1903
1904	for (pos = 0; map && pos < map->len; pos++) {
1905		if (map->queues[pos] != index)
1906			continue;
1907		return map;
1908	}
1909
1910	/* Need to add queue to this CPU's existing map */
1911	if (map) {
1912		if (pos < map->alloc_len)
1913			return map;
1914
1915		alloc_len = map->alloc_len * 2;
1916	}
1917
1918	/* Need to allocate new map to store queue on this CPU's map */
1919	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1920			       cpu_to_node(cpu));
1921	if (!new_map)
1922		return NULL;
1923
1924	for (i = 0; i < pos; i++)
1925		new_map->queues[i] = map->queues[i];
1926	new_map->alloc_len = alloc_len;
1927	new_map->len = pos;
1928
1929	return new_map;
1930}
1931
1932int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1933			u16 index)
1934{
1935	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1936	struct xps_map *map, *new_map;
1937	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1938	int cpu, numa_node_id = -2;
1939	bool active = false;
1940
1941	mutex_lock(&xps_map_mutex);
1942
1943	dev_maps = xmap_dereference(dev->xps_maps);
1944
1945	/* allocate memory for queue storage */
1946	for_each_online_cpu(cpu) {
1947		if (!cpumask_test_cpu(cpu, mask))
1948			continue;
1949
1950		if (!new_dev_maps)
1951			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1952		if (!new_dev_maps) {
1953			mutex_unlock(&xps_map_mutex);
1954			return -ENOMEM;
1955		}
1956
1957		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1958				 NULL;
1959
1960		map = expand_xps_map(map, cpu, index);
1961		if (!map)
1962			goto error;
1963
1964		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1965	}
1966
1967	if (!new_dev_maps)
1968		goto out_no_new_maps;
1969
1970	for_each_possible_cpu(cpu) {
1971		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1972			/* add queue to CPU maps */
1973			int pos = 0;
1974
1975			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1976			while ((pos < map->len) && (map->queues[pos] != index))
1977				pos++;
1978
1979			if (pos == map->len)
1980				map->queues[map->len++] = index;
1981#ifdef CONFIG_NUMA
1982			if (numa_node_id == -2)
1983				numa_node_id = cpu_to_node(cpu);
1984			else if (numa_node_id != cpu_to_node(cpu))
1985				numa_node_id = -1;
1986#endif
1987		} else if (dev_maps) {
1988			/* fill in the new device map from the old device map */
1989			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1991		}
1992
1993	}
1994
1995	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1996
1997	/* Cleanup old maps */
1998	if (dev_maps) {
1999		for_each_possible_cpu(cpu) {
2000			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2001			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2002			if (map && map != new_map)
2003				kfree_rcu(map, rcu);
2004		}
2005
2006		kfree_rcu(dev_maps, rcu);
2007	}
2008
2009	dev_maps = new_dev_maps;
2010	active = true;
2011
2012out_no_new_maps:
2013	/* update Tx queue numa node */
2014	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2015				     (numa_node_id >= 0) ? numa_node_id :
2016				     NUMA_NO_NODE);
2017
2018	if (!dev_maps)
2019		goto out_no_maps;
2020
2021	/* removes queue from unused CPUs */
2022	for_each_possible_cpu(cpu) {
2023		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2024			continue;
2025
2026		if (remove_xps_queue(dev_maps, cpu, index))
2027			active = true;
2028	}
2029
2030	/* free map if not active */
2031	if (!active) {
2032		RCU_INIT_POINTER(dev->xps_maps, NULL);
2033		kfree_rcu(dev_maps, rcu);
2034	}
2035
2036out_no_maps:
2037	mutex_unlock(&xps_map_mutex);
2038
2039	return 0;
2040error:
2041	/* remove any maps that we added */
2042	for_each_possible_cpu(cpu) {
2043		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2044		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2045				 NULL;
2046		if (new_map && new_map != map)
2047			kfree(new_map);
2048	}
2049
2050	mutex_unlock(&xps_map_mutex);
2051
2052	kfree(new_dev_maps);
2053	return -ENOMEM;
2054}
2055EXPORT_SYMBOL(netif_set_xps_queue);
2056
2057#endif
2058/*
2059 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2060 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2061 */
2062int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2063{
2064	int rc;
2065
2066	if (txq < 1 || txq > dev->num_tx_queues)
2067		return -EINVAL;
2068
2069	if (dev->reg_state == NETREG_REGISTERED ||
2070	    dev->reg_state == NETREG_UNREGISTERING) {
2071		ASSERT_RTNL();
2072
2073		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2074						  txq);
2075		if (rc)
2076			return rc;
2077
2078		if (dev->num_tc)
2079			netif_setup_tc(dev, txq);
2080
2081		if (txq < dev->real_num_tx_queues) {
2082			qdisc_reset_all_tx_gt(dev, txq);
2083#ifdef CONFIG_XPS
2084			netif_reset_xps_queues_gt(dev, txq);
2085#endif
2086		}
2087	}
2088
2089	dev->real_num_tx_queues = txq;
2090	return 0;
2091}
2092EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2093
2094#ifdef CONFIG_SYSFS
2095/**
2096 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2097 *	@dev: Network device
2098 *	@rxq: Actual number of RX queues
2099 *
2100 *	This must be called either with the rtnl_lock held or before
2101 *	registration of the net device.  Returns 0 on success, or a
2102 *	negative error code.  If called before registration, it always
2103 *	succeeds.
2104 */
2105int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2106{
2107	int rc;
2108
2109	if (rxq < 1 || rxq > dev->num_rx_queues)
2110		return -EINVAL;
2111
2112	if (dev->reg_state == NETREG_REGISTERED) {
2113		ASSERT_RTNL();
2114
2115		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2116						  rxq);
2117		if (rc)
2118			return rc;
2119	}
2120
2121	dev->real_num_rx_queues = rxq;
2122	return 0;
2123}
2124EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2125#endif
2126
2127/**
2128 * netif_get_num_default_rss_queues - default number of RSS queues
2129 *
2130 * This routine should set an upper limit on the number of RSS queues
2131 * used by default by multiqueue devices.
2132 */
2133int netif_get_num_default_rss_queues(void)
2134{
2135	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2136}
2137EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2138
2139static inline void __netif_reschedule(struct Qdisc *q)
2140{
2141	struct softnet_data *sd;
2142	unsigned long flags;
2143
2144	local_irq_save(flags);
2145	sd = &__get_cpu_var(softnet_data);
2146	q->next_sched = NULL;
2147	*sd->output_queue_tailp = q;
2148	sd->output_queue_tailp = &q->next_sched;
2149	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2150	local_irq_restore(flags);
2151}
2152
2153void __netif_schedule(struct Qdisc *q)
2154{
2155	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2156		__netif_reschedule(q);
2157}
2158EXPORT_SYMBOL(__netif_schedule);
2159
2160struct dev_kfree_skb_cb {
2161	enum skb_free_reason reason;
2162};
2163
2164static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2165{
2166	return (struct dev_kfree_skb_cb *)skb->cb;
2167}
2168
2169void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2170{
2171	unsigned long flags;
2172
2173	if (likely(atomic_read(&skb->users) == 1)) {
2174		smp_rmb();
2175		atomic_set(&skb->users, 0);
2176	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2177		return;
2178	}
2179	get_kfree_skb_cb(skb)->reason = reason;
2180	local_irq_save(flags);
2181	skb->next = __this_cpu_read(softnet_data.completion_queue);
2182	__this_cpu_write(softnet_data.completion_queue, skb);
2183	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2184	local_irq_restore(flags);
2185}
2186EXPORT_SYMBOL(__dev_kfree_skb_irq);
2187
2188void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2189{
2190	if (in_irq() || irqs_disabled())
2191		__dev_kfree_skb_irq(skb, reason);
2192	else
2193		dev_kfree_skb(skb);
2194}
2195EXPORT_SYMBOL(__dev_kfree_skb_any);
2196
2197
2198/**
2199 * netif_device_detach - mark device as removed
2200 * @dev: network device
2201 *
2202 * Mark device as removed from system and therefore no longer available.
2203 */
2204void netif_device_detach(struct net_device *dev)
2205{
2206	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2207	    netif_running(dev)) {
2208		netif_tx_stop_all_queues(dev);
2209	}
2210}
2211EXPORT_SYMBOL(netif_device_detach);
2212
2213/**
2214 * netif_device_attach - mark device as attached
2215 * @dev: network device
2216 *
2217 * Mark device as attached from system and restart if needed.
2218 */
2219void netif_device_attach(struct net_device *dev)
2220{
2221	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2222	    netif_running(dev)) {
2223		netif_tx_wake_all_queues(dev);
2224		__netdev_watchdog_up(dev);
2225	}
2226}
2227EXPORT_SYMBOL(netif_device_attach);
2228
2229static void skb_warn_bad_offload(const struct sk_buff *skb)
2230{
2231	static const netdev_features_t null_features = 0;
2232	struct net_device *dev = skb->dev;
2233	const char *driver = "";
2234
2235	if (!net_ratelimit())
2236		return;
2237
2238	if (dev && dev->dev.parent)
2239		driver = dev_driver_string(dev->dev.parent);
2240
2241	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2242	     "gso_type=%d ip_summed=%d\n",
2243	     driver, dev ? &dev->features : &null_features,
2244	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2245	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2246	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2247}
2248
2249/*
2250 * Invalidate hardware checksum when packet is to be mangled, and
2251 * complete checksum manually on outgoing path.
2252 */
2253int skb_checksum_help(struct sk_buff *skb)
2254{
2255	__wsum csum;
2256	int ret = 0, offset;
2257
2258	if (skb->ip_summed == CHECKSUM_COMPLETE)
2259		goto out_set_summed;
2260
2261	if (unlikely(skb_shinfo(skb)->gso_size)) {
2262		skb_warn_bad_offload(skb);
2263		return -EINVAL;
2264	}
2265
2266	/* Before computing a checksum, we should make sure no frag could
2267	 * be modified by an external entity : checksum could be wrong.
2268	 */
2269	if (skb_has_shared_frag(skb)) {
2270		ret = __skb_linearize(skb);
2271		if (ret)
2272			goto out;
2273	}
2274
2275	offset = skb_checksum_start_offset(skb);
2276	BUG_ON(offset >= skb_headlen(skb));
2277	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2278
2279	offset += skb->csum_offset;
2280	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2281
2282	if (skb_cloned(skb) &&
2283	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2284		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2285		if (ret)
2286			goto out;
2287	}
2288
2289	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2290out_set_summed:
2291	skb->ip_summed = CHECKSUM_NONE;
2292out:
2293	return ret;
2294}
2295EXPORT_SYMBOL(skb_checksum_help);
2296
2297__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2298{
2299	unsigned int vlan_depth = skb->mac_len;
2300	__be16 type = skb->protocol;
2301
2302	/* Tunnel gso handlers can set protocol to ethernet. */
2303	if (type == htons(ETH_P_TEB)) {
2304		struct ethhdr *eth;
2305
2306		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2307			return 0;
2308
2309		eth = (struct ethhdr *)skb_mac_header(skb);
2310		type = eth->h_proto;
2311	}
2312
2313	/* if skb->protocol is 802.1Q/AD then the header should already be
2314	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2315	 * ETH_HLEN otherwise
2316	 */
2317	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2318		if (vlan_depth) {
2319			if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2320				return 0;
2321			vlan_depth -= VLAN_HLEN;
2322		} else {
2323			vlan_depth = ETH_HLEN;
2324		}
2325		do {
2326			struct vlan_hdr *vh;
2327
2328			if (unlikely(!pskb_may_pull(skb,
2329						    vlan_depth + VLAN_HLEN)))
2330				return 0;
2331
2332			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2333			type = vh->h_vlan_encapsulated_proto;
2334			vlan_depth += VLAN_HLEN;
2335		} while (type == htons(ETH_P_8021Q) ||
2336			 type == htons(ETH_P_8021AD));
2337	}
2338
2339	*depth = vlan_depth;
2340
2341	return type;
2342}
2343
2344/**
2345 *	skb_mac_gso_segment - mac layer segmentation handler.
2346 *	@skb: buffer to segment
2347 *	@features: features for the output path (see dev->features)
2348 */
2349struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2350				    netdev_features_t features)
2351{
2352	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2353	struct packet_offload *ptype;
2354	int vlan_depth = skb->mac_len;
2355	__be16 type = skb_network_protocol(skb, &vlan_depth);
2356
2357	if (unlikely(!type))
2358		return ERR_PTR(-EINVAL);
2359
2360	__skb_pull(skb, vlan_depth);
2361
2362	rcu_read_lock();
2363	list_for_each_entry_rcu(ptype, &offload_base, list) {
2364		if (ptype->type == type && ptype->callbacks.gso_segment) {
2365			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2366				int err;
2367
2368				err = ptype->callbacks.gso_send_check(skb);
2369				segs = ERR_PTR(err);
2370				if (err || skb_gso_ok(skb, features))
2371					break;
2372				__skb_push(skb, (skb->data -
2373						 skb_network_header(skb)));
2374			}
2375			segs = ptype->callbacks.gso_segment(skb, features);
2376			break;
2377		}
2378	}
2379	rcu_read_unlock();
2380
2381	__skb_push(skb, skb->data - skb_mac_header(skb));
2382
2383	return segs;
2384}
2385EXPORT_SYMBOL(skb_mac_gso_segment);
2386
2387
2388/* openvswitch calls this on rx path, so we need a different check.
2389 */
2390static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2391{
2392	if (tx_path)
2393		return skb->ip_summed != CHECKSUM_PARTIAL;
2394	else
2395		return skb->ip_summed == CHECKSUM_NONE;
2396}
2397
2398/**
2399 *	__skb_gso_segment - Perform segmentation on skb.
2400 *	@skb: buffer to segment
2401 *	@features: features for the output path (see dev->features)
2402 *	@tx_path: whether it is called in TX path
2403 *
2404 *	This function segments the given skb and returns a list of segments.
2405 *
2406 *	It may return NULL if the skb requires no segmentation.  This is
2407 *	only possible when GSO is used for verifying header integrity.
2408 */
2409struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2410				  netdev_features_t features, bool tx_path)
2411{
2412	if (unlikely(skb_needs_check(skb, tx_path))) {
2413		int err;
2414
2415		skb_warn_bad_offload(skb);
2416
2417		if (skb_header_cloned(skb) &&
2418		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2419			return ERR_PTR(err);
2420	}
2421
2422	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2423	SKB_GSO_CB(skb)->encap_level = 0;
2424
2425	skb_reset_mac_header(skb);
2426	skb_reset_mac_len(skb);
2427
2428	return skb_mac_gso_segment(skb, features);
2429}
2430EXPORT_SYMBOL(__skb_gso_segment);
2431
2432/* Take action when hardware reception checksum errors are detected. */
2433#ifdef CONFIG_BUG
2434void netdev_rx_csum_fault(struct net_device *dev)
2435{
2436	if (net_ratelimit()) {
2437		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2438		dump_stack();
2439	}
2440}
2441EXPORT_SYMBOL(netdev_rx_csum_fault);
2442#endif
2443
2444/* Actually, we should eliminate this check as soon as we know, that:
2445 * 1. IOMMU is present and allows to map all the memory.
2446 * 2. No high memory really exists on this machine.
2447 */
2448
2449static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2450{
2451#ifdef CONFIG_HIGHMEM
2452	int i;
2453	if (!(dev->features & NETIF_F_HIGHDMA)) {
2454		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2455			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2456			if (PageHighMem(skb_frag_page(frag)))
2457				return 1;
2458		}
2459	}
2460
2461	if (PCI_DMA_BUS_IS_PHYS) {
2462		struct device *pdev = dev->dev.parent;
2463
2464		if (!pdev)
2465			return 0;
2466		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2467			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2468			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2469			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2470				return 1;
2471		}
2472	}
2473#endif
2474	return 0;
2475}
2476
2477struct dev_gso_cb {
2478	void (*destructor)(struct sk_buff *skb);
2479};
2480
2481#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2482
2483static void dev_gso_skb_destructor(struct sk_buff *skb)
2484{
2485	struct dev_gso_cb *cb;
2486
2487	kfree_skb_list(skb->next);
2488	skb->next = NULL;
2489
2490	cb = DEV_GSO_CB(skb);
2491	if (cb->destructor)
2492		cb->destructor(skb);
2493}
2494
2495/**
2496 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2497 *	@skb: buffer to segment
2498 *	@features: device features as applicable to this skb
2499 *
2500 *	This function segments the given skb and stores the list of segments
2501 *	in skb->next.
2502 */
2503static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2504{
2505	struct sk_buff *segs;
2506
2507	segs = skb_gso_segment(skb, features);
2508
2509	/* Verifying header integrity only. */
2510	if (!segs)
2511		return 0;
2512
2513	if (IS_ERR(segs))
2514		return PTR_ERR(segs);
2515
2516	skb->next = segs;
2517	DEV_GSO_CB(skb)->destructor = skb->destructor;
2518	skb->destructor = dev_gso_skb_destructor;
2519
2520	return 0;
2521}
2522
2523/* If MPLS offload request, verify we are testing hardware MPLS features
2524 * instead of standard features for the netdev.
2525 */
2526#ifdef CONFIG_NET_MPLS_GSO
2527static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528					   netdev_features_t features,
2529					   __be16 type)
2530{
2531	if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2532		features &= skb->dev->mpls_features;
2533
2534	return features;
2535}
2536#else
2537static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538					   netdev_features_t features,
2539					   __be16 type)
2540{
2541	return features;
2542}
2543#endif
2544
2545static netdev_features_t harmonize_features(struct sk_buff *skb,
2546	netdev_features_t features)
2547{
2548	int tmp;
2549	__be16 type;
2550
2551	type = skb_network_protocol(skb, &tmp);
2552	features = net_mpls_features(skb, features, type);
2553
2554	if (skb->ip_summed != CHECKSUM_NONE &&
2555	    !can_checksum_protocol(features, type)) {
2556		features &= ~NETIF_F_ALL_CSUM;
2557	} else if (illegal_highdma(skb->dev, skb)) {
2558		features &= ~NETIF_F_SG;
2559	}
2560
2561	return features;
2562}
2563
2564netdev_features_t netif_skb_features(struct sk_buff *skb)
2565{
2566	__be16 protocol = skb->protocol;
2567	netdev_features_t features = skb->dev->features;
2568
2569	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2570		features &= ~NETIF_F_GSO_MASK;
2571
2572	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2573		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2574		protocol = veh->h_vlan_encapsulated_proto;
2575	} else if (!vlan_tx_tag_present(skb)) {
2576		return harmonize_features(skb, features);
2577	}
2578
2579	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2580					       NETIF_F_HW_VLAN_STAG_TX);
2581
2582	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2583		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2584				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2585				NETIF_F_HW_VLAN_STAG_TX;
2586
2587	return harmonize_features(skb, features);
2588}
2589EXPORT_SYMBOL(netif_skb_features);
2590
2591int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2592			struct netdev_queue *txq)
2593{
2594	const struct net_device_ops *ops = dev->netdev_ops;
2595	int rc = NETDEV_TX_OK;
2596	unsigned int skb_len;
2597
2598	if (likely(!skb->next)) {
2599		netdev_features_t features;
2600
2601		/*
2602		 * If device doesn't need skb->dst, release it right now while
2603		 * its hot in this cpu cache
2604		 */
2605		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2606			skb_dst_drop(skb);
2607
2608		features = netif_skb_features(skb);
2609
2610		if (vlan_tx_tag_present(skb) &&
2611		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2612			skb = __vlan_put_tag(skb, skb->vlan_proto,
2613					     vlan_tx_tag_get(skb));
2614			if (unlikely(!skb))
2615				goto out;
2616
2617			skb->vlan_tci = 0;
2618		}
2619
2620		/* If encapsulation offload request, verify we are testing
2621		 * hardware encapsulation features instead of standard
2622		 * features for the netdev
2623		 */
2624		if (skb->encapsulation)
2625			features &= dev->hw_enc_features;
2626
2627		if (netif_needs_gso(skb, features)) {
2628			if (unlikely(dev_gso_segment(skb, features)))
2629				goto out_kfree_skb;
2630			if (skb->next)
2631				goto gso;
2632		} else {
2633			if (skb_needs_linearize(skb, features) &&
2634			    __skb_linearize(skb))
2635				goto out_kfree_skb;
2636
2637			/* If packet is not checksummed and device does not
2638			 * support checksumming for this protocol, complete
2639			 * checksumming here.
2640			 */
2641			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2642				if (skb->encapsulation)
2643					skb_set_inner_transport_header(skb,
2644						skb_checksum_start_offset(skb));
2645				else
2646					skb_set_transport_header(skb,
2647						skb_checksum_start_offset(skb));
2648				if (!(features & NETIF_F_ALL_CSUM) &&
2649				     skb_checksum_help(skb))
2650					goto out_kfree_skb;
2651			}
2652		}
2653
2654		if (!list_empty(&ptype_all))
2655			dev_queue_xmit_nit(skb, dev);
2656
2657		skb_len = skb->len;
2658		trace_net_dev_start_xmit(skb, dev);
2659		rc = ops->ndo_start_xmit(skb, dev);
2660		trace_net_dev_xmit(skb, rc, dev, skb_len);
2661		if (rc == NETDEV_TX_OK)
2662			txq_trans_update(txq);
2663		return rc;
2664	}
2665
2666gso:
2667	do {
2668		struct sk_buff *nskb = skb->next;
2669
2670		skb->next = nskb->next;
2671		nskb->next = NULL;
2672
2673		if (!list_empty(&ptype_all))
2674			dev_queue_xmit_nit(nskb, dev);
2675
2676		skb_len = nskb->len;
2677		trace_net_dev_start_xmit(nskb, dev);
2678		rc = ops->ndo_start_xmit(nskb, dev);
2679		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2680		if (unlikely(rc != NETDEV_TX_OK)) {
2681			if (rc & ~NETDEV_TX_MASK)
2682				goto out_kfree_gso_skb;
2683			nskb->next = skb->next;
2684			skb->next = nskb;
2685			return rc;
2686		}
2687		txq_trans_update(txq);
2688		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2689			return NETDEV_TX_BUSY;
2690	} while (skb->next);
2691
2692out_kfree_gso_skb:
2693	if (likely(skb->next == NULL)) {
2694		skb->destructor = DEV_GSO_CB(skb)->destructor;
2695		consume_skb(skb);
2696		return rc;
2697	}
2698out_kfree_skb:
2699	kfree_skb(skb);
2700out:
2701	return rc;
2702}
2703EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2704
2705static void qdisc_pkt_len_init(struct sk_buff *skb)
2706{
2707	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2708
2709	qdisc_skb_cb(skb)->pkt_len = skb->len;
2710
2711	/* To get more precise estimation of bytes sent on wire,
2712	 * we add to pkt_len the headers size of all segments
2713	 */
2714	if (shinfo->gso_size)  {
2715		unsigned int hdr_len;
2716		u16 gso_segs = shinfo->gso_segs;
2717
2718		/* mac layer + network layer */
2719		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2720
2721		/* + transport layer */
2722		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2723			hdr_len += tcp_hdrlen(skb);
2724		else
2725			hdr_len += sizeof(struct udphdr);
2726
2727		if (shinfo->gso_type & SKB_GSO_DODGY)
2728			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2729						shinfo->gso_size);
2730
2731		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2732	}
2733}
2734
2735static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2736				 struct net_device *dev,
2737				 struct netdev_queue *txq)
2738{
2739	spinlock_t *root_lock = qdisc_lock(q);
2740	bool contended;
2741	int rc;
2742
2743	qdisc_pkt_len_init(skb);
2744	qdisc_calculate_pkt_len(skb, q);
2745	/*
2746	 * Heuristic to force contended enqueues to serialize on a
2747	 * separate lock before trying to get qdisc main lock.
2748	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2749	 * and dequeue packets faster.
2750	 */
2751	contended = qdisc_is_running(q);
2752	if (unlikely(contended))
2753		spin_lock(&q->busylock);
2754
2755	spin_lock(root_lock);
2756	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2757		kfree_skb(skb);
2758		rc = NET_XMIT_DROP;
2759	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2760		   qdisc_run_begin(q)) {
2761		/*
2762		 * This is a work-conserving queue; there are no old skbs
2763		 * waiting to be sent out; and the qdisc is not running -
2764		 * xmit the skb directly.
2765		 */
2766		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2767			skb_dst_force(skb);
2768
2769		qdisc_bstats_update(q, skb);
2770
2771		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2772			if (unlikely(contended)) {
2773				spin_unlock(&q->busylock);
2774				contended = false;
2775			}
2776			__qdisc_run(q);
2777		} else
2778			qdisc_run_end(q);
2779
2780		rc = NET_XMIT_SUCCESS;
2781	} else {
2782		skb_dst_force(skb);
2783		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2784		if (qdisc_run_begin(q)) {
2785			if (unlikely(contended)) {
2786				spin_unlock(&q->busylock);
2787				contended = false;
2788			}
2789			__qdisc_run(q);
2790		}
2791	}
2792	spin_unlock(root_lock);
2793	if (unlikely(contended))
2794		spin_unlock(&q->busylock);
2795	return rc;
2796}
2797
2798#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2799static void skb_update_prio(struct sk_buff *skb)
2800{
2801	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2802
2803	if (!skb->priority && skb->sk && map) {
2804		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2805
2806		if (prioidx < map->priomap_len)
2807			skb->priority = map->priomap[prioidx];
2808	}
2809}
2810#else
2811#define skb_update_prio(skb)
2812#endif
2813
2814static DEFINE_PER_CPU(int, xmit_recursion);
2815#define RECURSION_LIMIT 10
2816
2817/**
2818 *	dev_loopback_xmit - loop back @skb
2819 *	@skb: buffer to transmit
2820 */
2821int dev_loopback_xmit(struct sk_buff *skb)
2822{
2823	skb_reset_mac_header(skb);
2824	__skb_pull(skb, skb_network_offset(skb));
2825	skb->pkt_type = PACKET_LOOPBACK;
2826	skb->ip_summed = CHECKSUM_UNNECESSARY;
2827	WARN_ON(!skb_dst(skb));
2828	skb_dst_force(skb);
2829	netif_rx_ni(skb);
2830	return 0;
2831}
2832EXPORT_SYMBOL(dev_loopback_xmit);
2833
2834/**
2835 *	__dev_queue_xmit - transmit a buffer
2836 *	@skb: buffer to transmit
2837 *	@accel_priv: private data used for L2 forwarding offload
2838 *
2839 *	Queue a buffer for transmission to a network device. The caller must
2840 *	have set the device and priority and built the buffer before calling
2841 *	this function. The function can be called from an interrupt.
2842 *
2843 *	A negative errno code is returned on a failure. A success does not
2844 *	guarantee the frame will be transmitted as it may be dropped due
2845 *	to congestion or traffic shaping.
2846 *
2847 * -----------------------------------------------------------------------------------
2848 *      I notice this method can also return errors from the queue disciplines,
2849 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2850 *      be positive.
2851 *
2852 *      Regardless of the return value, the skb is consumed, so it is currently
2853 *      difficult to retry a send to this method.  (You can bump the ref count
2854 *      before sending to hold a reference for retry if you are careful.)
2855 *
2856 *      When calling this method, interrupts MUST be enabled.  This is because
2857 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2858 *          --BLG
2859 */
2860static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2861{
2862	struct net_device *dev = skb->dev;
2863	struct netdev_queue *txq;
2864	struct Qdisc *q;
2865	int rc = -ENOMEM;
2866
2867	skb_reset_mac_header(skb);
2868
2869	/* Disable soft irqs for various locks below. Also
2870	 * stops preemption for RCU.
2871	 */
2872	rcu_read_lock_bh();
2873
2874	skb_update_prio(skb);
2875
2876	txq = netdev_pick_tx(dev, skb, accel_priv);
2877	q = rcu_dereference_bh(txq->qdisc);
2878
2879#ifdef CONFIG_NET_CLS_ACT
2880	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2881#endif
2882	trace_net_dev_queue(skb);
2883	if (q->enqueue) {
2884		rc = __dev_xmit_skb(skb, q, dev, txq);
2885		goto out;
2886	}
2887
2888	/* The device has no queue. Common case for software devices:
2889	   loopback, all the sorts of tunnels...
2890
2891	   Really, it is unlikely that netif_tx_lock protection is necessary
2892	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2893	   counters.)
2894	   However, it is possible, that they rely on protection
2895	   made by us here.
2896
2897	   Check this and shot the lock. It is not prone from deadlocks.
2898	   Either shot noqueue qdisc, it is even simpler 8)
2899	 */
2900	if (dev->flags & IFF_UP) {
2901		int cpu = smp_processor_id(); /* ok because BHs are off */
2902
2903		if (txq->xmit_lock_owner != cpu) {
2904
2905			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2906				goto recursion_alert;
2907
2908			HARD_TX_LOCK(dev, txq, cpu);
2909
2910			if (!netif_xmit_stopped(txq)) {
2911				__this_cpu_inc(xmit_recursion);
2912				rc = dev_hard_start_xmit(skb, dev, txq);
2913				__this_cpu_dec(xmit_recursion);
2914				if (dev_xmit_complete(rc)) {
2915					HARD_TX_UNLOCK(dev, txq);
2916					goto out;
2917				}
2918			}
2919			HARD_TX_UNLOCK(dev, txq);
2920			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2921					     dev->name);
2922		} else {
2923			/* Recursion is detected! It is possible,
2924			 * unfortunately
2925			 */
2926recursion_alert:
2927			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2928					     dev->name);
2929		}
2930	}
2931
2932	rc = -ENETDOWN;
2933	rcu_read_unlock_bh();
2934
2935	atomic_long_inc(&dev->tx_dropped);
2936	kfree_skb(skb);
2937	return rc;
2938out:
2939	rcu_read_unlock_bh();
2940	return rc;
2941}
2942
2943int dev_queue_xmit(struct sk_buff *skb)
2944{
2945	return __dev_queue_xmit(skb, NULL);
2946}
2947EXPORT_SYMBOL(dev_queue_xmit);
2948
2949int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2950{
2951	return __dev_queue_xmit(skb, accel_priv);
2952}
2953EXPORT_SYMBOL(dev_queue_xmit_accel);
2954
2955
2956/*=======================================================================
2957			Receiver routines
2958  =======================================================================*/
2959
2960int netdev_max_backlog __read_mostly = 1000;
2961EXPORT_SYMBOL(netdev_max_backlog);
2962
2963int netdev_tstamp_prequeue __read_mostly = 1;
2964int netdev_budget __read_mostly = 300;
2965int weight_p __read_mostly = 64;            /* old backlog weight */
2966
2967/* Called with irq disabled */
2968static inline void ____napi_schedule(struct softnet_data *sd,
2969				     struct napi_struct *napi)
2970{
2971	list_add_tail(&napi->poll_list, &sd->poll_list);
2972	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2973}
2974
2975#ifdef CONFIG_RPS
2976
2977/* One global table that all flow-based protocols share. */
2978struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2979EXPORT_SYMBOL(rps_sock_flow_table);
2980
2981struct static_key rps_needed __read_mostly;
2982
2983static struct rps_dev_flow *
2984set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2985	    struct rps_dev_flow *rflow, u16 next_cpu)
2986{
2987	if (next_cpu != RPS_NO_CPU) {
2988#ifdef CONFIG_RFS_ACCEL
2989		struct netdev_rx_queue *rxqueue;
2990		struct rps_dev_flow_table *flow_table;
2991		struct rps_dev_flow *old_rflow;
2992		u32 flow_id;
2993		u16 rxq_index;
2994		int rc;
2995
2996		/* Should we steer this flow to a different hardware queue? */
2997		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2998		    !(dev->features & NETIF_F_NTUPLE))
2999			goto out;
3000		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3001		if (rxq_index == skb_get_rx_queue(skb))
3002			goto out;
3003
3004		rxqueue = dev->_rx + rxq_index;
3005		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3006		if (!flow_table)
3007			goto out;
3008		flow_id = skb_get_hash(skb) & flow_table->mask;
3009		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3010							rxq_index, flow_id);
3011		if (rc < 0)
3012			goto out;
3013		old_rflow = rflow;
3014		rflow = &flow_table->flows[flow_id];
3015		rflow->filter = rc;
3016		if (old_rflow->filter == rflow->filter)
3017			old_rflow->filter = RPS_NO_FILTER;
3018	out:
3019#endif
3020		rflow->last_qtail =
3021			per_cpu(softnet_data, next_cpu).input_queue_head;
3022	}
3023
3024	rflow->cpu = next_cpu;
3025	return rflow;
3026}
3027
3028/*
3029 * get_rps_cpu is called from netif_receive_skb and returns the target
3030 * CPU from the RPS map of the receiving queue for a given skb.
3031 * rcu_read_lock must be held on entry.
3032 */
3033static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3034		       struct rps_dev_flow **rflowp)
3035{
3036	struct netdev_rx_queue *rxqueue;
3037	struct rps_map *map;
3038	struct rps_dev_flow_table *flow_table;
3039	struct rps_sock_flow_table *sock_flow_table;
3040	int cpu = -1;
3041	u16 tcpu;
3042	u32 hash;
3043
3044	if (skb_rx_queue_recorded(skb)) {
3045		u16 index = skb_get_rx_queue(skb);
3046		if (unlikely(index >= dev->real_num_rx_queues)) {
3047			WARN_ONCE(dev->real_num_rx_queues > 1,
3048				  "%s received packet on queue %u, but number "
3049				  "of RX queues is %u\n",
3050				  dev->name, index, dev->real_num_rx_queues);
3051			goto done;
3052		}
3053		rxqueue = dev->_rx + index;
3054	} else
3055		rxqueue = dev->_rx;
3056
3057	map = rcu_dereference(rxqueue->rps_map);
3058	if (map) {
3059		if (map->len == 1 &&
3060		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3061			tcpu = map->cpus[0];
3062			if (cpu_online(tcpu))
3063				cpu = tcpu;
3064			goto done;
3065		}
3066	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3067		goto done;
3068	}
3069
3070	skb_reset_network_header(skb);
3071	hash = skb_get_hash(skb);
3072	if (!hash)
3073		goto done;
3074
3075	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3076	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3077	if (flow_table && sock_flow_table) {
3078		u16 next_cpu;
3079		struct rps_dev_flow *rflow;
3080
3081		rflow = &flow_table->flows[hash & flow_table->mask];
3082		tcpu = rflow->cpu;
3083
3084		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3085
3086		/*
3087		 * If the desired CPU (where last recvmsg was done) is
3088		 * different from current CPU (one in the rx-queue flow
3089		 * table entry), switch if one of the following holds:
3090		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3091		 *   - Current CPU is offline.
3092		 *   - The current CPU's queue tail has advanced beyond the
3093		 *     last packet that was enqueued using this table entry.
3094		 *     This guarantees that all previous packets for the flow
3095		 *     have been dequeued, thus preserving in order delivery.
3096		 */
3097		if (unlikely(tcpu != next_cpu) &&
3098		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3099		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3100		      rflow->last_qtail)) >= 0)) {
3101			tcpu = next_cpu;
3102			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3103		}
3104
3105		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3106			*rflowp = rflow;
3107			cpu = tcpu;
3108			goto done;
3109		}
3110	}
3111
3112	if (map) {
3113		tcpu = map->cpus[((u64) hash * map->len) >> 32];
3114
3115		if (cpu_online(tcpu)) {
3116			cpu = tcpu;
3117			goto done;
3118		}
3119	}
3120
3121done:
3122	return cpu;
3123}
3124
3125#ifdef CONFIG_RFS_ACCEL
3126
3127/**
3128 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3129 * @dev: Device on which the filter was set
3130 * @rxq_index: RX queue index
3131 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3132 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3133 *
3134 * Drivers that implement ndo_rx_flow_steer() should periodically call
3135 * this function for each installed filter and remove the filters for
3136 * which it returns %true.
3137 */
3138bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3139			 u32 flow_id, u16 filter_id)
3140{
3141	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3142	struct rps_dev_flow_table *flow_table;
3143	struct rps_dev_flow *rflow;
3144	bool expire = true;
3145	int cpu;
3146
3147	rcu_read_lock();
3148	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3149	if (flow_table && flow_id <= flow_table->mask) {
3150		rflow = &flow_table->flows[flow_id];
3151		cpu = ACCESS_ONCE(rflow->cpu);
3152		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3153		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3154			   rflow->last_qtail) <
3155		     (int)(10 * flow_table->mask)))
3156			expire = false;
3157	}
3158	rcu_read_unlock();
3159	return expire;
3160}
3161EXPORT_SYMBOL(rps_may_expire_flow);
3162
3163#endif /* CONFIG_RFS_ACCEL */
3164
3165/* Called from hardirq (IPI) context */
3166static void rps_trigger_softirq(void *data)
3167{
3168	struct softnet_data *sd = data;
3169
3170	____napi_schedule(sd, &sd->backlog);
3171	sd->received_rps++;
3172}
3173
3174#endif /* CONFIG_RPS */
3175
3176/*
3177 * Check if this softnet_data structure is another cpu one
3178 * If yes, queue it to our IPI list and return 1
3179 * If no, return 0
3180 */
3181static int rps_ipi_queued(struct softnet_data *sd)
3182{
3183#ifdef CONFIG_RPS
3184	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3185
3186	if (sd != mysd) {
3187		sd->rps_ipi_next = mysd->rps_ipi_list;
3188		mysd->rps_ipi_list = sd;
3189
3190		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3191		return 1;
3192	}
3193#endif /* CONFIG_RPS */
3194	return 0;
3195}
3196
3197#ifdef CONFIG_NET_FLOW_LIMIT
3198int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3199#endif
3200
3201static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3202{
3203#ifdef CONFIG_NET_FLOW_LIMIT
3204	struct sd_flow_limit *fl;
3205	struct softnet_data *sd;
3206	unsigned int old_flow, new_flow;
3207
3208	if (qlen < (netdev_max_backlog >> 1))
3209		return false;
3210
3211	sd = &__get_cpu_var(softnet_data);
3212
3213	rcu_read_lock();
3214	fl = rcu_dereference(sd->flow_limit);
3215	if (fl) {
3216		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3217		old_flow = fl->history[fl->history_head];
3218		fl->history[fl->history_head] = new_flow;
3219
3220		fl->history_head++;
3221		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3222
3223		if (likely(fl->buckets[old_flow]))
3224			fl->buckets[old_flow]--;
3225
3226		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3227			fl->count++;
3228			rcu_read_unlock();
3229			return true;
3230		}
3231	}
3232	rcu_read_unlock();
3233#endif
3234	return false;
3235}
3236
3237/*
3238 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3239 * queue (may be a remote CPU queue).
3240 */
3241static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3242			      unsigned int *qtail)
3243{
3244	struct softnet_data *sd;
3245	unsigned long flags;
3246	unsigned int qlen;
3247
3248	sd = &per_cpu(softnet_data, cpu);
3249
3250	local_irq_save(flags);
3251
3252	rps_lock(sd);
3253	qlen = skb_queue_len(&sd->input_pkt_queue);
3254	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3255		if (skb_queue_len(&sd->input_pkt_queue)) {
3256enqueue:
3257			__skb_queue_tail(&sd->input_pkt_queue, skb);
3258			input_queue_tail_incr_save(sd, qtail);
3259			rps_unlock(sd);
3260			local_irq_restore(flags);
3261			return NET_RX_SUCCESS;
3262		}
3263
3264		/* Schedule NAPI for backlog device
3265		 * We can use non atomic operation since we own the queue lock
3266		 */
3267		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3268			if (!rps_ipi_queued(sd))
3269				____napi_schedule(sd, &sd->backlog);
3270		}
3271		goto enqueue;
3272	}
3273
3274	sd->dropped++;
3275	rps_unlock(sd);
3276
3277	local_irq_restore(flags);
3278
3279	atomic_long_inc(&skb->dev->rx_dropped);
3280	kfree_skb(skb);
3281	return NET_RX_DROP;
3282}
3283
3284static int netif_rx_internal(struct sk_buff *skb)
3285{
3286	int ret;
3287
3288	net_timestamp_check(netdev_tstamp_prequeue, skb);
3289
3290	trace_netif_rx(skb);
3291#ifdef CONFIG_RPS
3292	if (static_key_false(&rps_needed)) {
3293		struct rps_dev_flow voidflow, *rflow = &voidflow;
3294		int cpu;
3295
3296		preempt_disable();
3297		rcu_read_lock();
3298
3299		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3300		if (cpu < 0)
3301			cpu = smp_processor_id();
3302
3303		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3304
3305		rcu_read_unlock();
3306		preempt_enable();
3307	} else
3308#endif
3309	{
3310		unsigned int qtail;
3311		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3312		put_cpu();
3313	}
3314	return ret;
3315}
3316
3317/**
3318 *	netif_rx	-	post buffer to the network code
3319 *	@skb: buffer to post
3320 *
3321 *	This function receives a packet from a device driver and queues it for
3322 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3323 *	may be dropped during processing for congestion control or by the
3324 *	protocol layers.
3325 *
3326 *	return values:
3327 *	NET_RX_SUCCESS	(no congestion)
3328 *	NET_RX_DROP     (packet was dropped)
3329 *
3330 */
3331
3332int netif_rx(struct sk_buff *skb)
3333{
3334	trace_netif_rx_entry(skb);
3335
3336	return netif_rx_internal(skb);
3337}
3338EXPORT_SYMBOL(netif_rx);
3339
3340int netif_rx_ni(struct sk_buff *skb)
3341{
3342	int err;
3343
3344	trace_netif_rx_ni_entry(skb);
3345
3346	preempt_disable();
3347	err = netif_rx_internal(skb);
3348	if (local_softirq_pending())
3349		do_softirq();
3350	preempt_enable();
3351
3352	return err;
3353}
3354EXPORT_SYMBOL(netif_rx_ni);
3355
3356static void net_tx_action(struct softirq_action *h)
3357{
3358	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3359
3360	if (sd->completion_queue) {
3361		struct sk_buff *clist;
3362
3363		local_irq_disable();
3364		clist = sd->completion_queue;
3365		sd->completion_queue = NULL;
3366		local_irq_enable();
3367
3368		while (clist) {
3369			struct sk_buff *skb = clist;
3370			clist = clist->next;
3371
3372			WARN_ON(atomic_read(&skb->users));
3373			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3374				trace_consume_skb(skb);
3375			else
3376				trace_kfree_skb(skb, net_tx_action);
3377			__kfree_skb(skb);
3378		}
3379	}
3380
3381	if (sd->output_queue) {
3382		struct Qdisc *head;
3383
3384		local_irq_disable();
3385		head = sd->output_queue;
3386		sd->output_queue = NULL;
3387		sd->output_queue_tailp = &sd->output_queue;
3388		local_irq_enable();
3389
3390		while (head) {
3391			struct Qdisc *q = head;
3392			spinlock_t *root_lock;
3393
3394			head = head->next_sched;
3395
3396			root_lock = qdisc_lock(q);
3397			if (spin_trylock(root_lock)) {
3398				smp_mb__before_atomic();
3399				clear_bit(__QDISC_STATE_SCHED,
3400					  &q->state);
3401				qdisc_run(q);
3402				spin_unlock(root_lock);
3403			} else {
3404				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3405					      &q->state)) {
3406					__netif_reschedule(q);
3407				} else {
3408					smp_mb__before_atomic();
3409					clear_bit(__QDISC_STATE_SCHED,
3410						  &q->state);
3411				}
3412			}
3413		}
3414	}
3415}
3416
3417#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3418    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3419/* This hook is defined here for ATM LANE */
3420int (*br_fdb_test_addr_hook)(struct net_device *dev,
3421			     unsigned char *addr) __read_mostly;
3422EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3423#endif
3424
3425#ifdef CONFIG_NET_CLS_ACT
3426/* TODO: Maybe we should just force sch_ingress to be compiled in
3427 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3428 * a compare and 2 stores extra right now if we dont have it on
3429 * but have CONFIG_NET_CLS_ACT
3430 * NOTE: This doesn't stop any functionality; if you dont have
3431 * the ingress scheduler, you just can't add policies on ingress.
3432 *
3433 */
3434static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3435{
3436	struct net_device *dev = skb->dev;
3437	u32 ttl = G_TC_RTTL(skb->tc_verd);
3438	int result = TC_ACT_OK;
3439	struct Qdisc *q;
3440
3441	if (unlikely(MAX_RED_LOOP < ttl++)) {
3442		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3443				     skb->skb_iif, dev->ifindex);
3444		return TC_ACT_SHOT;
3445	}
3446
3447	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3448	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3449
3450	q = rxq->qdisc;
3451	if (q != &noop_qdisc) {
3452		spin_lock(qdisc_lock(q));
3453		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3454			result = qdisc_enqueue_root(skb, q);
3455		spin_unlock(qdisc_lock(q));
3456	}
3457
3458	return result;
3459}
3460
3461static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3462					 struct packet_type **pt_prev,
3463					 int *ret, struct net_device *orig_dev)
3464{
3465	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3466
3467	if (!rxq || rxq->qdisc == &noop_qdisc)
3468		goto out;
3469
3470	if (*pt_prev) {
3471		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3472		*pt_prev = NULL;
3473	}
3474
3475	switch (ing_filter(skb, rxq)) {
3476	case TC_ACT_SHOT:
3477	case TC_ACT_STOLEN:
3478		kfree_skb(skb);
3479		return NULL;
3480	}
3481
3482out:
3483	skb->tc_verd = 0;
3484	return skb;
3485}
3486#endif
3487
3488/**
3489 *	netdev_rx_handler_register - register receive handler
3490 *	@dev: device to register a handler for
3491 *	@rx_handler: receive handler to register
3492 *	@rx_handler_data: data pointer that is used by rx handler
3493 *
3494 *	Register a receive handler for a device. This handler will then be
3495 *	called from __netif_receive_skb. A negative errno code is returned
3496 *	on a failure.
3497 *
3498 *	The caller must hold the rtnl_mutex.
3499 *
3500 *	For a general description of rx_handler, see enum rx_handler_result.
3501 */
3502int netdev_rx_handler_register(struct net_device *dev,
3503			       rx_handler_func_t *rx_handler,
3504			       void *rx_handler_data)
3505{
3506	ASSERT_RTNL();
3507
3508	if (dev->rx_handler)
3509		return -EBUSY;
3510
3511	/* Note: rx_handler_data must be set before rx_handler */
3512	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3513	rcu_assign_pointer(dev->rx_handler, rx_handler);
3514
3515	return 0;
3516}
3517EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3518
3519/**
3520 *	netdev_rx_handler_unregister - unregister receive handler
3521 *	@dev: device to unregister a handler from
3522 *
3523 *	Unregister a receive handler from a device.
3524 *
3525 *	The caller must hold the rtnl_mutex.
3526 */
3527void netdev_rx_handler_unregister(struct net_device *dev)
3528{
3529
3530	ASSERT_RTNL();
3531	RCU_INIT_POINTER(dev->rx_handler, NULL);
3532	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3533	 * section has a guarantee to see a non NULL rx_handler_data
3534	 * as well.
3535	 */
3536	synchronize_net();
3537	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3538}
3539EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3540
3541/*
3542 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3543 * the special handling of PFMEMALLOC skbs.
3544 */
3545static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3546{
3547	switch (skb->protocol) {
3548	case htons(ETH_P_ARP):
3549	case htons(ETH_P_IP):
3550	case htons(ETH_P_IPV6):
3551	case htons(ETH_P_8021Q):
3552	case htons(ETH_P_8021AD):
3553		return true;
3554	default:
3555		return false;
3556	}
3557}
3558
3559static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3560{
3561	struct packet_type *ptype, *pt_prev;
3562	rx_handler_func_t *rx_handler;
3563	struct net_device *orig_dev;
3564	struct net_device *null_or_dev;
3565	bool deliver_exact = false;
3566	int ret = NET_RX_DROP;
3567	__be16 type;
3568
3569	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3570
3571	trace_netif_receive_skb(skb);
3572
3573	orig_dev = skb->dev;
3574
3575	skb_reset_network_header(skb);
3576	if (!skb_transport_header_was_set(skb))
3577		skb_reset_transport_header(skb);
3578	skb_reset_mac_len(skb);
3579
3580	pt_prev = NULL;
3581
3582	rcu_read_lock();
3583
3584another_round:
3585	skb->skb_iif = skb->dev->ifindex;
3586
3587	__this_cpu_inc(softnet_data.processed);
3588
3589	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3590	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3591		skb = vlan_untag(skb);
3592		if (unlikely(!skb))
3593			goto unlock;
3594	}
3595
3596#ifdef CONFIG_NET_CLS_ACT
3597	if (skb->tc_verd & TC_NCLS) {
3598		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3599		goto ncls;
3600	}
3601#endif
3602
3603	if (pfmemalloc)
3604		goto skip_taps;
3605
3606	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3607		if (!ptype->dev || ptype->dev == skb->dev) {
3608			if (pt_prev)
3609				ret = deliver_skb(skb, pt_prev, orig_dev);
3610			pt_prev = ptype;
3611		}
3612	}
3613
3614skip_taps:
3615#ifdef CONFIG_NET_CLS_ACT
3616	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3617	if (!skb)
3618		goto unlock;
3619ncls:
3620#endif
3621
3622	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3623		goto drop;
3624
3625	if (vlan_tx_tag_present(skb)) {
3626		if (pt_prev) {
3627			ret = deliver_skb(skb, pt_prev, orig_dev);
3628			pt_prev = NULL;
3629		}
3630		if (vlan_do_receive(&skb))
3631			goto another_round;
3632		else if (unlikely(!skb))
3633			goto unlock;
3634	}
3635
3636	rx_handler = rcu_dereference(skb->dev->rx_handler);
3637	if (rx_handler) {
3638		if (pt_prev) {
3639			ret = deliver_skb(skb, pt_prev, orig_dev);
3640			pt_prev = NULL;
3641		}
3642		switch (rx_handler(&skb)) {
3643		case RX_HANDLER_CONSUMED:
3644			ret = NET_RX_SUCCESS;
3645			goto unlock;
3646		case RX_HANDLER_ANOTHER:
3647			goto another_round;
3648		case RX_HANDLER_EXACT:
3649			deliver_exact = true;
3650		case RX_HANDLER_PASS:
3651			break;
3652		default:
3653			BUG();
3654		}
3655	}
3656
3657	if (unlikely(vlan_tx_tag_present(skb))) {
3658		if (vlan_tx_tag_get_id(skb))
3659			skb->pkt_type = PACKET_OTHERHOST;
3660		/* Note: we might in the future use prio bits
3661		 * and set skb->priority like in vlan_do_receive()
3662		 * For the time being, just ignore Priority Code Point
3663		 */
3664		skb->vlan_tci = 0;
3665	}
3666
3667	/* deliver only exact match when indicated */
3668	null_or_dev = deliver_exact ? skb->dev : NULL;
3669
3670	type = skb->protocol;
3671	list_for_each_entry_rcu(ptype,
3672			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3673		if (ptype->type == type &&
3674		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3675		     ptype->dev == orig_dev)) {
3676			if (pt_prev)
3677				ret = deliver_skb(skb, pt_prev, orig_dev);
3678			pt_prev = ptype;
3679		}
3680	}
3681
3682	if (pt_prev) {
3683		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3684			goto drop;
3685		else
3686			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3687	} else {
3688drop:
3689		atomic_long_inc(&skb->dev->rx_dropped);
3690		kfree_skb(skb);
3691		/* Jamal, now you will not able to escape explaining
3692		 * me how you were going to use this. :-)
3693		 */
3694		ret = NET_RX_DROP;
3695	}
3696
3697unlock:
3698	rcu_read_unlock();
3699	return ret;
3700}
3701
3702static int __netif_receive_skb(struct sk_buff *skb)
3703{
3704	int ret;
3705
3706	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3707		unsigned long pflags = current->flags;
3708
3709		/*
3710		 * PFMEMALLOC skbs are special, they should
3711		 * - be delivered to SOCK_MEMALLOC sockets only
3712		 * - stay away from userspace
3713		 * - have bounded memory usage
3714		 *
3715		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3716		 * context down to all allocation sites.
3717		 */
3718		current->flags |= PF_MEMALLOC;
3719		ret = __netif_receive_skb_core(skb, true);
3720		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3721	} else
3722		ret = __netif_receive_skb_core(skb, false);
3723
3724	return ret;
3725}
3726
3727static int netif_receive_skb_internal(struct sk_buff *skb)
3728{
3729	net_timestamp_check(netdev_tstamp_prequeue, skb);
3730
3731	if (skb_defer_rx_timestamp(skb))
3732		return NET_RX_SUCCESS;
3733
3734#ifdef CONFIG_RPS
3735	if (static_key_false(&rps_needed)) {
3736		struct rps_dev_flow voidflow, *rflow = &voidflow;
3737		int cpu, ret;
3738
3739		rcu_read_lock();
3740
3741		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3742
3743		if (cpu >= 0) {
3744			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3745			rcu_read_unlock();
3746			return ret;
3747		}
3748		rcu_read_unlock();
3749	}
3750#endif
3751	return __netif_receive_skb(skb);
3752}
3753
3754/**
3755 *	netif_receive_skb - process receive buffer from network
3756 *	@skb: buffer to process
3757 *
3758 *	netif_receive_skb() is the main receive data processing function.
3759 *	It always succeeds. The buffer may be dropped during processing
3760 *	for congestion control or by the protocol layers.
3761 *
3762 *	This function may only be called from softirq context and interrupts
3763 *	should be enabled.
3764 *
3765 *	Return values (usually ignored):
3766 *	NET_RX_SUCCESS: no congestion
3767 *	NET_RX_DROP: packet was dropped
3768 */
3769int netif_receive_skb(struct sk_buff *skb)
3770{
3771	trace_netif_receive_skb_entry(skb);
3772
3773	return netif_receive_skb_internal(skb);
3774}
3775EXPORT_SYMBOL(netif_receive_skb);
3776
3777/* Network device is going away, flush any packets still pending
3778 * Called with irqs disabled.
3779 */
3780static void flush_backlog(void *arg)
3781{
3782	struct net_device *dev = arg;
3783	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3784	struct sk_buff *skb, *tmp;
3785
3786	rps_lock(sd);
3787	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3788		if (skb->dev == dev) {
3789			__skb_unlink(skb, &sd->input_pkt_queue);
3790			kfree_skb(skb);
3791			input_queue_head_incr(sd);
3792		}
3793	}
3794	rps_unlock(sd);
3795
3796	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3797		if (skb->dev == dev) {
3798			__skb_unlink(skb, &sd->process_queue);
3799			kfree_skb(skb);
3800			input_queue_head_incr(sd);
3801		}
3802	}
3803}
3804
3805static int napi_gro_complete(struct sk_buff *skb)
3806{
3807	struct packet_offload *ptype;
3808	__be16 type = skb->protocol;
3809	struct list_head *head = &offload_base;
3810	int err = -ENOENT;
3811
3812	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3813
3814	if (NAPI_GRO_CB(skb)->count == 1) {
3815		skb_shinfo(skb)->gso_size = 0;
3816		goto out;
3817	}
3818
3819	rcu_read_lock();
3820	list_for_each_entry_rcu(ptype, head, list) {
3821		if (ptype->type != type || !ptype->callbacks.gro_complete)
3822			continue;
3823
3824		err = ptype->callbacks.gro_complete(skb, 0);
3825		break;
3826	}
3827	rcu_read_unlock();
3828
3829	if (err) {
3830		WARN_ON(&ptype->list == head);
3831		kfree_skb(skb);
3832		return NET_RX_SUCCESS;
3833	}
3834
3835out:
3836	return netif_receive_skb_internal(skb);
3837}
3838
3839/* napi->gro_list contains packets ordered by age.
3840 * youngest packets at the head of it.
3841 * Complete skbs in reverse order to reduce latencies.
3842 */
3843void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3844{
3845	struct sk_buff *skb, *prev = NULL;
3846
3847	/* scan list and build reverse chain */
3848	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3849		skb->prev = prev;
3850		prev = skb;
3851	}
3852
3853	for (skb = prev; skb; skb = prev) {
3854		skb->next = NULL;
3855
3856		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3857			return;
3858
3859		prev = skb->prev;
3860		napi_gro_complete(skb);
3861		napi->gro_count--;
3862	}
3863
3864	napi->gro_list = NULL;
3865}
3866EXPORT_SYMBOL(napi_gro_flush);
3867
3868static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3869{
3870	struct sk_buff *p;
3871	unsigned int maclen = skb->dev->hard_header_len;
3872	u32 hash = skb_get_hash_raw(skb);
3873
3874	for (p = napi->gro_list; p; p = p->next) {
3875		unsigned long diffs;
3876
3877		NAPI_GRO_CB(p)->flush = 0;
3878
3879		if (hash != skb_get_hash_raw(p)) {
3880			NAPI_GRO_CB(p)->same_flow = 0;
3881			continue;
3882		}
3883
3884		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3885		diffs |= p->vlan_tci ^ skb->vlan_tci;
3886		if (maclen == ETH_HLEN)
3887			diffs |= compare_ether_header(skb_mac_header(p),
3888						      skb_mac_header(skb));
3889		else if (!diffs)
3890			diffs = memcmp(skb_mac_header(p),
3891				       skb_mac_header(skb),
3892				       maclen);
3893		NAPI_GRO_CB(p)->same_flow = !diffs;
3894	}
3895}
3896
3897static void skb_gro_reset_offset(struct sk_buff *skb)
3898{
3899	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3900	const skb_frag_t *frag0 = &pinfo->frags[0];
3901
3902	NAPI_GRO_CB(skb)->data_offset = 0;
3903	NAPI_GRO_CB(skb)->frag0 = NULL;
3904	NAPI_GRO_CB(skb)->frag0_len = 0;
3905
3906	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3907	    pinfo->nr_frags &&
3908	    !PageHighMem(skb_frag_page(frag0))) {
3909		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3910		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3911	}
3912}
3913
3914static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3915{
3916	struct skb_shared_info *pinfo = skb_shinfo(skb);
3917
3918	BUG_ON(skb->end - skb->tail < grow);
3919
3920	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3921
3922	skb->data_len -= grow;
3923	skb->tail += grow;
3924
3925	pinfo->frags[0].page_offset += grow;
3926	skb_frag_size_sub(&pinfo->frags[0], grow);
3927
3928	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3929		skb_frag_unref(skb, 0);
3930		memmove(pinfo->frags, pinfo->frags + 1,
3931			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3932	}
3933}
3934
3935static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3936{
3937	struct sk_buff **pp = NULL;
3938	struct packet_offload *ptype;
3939	__be16 type = skb->protocol;
3940	struct list_head *head = &offload_base;
3941	int same_flow;
3942	enum gro_result ret;
3943	int grow;
3944
3945	if (!(skb->dev->features & NETIF_F_GRO))
3946		goto normal;
3947
3948	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3949		goto normal;
3950
3951	gro_list_prepare(napi, skb);
3952	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3953
3954	rcu_read_lock();
3955	list_for_each_entry_rcu(ptype, head, list) {
3956		if (ptype->type != type || !ptype->callbacks.gro_receive)
3957			continue;
3958
3959		skb_set_network_header(skb, skb_gro_offset(skb));
3960		skb_reset_mac_len(skb);
3961		NAPI_GRO_CB(skb)->same_flow = 0;
3962		NAPI_GRO_CB(skb)->flush = 0;
3963		NAPI_GRO_CB(skb)->free = 0;
3964		NAPI_GRO_CB(skb)->udp_mark = 0;
3965
3966		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3967		break;
3968	}
3969	rcu_read_unlock();
3970
3971	if (&ptype->list == head)
3972		goto normal;
3973
3974	same_flow = NAPI_GRO_CB(skb)->same_flow;
3975	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3976
3977	if (pp) {
3978		struct sk_buff *nskb = *pp;
3979
3980		*pp = nskb->next;
3981		nskb->next = NULL;
3982		napi_gro_complete(nskb);
3983		napi->gro_count--;
3984	}
3985
3986	if (same_flow)
3987		goto ok;
3988
3989	if (NAPI_GRO_CB(skb)->flush)
3990		goto normal;
3991
3992	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3993		struct sk_buff *nskb = napi->gro_list;
3994
3995		/* locate the end of the list to select the 'oldest' flow */
3996		while (nskb->next) {
3997			pp = &nskb->next;
3998			nskb = *pp;
3999		}
4000		*pp = NULL;
4001		nskb->next = NULL;
4002		napi_gro_complete(nskb);
4003	} else {
4004		napi->gro_count++;
4005	}
4006	NAPI_GRO_CB(skb)->count = 1;
4007	NAPI_GRO_CB(skb)->age = jiffies;
4008	NAPI_GRO_CB(skb)->last = skb;
4009	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4010	skb->next = napi->gro_list;
4011	napi->gro_list = skb;
4012	ret = GRO_HELD;
4013
4014pull:
4015	grow = skb_gro_offset(skb) - skb_headlen(skb);
4016	if (grow > 0)
4017		gro_pull_from_frag0(skb, grow);
4018ok:
4019	return ret;
4020
4021normal:
4022	ret = GRO_NORMAL;
4023	goto pull;
4024}
4025
4026struct packet_offload *gro_find_receive_by_type(__be16 type)
4027{
4028	struct list_head *offload_head = &offload_base;
4029	struct packet_offload *ptype;
4030
4031	list_for_each_entry_rcu(ptype, offload_head, list) {
4032		if (ptype->type != type || !ptype->callbacks.gro_receive)
4033			continue;
4034		return ptype;
4035	}
4036	return NULL;
4037}
4038EXPORT_SYMBOL(gro_find_receive_by_type);
4039
4040struct packet_offload *gro_find_complete_by_type(__be16 type)
4041{
4042	struct list_head *offload_head = &offload_base;
4043	struct packet_offload *ptype;
4044
4045	list_for_each_entry_rcu(ptype, offload_head, list) {
4046		if (ptype->type != type || !ptype->callbacks.gro_complete)
4047			continue;
4048		return ptype;
4049	}
4050	return NULL;
4051}
4052EXPORT_SYMBOL(gro_find_complete_by_type);
4053
4054static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4055{
4056	switch (ret) {
4057	case GRO_NORMAL:
4058		if (netif_receive_skb_internal(skb))
4059			ret = GRO_DROP;
4060		break;
4061
4062	case GRO_DROP:
4063		kfree_skb(skb);
4064		break;
4065
4066	case GRO_MERGED_FREE:
4067		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4068			kmem_cache_free(skbuff_head_cache, skb);
4069		else
4070			__kfree_skb(skb);
4071		break;
4072
4073	case GRO_HELD:
4074	case GRO_MERGED:
4075		break;
4076	}
4077
4078	return ret;
4079}
4080
4081gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4082{
4083	trace_napi_gro_receive_entry(skb);
4084
4085	skb_gro_reset_offset(skb);
4086
4087	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4088}
4089EXPORT_SYMBOL(napi_gro_receive);
4090
4091static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4092{
4093	__skb_pull(skb, skb_headlen(skb));
4094	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4095	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4096	skb->vlan_tci = 0;
4097	skb->dev = napi->dev;
4098	skb->skb_iif = 0;
4099	skb->encapsulation = 0;
4100	skb_shinfo(skb)->gso_type = 0;
4101	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4102
4103	napi->skb = skb;
4104}
4105
4106struct sk_buff *napi_get_frags(struct napi_struct *napi)
4107{
4108	struct sk_buff *skb = napi->skb;
4109
4110	if (!skb) {
4111		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4112		napi->skb = skb;
4113	}
4114	return skb;
4115}
4116EXPORT_SYMBOL(napi_get_frags);
4117
4118static gro_result_t napi_frags_finish(struct napi_struct *napi,
4119				      struct sk_buff *skb,
4120				      gro_result_t ret)
4121{
4122	switch (ret) {
4123	case GRO_NORMAL:
4124	case GRO_HELD:
4125		__skb_push(skb, ETH_HLEN);
4126		skb->protocol = eth_type_trans(skb, skb->dev);
4127		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4128			ret = GRO_DROP;
4129		break;
4130
4131	case GRO_DROP:
4132	case GRO_MERGED_FREE:
4133		napi_reuse_skb(napi, skb);
4134		break;
4135
4136	case GRO_MERGED:
4137		break;
4138	}
4139
4140	return ret;
4141}
4142
4143/* Upper GRO stack assumes network header starts at gro_offset=0
4144 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4145 * We copy ethernet header into skb->data to have a common layout.
4146 */
4147static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4148{
4149	struct sk_buff *skb = napi->skb;
4150	const struct ethhdr *eth;
4151	unsigned int hlen = sizeof(*eth);
4152
4153	napi->skb = NULL;
4154
4155	skb_reset_mac_header(skb);
4156	skb_gro_reset_offset(skb);
4157
4158	eth = skb_gro_header_fast(skb, 0);
4159	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4160		eth = skb_gro_header_slow(skb, hlen, 0);
4161		if (unlikely(!eth)) {
4162			napi_reuse_skb(napi, skb);
4163			return NULL;
4164		}
4165	} else {
4166		gro_pull_from_frag0(skb, hlen);
4167		NAPI_GRO_CB(skb)->frag0 += hlen;
4168		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4169	}
4170	__skb_pull(skb, hlen);
4171
4172	/*
4173	 * This works because the only protocols we care about don't require
4174	 * special handling.
4175	 * We'll fix it up properly in napi_frags_finish()
4176	 */
4177	skb->protocol = eth->h_proto;
4178
4179	return skb;
4180}
4181
4182gro_result_t napi_gro_frags(struct napi_struct *napi)
4183{
4184	struct sk_buff *skb = napi_frags_skb(napi);
4185
4186	if (!skb)
4187		return GRO_DROP;
4188
4189	trace_napi_gro_frags_entry(skb);
4190
4191	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4192}
4193EXPORT_SYMBOL(napi_gro_frags);
4194
4195/*
4196 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4197 * Note: called with local irq disabled, but exits with local irq enabled.
4198 */
4199static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4200{
4201#ifdef CONFIG_RPS
4202	struct softnet_data *remsd = sd->rps_ipi_list;
4203
4204	if (remsd) {
4205		sd->rps_ipi_list = NULL;
4206
4207		local_irq_enable();
4208
4209		/* Send pending IPI's to kick RPS processing on remote cpus. */
4210		while (remsd) {
4211			struct softnet_data *next = remsd->rps_ipi_next;
4212
4213			if (cpu_online(remsd->cpu))
4214				smp_call_function_single_async(remsd->cpu,
4215							   &remsd->csd);
4216			remsd = next;
4217		}
4218	} else
4219#endif
4220		local_irq_enable();
4221}
4222
4223static int process_backlog(struct napi_struct *napi, int quota)
4224{
4225	int work = 0;
4226	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4227
4228#ifdef CONFIG_RPS
4229	/* Check if we have pending ipi, its better to send them now,
4230	 * not waiting net_rx_action() end.
4231	 */
4232	if (sd->rps_ipi_list) {
4233		local_irq_disable();
4234		net_rps_action_and_irq_enable(sd);
4235	}
4236#endif
4237	napi->weight = weight_p;
4238	local_irq_disable();
4239	while (1) {
4240		struct sk_buff *skb;
4241
4242		while ((skb = __skb_dequeue(&sd->process_queue))) {
4243			local_irq_enable();
4244			__netif_receive_skb(skb);
4245			local_irq_disable();
4246			input_queue_head_incr(sd);
4247			if (++work >= quota) {
4248				local_irq_enable();
4249				return work;
4250			}
4251		}
4252
4253		rps_lock(sd);
4254		if (skb_queue_empty(&sd->input_pkt_queue)) {
4255			/*
4256			 * Inline a custom version of __napi_complete().
4257			 * only current cpu owns and manipulates this napi,
4258			 * and NAPI_STATE_SCHED is the only possible flag set
4259			 * on backlog.
4260			 * We can use a plain write instead of clear_bit(),
4261			 * and we dont need an smp_mb() memory barrier.
4262			 */
4263			list_del(&napi->poll_list);
4264			napi->state = 0;
4265			rps_unlock(sd);
4266
4267			break;
4268		}
4269
4270		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4271					   &sd->process_queue);
4272		rps_unlock(sd);
4273	}
4274	local_irq_enable();
4275
4276	return work;
4277}
4278
4279/**
4280 * __napi_schedule - schedule for receive
4281 * @n: entry to schedule
4282 *
4283 * The entry's receive function will be scheduled to run
4284 */
4285void __napi_schedule(struct napi_struct *n)
4286{
4287	unsigned long flags;
4288
4289	local_irq_save(flags);
4290	____napi_schedule(&__get_cpu_var(softnet_data), n);
4291	local_irq_restore(flags);
4292}
4293EXPORT_SYMBOL(__napi_schedule);
4294
4295void __napi_complete(struct napi_struct *n)
4296{
4297	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4298	BUG_ON(n->gro_list);
4299
4300	list_del(&n->poll_list);
4301	smp_mb__before_atomic();
4302	clear_bit(NAPI_STATE_SCHED, &n->state);
4303}
4304EXPORT_SYMBOL(__napi_complete);
4305
4306void napi_complete(struct napi_struct *n)
4307{
4308	unsigned long flags;
4309
4310	/*
4311	 * don't let napi dequeue from the cpu poll list
4312	 * just in case its running on a different cpu
4313	 */
4314	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4315		return;
4316
4317	napi_gro_flush(n, false);
4318	local_irq_save(flags);
4319	__napi_complete(n);
4320	local_irq_restore(flags);
4321}
4322EXPORT_SYMBOL(napi_complete);
4323
4324/* must be called under rcu_read_lock(), as we dont take a reference */
4325struct napi_struct *napi_by_id(unsigned int napi_id)
4326{
4327	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4328	struct napi_struct *napi;
4329
4330	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4331		if (napi->napi_id == napi_id)
4332			return napi;
4333
4334	return NULL;
4335}
4336EXPORT_SYMBOL_GPL(napi_by_id);
4337
4338void napi_hash_add(struct napi_struct *napi)
4339{
4340	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4341
4342		spin_lock(&napi_hash_lock);
4343
4344		/* 0 is not a valid id, we also skip an id that is taken
4345		 * we expect both events to be extremely rare
4346		 */
4347		napi->napi_id = 0;
4348		while (!napi->napi_id) {
4349			napi->napi_id = ++napi_gen_id;
4350			if (napi_by_id(napi->napi_id))
4351				napi->napi_id = 0;
4352		}
4353
4354		hlist_add_head_rcu(&napi->napi_hash_node,
4355			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4356
4357		spin_unlock(&napi_hash_lock);
4358	}
4359}
4360EXPORT_SYMBOL_GPL(napi_hash_add);
4361
4362/* Warning : caller is responsible to make sure rcu grace period
4363 * is respected before freeing memory containing @napi
4364 */
4365void napi_hash_del(struct napi_struct *napi)
4366{
4367	spin_lock(&napi_hash_lock);
4368
4369	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4370		hlist_del_rcu(&napi->napi_hash_node);
4371
4372	spin_unlock(&napi_hash_lock);
4373}
4374EXPORT_SYMBOL_GPL(napi_hash_del);
4375
4376void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4377		    int (*poll)(struct napi_struct *, int), int weight)
4378{
4379	INIT_LIST_HEAD(&napi->poll_list);
4380	napi->gro_count = 0;
4381	napi->gro_list = NULL;
4382	napi->skb = NULL;
4383	napi->poll = poll;
4384	if (weight > NAPI_POLL_WEIGHT)
4385		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4386			    weight, dev->name);
4387	napi->weight = weight;
4388	list_add(&napi->dev_list, &dev->napi_list);
4389	napi->dev = dev;
4390#ifdef CONFIG_NETPOLL
4391	spin_lock_init(&napi->poll_lock);
4392	napi->poll_owner = -1;
4393#endif
4394	set_bit(NAPI_STATE_SCHED, &napi->state);
4395}
4396EXPORT_SYMBOL(netif_napi_add);
4397
4398void netif_napi_del(struct napi_struct *napi)
4399{
4400	list_del_init(&napi->dev_list);
4401	napi_free_frags(napi);
4402
4403	kfree_skb_list(napi->gro_list);
4404	napi->gro_list = NULL;
4405	napi->gro_count = 0;
4406}
4407EXPORT_SYMBOL(netif_napi_del);
4408
4409static void net_rx_action(struct softirq_action *h)
4410{
4411	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4412	unsigned long time_limit = jiffies + 2;
4413	int budget = netdev_budget;
4414	void *have;
4415
4416	local_irq_disable();
4417
4418	while (!list_empty(&sd->poll_list)) {
4419		struct napi_struct *n;
4420		int work, weight;
4421
4422		/* If softirq window is exhuasted then punt.
4423		 * Allow this to run for 2 jiffies since which will allow
4424		 * an average latency of 1.5/HZ.
4425		 */
4426		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4427			goto softnet_break;
4428
4429		local_irq_enable();
4430
4431		/* Even though interrupts have been re-enabled, this
4432		 * access is safe because interrupts can only add new
4433		 * entries to the tail of this list, and only ->poll()
4434		 * calls can remove this head entry from the list.
4435		 */
4436		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4437
4438		have = netpoll_poll_lock(n);
4439
4440		weight = n->weight;
4441
4442		/* This NAPI_STATE_SCHED test is for avoiding a race
4443		 * with netpoll's poll_napi().  Only the entity which
4444		 * obtains the lock and sees NAPI_STATE_SCHED set will
4445		 * actually make the ->poll() call.  Therefore we avoid
4446		 * accidentally calling ->poll() when NAPI is not scheduled.
4447		 */
4448		work = 0;
4449		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4450			work = n->poll(n, weight);
4451			trace_napi_poll(n);
4452		}
4453
4454		WARN_ON_ONCE(work > weight);
4455
4456		budget -= work;
4457
4458		local_irq_disable();
4459
4460		/* Drivers must not modify the NAPI state if they
4461		 * consume the entire weight.  In such cases this code
4462		 * still "owns" the NAPI instance and therefore can
4463		 * move the instance around on the list at-will.
4464		 */
4465		if (unlikely(work == weight)) {
4466			if (unlikely(napi_disable_pending(n))) {
4467				local_irq_enable();
4468				napi_complete(n);
4469				local_irq_disable();
4470			} else {
4471				if (n->gro_list) {
4472					/* flush too old packets
4473					 * If HZ < 1000, flush all packets.
4474					 */
4475					local_irq_enable();
4476					napi_gro_flush(n, HZ >= 1000);
4477					local_irq_disable();
4478				}
4479				list_move_tail(&n->poll_list, &sd->poll_list);
4480			}
4481		}
4482
4483		netpoll_poll_unlock(have);
4484	}
4485out:
4486	net_rps_action_and_irq_enable(sd);
4487
4488#ifdef CONFIG_NET_DMA
4489	/*
4490	 * There may not be any more sk_buffs coming right now, so push
4491	 * any pending DMA copies to hardware
4492	 */
4493	dma_issue_pending_all();
4494#endif
4495
4496	return;
4497
4498softnet_break:
4499	sd->time_squeeze++;
4500	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4501	goto out;
4502}
4503
4504struct netdev_adjacent {
4505	struct net_device *dev;
4506
4507	/* upper master flag, there can only be one master device per list */
4508	bool master;
4509
4510	/* counter for the number of times this device was added to us */
4511	u16 ref_nr;
4512
4513	/* private field for the users */
4514	void *private;
4515
4516	struct list_head list;
4517	struct rcu_head rcu;
4518};
4519
4520static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4521						 struct net_device *adj_dev,
4522						 struct list_head *adj_list)
4523{
4524	struct netdev_adjacent *adj;
4525
4526	list_for_each_entry(adj, adj_list, list) {
4527		if (adj->dev == adj_dev)
4528			return adj;
4529	}
4530	return NULL;
4531}
4532
4533/**
4534 * netdev_has_upper_dev - Check if device is linked to an upper device
4535 * @dev: device
4536 * @upper_dev: upper device to check
4537 *
4538 * Find out if a device is linked to specified upper device and return true
4539 * in case it is. Note that this checks only immediate upper device,
4540 * not through a complete stack of devices. The caller must hold the RTNL lock.
4541 */
4542bool netdev_has_upper_dev(struct net_device *dev,
4543			  struct net_device *upper_dev)
4544{
4545	ASSERT_RTNL();
4546
4547	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4548}
4549EXPORT_SYMBOL(netdev_has_upper_dev);
4550
4551/**
4552 * netdev_has_any_upper_dev - Check if device is linked to some device
4553 * @dev: device
4554 *
4555 * Find out if a device is linked to an upper device and return true in case
4556 * it is. The caller must hold the RTNL lock.
4557 */
4558static bool netdev_has_any_upper_dev(struct net_device *dev)
4559{
4560	ASSERT_RTNL();
4561
4562	return !list_empty(&dev->all_adj_list.upper);
4563}
4564
4565/**
4566 * netdev_master_upper_dev_get - Get master upper device
4567 * @dev: device
4568 *
4569 * Find a master upper device and return pointer to it or NULL in case
4570 * it's not there. The caller must hold the RTNL lock.
4571 */
4572struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4573{
4574	struct netdev_adjacent *upper;
4575
4576	ASSERT_RTNL();
4577
4578	if (list_empty(&dev->adj_list.upper))
4579		return NULL;
4580
4581	upper = list_first_entry(&dev->adj_list.upper,
4582				 struct netdev_adjacent, list);
4583	if (likely(upper->master))
4584		return upper->dev;
4585	return NULL;
4586}
4587EXPORT_SYMBOL(netdev_master_upper_dev_get);
4588
4589void *netdev_adjacent_get_private(struct list_head *adj_list)
4590{
4591	struct netdev_adjacent *adj;
4592
4593	adj = list_entry(adj_list, struct netdev_adjacent, list);
4594
4595	return adj->private;
4596}
4597EXPORT_SYMBOL(netdev_adjacent_get_private);
4598
4599/**
4600 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4601 * @dev: device
4602 * @iter: list_head ** of the current position
4603 *
4604 * Gets the next device from the dev's upper list, starting from iter
4605 * position. The caller must hold RCU read lock.
4606 */
4607struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4608						 struct list_head **iter)
4609{
4610	struct netdev_adjacent *upper;
4611
4612	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4613
4614	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4615
4616	if (&upper->list == &dev->adj_list.upper)
4617		return NULL;
4618
4619	*iter = &upper->list;
4620
4621	return upper->dev;
4622}
4623EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4624
4625/**
4626 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4627 * @dev: device
4628 * @iter: list_head ** of the current position
4629 *
4630 * Gets the next device from the dev's upper list, starting from iter
4631 * position. The caller must hold RCU read lock.
4632 */
4633struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4634						     struct list_head **iter)
4635{
4636	struct netdev_adjacent *upper;
4637
4638	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4639
4640	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4641
4642	if (&upper->list == &dev->all_adj_list.upper)
4643		return NULL;
4644
4645	*iter = &upper->list;
4646
4647	return upper->dev;
4648}
4649EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4650
4651/**
4652 * netdev_lower_get_next_private - Get the next ->private from the
4653 *				   lower neighbour list
4654 * @dev: device
4655 * @iter: list_head ** of the current position
4656 *
4657 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4658 * list, starting from iter position. The caller must hold either hold the
4659 * RTNL lock or its own locking that guarantees that the neighbour lower
4660 * list will remain unchainged.
4661 */
4662void *netdev_lower_get_next_private(struct net_device *dev,
4663				    struct list_head **iter)
4664{
4665	struct netdev_adjacent *lower;
4666
4667	lower = list_entry(*iter, struct netdev_adjacent, list);
4668
4669	if (&lower->list == &dev->adj_list.lower)
4670		return NULL;
4671
4672	*iter = lower->list.next;
4673
4674	return lower->private;
4675}
4676EXPORT_SYMBOL(netdev_lower_get_next_private);
4677
4678/**
4679 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4680 *				       lower neighbour list, RCU
4681 *				       variant
4682 * @dev: device
4683 * @iter: list_head ** of the current position
4684 *
4685 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4686 * list, starting from iter position. The caller must hold RCU read lock.
4687 */
4688void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4689					struct list_head **iter)
4690{
4691	struct netdev_adjacent *lower;
4692
4693	WARN_ON_ONCE(!rcu_read_lock_held());
4694
4695	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4696
4697	if (&lower->list == &dev->adj_list.lower)
4698		return NULL;
4699
4700	*iter = &lower->list;
4701
4702	return lower->private;
4703}
4704EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4705
4706/**
4707 * netdev_lower_get_next - Get the next device from the lower neighbour
4708 *                         list
4709 * @dev: device
4710 * @iter: list_head ** of the current position
4711 *
4712 * Gets the next netdev_adjacent from the dev's lower neighbour
4713 * list, starting from iter position. The caller must hold RTNL lock or
4714 * its own locking that guarantees that the neighbour lower
4715 * list will remain unchainged.
4716 */
4717void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4718{
4719	struct netdev_adjacent *lower;
4720
4721	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4722
4723	if (&lower->list == &dev->adj_list.lower)
4724		return NULL;
4725
4726	*iter = &lower->list;
4727
4728	return lower->dev;
4729}
4730EXPORT_SYMBOL(netdev_lower_get_next);
4731
4732/**
4733 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4734 *				       lower neighbour list, RCU
4735 *				       variant
4736 * @dev: device
4737 *
4738 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4739 * list. The caller must hold RCU read lock.
4740 */
4741void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4742{
4743	struct netdev_adjacent *lower;
4744
4745	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4746			struct netdev_adjacent, list);
4747	if (lower)
4748		return lower->private;
4749	return NULL;
4750}
4751EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4752
4753/**
4754 * netdev_master_upper_dev_get_rcu - Get master upper device
4755 * @dev: device
4756 *
4757 * Find a master upper device and return pointer to it or NULL in case
4758 * it's not there. The caller must hold the RCU read lock.
4759 */
4760struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4761{
4762	struct netdev_adjacent *upper;
4763
4764	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4765				       struct netdev_adjacent, list);
4766	if (upper && likely(upper->master))
4767		return upper->dev;
4768	return NULL;
4769}
4770EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4771
4772static int netdev_adjacent_sysfs_add(struct net_device *dev,
4773			      struct net_device *adj_dev,
4774			      struct list_head *dev_list)
4775{
4776	char linkname[IFNAMSIZ+7];
4777	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4778		"upper_%s" : "lower_%s", adj_dev->name);
4779	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4780				 linkname);
4781}
4782static void netdev_adjacent_sysfs_del(struct net_device *dev,
4783			       char *name,
4784			       struct list_head *dev_list)
4785{
4786	char linkname[IFNAMSIZ+7];
4787	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4788		"upper_%s" : "lower_%s", name);
4789	sysfs_remove_link(&(dev->dev.kobj), linkname);
4790}
4791
4792#define netdev_adjacent_is_neigh_list(dev, dev_list) \
4793		(dev_list == &dev->adj_list.upper || \
4794		 dev_list == &dev->adj_list.lower)
4795
4796static int __netdev_adjacent_dev_insert(struct net_device *dev,
4797					struct net_device *adj_dev,
4798					struct list_head *dev_list,
4799					void *private, bool master)
4800{
4801	struct netdev_adjacent *adj;
4802	int ret;
4803
4804	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4805
4806	if (adj) {
4807		adj->ref_nr++;
4808		return 0;
4809	}
4810
4811	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4812	if (!adj)
4813		return -ENOMEM;
4814
4815	adj->dev = adj_dev;
4816	adj->master = master;
4817	adj->ref_nr = 1;
4818	adj->private = private;
4819	dev_hold(adj_dev);
4820
4821	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4822		 adj_dev->name, dev->name, adj_dev->name);
4823
4824	if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4825		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4826		if (ret)
4827			goto free_adj;
4828	}
4829
4830	/* Ensure that master link is always the first item in list. */
4831	if (master) {
4832		ret = sysfs_create_link(&(dev->dev.kobj),
4833					&(adj_dev->dev.kobj), "master");
4834		if (ret)
4835			goto remove_symlinks;
4836
4837		list_add_rcu(&adj->list, dev_list);
4838	} else {
4839		list_add_tail_rcu(&adj->list, dev_list);
4840	}
4841
4842	return 0;
4843
4844remove_symlinks:
4845	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4846		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4847free_adj:
4848	kfree(adj);
4849	dev_put(adj_dev);
4850
4851	return ret;
4852}
4853
4854static void __netdev_adjacent_dev_remove(struct net_device *dev,
4855					 struct net_device *adj_dev,
4856					 struct list_head *dev_list)
4857{
4858	struct netdev_adjacent *adj;
4859
4860	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4861
4862	if (!adj) {
4863		pr_err("tried to remove device %s from %s\n",
4864		       dev->name, adj_dev->name);
4865		BUG();
4866	}
4867
4868	if (adj->ref_nr > 1) {
4869		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4870			 adj->ref_nr-1);
4871		adj->ref_nr--;
4872		return;
4873	}
4874
4875	if (adj->master)
4876		sysfs_remove_link(&(dev->dev.kobj), "master");
4877
4878	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4879		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4880
4881	list_del_rcu(&adj->list);
4882	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4883		 adj_dev->name, dev->name, adj_dev->name);
4884	dev_put(adj_dev);
4885	kfree_rcu(adj, rcu);
4886}
4887
4888static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4889					    struct net_device *upper_dev,
4890					    struct list_head *up_list,
4891					    struct list_head *down_list,
4892					    void *private, bool master)
4893{
4894	int ret;
4895
4896	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4897					   master);
4898	if (ret)
4899		return ret;
4900
4901	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4902					   false);
4903	if (ret) {
4904		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4905		return ret;
4906	}
4907
4908	return 0;
4909}
4910
4911static int __netdev_adjacent_dev_link(struct net_device *dev,
4912				      struct net_device *upper_dev)
4913{
4914	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4915						&dev->all_adj_list.upper,
4916						&upper_dev->all_adj_list.lower,
4917						NULL, false);
4918}
4919
4920static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4921					       struct net_device *upper_dev,
4922					       struct list_head *up_list,
4923					       struct list_head *down_list)
4924{
4925	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4926	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4927}
4928
4929static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4930					 struct net_device *upper_dev)
4931{
4932	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4933					   &dev->all_adj_list.upper,
4934					   &upper_dev->all_adj_list.lower);
4935}
4936
4937static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4938						struct net_device *upper_dev,
4939						void *private, bool master)
4940{
4941	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4942
4943	if (ret)
4944		return ret;
4945
4946	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4947					       &dev->adj_list.upper,
4948					       &upper_dev->adj_list.lower,
4949					       private, master);
4950	if (ret) {
4951		__netdev_adjacent_dev_unlink(dev, upper_dev);
4952		return ret;
4953	}
4954
4955	return 0;
4956}
4957
4958static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4959						   struct net_device *upper_dev)
4960{
4961	__netdev_adjacent_dev_unlink(dev, upper_dev);
4962	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4963					   &dev->adj_list.upper,
4964					   &upper_dev->adj_list.lower);
4965}
4966
4967static int __netdev_upper_dev_link(struct net_device *dev,
4968				   struct net_device *upper_dev, bool master,
4969				   void *private)
4970{
4971	struct netdev_adjacent *i, *j, *to_i, *to_j;
4972	int ret = 0;
4973
4974	ASSERT_RTNL();
4975
4976	if (dev == upper_dev)
4977		return -EBUSY;
4978
4979	/* To prevent loops, check if dev is not upper device to upper_dev. */
4980	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4981		return -EBUSY;
4982
4983	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4984		return -EEXIST;
4985
4986	if (master && netdev_master_upper_dev_get(dev))
4987		return -EBUSY;
4988
4989	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4990						   master);
4991	if (ret)
4992		return ret;
4993
4994	/* Now that we linked these devs, make all the upper_dev's
4995	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4996	 * versa, and don't forget the devices itself. All of these
4997	 * links are non-neighbours.
4998	 */
4999	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5000		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5001			pr_debug("Interlinking %s with %s, non-neighbour\n",
5002				 i->dev->name, j->dev->name);
5003			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5004			if (ret)
5005				goto rollback_mesh;
5006		}
5007	}
5008
5009	/* add dev to every upper_dev's upper device */
5010	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5011		pr_debug("linking %s's upper device %s with %s\n",
5012			 upper_dev->name, i->dev->name, dev->name);
5013		ret = __netdev_adjacent_dev_link(dev, i->dev);
5014		if (ret)
5015			goto rollback_upper_mesh;
5016	}
5017
5018	/* add upper_dev to every dev's lower device */
5019	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5020		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5021			 i->dev->name, upper_dev->name);
5022		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5023		if (ret)
5024			goto rollback_lower_mesh;
5025	}
5026
5027	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5028	return 0;
5029
5030rollback_lower_mesh:
5031	to_i = i;
5032	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5033		if (i == to_i)
5034			break;
5035		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5036	}
5037
5038	i = NULL;
5039
5040rollback_upper_mesh:
5041	to_i = i;
5042	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5043		if (i == to_i)
5044			break;
5045		__netdev_adjacent_dev_unlink(dev, i->dev);
5046	}
5047
5048	i = j = NULL;
5049
5050rollback_mesh:
5051	to_i = i;
5052	to_j = j;
5053	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5054		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5055			if (i == to_i && j == to_j)
5056				break;
5057			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5058		}
5059		if (i == to_i)
5060			break;
5061	}
5062
5063	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5064
5065	return ret;
5066}
5067
5068/**
5069 * netdev_upper_dev_link - Add a link to the upper device
5070 * @dev: device
5071 * @upper_dev: new upper device
5072 *
5073 * Adds a link to device which is upper to this one. The caller must hold
5074 * the RTNL lock. On a failure a negative errno code is returned.
5075 * On success the reference counts are adjusted and the function
5076 * returns zero.
5077 */
5078int netdev_upper_dev_link(struct net_device *dev,
5079			  struct net_device *upper_dev)
5080{
5081	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5082}
5083EXPORT_SYMBOL(netdev_upper_dev_link);
5084
5085/**
5086 * netdev_master_upper_dev_link - Add a master link to the upper device
5087 * @dev: device
5088 * @upper_dev: new upper device
5089 *
5090 * Adds a link to device which is upper to this one. In this case, only
5091 * one master upper device can be linked, although other non-master devices
5092 * might be linked as well. The caller must hold the RTNL lock.
5093 * On a failure a negative errno code is returned. On success the reference
5094 * counts are adjusted and the function returns zero.
5095 */
5096int netdev_master_upper_dev_link(struct net_device *dev,
5097				 struct net_device *upper_dev)
5098{
5099	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5100}
5101EXPORT_SYMBOL(netdev_master_upper_dev_link);
5102
5103int netdev_master_upper_dev_link_private(struct net_device *dev,
5104					 struct net_device *upper_dev,
5105					 void *private)
5106{
5107	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5108}
5109EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5110
5111/**
5112 * netdev_upper_dev_unlink - Removes a link to upper device
5113 * @dev: device
5114 * @upper_dev: new upper device
5115 *
5116 * Removes a link to device which is upper to this one. The caller must hold
5117 * the RTNL lock.
5118 */
5119void netdev_upper_dev_unlink(struct net_device *dev,
5120			     struct net_device *upper_dev)
5121{
5122	struct netdev_adjacent *i, *j;
5123	ASSERT_RTNL();
5124
5125	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5126
5127	/* Here is the tricky part. We must remove all dev's lower
5128	 * devices from all upper_dev's upper devices and vice
5129	 * versa, to maintain the graph relationship.
5130	 */
5131	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5132		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5133			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5134
5135	/* remove also the devices itself from lower/upper device
5136	 * list
5137	 */
5138	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5139		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5140
5141	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5142		__netdev_adjacent_dev_unlink(dev, i->dev);
5143
5144	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5145}
5146EXPORT_SYMBOL(netdev_upper_dev_unlink);
5147
5148void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5149{
5150	struct netdev_adjacent *iter;
5151
5152	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5153		netdev_adjacent_sysfs_del(iter->dev, oldname,
5154					  &iter->dev->adj_list.lower);
5155		netdev_adjacent_sysfs_add(iter->dev, dev,
5156					  &iter->dev->adj_list.lower);
5157	}
5158
5159	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5160		netdev_adjacent_sysfs_del(iter->dev, oldname,
5161					  &iter->dev->adj_list.upper);
5162		netdev_adjacent_sysfs_add(iter->dev, dev,
5163					  &iter->dev->adj_list.upper);
5164	}
5165}
5166
5167void *netdev_lower_dev_get_private(struct net_device *dev,
5168				   struct net_device *lower_dev)
5169{
5170	struct netdev_adjacent *lower;
5171
5172	if (!lower_dev)
5173		return NULL;
5174	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5175	if (!lower)
5176		return NULL;
5177
5178	return lower->private;
5179}
5180EXPORT_SYMBOL(netdev_lower_dev_get_private);
5181
5182
5183int dev_get_nest_level(struct net_device *dev,
5184		       bool (*type_check)(struct net_device *dev))
5185{
5186	struct net_device *lower = NULL;
5187	struct list_head *iter;
5188	int max_nest = -1;
5189	int nest;
5190
5191	ASSERT_RTNL();
5192
5193	netdev_for_each_lower_dev(dev, lower, iter) {
5194		nest = dev_get_nest_level(lower, type_check);
5195		if (max_nest < nest)
5196			max_nest = nest;
5197	}
5198
5199	if (type_check(dev))
5200		max_nest++;
5201
5202	return max_nest;
5203}
5204EXPORT_SYMBOL(dev_get_nest_level);
5205
5206static void dev_change_rx_flags(struct net_device *dev, int flags)
5207{
5208	const struct net_device_ops *ops = dev->netdev_ops;
5209
5210	if (ops->ndo_change_rx_flags)
5211		ops->ndo_change_rx_flags(dev, flags);
5212}
5213
5214static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5215{
5216	unsigned int old_flags = dev->flags;
5217	kuid_t uid;
5218	kgid_t gid;
5219
5220	ASSERT_RTNL();
5221
5222	dev->flags |= IFF_PROMISC;
5223	dev->promiscuity += inc;
5224	if (dev->promiscuity == 0) {
5225		/*
5226		 * Avoid overflow.
5227		 * If inc causes overflow, untouch promisc and return error.
5228		 */
5229		if (inc < 0)
5230			dev->flags &= ~IFF_PROMISC;
5231		else {
5232			dev->promiscuity -= inc;
5233			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5234				dev->name);
5235			return -EOVERFLOW;
5236		}
5237	}
5238	if (dev->flags != old_flags) {
5239		pr_info("device %s %s promiscuous mode\n",
5240			dev->name,
5241			dev->flags & IFF_PROMISC ? "entered" : "left");
5242		if (audit_enabled) {
5243			current_uid_gid(&uid, &gid);
5244			audit_log(current->audit_context, GFP_ATOMIC,
5245				AUDIT_ANOM_PROMISCUOUS,
5246				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5247				dev->name, (dev->flags & IFF_PROMISC),
5248				(old_flags & IFF_PROMISC),
5249				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5250				from_kuid(&init_user_ns, uid),
5251				from_kgid(&init_user_ns, gid),
5252				audit_get_sessionid(current));
5253		}
5254
5255		dev_change_rx_flags(dev, IFF_PROMISC);
5256	}
5257	if (notify)
5258		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5259	return 0;
5260}
5261
5262/**
5263 *	dev_set_promiscuity	- update promiscuity count on a device
5264 *	@dev: device
5265 *	@inc: modifier
5266 *
5267 *	Add or remove promiscuity from a device. While the count in the device
5268 *	remains above zero the interface remains promiscuous. Once it hits zero
5269 *	the device reverts back to normal filtering operation. A negative inc
5270 *	value is used to drop promiscuity on the device.
5271 *	Return 0 if successful or a negative errno code on error.
5272 */
5273int dev_set_promiscuity(struct net_device *dev, int inc)
5274{
5275	unsigned int old_flags = dev->flags;
5276	int err;
5277
5278	err = __dev_set_promiscuity(dev, inc, true);
5279	if (err < 0)
5280		return err;
5281	if (dev->flags != old_flags)
5282		dev_set_rx_mode(dev);
5283	return err;
5284}
5285EXPORT_SYMBOL(dev_set_promiscuity);
5286
5287static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5288{
5289	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5290
5291	ASSERT_RTNL();
5292
5293	dev->flags |= IFF_ALLMULTI;
5294	dev->allmulti += inc;
5295	if (dev->allmulti == 0) {
5296		/*
5297		 * Avoid overflow.
5298		 * If inc causes overflow, untouch allmulti and return error.
5299		 */
5300		if (inc < 0)
5301			dev->flags &= ~IFF_ALLMULTI;
5302		else {
5303			dev->allmulti -= inc;
5304			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5305				dev->name);
5306			return -EOVERFLOW;
5307		}
5308	}
5309	if (dev->flags ^ old_flags) {
5310		dev_change_rx_flags(dev, IFF_ALLMULTI);
5311		dev_set_rx_mode(dev);
5312		if (notify)
5313			__dev_notify_flags(dev, old_flags,
5314					   dev->gflags ^ old_gflags);
5315	}
5316	return 0;
5317}
5318
5319/**
5320 *	dev_set_allmulti	- update allmulti count on a device
5321 *	@dev: device
5322 *	@inc: modifier
5323 *
5324 *	Add or remove reception of all multicast frames to a device. While the
5325 *	count in the device remains above zero the interface remains listening
5326 *	to all interfaces. Once it hits zero the device reverts back to normal
5327 *	filtering operation. A negative @inc value is used to drop the counter
5328 *	when releasing a resource needing all multicasts.
5329 *	Return 0 if successful or a negative errno code on error.
5330 */
5331
5332int dev_set_allmulti(struct net_device *dev, int inc)
5333{
5334	return __dev_set_allmulti(dev, inc, true);
5335}
5336EXPORT_SYMBOL(dev_set_allmulti);
5337
5338/*
5339 *	Upload unicast and multicast address lists to device and
5340 *	configure RX filtering. When the device doesn't support unicast
5341 *	filtering it is put in promiscuous mode while unicast addresses
5342 *	are present.
5343 */
5344void __dev_set_rx_mode(struct net_device *dev)
5345{
5346	const struct net_device_ops *ops = dev->netdev_ops;
5347
5348	/* dev_open will call this function so the list will stay sane. */
5349	if (!(dev->flags&IFF_UP))
5350		return;
5351
5352	if (!netif_device_present(dev))
5353		return;
5354
5355	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5356		/* Unicast addresses changes may only happen under the rtnl,
5357		 * therefore calling __dev_set_promiscuity here is safe.
5358		 */
5359		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5360			__dev_set_promiscuity(dev, 1, false);
5361			dev->uc_promisc = true;
5362		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5363			__dev_set_promiscuity(dev, -1, false);
5364			dev->uc_promisc = false;
5365		}
5366	}
5367
5368	if (ops->ndo_set_rx_mode)
5369		ops->ndo_set_rx_mode(dev);
5370}
5371
5372void dev_set_rx_mode(struct net_device *dev)
5373{
5374	netif_addr_lock_bh(dev);
5375	__dev_set_rx_mode(dev);
5376	netif_addr_unlock_bh(dev);
5377}
5378
5379/**
5380 *	dev_get_flags - get flags reported to userspace
5381 *	@dev: device
5382 *
5383 *	Get the combination of flag bits exported through APIs to userspace.
5384 */
5385unsigned int dev_get_flags(const struct net_device *dev)
5386{
5387	unsigned int flags;
5388
5389	flags = (dev->flags & ~(IFF_PROMISC |
5390				IFF_ALLMULTI |
5391				IFF_RUNNING |
5392				IFF_LOWER_UP |
5393				IFF_DORMANT)) |
5394		(dev->gflags & (IFF_PROMISC |
5395				IFF_ALLMULTI));
5396
5397	if (netif_running(dev)) {
5398		if (netif_oper_up(dev))
5399			flags |= IFF_RUNNING;
5400		if (netif_carrier_ok(dev))
5401			flags |= IFF_LOWER_UP;
5402		if (netif_dormant(dev))
5403			flags |= IFF_DORMANT;
5404	}
5405
5406	return flags;
5407}
5408EXPORT_SYMBOL(dev_get_flags);
5409
5410int __dev_change_flags(struct net_device *dev, unsigned int flags)
5411{
5412	unsigned int old_flags = dev->flags;
5413	int ret;
5414
5415	ASSERT_RTNL();
5416
5417	/*
5418	 *	Set the flags on our device.
5419	 */
5420
5421	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5422			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5423			       IFF_AUTOMEDIA)) |
5424		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5425				    IFF_ALLMULTI));
5426
5427	/*
5428	 *	Load in the correct multicast list now the flags have changed.
5429	 */
5430
5431	if ((old_flags ^ flags) & IFF_MULTICAST)
5432		dev_change_rx_flags(dev, IFF_MULTICAST);
5433
5434	dev_set_rx_mode(dev);
5435
5436	/*
5437	 *	Have we downed the interface. We handle IFF_UP ourselves
5438	 *	according to user attempts to set it, rather than blindly
5439	 *	setting it.
5440	 */
5441
5442	ret = 0;
5443	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5444		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5445
5446		if (!ret)
5447			dev_set_rx_mode(dev);
5448	}
5449
5450	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5451		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5452		unsigned int old_flags = dev->flags;
5453
5454		dev->gflags ^= IFF_PROMISC;
5455
5456		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5457			if (dev->flags != old_flags)
5458				dev_set_rx_mode(dev);
5459	}
5460
5461	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5462	   is important. Some (broken) drivers set IFF_PROMISC, when
5463	   IFF_ALLMULTI is requested not asking us and not reporting.
5464	 */
5465	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5466		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5467
5468		dev->gflags ^= IFF_ALLMULTI;
5469		__dev_set_allmulti(dev, inc, false);
5470	}
5471
5472	return ret;
5473}
5474
5475void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5476			unsigned int gchanges)
5477{
5478	unsigned int changes = dev->flags ^ old_flags;
5479
5480	if (gchanges)
5481		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5482
5483	if (changes & IFF_UP) {
5484		if (dev->flags & IFF_UP)
5485			call_netdevice_notifiers(NETDEV_UP, dev);
5486		else
5487			call_netdevice_notifiers(NETDEV_DOWN, dev);
5488	}
5489
5490	if (dev->flags & IFF_UP &&
5491	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5492		struct netdev_notifier_change_info change_info;
5493
5494		change_info.flags_changed = changes;
5495		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5496					      &change_info.info);
5497	}
5498}
5499
5500/**
5501 *	dev_change_flags - change device settings
5502 *	@dev: device
5503 *	@flags: device state flags
5504 *
5505 *	Change settings on device based state flags. The flags are
5506 *	in the userspace exported format.
5507 */
5508int dev_change_flags(struct net_device *dev, unsigned int flags)
5509{
5510	int ret;
5511	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5512
5513	ret = __dev_change_flags(dev, flags);
5514	if (ret < 0)
5515		return ret;
5516
5517	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5518	__dev_notify_flags(dev, old_flags, changes);
5519	return ret;
5520}
5521EXPORT_SYMBOL(dev_change_flags);
5522
5523static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5524{
5525	const struct net_device_ops *ops = dev->netdev_ops;
5526
5527	if (ops->ndo_change_mtu)
5528		return ops->ndo_change_mtu(dev, new_mtu);
5529
5530	dev->mtu = new_mtu;
5531	return 0;
5532}
5533
5534/**
5535 *	dev_set_mtu - Change maximum transfer unit
5536 *	@dev: device
5537 *	@new_mtu: new transfer unit
5538 *
5539 *	Change the maximum transfer size of the network device.
5540 */
5541int dev_set_mtu(struct net_device *dev, int new_mtu)
5542{
5543	int err, orig_mtu;
5544
5545	if (new_mtu == dev->mtu)
5546		return 0;
5547
5548	/*	MTU must be positive.	 */
5549	if (new_mtu < 0)
5550		return -EINVAL;
5551
5552	if (!netif_device_present(dev))
5553		return -ENODEV;
5554
5555	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5556	err = notifier_to_errno(err);
5557	if (err)
5558		return err;
5559
5560	orig_mtu = dev->mtu;
5561	err = __dev_set_mtu(dev, new_mtu);
5562
5563	if (!err) {
5564		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5565		err = notifier_to_errno(err);
5566		if (err) {
5567			/* setting mtu back and notifying everyone again,
5568			 * so that they have a chance to revert changes.
5569			 */
5570			__dev_set_mtu(dev, orig_mtu);
5571			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5572		}
5573	}
5574	return err;
5575}
5576EXPORT_SYMBOL(dev_set_mtu);
5577
5578/**
5579 *	dev_set_group - Change group this device belongs to
5580 *	@dev: device
5581 *	@new_group: group this device should belong to
5582 */
5583void dev_set_group(struct net_device *dev, int new_group)
5584{
5585	dev->group = new_group;
5586}
5587EXPORT_SYMBOL(dev_set_group);
5588
5589/**
5590 *	dev_set_mac_address - Change Media Access Control Address
5591 *	@dev: device
5592 *	@sa: new address
5593 *
5594 *	Change the hardware (MAC) address of the device
5595 */
5596int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5597{
5598	const struct net_device_ops *ops = dev->netdev_ops;
5599	int err;
5600
5601	if (!ops->ndo_set_mac_address)
5602		return -EOPNOTSUPP;
5603	if (sa->sa_family != dev->type)
5604		return -EINVAL;
5605	if (!netif_device_present(dev))
5606		return -ENODEV;
5607	err = ops->ndo_set_mac_address(dev, sa);
5608	if (err)
5609		return err;
5610	dev->addr_assign_type = NET_ADDR_SET;
5611	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5612	add_device_randomness(dev->dev_addr, dev->addr_len);
5613	return 0;
5614}
5615EXPORT_SYMBOL(dev_set_mac_address);
5616
5617/**
5618 *	dev_change_carrier - Change device carrier
5619 *	@dev: device
5620 *	@new_carrier: new value
5621 *
5622 *	Change device carrier
5623 */
5624int dev_change_carrier(struct net_device *dev, bool new_carrier)
5625{
5626	const struct net_device_ops *ops = dev->netdev_ops;
5627
5628	if (!ops->ndo_change_carrier)
5629		return -EOPNOTSUPP;
5630	if (!netif_device_present(dev))
5631		return -ENODEV;
5632	return ops->ndo_change_carrier(dev, new_carrier);
5633}
5634EXPORT_SYMBOL(dev_change_carrier);
5635
5636/**
5637 *	dev_get_phys_port_id - Get device physical port ID
5638 *	@dev: device
5639 *	@ppid: port ID
5640 *
5641 *	Get device physical port ID
5642 */
5643int dev_get_phys_port_id(struct net_device *dev,
5644			 struct netdev_phys_port_id *ppid)
5645{
5646	const struct net_device_ops *ops = dev->netdev_ops;
5647
5648	if (!ops->ndo_get_phys_port_id)
5649		return -EOPNOTSUPP;
5650	return ops->ndo_get_phys_port_id(dev, ppid);
5651}
5652EXPORT_SYMBOL(dev_get_phys_port_id);
5653
5654/**
5655 *	dev_new_index	-	allocate an ifindex
5656 *	@net: the applicable net namespace
5657 *
5658 *	Returns a suitable unique value for a new device interface
5659 *	number.  The caller must hold the rtnl semaphore or the
5660 *	dev_base_lock to be sure it remains unique.
5661 */
5662static int dev_new_index(struct net *net)
5663{
5664	int ifindex = net->ifindex;
5665	for (;;) {
5666		if (++ifindex <= 0)
5667			ifindex = 1;
5668		if (!__dev_get_by_index(net, ifindex))
5669			return net->ifindex = ifindex;
5670	}
5671}
5672
5673/* Delayed registration/unregisteration */
5674static LIST_HEAD(net_todo_list);
5675DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5676
5677static void net_set_todo(struct net_device *dev)
5678{
5679	list_add_tail(&dev->todo_list, &net_todo_list);
5680	dev_net(dev)->dev_unreg_count++;
5681}
5682
5683static void rollback_registered_many(struct list_head *head)
5684{
5685	struct net_device *dev, *tmp;
5686	LIST_HEAD(close_head);
5687
5688	BUG_ON(dev_boot_phase);
5689	ASSERT_RTNL();
5690
5691	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5692		/* Some devices call without registering
5693		 * for initialization unwind. Remove those
5694		 * devices and proceed with the remaining.
5695		 */
5696		if (dev->reg_state == NETREG_UNINITIALIZED) {
5697			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5698				 dev->name, dev);
5699
5700			WARN_ON(1);
5701			list_del(&dev->unreg_list);
5702			continue;
5703		}
5704		dev->dismantle = true;
5705		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5706	}
5707
5708	/* If device is running, close it first. */
5709	list_for_each_entry(dev, head, unreg_list)
5710		list_add_tail(&dev->close_list, &close_head);
5711	dev_close_many(&close_head);
5712
5713	list_for_each_entry(dev, head, unreg_list) {
5714		/* And unlink it from device chain. */
5715		unlist_netdevice(dev);
5716
5717		dev->reg_state = NETREG_UNREGISTERING;
5718	}
5719
5720	synchronize_net();
5721
5722	list_for_each_entry(dev, head, unreg_list) {
5723		/* Shutdown queueing discipline. */
5724		dev_shutdown(dev);
5725
5726
5727		/* Notify protocols, that we are about to destroy
5728		   this device. They should clean all the things.
5729		*/
5730		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5731
5732		/*
5733		 *	Flush the unicast and multicast chains
5734		 */
5735		dev_uc_flush(dev);
5736		dev_mc_flush(dev);
5737
5738		if (dev->netdev_ops->ndo_uninit)
5739			dev->netdev_ops->ndo_uninit(dev);
5740
5741		if (!dev->rtnl_link_ops ||
5742		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5743			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5744
5745		/* Notifier chain MUST detach us all upper devices. */
5746		WARN_ON(netdev_has_any_upper_dev(dev));
5747
5748		/* Remove entries from kobject tree */
5749		netdev_unregister_kobject(dev);
5750#ifdef CONFIG_XPS
5751		/* Remove XPS queueing entries */
5752		netif_reset_xps_queues_gt(dev, 0);
5753#endif
5754	}
5755
5756	synchronize_net();
5757
5758	list_for_each_entry(dev, head, unreg_list)
5759		dev_put(dev);
5760}
5761
5762static void rollback_registered(struct net_device *dev)
5763{
5764	LIST_HEAD(single);
5765
5766	list_add(&dev->unreg_list, &single);
5767	rollback_registered_many(&single);
5768	list_del(&single);
5769}
5770
5771static netdev_features_t netdev_fix_features(struct net_device *dev,
5772	netdev_features_t features)
5773{
5774	/* Fix illegal checksum combinations */
5775	if ((features & NETIF_F_HW_CSUM) &&
5776	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5777		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5778		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5779	}
5780
5781	/* TSO requires that SG is present as well. */
5782	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5783		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5784		features &= ~NETIF_F_ALL_TSO;
5785	}
5786
5787	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5788					!(features & NETIF_F_IP_CSUM)) {
5789		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5790		features &= ~NETIF_F_TSO;
5791		features &= ~NETIF_F_TSO_ECN;
5792	}
5793
5794	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5795					 !(features & NETIF_F_IPV6_CSUM)) {
5796		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5797		features &= ~NETIF_F_TSO6;
5798	}
5799
5800	/* TSO ECN requires that TSO is present as well. */
5801	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5802		features &= ~NETIF_F_TSO_ECN;
5803
5804	/* Software GSO depends on SG. */
5805	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5806		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5807		features &= ~NETIF_F_GSO;
5808	}
5809
5810	/* UFO needs SG and checksumming */
5811	if (features & NETIF_F_UFO) {
5812		/* maybe split UFO into V4 and V6? */
5813		if (!((features & NETIF_F_GEN_CSUM) ||
5814		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5815			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5816			netdev_dbg(dev,
5817				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5818			features &= ~NETIF_F_UFO;
5819		}
5820
5821		if (!(features & NETIF_F_SG)) {
5822			netdev_dbg(dev,
5823				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5824			features &= ~NETIF_F_UFO;
5825		}
5826	}
5827
5828#ifdef CONFIG_NET_RX_BUSY_POLL
5829	if (dev->netdev_ops->ndo_busy_poll)
5830		features |= NETIF_F_BUSY_POLL;
5831	else
5832#endif
5833		features &= ~NETIF_F_BUSY_POLL;
5834
5835	return features;
5836}
5837
5838int __netdev_update_features(struct net_device *dev)
5839{
5840	netdev_features_t features;
5841	int err = 0;
5842
5843	ASSERT_RTNL();
5844
5845	features = netdev_get_wanted_features(dev);
5846
5847	if (dev->netdev_ops->ndo_fix_features)
5848		features = dev->netdev_ops->ndo_fix_features(dev, features);
5849
5850	/* driver might be less strict about feature dependencies */
5851	features = netdev_fix_features(dev, features);
5852
5853	if (dev->features == features)
5854		return 0;
5855
5856	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5857		&dev->features, &features);
5858
5859	if (dev->netdev_ops->ndo_set_features)
5860		err = dev->netdev_ops->ndo_set_features(dev, features);
5861
5862	if (unlikely(err < 0)) {
5863		netdev_err(dev,
5864			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5865			err, &features, &dev->features);
5866		return -1;
5867	}
5868
5869	if (!err)
5870		dev->features = features;
5871
5872	return 1;
5873}
5874
5875/**
5876 *	netdev_update_features - recalculate device features
5877 *	@dev: the device to check
5878 *
5879 *	Recalculate dev->features set and send notifications if it
5880 *	has changed. Should be called after driver or hardware dependent
5881 *	conditions might have changed that influence the features.
5882 */
5883void netdev_update_features(struct net_device *dev)
5884{
5885	if (__netdev_update_features(dev))
5886		netdev_features_change(dev);
5887}
5888EXPORT_SYMBOL(netdev_update_features);
5889
5890/**
5891 *	netdev_change_features - recalculate device features
5892 *	@dev: the device to check
5893 *
5894 *	Recalculate dev->features set and send notifications even
5895 *	if they have not changed. Should be called instead of
5896 *	netdev_update_features() if also dev->vlan_features might
5897 *	have changed to allow the changes to be propagated to stacked
5898 *	VLAN devices.
5899 */
5900void netdev_change_features(struct net_device *dev)
5901{
5902	__netdev_update_features(dev);
5903	netdev_features_change(dev);
5904}
5905EXPORT_SYMBOL(netdev_change_features);
5906
5907/**
5908 *	netif_stacked_transfer_operstate -	transfer operstate
5909 *	@rootdev: the root or lower level device to transfer state from
5910 *	@dev: the device to transfer operstate to
5911 *
5912 *	Transfer operational state from root to device. This is normally
5913 *	called when a stacking relationship exists between the root
5914 *	device and the device(a leaf device).
5915 */
5916void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5917					struct net_device *dev)
5918{
5919	if (rootdev->operstate == IF_OPER_DORMANT)
5920		netif_dormant_on(dev);
5921	else
5922		netif_dormant_off(dev);
5923
5924	if (netif_carrier_ok(rootdev)) {
5925		if (!netif_carrier_ok(dev))
5926			netif_carrier_on(dev);
5927	} else {
5928		if (netif_carrier_ok(dev))
5929			netif_carrier_off(dev);
5930	}
5931}
5932EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5933
5934#ifdef CONFIG_SYSFS
5935static int netif_alloc_rx_queues(struct net_device *dev)
5936{
5937	unsigned int i, count = dev->num_rx_queues;
5938	struct netdev_rx_queue *rx;
5939
5940	BUG_ON(count < 1);
5941
5942	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5943	if (!rx)
5944		return -ENOMEM;
5945
5946	dev->_rx = rx;
5947
5948	for (i = 0; i < count; i++)
5949		rx[i].dev = dev;
5950	return 0;
5951}
5952#endif
5953
5954static void netdev_init_one_queue(struct net_device *dev,
5955				  struct netdev_queue *queue, void *_unused)
5956{
5957	/* Initialize queue lock */
5958	spin_lock_init(&queue->_xmit_lock);
5959	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5960	queue->xmit_lock_owner = -1;
5961	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5962	queue->dev = dev;
5963#ifdef CONFIG_BQL
5964	dql_init(&queue->dql, HZ);
5965#endif
5966}
5967
5968static void netif_free_tx_queues(struct net_device *dev)
5969{
5970	kvfree(dev->_tx);
5971}
5972
5973static int netif_alloc_netdev_queues(struct net_device *dev)
5974{
5975	unsigned int count = dev->num_tx_queues;
5976	struct netdev_queue *tx;
5977	size_t sz = count * sizeof(*tx);
5978
5979	BUG_ON(count < 1 || count > 0xffff);
5980
5981	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5982	if (!tx) {
5983		tx = vzalloc(sz);
5984		if (!tx)
5985			return -ENOMEM;
5986	}
5987	dev->_tx = tx;
5988
5989	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5990	spin_lock_init(&dev->tx_global_lock);
5991
5992	return 0;
5993}
5994
5995/**
5996 *	register_netdevice	- register a network device
5997 *	@dev: device to register
5998 *
5999 *	Take a completed network device structure and add it to the kernel
6000 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6001 *	chain. 0 is returned on success. A negative errno code is returned
6002 *	on a failure to set up the device, or if the name is a duplicate.
6003 *
6004 *	Callers must hold the rtnl semaphore. You may want
6005 *	register_netdev() instead of this.
6006 *
6007 *	BUGS:
6008 *	The locking appears insufficient to guarantee two parallel registers
6009 *	will not get the same name.
6010 */
6011
6012int register_netdevice(struct net_device *dev)
6013{
6014	int ret;
6015	struct net *net = dev_net(dev);
6016
6017	BUG_ON(dev_boot_phase);
6018	ASSERT_RTNL();
6019
6020	might_sleep();
6021
6022	/* When net_device's are persistent, this will be fatal. */
6023	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6024	BUG_ON(!net);
6025
6026	spin_lock_init(&dev->addr_list_lock);
6027	netdev_set_addr_lockdep_class(dev);
6028
6029	dev->iflink = -1;
6030
6031	ret = dev_get_valid_name(net, dev, dev->name);
6032	if (ret < 0)
6033		goto out;
6034
6035	/* Init, if this function is available */
6036	if (dev->netdev_ops->ndo_init) {
6037		ret = dev->netdev_ops->ndo_init(dev);
6038		if (ret) {
6039			if (ret > 0)
6040				ret = -EIO;
6041			goto out;
6042		}
6043	}
6044
6045	if (((dev->hw_features | dev->features) &
6046	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6047	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6048	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6049		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6050		ret = -EINVAL;
6051		goto err_uninit;
6052	}
6053
6054	ret = -EBUSY;
6055	if (!dev->ifindex)
6056		dev->ifindex = dev_new_index(net);
6057	else if (__dev_get_by_index(net, dev->ifindex))
6058		goto err_uninit;
6059
6060	if (dev->iflink == -1)
6061		dev->iflink = dev->ifindex;
6062
6063	/* Transfer changeable features to wanted_features and enable
6064	 * software offloads (GSO and GRO).
6065	 */
6066	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6067	dev->features |= NETIF_F_SOFT_FEATURES;
6068	dev->wanted_features = dev->features & dev->hw_features;
6069
6070	if (!(dev->flags & IFF_LOOPBACK)) {
6071		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6072	}
6073
6074	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6075	 */
6076	dev->vlan_features |= NETIF_F_HIGHDMA;
6077
6078	/* Make NETIF_F_SG inheritable to tunnel devices.
6079	 */
6080	dev->hw_enc_features |= NETIF_F_SG;
6081
6082	/* Make NETIF_F_SG inheritable to MPLS.
6083	 */
6084	dev->mpls_features |= NETIF_F_SG;
6085
6086	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6087	ret = notifier_to_errno(ret);
6088	if (ret)
6089		goto err_uninit;
6090
6091	ret = netdev_register_kobject(dev);
6092	if (ret)
6093		goto err_uninit;
6094	dev->reg_state = NETREG_REGISTERED;
6095
6096	__netdev_update_features(dev);
6097
6098	/*
6099	 *	Default initial state at registry is that the
6100	 *	device is present.
6101	 */
6102
6103	set_bit(__LINK_STATE_PRESENT, &dev->state);
6104
6105	linkwatch_init_dev(dev);
6106
6107	dev_init_scheduler(dev);
6108	dev_hold(dev);
6109	list_netdevice(dev);
6110	add_device_randomness(dev->dev_addr, dev->addr_len);
6111
6112	/* If the device has permanent device address, driver should
6113	 * set dev_addr and also addr_assign_type should be set to
6114	 * NET_ADDR_PERM (default value).
6115	 */
6116	if (dev->addr_assign_type == NET_ADDR_PERM)
6117		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6118
6119	/* Notify protocols, that a new device appeared. */
6120	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6121	ret = notifier_to_errno(ret);
6122	if (ret) {
6123		rollback_registered(dev);
6124		dev->reg_state = NETREG_UNREGISTERED;
6125	}
6126	/*
6127	 *	Prevent userspace races by waiting until the network
6128	 *	device is fully setup before sending notifications.
6129	 */
6130	if (!dev->rtnl_link_ops ||
6131	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6132		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6133
6134out:
6135	return ret;
6136
6137err_uninit:
6138	if (dev->netdev_ops->ndo_uninit)
6139		dev->netdev_ops->ndo_uninit(dev);
6140	goto out;
6141}
6142EXPORT_SYMBOL(register_netdevice);
6143
6144/**
6145 *	init_dummy_netdev	- init a dummy network device for NAPI
6146 *	@dev: device to init
6147 *
6148 *	This takes a network device structure and initialize the minimum
6149 *	amount of fields so it can be used to schedule NAPI polls without
6150 *	registering a full blown interface. This is to be used by drivers
6151 *	that need to tie several hardware interfaces to a single NAPI
6152 *	poll scheduler due to HW limitations.
6153 */
6154int init_dummy_netdev(struct net_device *dev)
6155{
6156	/* Clear everything. Note we don't initialize spinlocks
6157	 * are they aren't supposed to be taken by any of the
6158	 * NAPI code and this dummy netdev is supposed to be
6159	 * only ever used for NAPI polls
6160	 */
6161	memset(dev, 0, sizeof(struct net_device));
6162
6163	/* make sure we BUG if trying to hit standard
6164	 * register/unregister code path
6165	 */
6166	dev->reg_state = NETREG_DUMMY;
6167
6168	/* NAPI wants this */
6169	INIT_LIST_HEAD(&dev->napi_list);
6170
6171	/* a dummy interface is started by default */
6172	set_bit(__LINK_STATE_PRESENT, &dev->state);
6173	set_bit(__LINK_STATE_START, &dev->state);
6174
6175	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6176	 * because users of this 'device' dont need to change
6177	 * its refcount.
6178	 */
6179
6180	return 0;
6181}
6182EXPORT_SYMBOL_GPL(init_dummy_netdev);
6183
6184
6185/**
6186 *	register_netdev	- register a network device
6187 *	@dev: device to register
6188 *
6189 *	Take a completed network device structure and add it to the kernel
6190 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6191 *	chain. 0 is returned on success. A negative errno code is returned
6192 *	on a failure to set up the device, or if the name is a duplicate.
6193 *
6194 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6195 *	and expands the device name if you passed a format string to
6196 *	alloc_netdev.
6197 */
6198int register_netdev(struct net_device *dev)
6199{
6200	int err;
6201
6202	rtnl_lock();
6203	err = register_netdevice(dev);
6204	rtnl_unlock();
6205	return err;
6206}
6207EXPORT_SYMBOL(register_netdev);
6208
6209int netdev_refcnt_read(const struct net_device *dev)
6210{
6211	int i, refcnt = 0;
6212
6213	for_each_possible_cpu(i)
6214		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6215	return refcnt;
6216}
6217EXPORT_SYMBOL(netdev_refcnt_read);
6218
6219/**
6220 * netdev_wait_allrefs - wait until all references are gone.
6221 * @dev: target net_device
6222 *
6223 * This is called when unregistering network devices.
6224 *
6225 * Any protocol or device that holds a reference should register
6226 * for netdevice notification, and cleanup and put back the
6227 * reference if they receive an UNREGISTER event.
6228 * We can get stuck here if buggy protocols don't correctly
6229 * call dev_put.
6230 */
6231static void netdev_wait_allrefs(struct net_device *dev)
6232{
6233	unsigned long rebroadcast_time, warning_time;
6234	int refcnt;
6235
6236	linkwatch_forget_dev(dev);
6237
6238	rebroadcast_time = warning_time = jiffies;
6239	refcnt = netdev_refcnt_read(dev);
6240
6241	while (refcnt != 0) {
6242		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6243			rtnl_lock();
6244
6245			/* Rebroadcast unregister notification */
6246			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6247
6248			__rtnl_unlock();
6249			rcu_barrier();
6250			rtnl_lock();
6251
6252			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6253			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6254				     &dev->state)) {
6255				/* We must not have linkwatch events
6256				 * pending on unregister. If this
6257				 * happens, we simply run the queue
6258				 * unscheduled, resulting in a noop
6259				 * for this device.
6260				 */
6261				linkwatch_run_queue();
6262			}
6263
6264			__rtnl_unlock();
6265
6266			rebroadcast_time = jiffies;
6267		}
6268
6269		msleep(250);
6270
6271		refcnt = netdev_refcnt_read(dev);
6272
6273		if (time_after(jiffies, warning_time + 10 * HZ)) {
6274			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6275				 dev->name, refcnt);
6276			warning_time = jiffies;
6277		}
6278	}
6279}
6280
6281/* The sequence is:
6282 *
6283 *	rtnl_lock();
6284 *	...
6285 *	register_netdevice(x1);
6286 *	register_netdevice(x2);
6287 *	...
6288 *	unregister_netdevice(y1);
6289 *	unregister_netdevice(y2);
6290 *      ...
6291 *	rtnl_unlock();
6292 *	free_netdev(y1);
6293 *	free_netdev(y2);
6294 *
6295 * We are invoked by rtnl_unlock().
6296 * This allows us to deal with problems:
6297 * 1) We can delete sysfs objects which invoke hotplug
6298 *    without deadlocking with linkwatch via keventd.
6299 * 2) Since we run with the RTNL semaphore not held, we can sleep
6300 *    safely in order to wait for the netdev refcnt to drop to zero.
6301 *
6302 * We must not return until all unregister events added during
6303 * the interval the lock was held have been completed.
6304 */
6305void netdev_run_todo(void)
6306{
6307	struct list_head list;
6308
6309	/* Snapshot list, allow later requests */
6310	list_replace_init(&net_todo_list, &list);
6311
6312	__rtnl_unlock();
6313
6314
6315	/* Wait for rcu callbacks to finish before next phase */
6316	if (!list_empty(&list))
6317		rcu_barrier();
6318
6319	while (!list_empty(&list)) {
6320		struct net_device *dev
6321			= list_first_entry(&list, struct net_device, todo_list);
6322		list_del(&dev->todo_list);
6323
6324		rtnl_lock();
6325		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6326		__rtnl_unlock();
6327
6328		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6329			pr_err("network todo '%s' but state %d\n",
6330			       dev->name, dev->reg_state);
6331			dump_stack();
6332			continue;
6333		}
6334
6335		dev->reg_state = NETREG_UNREGISTERED;
6336
6337		on_each_cpu(flush_backlog, dev, 1);
6338
6339		netdev_wait_allrefs(dev);
6340
6341		/* paranoia */
6342		BUG_ON(netdev_refcnt_read(dev));
6343		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6344		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6345		WARN_ON(dev->dn_ptr);
6346
6347		if (dev->destructor)
6348			dev->destructor(dev);
6349
6350		/* Report a network device has been unregistered */
6351		rtnl_lock();
6352		dev_net(dev)->dev_unreg_count--;
6353		__rtnl_unlock();
6354		wake_up(&netdev_unregistering_wq);
6355
6356		/* Free network device */
6357		kobject_put(&dev->dev.kobj);
6358	}
6359}
6360
6361/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6362 * fields in the same order, with only the type differing.
6363 */
6364void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6365			     const struct net_device_stats *netdev_stats)
6366{
6367#if BITS_PER_LONG == 64
6368	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6369	memcpy(stats64, netdev_stats, sizeof(*stats64));
6370#else
6371	size_t i, n = sizeof(*stats64) / sizeof(u64);
6372	const unsigned long *src = (const unsigned long *)netdev_stats;
6373	u64 *dst = (u64 *)stats64;
6374
6375	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6376		     sizeof(*stats64) / sizeof(u64));
6377	for (i = 0; i < n; i++)
6378		dst[i] = src[i];
6379#endif
6380}
6381EXPORT_SYMBOL(netdev_stats_to_stats64);
6382
6383/**
6384 *	dev_get_stats	- get network device statistics
6385 *	@dev: device to get statistics from
6386 *	@storage: place to store stats
6387 *
6388 *	Get network statistics from device. Return @storage.
6389 *	The device driver may provide its own method by setting
6390 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6391 *	otherwise the internal statistics structure is used.
6392 */
6393struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6394					struct rtnl_link_stats64 *storage)
6395{
6396	const struct net_device_ops *ops = dev->netdev_ops;
6397
6398	if (ops->ndo_get_stats64) {
6399		memset(storage, 0, sizeof(*storage));
6400		ops->ndo_get_stats64(dev, storage);
6401	} else if (ops->ndo_get_stats) {
6402		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6403	} else {
6404		netdev_stats_to_stats64(storage, &dev->stats);
6405	}
6406	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6407	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6408	return storage;
6409}
6410EXPORT_SYMBOL(dev_get_stats);
6411
6412struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6413{
6414	struct netdev_queue *queue = dev_ingress_queue(dev);
6415
6416#ifdef CONFIG_NET_CLS_ACT
6417	if (queue)
6418		return queue;
6419	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6420	if (!queue)
6421		return NULL;
6422	netdev_init_one_queue(dev, queue, NULL);
6423	queue->qdisc = &noop_qdisc;
6424	queue->qdisc_sleeping = &noop_qdisc;
6425	rcu_assign_pointer(dev->ingress_queue, queue);
6426#endif
6427	return queue;
6428}
6429
6430static const struct ethtool_ops default_ethtool_ops;
6431
6432void netdev_set_default_ethtool_ops(struct net_device *dev,
6433				    const struct ethtool_ops *ops)
6434{
6435	if (dev->ethtool_ops == &default_ethtool_ops)
6436		dev->ethtool_ops = ops;
6437}
6438EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6439
6440void netdev_freemem(struct net_device *dev)
6441{
6442	char *addr = (char *)dev - dev->padded;
6443
6444	kvfree(addr);
6445}
6446
6447/**
6448 *	alloc_netdev_mqs - allocate network device
6449 *	@sizeof_priv:	size of private data to allocate space for
6450 *	@name:		device name format string
6451 *	@setup:		callback to initialize device
6452 *	@txqs:		the number of TX subqueues to allocate
6453 *	@rxqs:		the number of RX subqueues to allocate
6454 *
6455 *	Allocates a struct net_device with private data area for driver use
6456 *	and performs basic initialization.  Also allocates subqueue structs
6457 *	for each queue on the device.
6458 */
6459struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6460		void (*setup)(struct net_device *),
6461		unsigned int txqs, unsigned int rxqs)
6462{
6463	struct net_device *dev;
6464	size_t alloc_size;
6465	struct net_device *p;
6466
6467	BUG_ON(strlen(name) >= sizeof(dev->name));
6468
6469	if (txqs < 1) {
6470		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6471		return NULL;
6472	}
6473
6474#ifdef CONFIG_SYSFS
6475	if (rxqs < 1) {
6476		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6477		return NULL;
6478	}
6479#endif
6480
6481	alloc_size = sizeof(struct net_device);
6482	if (sizeof_priv) {
6483		/* ensure 32-byte alignment of private area */
6484		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6485		alloc_size += sizeof_priv;
6486	}
6487	/* ensure 32-byte alignment of whole construct */
6488	alloc_size += NETDEV_ALIGN - 1;
6489
6490	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6491	if (!p)
6492		p = vzalloc(alloc_size);
6493	if (!p)
6494		return NULL;
6495
6496	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6497	dev->padded = (char *)dev - (char *)p;
6498
6499	dev->pcpu_refcnt = alloc_percpu(int);
6500	if (!dev->pcpu_refcnt)
6501		goto free_dev;
6502
6503	if (dev_addr_init(dev))
6504		goto free_pcpu;
6505
6506	dev_mc_init(dev);
6507	dev_uc_init(dev);
6508
6509	dev_net_set(dev, &init_net);
6510
6511	dev->gso_max_size = GSO_MAX_SIZE;
6512	dev->gso_max_segs = GSO_MAX_SEGS;
6513
6514	INIT_LIST_HEAD(&dev->napi_list);
6515	INIT_LIST_HEAD(&dev->unreg_list);
6516	INIT_LIST_HEAD(&dev->close_list);
6517	INIT_LIST_HEAD(&dev->link_watch_list);
6518	INIT_LIST_HEAD(&dev->adj_list.upper);
6519	INIT_LIST_HEAD(&dev->adj_list.lower);
6520	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6521	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6522	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6523	setup(dev);
6524
6525	dev->num_tx_queues = txqs;
6526	dev->real_num_tx_queues = txqs;
6527	if (netif_alloc_netdev_queues(dev))
6528		goto free_all;
6529
6530#ifdef CONFIG_SYSFS
6531	dev->num_rx_queues = rxqs;
6532	dev->real_num_rx_queues = rxqs;
6533	if (netif_alloc_rx_queues(dev))
6534		goto free_all;
6535#endif
6536
6537	strcpy(dev->name, name);
6538	dev->group = INIT_NETDEV_GROUP;
6539	if (!dev->ethtool_ops)
6540		dev->ethtool_ops = &default_ethtool_ops;
6541	return dev;
6542
6543free_all:
6544	free_netdev(dev);
6545	return NULL;
6546
6547free_pcpu:
6548	free_percpu(dev->pcpu_refcnt);
6549free_dev:
6550	netdev_freemem(dev);
6551	return NULL;
6552}
6553EXPORT_SYMBOL(alloc_netdev_mqs);
6554
6555/**
6556 *	free_netdev - free network device
6557 *	@dev: device
6558 *
6559 *	This function does the last stage of destroying an allocated device
6560 * 	interface. The reference to the device object is released.
6561 *	If this is the last reference then it will be freed.
6562 */
6563void free_netdev(struct net_device *dev)
6564{
6565	struct napi_struct *p, *n;
6566
6567	release_net(dev_net(dev));
6568
6569	netif_free_tx_queues(dev);
6570#ifdef CONFIG_SYSFS
6571	kfree(dev->_rx);
6572#endif
6573
6574	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6575
6576	/* Flush device addresses */
6577	dev_addr_flush(dev);
6578
6579	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6580		netif_napi_del(p);
6581
6582	free_percpu(dev->pcpu_refcnt);
6583	dev->pcpu_refcnt = NULL;
6584
6585	/*  Compatibility with error handling in drivers */
6586	if (dev->reg_state == NETREG_UNINITIALIZED) {
6587		netdev_freemem(dev);
6588		return;
6589	}
6590
6591	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6592	dev->reg_state = NETREG_RELEASED;
6593
6594	/* will free via device release */
6595	put_device(&dev->dev);
6596}
6597EXPORT_SYMBOL(free_netdev);
6598
6599/**
6600 *	synchronize_net -  Synchronize with packet receive processing
6601 *
6602 *	Wait for packets currently being received to be done.
6603 *	Does not block later packets from starting.
6604 */
6605void synchronize_net(void)
6606{
6607	might_sleep();
6608	if (rtnl_is_locked())
6609		synchronize_rcu_expedited();
6610	else
6611		synchronize_rcu();
6612}
6613EXPORT_SYMBOL(synchronize_net);
6614
6615/**
6616 *	unregister_netdevice_queue - remove device from the kernel
6617 *	@dev: device
6618 *	@head: list
6619 *
6620 *	This function shuts down a device interface and removes it
6621 *	from the kernel tables.
6622 *	If head not NULL, device is queued to be unregistered later.
6623 *
6624 *	Callers must hold the rtnl semaphore.  You may want
6625 *	unregister_netdev() instead of this.
6626 */
6627
6628void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6629{
6630	ASSERT_RTNL();
6631
6632	if (head) {
6633		list_move_tail(&dev->unreg_list, head);
6634	} else {
6635		rollback_registered(dev);
6636		/* Finish processing unregister after unlock */
6637		net_set_todo(dev);
6638	}
6639}
6640EXPORT_SYMBOL(unregister_netdevice_queue);
6641
6642/**
6643 *	unregister_netdevice_many - unregister many devices
6644 *	@head: list of devices
6645 *
6646 *  Note: As most callers use a stack allocated list_head,
6647 *  we force a list_del() to make sure stack wont be corrupted later.
6648 */
6649void unregister_netdevice_many(struct list_head *head)
6650{
6651	struct net_device *dev;
6652
6653	if (!list_empty(head)) {
6654		rollback_registered_many(head);
6655		list_for_each_entry(dev, head, unreg_list)
6656			net_set_todo(dev);
6657		list_del(head);
6658	}
6659}
6660EXPORT_SYMBOL(unregister_netdevice_many);
6661
6662/**
6663 *	unregister_netdev - remove device from the kernel
6664 *	@dev: device
6665 *
6666 *	This function shuts down a device interface and removes it
6667 *	from the kernel tables.
6668 *
6669 *	This is just a wrapper for unregister_netdevice that takes
6670 *	the rtnl semaphore.  In general you want to use this and not
6671 *	unregister_netdevice.
6672 */
6673void unregister_netdev(struct net_device *dev)
6674{
6675	rtnl_lock();
6676	unregister_netdevice(dev);
6677	rtnl_unlock();
6678}
6679EXPORT_SYMBOL(unregister_netdev);
6680
6681/**
6682 *	dev_change_net_namespace - move device to different nethost namespace
6683 *	@dev: device
6684 *	@net: network namespace
6685 *	@pat: If not NULL name pattern to try if the current device name
6686 *	      is already taken in the destination network namespace.
6687 *
6688 *	This function shuts down a device interface and moves it
6689 *	to a new network namespace. On success 0 is returned, on
6690 *	a failure a netagive errno code is returned.
6691 *
6692 *	Callers must hold the rtnl semaphore.
6693 */
6694
6695int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6696{
6697	int err;
6698
6699	ASSERT_RTNL();
6700
6701	/* Don't allow namespace local devices to be moved. */
6702	err = -EINVAL;
6703	if (dev->features & NETIF_F_NETNS_LOCAL)
6704		goto out;
6705
6706	/* Ensure the device has been registrered */
6707	if (dev->reg_state != NETREG_REGISTERED)
6708		goto out;
6709
6710	/* Get out if there is nothing todo */
6711	err = 0;
6712	if (net_eq(dev_net(dev), net))
6713		goto out;
6714
6715	/* Pick the destination device name, and ensure
6716	 * we can use it in the destination network namespace.
6717	 */
6718	err = -EEXIST;
6719	if (__dev_get_by_name(net, dev->name)) {
6720		/* We get here if we can't use the current device name */
6721		if (!pat)
6722			goto out;
6723		if (dev_get_valid_name(net, dev, pat) < 0)
6724			goto out;
6725	}
6726
6727	/*
6728	 * And now a mini version of register_netdevice unregister_netdevice.
6729	 */
6730
6731	/* If device is running close it first. */
6732	dev_close(dev);
6733
6734	/* And unlink it from device chain */
6735	err = -ENODEV;
6736	unlist_netdevice(dev);
6737
6738	synchronize_net();
6739
6740	/* Shutdown queueing discipline. */
6741	dev_shutdown(dev);
6742
6743	/* Notify protocols, that we are about to destroy
6744	   this device. They should clean all the things.
6745
6746	   Note that dev->reg_state stays at NETREG_REGISTERED.
6747	   This is wanted because this way 8021q and macvlan know
6748	   the device is just moving and can keep their slaves up.
6749	*/
6750	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6751	rcu_barrier();
6752	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6753	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6754
6755	/*
6756	 *	Flush the unicast and multicast chains
6757	 */
6758	dev_uc_flush(dev);
6759	dev_mc_flush(dev);
6760
6761	/* Send a netdev-removed uevent to the old namespace */
6762	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6763
6764	/* Actually switch the network namespace */
6765	dev_net_set(dev, net);
6766
6767	/* If there is an ifindex conflict assign a new one */
6768	if (__dev_get_by_index(net, dev->ifindex)) {
6769		int iflink = (dev->iflink == dev->ifindex);
6770		dev->ifindex = dev_new_index(net);
6771		if (iflink)
6772			dev->iflink = dev->ifindex;
6773	}
6774
6775	/* Send a netdev-add uevent to the new namespace */
6776	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6777
6778	/* Fixup kobjects */
6779	err = device_rename(&dev->dev, dev->name);
6780	WARN_ON(err);
6781
6782	/* Add the device back in the hashes */
6783	list_netdevice(dev);
6784
6785	/* Notify protocols, that a new device appeared. */
6786	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6787
6788	/*
6789	 *	Prevent userspace races by waiting until the network
6790	 *	device is fully setup before sending notifications.
6791	 */
6792	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6793
6794	synchronize_net();
6795	err = 0;
6796out:
6797	return err;
6798}
6799EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6800
6801static int dev_cpu_callback(struct notifier_block *nfb,
6802			    unsigned long action,
6803			    void *ocpu)
6804{
6805	struct sk_buff **list_skb;
6806	struct sk_buff *skb;
6807	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6808	struct softnet_data *sd, *oldsd;
6809
6810	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6811		return NOTIFY_OK;
6812
6813	local_irq_disable();
6814	cpu = smp_processor_id();
6815	sd = &per_cpu(softnet_data, cpu);
6816	oldsd = &per_cpu(softnet_data, oldcpu);
6817
6818	/* Find end of our completion_queue. */
6819	list_skb = &sd->completion_queue;
6820	while (*list_skb)
6821		list_skb = &(*list_skb)->next;
6822	/* Append completion queue from offline CPU. */
6823	*list_skb = oldsd->completion_queue;
6824	oldsd->completion_queue = NULL;
6825
6826	/* Append output queue from offline CPU. */
6827	if (oldsd->output_queue) {
6828		*sd->output_queue_tailp = oldsd->output_queue;
6829		sd->output_queue_tailp = oldsd->output_queue_tailp;
6830		oldsd->output_queue = NULL;
6831		oldsd->output_queue_tailp = &oldsd->output_queue;
6832	}
6833	/* Append NAPI poll list from offline CPU. */
6834	if (!list_empty(&oldsd->poll_list)) {
6835		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6836		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6837	}
6838
6839	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6840	local_irq_enable();
6841
6842	/* Process offline CPU's input_pkt_queue */
6843	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6844		netif_rx_internal(skb);
6845		input_queue_head_incr(oldsd);
6846	}
6847	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6848		netif_rx_internal(skb);
6849		input_queue_head_incr(oldsd);
6850	}
6851
6852	return NOTIFY_OK;
6853}
6854
6855
6856/**
6857 *	netdev_increment_features - increment feature set by one
6858 *	@all: current feature set
6859 *	@one: new feature set
6860 *	@mask: mask feature set
6861 *
6862 *	Computes a new feature set after adding a device with feature set
6863 *	@one to the master device with current feature set @all.  Will not
6864 *	enable anything that is off in @mask. Returns the new feature set.
6865 */
6866netdev_features_t netdev_increment_features(netdev_features_t all,
6867	netdev_features_t one, netdev_features_t mask)
6868{
6869	if (mask & NETIF_F_GEN_CSUM)
6870		mask |= NETIF_F_ALL_CSUM;
6871	mask |= NETIF_F_VLAN_CHALLENGED;
6872
6873	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6874	all &= one | ~NETIF_F_ALL_FOR_ALL;
6875
6876	/* If one device supports hw checksumming, set for all. */
6877	if (all & NETIF_F_GEN_CSUM)
6878		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6879
6880	return all;
6881}
6882EXPORT_SYMBOL(netdev_increment_features);
6883
6884static struct hlist_head * __net_init netdev_create_hash(void)
6885{
6886	int i;
6887	struct hlist_head *hash;
6888
6889	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6890	if (hash != NULL)
6891		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6892			INIT_HLIST_HEAD(&hash[i]);
6893
6894	return hash;
6895}
6896
6897/* Initialize per network namespace state */
6898static int __net_init netdev_init(struct net *net)
6899{
6900	if (net != &init_net)
6901		INIT_LIST_HEAD(&net->dev_base_head);
6902
6903	net->dev_name_head = netdev_create_hash();
6904	if (net->dev_name_head == NULL)
6905		goto err_name;
6906
6907	net->dev_index_head = netdev_create_hash();
6908	if (net->dev_index_head == NULL)
6909		goto err_idx;
6910
6911	return 0;
6912
6913err_idx:
6914	kfree(net->dev_name_head);
6915err_name:
6916	return -ENOMEM;
6917}
6918
6919/**
6920 *	netdev_drivername - network driver for the device
6921 *	@dev: network device
6922 *
6923 *	Determine network driver for device.
6924 */
6925const char *netdev_drivername(const struct net_device *dev)
6926{
6927	const struct device_driver *driver;
6928	const struct device *parent;
6929	const char *empty = "";
6930
6931	parent = dev->dev.parent;
6932	if (!parent)
6933		return empty;
6934
6935	driver = parent->driver;
6936	if (driver && driver->name)
6937		return driver->name;
6938	return empty;
6939}
6940
6941static int __netdev_printk(const char *level, const struct net_device *dev,
6942			   struct va_format *vaf)
6943{
6944	int r;
6945
6946	if (dev && dev->dev.parent) {
6947		r = dev_printk_emit(level[1] - '0',
6948				    dev->dev.parent,
6949				    "%s %s %s: %pV",
6950				    dev_driver_string(dev->dev.parent),
6951				    dev_name(dev->dev.parent),
6952				    netdev_name(dev), vaf);
6953	} else if (dev) {
6954		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6955	} else {
6956		r = printk("%s(NULL net_device): %pV", level, vaf);
6957	}
6958
6959	return r;
6960}
6961
6962int netdev_printk(const char *level, const struct net_device *dev,
6963		  const char *format, ...)
6964{
6965	struct va_format vaf;
6966	va_list args;
6967	int r;
6968
6969	va_start(args, format);
6970
6971	vaf.fmt = format;
6972	vaf.va = &args;
6973
6974	r = __netdev_printk(level, dev, &vaf);
6975
6976	va_end(args);
6977
6978	return r;
6979}
6980EXPORT_SYMBOL(netdev_printk);
6981
6982#define define_netdev_printk_level(func, level)			\
6983int func(const struct net_device *dev, const char *fmt, ...)	\
6984{								\
6985	int r;							\
6986	struct va_format vaf;					\
6987	va_list args;						\
6988								\
6989	va_start(args, fmt);					\
6990								\
6991	vaf.fmt = fmt;						\
6992	vaf.va = &args;						\
6993								\
6994	r = __netdev_printk(level, dev, &vaf);			\
6995								\
6996	va_end(args);						\
6997								\
6998	return r;						\
6999}								\
7000EXPORT_SYMBOL(func);
7001
7002define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7003define_netdev_printk_level(netdev_alert, KERN_ALERT);
7004define_netdev_printk_level(netdev_crit, KERN_CRIT);
7005define_netdev_printk_level(netdev_err, KERN_ERR);
7006define_netdev_printk_level(netdev_warn, KERN_WARNING);
7007define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7008define_netdev_printk_level(netdev_info, KERN_INFO);
7009
7010static void __net_exit netdev_exit(struct net *net)
7011{
7012	kfree(net->dev_name_head);
7013	kfree(net->dev_index_head);
7014}
7015
7016static struct pernet_operations __net_initdata netdev_net_ops = {
7017	.init = netdev_init,
7018	.exit = netdev_exit,
7019};
7020
7021static void __net_exit default_device_exit(struct net *net)
7022{
7023	struct net_device *dev, *aux;
7024	/*
7025	 * Push all migratable network devices back to the
7026	 * initial network namespace
7027	 */
7028	rtnl_lock();
7029	for_each_netdev_safe(net, dev, aux) {
7030		int err;
7031		char fb_name[IFNAMSIZ];
7032
7033		/* Ignore unmoveable devices (i.e. loopback) */
7034		if (dev->features & NETIF_F_NETNS_LOCAL)
7035			continue;
7036
7037		/* Leave virtual devices for the generic cleanup */
7038		if (dev->rtnl_link_ops)
7039			continue;
7040
7041		/* Push remaining network devices to init_net */
7042		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7043		err = dev_change_net_namespace(dev, &init_net, fb_name);
7044		if (err) {
7045			pr_emerg("%s: failed to move %s to init_net: %d\n",
7046				 __func__, dev->name, err);
7047			BUG();
7048		}
7049	}
7050	rtnl_unlock();
7051}
7052
7053static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7054{
7055	/* Return with the rtnl_lock held when there are no network
7056	 * devices unregistering in any network namespace in net_list.
7057	 */
7058	struct net *net;
7059	bool unregistering;
7060	DEFINE_WAIT(wait);
7061
7062	for (;;) {
7063		prepare_to_wait(&netdev_unregistering_wq, &wait,
7064				TASK_UNINTERRUPTIBLE);
7065		unregistering = false;
7066		rtnl_lock();
7067		list_for_each_entry(net, net_list, exit_list) {
7068			if (net->dev_unreg_count > 0) {
7069				unregistering = true;
7070				break;
7071			}
7072		}
7073		if (!unregistering)
7074			break;
7075		__rtnl_unlock();
7076		schedule();
7077	}
7078	finish_wait(&netdev_unregistering_wq, &wait);
7079}
7080
7081static void __net_exit default_device_exit_batch(struct list_head *net_list)
7082{
7083	/* At exit all network devices most be removed from a network
7084	 * namespace.  Do this in the reverse order of registration.
7085	 * Do this across as many network namespaces as possible to
7086	 * improve batching efficiency.
7087	 */
7088	struct net_device *dev;
7089	struct net *net;
7090	LIST_HEAD(dev_kill_list);
7091
7092	/* To prevent network device cleanup code from dereferencing
7093	 * loopback devices or network devices that have been freed
7094	 * wait here for all pending unregistrations to complete,
7095	 * before unregistring the loopback device and allowing the
7096	 * network namespace be freed.
7097	 *
7098	 * The netdev todo list containing all network devices
7099	 * unregistrations that happen in default_device_exit_batch
7100	 * will run in the rtnl_unlock() at the end of
7101	 * default_device_exit_batch.
7102	 */
7103	rtnl_lock_unregistering(net_list);
7104	list_for_each_entry(net, net_list, exit_list) {
7105		for_each_netdev_reverse(net, dev) {
7106			if (dev->rtnl_link_ops)
7107				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7108			else
7109				unregister_netdevice_queue(dev, &dev_kill_list);
7110		}
7111	}
7112	unregister_netdevice_many(&dev_kill_list);
7113	rtnl_unlock();
7114}
7115
7116static struct pernet_operations __net_initdata default_device_ops = {
7117	.exit = default_device_exit,
7118	.exit_batch = default_device_exit_batch,
7119};
7120
7121/*
7122 *	Initialize the DEV module. At boot time this walks the device list and
7123 *	unhooks any devices that fail to initialise (normally hardware not
7124 *	present) and leaves us with a valid list of present and active devices.
7125 *
7126 */
7127
7128/*
7129 *       This is called single threaded during boot, so no need
7130 *       to take the rtnl semaphore.
7131 */
7132static int __init net_dev_init(void)
7133{
7134	int i, rc = -ENOMEM;
7135
7136	BUG_ON(!dev_boot_phase);
7137
7138	if (dev_proc_init())
7139		goto out;
7140
7141	if (netdev_kobject_init())
7142		goto out;
7143
7144	INIT_LIST_HEAD(&ptype_all);
7145	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7146		INIT_LIST_HEAD(&ptype_base[i]);
7147
7148	INIT_LIST_HEAD(&offload_base);
7149
7150	if (register_pernet_subsys(&netdev_net_ops))
7151		goto out;
7152
7153	/*
7154	 *	Initialise the packet receive queues.
7155	 */
7156
7157	for_each_possible_cpu(i) {
7158		struct softnet_data *sd = &per_cpu(softnet_data, i);
7159
7160		skb_queue_head_init(&sd->input_pkt_queue);
7161		skb_queue_head_init(&sd->process_queue);
7162		INIT_LIST_HEAD(&sd->poll_list);
7163		sd->output_queue_tailp = &sd->output_queue;
7164#ifdef CONFIG_RPS
7165		sd->csd.func = rps_trigger_softirq;
7166		sd->csd.info = sd;
7167		sd->cpu = i;
7168#endif
7169
7170		sd->backlog.poll = process_backlog;
7171		sd->backlog.weight = weight_p;
7172	}
7173
7174	dev_boot_phase = 0;
7175
7176	/* The loopback device is special if any other network devices
7177	 * is present in a network namespace the loopback device must
7178	 * be present. Since we now dynamically allocate and free the
7179	 * loopback device ensure this invariant is maintained by
7180	 * keeping the loopback device as the first device on the
7181	 * list of network devices.  Ensuring the loopback devices
7182	 * is the first device that appears and the last network device
7183	 * that disappears.
7184	 */
7185	if (register_pernet_device(&loopback_net_ops))
7186		goto out;
7187
7188	if (register_pernet_device(&default_device_ops))
7189		goto out;
7190
7191	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7192	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7193
7194	hotcpu_notifier(dev_cpu_callback, 0);
7195	dst_init();
7196	rc = 0;
7197out:
7198	return rc;
7199}
7200
7201subsys_initcall(net_dev_init);