net/core/dev.c at v3.14 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v3.14 179 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <linux/ipv6.h>
 122#include <linux/in.h>
 123#include <linux/jhash.h>
 124#include <linux/random.h>
 125#include <trace/events/napi.h>
 126#include <trace/events/net.h>
 127#include <trace/events/skb.h>
 128#include <linux/pci.h>
 129#include <linux/inetdevice.h>
 130#include <linux/cpu_rmap.h>
 131#include <linux/static_key.h>
 132#include <linux/hashtable.h>
 133#include <linux/vmalloc.h>
 134#include <linux/if_macvlan.h>
 135
 136#include "net-sysfs.h"
 137
 138/* Instead of increasing this, you should create a hash table. */
 139#define MAX_GRO_SKBS 8
 140
 141/* This should be increased if a protocol with a bigger head is added. */
 142#define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144static DEFINE_SPINLOCK(ptype_lock);
 145static DEFINE_SPINLOCK(offload_lock);
 146struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147struct list_head ptype_all __read_mostly;	/* Taps */
 148static struct list_head offload_base __read_mostly;
 149
 150static int netif_rx_internal(struct sk_buff *skb);
 151
 152/*
 153 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 154 * semaphore.
 155 *
 156 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 157 *
 158 * Writers must hold the rtnl semaphore while they loop through the
 159 * dev_base_head list, and hold dev_base_lock for writing when they do the
 160 * actual updates.  This allows pure readers to access the list even
 161 * while a writer is preparing to update it.
 162 *
 163 * To put it another way, dev_base_lock is held for writing only to
 164 * protect against pure readers; the rtnl semaphore provides the
 165 * protection against other writers.
 166 *
 167 * See, for example usages, register_netdevice() and
 168 * unregister_netdevice(), which must be called with the rtnl
 169 * semaphore held.
 170 */
 171DEFINE_RWLOCK(dev_base_lock);
 172EXPORT_SYMBOL(dev_base_lock);
 173
 174/* protects napi_hash addition/deletion and napi_gen_id */
 175static DEFINE_SPINLOCK(napi_hash_lock);
 176
 177static unsigned int napi_gen_id;
 178static DEFINE_HASHTABLE(napi_hash, 8);
 179
 180static seqcount_t devnet_rename_seq;
 181
 182static inline void dev_base_seq_inc(struct net *net)
 183{
 184	while (++net->dev_base_seq == 0);
 185}
 186
 187static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 188{
 189	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 190
 191	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 192}
 193
 194static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 195{
 196	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 197}
 198
 199static inline void rps_lock(struct softnet_data *sd)
 200{
 201#ifdef CONFIG_RPS
 202	spin_lock(&sd->input_pkt_queue.lock);
 203#endif
 204}
 205
 206static inline void rps_unlock(struct softnet_data *sd)
 207{
 208#ifdef CONFIG_RPS
 209	spin_unlock(&sd->input_pkt_queue.lock);
 210#endif
 211}
 212
 213/* Device list insertion */
 214static void list_netdevice(struct net_device *dev)
 215{
 216	struct net *net = dev_net(dev);
 217
 218	ASSERT_RTNL();
 219
 220	write_lock_bh(&dev_base_lock);
 221	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 222	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 223	hlist_add_head_rcu(&dev->index_hlist,
 224			   dev_index_hash(net, dev->ifindex));
 225	write_unlock_bh(&dev_base_lock);
 226
 227	dev_base_seq_inc(net);
 228}
 229
 230/* Device list removal
 231 * caller must respect a RCU grace period before freeing/reusing dev
 232 */
 233static void unlist_netdevice(struct net_device *dev)
 234{
 235	ASSERT_RTNL();
 236
 237	/* Unlink dev from the device chain */
 238	write_lock_bh(&dev_base_lock);
 239	list_del_rcu(&dev->dev_list);
 240	hlist_del_rcu(&dev->name_hlist);
 241	hlist_del_rcu(&dev->index_hlist);
 242	write_unlock_bh(&dev_base_lock);
 243
 244	dev_base_seq_inc(dev_net(dev));
 245}
 246
 247/*
 248 *	Our notifier list
 249 */
 250
 251static RAW_NOTIFIER_HEAD(netdev_chain);
 252
 253/*
 254 *	Device drivers call our routines to queue packets here. We empty the
 255 *	queue in the local softnet handler.
 256 */
 257
 258DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 259EXPORT_PER_CPU_SYMBOL(softnet_data);
 260
 261#ifdef CONFIG_LOCKDEP
 262/*
 263 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 264 * according to dev->type
 265 */
 266static const unsigned short netdev_lock_type[] =
 267	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 268	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 269	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 270	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 271	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 272	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 273	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 274	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 275	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 276	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 277	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 278	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 279	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 280	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 281	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 282
 283static const char *const netdev_lock_name[] =
 284	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 285	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 286	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 287	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 288	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 289	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 290	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 291	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 292	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 293	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 294	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 295	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 296	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 297	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 298	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 299
 300static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 301static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 302
 303static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 304{
 305	int i;
 306
 307	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 308		if (netdev_lock_type[i] == dev_type)
 309			return i;
 310	/* the last key is used by default */
 311	return ARRAY_SIZE(netdev_lock_type) - 1;
 312}
 313
 314static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 315						 unsigned short dev_type)
 316{
 317	int i;
 318
 319	i = netdev_lock_pos(dev_type);
 320	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 321				   netdev_lock_name[i]);
 322}
 323
 324static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 325{
 326	int i;
 327
 328	i = netdev_lock_pos(dev->type);
 329	lockdep_set_class_and_name(&dev->addr_list_lock,
 330				   &netdev_addr_lock_key[i],
 331				   netdev_lock_name[i]);
 332}
 333#else
 334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335						 unsigned short dev_type)
 336{
 337}
 338static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 339{
 340}
 341#endif
 342
 343/*******************************************************************************
 344
 345		Protocol management and registration routines
 346
 347*******************************************************************************/
 348
 349/*
 350 *	Add a protocol ID to the list. Now that the input handler is
 351 *	smarter we can dispense with all the messy stuff that used to be
 352 *	here.
 353 *
 354 *	BEWARE!!! Protocol handlers, mangling input packets,
 355 *	MUST BE last in hash buckets and checking protocol handlers
 356 *	MUST start from promiscuous ptype_all chain in net_bh.
 357 *	It is true now, do not change it.
 358 *	Explanation follows: if protocol handler, mangling packet, will
 359 *	be the first on list, it is not able to sense, that packet
 360 *	is cloned and should be copied-on-write, so that it will
 361 *	change it and subsequent readers will get broken packet.
 362 *							--ANK (980803)
 363 */
 364
 365static inline struct list_head *ptype_head(const struct packet_type *pt)
 366{
 367	if (pt->type == htons(ETH_P_ALL))
 368		return &ptype_all;
 369	else
 370		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 371}
 372
 373/**
 374 *	dev_add_pack - add packet handler
 375 *	@pt: packet type declaration
 376 *
 377 *	Add a protocol handler to the networking stack. The passed &packet_type
 378 *	is linked into kernel lists and may not be freed until it has been
 379 *	removed from the kernel lists.
 380 *
 381 *	This call does not sleep therefore it can not
 382 *	guarantee all CPU's that are in middle of receiving packets
 383 *	will see the new packet type (until the next received packet).
 384 */
 385
 386void dev_add_pack(struct packet_type *pt)
 387{
 388	struct list_head *head = ptype_head(pt);
 389
 390	spin_lock(&ptype_lock);
 391	list_add_rcu(&pt->list, head);
 392	spin_unlock(&ptype_lock);
 393}
 394EXPORT_SYMBOL(dev_add_pack);
 395
 396/**
 397 *	__dev_remove_pack	 - remove packet handler
 398 *	@pt: packet type declaration
 399 *
 400 *	Remove a protocol handler that was previously added to the kernel
 401 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 402 *	from the kernel lists and can be freed or reused once this function
 403 *	returns.
 404 *
 405 *      The packet type might still be in use by receivers
 406 *	and must not be freed until after all the CPU's have gone
 407 *	through a quiescent state.
 408 */
 409void __dev_remove_pack(struct packet_type *pt)
 410{
 411	struct list_head *head = ptype_head(pt);
 412	struct packet_type *pt1;
 413
 414	spin_lock(&ptype_lock);
 415
 416	list_for_each_entry(pt1, head, list) {
 417		if (pt == pt1) {
 418			list_del_rcu(&pt->list);
 419			goto out;
 420		}
 421	}
 422
 423	pr_warn("dev_remove_pack: %p not found\n", pt);
 424out:
 425	spin_unlock(&ptype_lock);
 426}
 427EXPORT_SYMBOL(__dev_remove_pack);
 428
 429/**
 430 *	dev_remove_pack	 - remove packet handler
 431 *	@pt: packet type declaration
 432 *
 433 *	Remove a protocol handler that was previously added to the kernel
 434 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 435 *	from the kernel lists and can be freed or reused once this function
 436 *	returns.
 437 *
 438 *	This call sleeps to guarantee that no CPU is looking at the packet
 439 *	type after return.
 440 */
 441void dev_remove_pack(struct packet_type *pt)
 442{
 443	__dev_remove_pack(pt);
 444
 445	synchronize_net();
 446}
 447EXPORT_SYMBOL(dev_remove_pack);
 448
 449
 450/**
 451 *	dev_add_offload - register offload handlers
 452 *	@po: protocol offload declaration
 453 *
 454 *	Add protocol offload handlers to the networking stack. The passed
 455 *	&proto_offload is linked into kernel lists and may not be freed until
 456 *	it has been removed from the kernel lists.
 457 *
 458 *	This call does not sleep therefore it can not
 459 *	guarantee all CPU's that are in middle of receiving packets
 460 *	will see the new offload handlers (until the next received packet).
 461 */
 462void dev_add_offload(struct packet_offload *po)
 463{
 464	struct list_head *head = &offload_base;
 465
 466	spin_lock(&offload_lock);
 467	list_add_rcu(&po->list, head);
 468	spin_unlock(&offload_lock);
 469}
 470EXPORT_SYMBOL(dev_add_offload);
 471
 472/**
 473 *	__dev_remove_offload	 - remove offload handler
 474 *	@po: packet offload declaration
 475 *
 476 *	Remove a protocol offload handler that was previously added to the
 477 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 478 *	is removed from the kernel lists and can be freed or reused once this
 479 *	function returns.
 480 *
 481 *      The packet type might still be in use by receivers
 482 *	and must not be freed until after all the CPU's have gone
 483 *	through a quiescent state.
 484 */
 485static void __dev_remove_offload(struct packet_offload *po)
 486{
 487	struct list_head *head = &offload_base;
 488	struct packet_offload *po1;
 489
 490	spin_lock(&offload_lock);
 491
 492	list_for_each_entry(po1, head, list) {
 493		if (po == po1) {
 494			list_del_rcu(&po->list);
 495			goto out;
 496		}
 497	}
 498
 499	pr_warn("dev_remove_offload: %p not found\n", po);
 500out:
 501	spin_unlock(&offload_lock);
 502}
 503
 504/**
 505 *	dev_remove_offload	 - remove packet offload handler
 506 *	@po: packet offload declaration
 507 *
 508 *	Remove a packet offload handler that was previously added to the kernel
 509 *	offload handlers by dev_add_offload(). The passed &offload_type is
 510 *	removed from the kernel lists and can be freed or reused once this
 511 *	function returns.
 512 *
 513 *	This call sleeps to guarantee that no CPU is looking at the packet
 514 *	type after return.
 515 */
 516void dev_remove_offload(struct packet_offload *po)
 517{
 518	__dev_remove_offload(po);
 519
 520	synchronize_net();
 521}
 522EXPORT_SYMBOL(dev_remove_offload);
 523
 524/******************************************************************************
 525
 526		      Device Boot-time Settings Routines
 527
 528*******************************************************************************/
 529
 530/* Boot time configuration table */
 531static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 532
 533/**
 534 *	netdev_boot_setup_add	- add new setup entry
 535 *	@name: name of the device
 536 *	@map: configured settings for the device
 537 *
 538 *	Adds new setup entry to the dev_boot_setup list.  The function
 539 *	returns 0 on error and 1 on success.  This is a generic routine to
 540 *	all netdevices.
 541 */
 542static int netdev_boot_setup_add(char *name, struct ifmap *map)
 543{
 544	struct netdev_boot_setup *s;
 545	int i;
 546
 547	s = dev_boot_setup;
 548	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 549		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 550			memset(s[i].name, 0, sizeof(s[i].name));
 551			strlcpy(s[i].name, name, IFNAMSIZ);
 552			memcpy(&s[i].map, map, sizeof(s[i].map));
 553			break;
 554		}
 555	}
 556
 557	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 558}
 559
 560/**
 561 *	netdev_boot_setup_check	- check boot time settings
 562 *	@dev: the netdevice
 563 *
 564 * 	Check boot time settings for the device.
 565 *	The found settings are set for the device to be used
 566 *	later in the device probing.
 567 *	Returns 0 if no settings found, 1 if they are.
 568 */
 569int netdev_boot_setup_check(struct net_device *dev)
 570{
 571	struct netdev_boot_setup *s = dev_boot_setup;
 572	int i;
 573
 574	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 576		    !strcmp(dev->name, s[i].name)) {
 577			dev->irq 	= s[i].map.irq;
 578			dev->base_addr 	= s[i].map.base_addr;
 579			dev->mem_start 	= s[i].map.mem_start;
 580			dev->mem_end 	= s[i].map.mem_end;
 581			return 1;
 582		}
 583	}
 584	return 0;
 585}
 586EXPORT_SYMBOL(netdev_boot_setup_check);
 587
 588
 589/**
 590 *	netdev_boot_base	- get address from boot time settings
 591 *	@prefix: prefix for network device
 592 *	@unit: id for network device
 593 *
 594 * 	Check boot time settings for the base address of device.
 595 *	The found settings are set for the device to be used
 596 *	later in the device probing.
 597 *	Returns 0 if no settings found.
 598 */
 599unsigned long netdev_boot_base(const char *prefix, int unit)
 600{
 601	const struct netdev_boot_setup *s = dev_boot_setup;
 602	char name[IFNAMSIZ];
 603	int i;
 604
 605	sprintf(name, "%s%d", prefix, unit);
 606
 607	/*
 608	 * If device already registered then return base of 1
 609	 * to indicate not to probe for this interface
 610	 */
 611	if (__dev_get_by_name(&init_net, name))
 612		return 1;
 613
 614	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 615		if (!strcmp(name, s[i].name))
 616			return s[i].map.base_addr;
 617	return 0;
 618}
 619
 620/*
 621 * Saves at boot time configured settings for any netdevice.
 622 */
 623int __init netdev_boot_setup(char *str)
 624{
 625	int ints[5];
 626	struct ifmap map;
 627
 628	str = get_options(str, ARRAY_SIZE(ints), ints);
 629	if (!str || !*str)
 630		return 0;
 631
 632	/* Save settings */
 633	memset(&map, 0, sizeof(map));
 634	if (ints[0] > 0)
 635		map.irq = ints[1];
 636	if (ints[0] > 1)
 637		map.base_addr = ints[2];
 638	if (ints[0] > 2)
 639		map.mem_start = ints[3];
 640	if (ints[0] > 3)
 641		map.mem_end = ints[4];
 642
 643	/* Add new entry to the list */
 644	return netdev_boot_setup_add(str, &map);
 645}
 646
 647__setup("netdev=", netdev_boot_setup);
 648
 649/*******************************************************************************
 650
 651			    Device Interface Subroutines
 652
 653*******************************************************************************/
 654
 655/**
 656 *	__dev_get_by_name	- find a device by its name
 657 *	@net: the applicable net namespace
 658 *	@name: name to find
 659 *
 660 *	Find an interface by name. Must be called under RTNL semaphore
 661 *	or @dev_base_lock. If the name is found a pointer to the device
 662 *	is returned. If the name is not found then %NULL is returned. The
 663 *	reference counters are not incremented so the caller must be
 664 *	careful with locks.
 665 */
 666
 667struct net_device *__dev_get_by_name(struct net *net, const char *name)
 668{
 669	struct net_device *dev;
 670	struct hlist_head *head = dev_name_hash(net, name);
 671
 672	hlist_for_each_entry(dev, head, name_hlist)
 673		if (!strncmp(dev->name, name, IFNAMSIZ))
 674			return dev;
 675
 676	return NULL;
 677}
 678EXPORT_SYMBOL(__dev_get_by_name);
 679
 680/**
 681 *	dev_get_by_name_rcu	- find a device by its name
 682 *	@net: the applicable net namespace
 683 *	@name: name to find
 684 *
 685 *	Find an interface by name.
 686 *	If the name is found a pointer to the device is returned.
 687 * 	If the name is not found then %NULL is returned.
 688 *	The reference counters are not incremented so the caller must be
 689 *	careful with locks. The caller must hold RCU lock.
 690 */
 691
 692struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 693{
 694	struct net_device *dev;
 695	struct hlist_head *head = dev_name_hash(net, name);
 696
 697	hlist_for_each_entry_rcu(dev, head, name_hlist)
 698		if (!strncmp(dev->name, name, IFNAMSIZ))
 699			return dev;
 700
 701	return NULL;
 702}
 703EXPORT_SYMBOL(dev_get_by_name_rcu);
 704
 705/**
 706 *	dev_get_by_name		- find a device by its name
 707 *	@net: the applicable net namespace
 708 *	@name: name to find
 709 *
 710 *	Find an interface by name. This can be called from any
 711 *	context and does its own locking. The returned handle has
 712 *	the usage count incremented and the caller must use dev_put() to
 713 *	release it when it is no longer needed. %NULL is returned if no
 714 *	matching device is found.
 715 */
 716
 717struct net_device *dev_get_by_name(struct net *net, const char *name)
 718{
 719	struct net_device *dev;
 720
 721	rcu_read_lock();
 722	dev = dev_get_by_name_rcu(net, name);
 723	if (dev)
 724		dev_hold(dev);
 725	rcu_read_unlock();
 726	return dev;
 727}
 728EXPORT_SYMBOL(dev_get_by_name);
 729
 730/**
 731 *	__dev_get_by_index - find a device by its ifindex
 732 *	@net: the applicable net namespace
 733 *	@ifindex: index of device
 734 *
 735 *	Search for an interface by index. Returns %NULL if the device
 736 *	is not found or a pointer to the device. The device has not
 737 *	had its reference counter increased so the caller must be careful
 738 *	about locking. The caller must hold either the RTNL semaphore
 739 *	or @dev_base_lock.
 740 */
 741
 742struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 743{
 744	struct net_device *dev;
 745	struct hlist_head *head = dev_index_hash(net, ifindex);
 746
 747	hlist_for_each_entry(dev, head, index_hlist)
 748		if (dev->ifindex == ifindex)
 749			return dev;
 750
 751	return NULL;
 752}
 753EXPORT_SYMBOL(__dev_get_by_index);
 754
 755/**
 756 *	dev_get_by_index_rcu - find a device by its ifindex
 757 *	@net: the applicable net namespace
 758 *	@ifindex: index of device
 759 *
 760 *	Search for an interface by index. Returns %NULL if the device
 761 *	is not found or a pointer to the device. The device has not
 762 *	had its reference counter increased so the caller must be careful
 763 *	about locking. The caller must hold RCU lock.
 764 */
 765
 766struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 767{
 768	struct net_device *dev;
 769	struct hlist_head *head = dev_index_hash(net, ifindex);
 770
 771	hlist_for_each_entry_rcu(dev, head, index_hlist)
 772		if (dev->ifindex == ifindex)
 773			return dev;
 774
 775	return NULL;
 776}
 777EXPORT_SYMBOL(dev_get_by_index_rcu);
 778
 779
 780/**
 781 *	dev_get_by_index - find a device by its ifindex
 782 *	@net: the applicable net namespace
 783 *	@ifindex: index of device
 784 *
 785 *	Search for an interface by index. Returns NULL if the device
 786 *	is not found or a pointer to the device. The device returned has
 787 *	had a reference added and the pointer is safe until the user calls
 788 *	dev_put to indicate they have finished with it.
 789 */
 790
 791struct net_device *dev_get_by_index(struct net *net, int ifindex)
 792{
 793	struct net_device *dev;
 794
 795	rcu_read_lock();
 796	dev = dev_get_by_index_rcu(net, ifindex);
 797	if (dev)
 798		dev_hold(dev);
 799	rcu_read_unlock();
 800	return dev;
 801}
 802EXPORT_SYMBOL(dev_get_by_index);
 803
 804/**
 805 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 806 *	@net: network namespace
 807 *	@name: a pointer to the buffer where the name will be stored.
 808 *	@ifindex: the ifindex of the interface to get the name from.
 809 *
 810 *	The use of raw_seqcount_begin() and cond_resched() before
 811 *	retrying is required as we want to give the writers a chance
 812 *	to complete when CONFIG_PREEMPT is not set.
 813 */
 814int netdev_get_name(struct net *net, char *name, int ifindex)
 815{
 816	struct net_device *dev;
 817	unsigned int seq;
 818
 819retry:
 820	seq = raw_seqcount_begin(&devnet_rename_seq);
 821	rcu_read_lock();
 822	dev = dev_get_by_index_rcu(net, ifindex);
 823	if (!dev) {
 824		rcu_read_unlock();
 825		return -ENODEV;
 826	}
 827
 828	strcpy(name, dev->name);
 829	rcu_read_unlock();
 830	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 831		cond_resched();
 832		goto retry;
 833	}
 834
 835	return 0;
 836}
 837
 838/**
 839 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 840 *	@net: the applicable net namespace
 841 *	@type: media type of device
 842 *	@ha: hardware address
 843 *
 844 *	Search for an interface by MAC address. Returns NULL if the device
 845 *	is not found or a pointer to the device.
 846 *	The caller must hold RCU or RTNL.
 847 *	The returned device has not had its ref count increased
 848 *	and the caller must therefore be careful about locking
 849 *
 850 */
 851
 852struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 853				       const char *ha)
 854{
 855	struct net_device *dev;
 856
 857	for_each_netdev_rcu(net, dev)
 858		if (dev->type == type &&
 859		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 860			return dev;
 861
 862	return NULL;
 863}
 864EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 865
 866struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 867{
 868	struct net_device *dev;
 869
 870	ASSERT_RTNL();
 871	for_each_netdev(net, dev)
 872		if (dev->type == type)
 873			return dev;
 874
 875	return NULL;
 876}
 877EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 878
 879struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 880{
 881	struct net_device *dev, *ret = NULL;
 882
 883	rcu_read_lock();
 884	for_each_netdev_rcu(net, dev)
 885		if (dev->type == type) {
 886			dev_hold(dev);
 887			ret = dev;
 888			break;
 889		}
 890	rcu_read_unlock();
 891	return ret;
 892}
 893EXPORT_SYMBOL(dev_getfirstbyhwtype);
 894
 895/**
 896 *	dev_get_by_flags_rcu - find any device with given flags
 897 *	@net: the applicable net namespace
 898 *	@if_flags: IFF_* values
 899 *	@mask: bitmask of bits in if_flags to check
 900 *
 901 *	Search for any interface with the given flags. Returns NULL if a device
 902 *	is not found or a pointer to the device. Must be called inside
 903 *	rcu_read_lock(), and result refcount is unchanged.
 904 */
 905
 906struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 907				    unsigned short mask)
 908{
 909	struct net_device *dev, *ret;
 910
 911	ret = NULL;
 912	for_each_netdev_rcu(net, dev) {
 913		if (((dev->flags ^ if_flags) & mask) == 0) {
 914			ret = dev;
 915			break;
 916		}
 917	}
 918	return ret;
 919}
 920EXPORT_SYMBOL(dev_get_by_flags_rcu);
 921
 922/**
 923 *	dev_valid_name - check if name is okay for network device
 924 *	@name: name string
 925 *
 926 *	Network device names need to be valid file names to
 927 *	to allow sysfs to work.  We also disallow any kind of
 928 *	whitespace.
 929 */
 930bool dev_valid_name(const char *name)
 931{
 932	if (*name == '\0')
 933		return false;
 934	if (strlen(name) >= IFNAMSIZ)
 935		return false;
 936	if (!strcmp(name, ".") || !strcmp(name, ".."))
 937		return false;
 938
 939	while (*name) {
 940		if (*name == '/' || isspace(*name))
 941			return false;
 942		name++;
 943	}
 944	return true;
 945}
 946EXPORT_SYMBOL(dev_valid_name);
 947
 948/**
 949 *	__dev_alloc_name - allocate a name for a device
 950 *	@net: network namespace to allocate the device name in
 951 *	@name: name format string
 952 *	@buf:  scratch buffer and result name string
 953 *
 954 *	Passed a format string - eg "lt%d" it will try and find a suitable
 955 *	id. It scans list of devices to build up a free map, then chooses
 956 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 957 *	while allocating the name and adding the device in order to avoid
 958 *	duplicates.
 959 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 960 *	Returns the number of the unit assigned or a negative errno code.
 961 */
 962
 963static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 964{
 965	int i = 0;
 966	const char *p;
 967	const int max_netdevices = 8*PAGE_SIZE;
 968	unsigned long *inuse;
 969	struct net_device *d;
 970
 971	p = strnchr(name, IFNAMSIZ-1, '%');
 972	if (p) {
 973		/*
 974		 * Verify the string as this thing may have come from
 975		 * the user.  There must be either one "%d" and no other "%"
 976		 * characters.
 977		 */
 978		if (p[1] != 'd' || strchr(p + 2, '%'))
 979			return -EINVAL;
 980
 981		/* Use one page as a bit array of possible slots */
 982		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 983		if (!inuse)
 984			return -ENOMEM;
 985
 986		for_each_netdev(net, d) {
 987			if (!sscanf(d->name, name, &i))
 988				continue;
 989			if (i < 0 || i >= max_netdevices)
 990				continue;
 991
 992			/*  avoid cases where sscanf is not exact inverse of printf */
 993			snprintf(buf, IFNAMSIZ, name, i);
 994			if (!strncmp(buf, d->name, IFNAMSIZ))
 995				set_bit(i, inuse);
 996		}
 997
 998		i = find_first_zero_bit(inuse, max_netdevices);
 999		free_page((unsigned long) inuse);
1000	}
1001
1002	if (buf != name)
1003		snprintf(buf, IFNAMSIZ, name, i);
1004	if (!__dev_get_by_name(net, buf))
1005		return i;
1006
1007	/* It is possible to run out of possible slots
1008	 * when the name is long and there isn't enough space left
1009	 * for the digits, or if all bits are used.
1010	 */
1011	return -ENFILE;
1012}
1013
1014/**
1015 *	dev_alloc_name - allocate a name for a device
1016 *	@dev: device
1017 *	@name: name format string
1018 *
1019 *	Passed a format string - eg "lt%d" it will try and find a suitable
1020 *	id. It scans list of devices to build up a free map, then chooses
1021 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1022 *	while allocating the name and adding the device in order to avoid
1023 *	duplicates.
1024 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1025 *	Returns the number of the unit assigned or a negative errno code.
1026 */
1027
1028int dev_alloc_name(struct net_device *dev, const char *name)
1029{
1030	char buf[IFNAMSIZ];
1031	struct net *net;
1032	int ret;
1033
1034	BUG_ON(!dev_net(dev));
1035	net = dev_net(dev);
1036	ret = __dev_alloc_name(net, name, buf);
1037	if (ret >= 0)
1038		strlcpy(dev->name, buf, IFNAMSIZ);
1039	return ret;
1040}
1041EXPORT_SYMBOL(dev_alloc_name);
1042
1043static int dev_alloc_name_ns(struct net *net,
1044			     struct net_device *dev,
1045			     const char *name)
1046{
1047	char buf[IFNAMSIZ];
1048	int ret;
1049
1050	ret = __dev_alloc_name(net, name, buf);
1051	if (ret >= 0)
1052		strlcpy(dev->name, buf, IFNAMSIZ);
1053	return ret;
1054}
1055
1056static int dev_get_valid_name(struct net *net,
1057			      struct net_device *dev,
1058			      const char *name)
1059{
1060	BUG_ON(!net);
1061
1062	if (!dev_valid_name(name))
1063		return -EINVAL;
1064
1065	if (strchr(name, '%'))
1066		return dev_alloc_name_ns(net, dev, name);
1067	else if (__dev_get_by_name(net, name))
1068		return -EEXIST;
1069	else if (dev->name != name)
1070		strlcpy(dev->name, name, IFNAMSIZ);
1071
1072	return 0;
1073}
1074
1075/**
1076 *	dev_change_name - change name of a device
1077 *	@dev: device
1078 *	@newname: name (or format string) must be at least IFNAMSIZ
1079 *
1080 *	Change name of a device, can pass format strings "eth%d".
1081 *	for wildcarding.
1082 */
1083int dev_change_name(struct net_device *dev, const char *newname)
1084{
1085	char oldname[IFNAMSIZ];
1086	int err = 0;
1087	int ret;
1088	struct net *net;
1089
1090	ASSERT_RTNL();
1091	BUG_ON(!dev_net(dev));
1092
1093	net = dev_net(dev);
1094	if (dev->flags & IFF_UP)
1095		return -EBUSY;
1096
1097	write_seqcount_begin(&devnet_rename_seq);
1098
1099	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1100		write_seqcount_end(&devnet_rename_seq);
1101		return 0;
1102	}
1103
1104	memcpy(oldname, dev->name, IFNAMSIZ);
1105
1106	err = dev_get_valid_name(net, dev, newname);
1107	if (err < 0) {
1108		write_seqcount_end(&devnet_rename_seq);
1109		return err;
1110	}
1111
1112rollback:
1113	ret = device_rename(&dev->dev, dev->name);
1114	if (ret) {
1115		memcpy(dev->name, oldname, IFNAMSIZ);
1116		write_seqcount_end(&devnet_rename_seq);
1117		return ret;
1118	}
1119
1120	write_seqcount_end(&devnet_rename_seq);
1121
1122	netdev_adjacent_rename_links(dev, oldname);
1123
1124	write_lock_bh(&dev_base_lock);
1125	hlist_del_rcu(&dev->name_hlist);
1126	write_unlock_bh(&dev_base_lock);
1127
1128	synchronize_rcu();
1129
1130	write_lock_bh(&dev_base_lock);
1131	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1132	write_unlock_bh(&dev_base_lock);
1133
1134	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1135	ret = notifier_to_errno(ret);
1136
1137	if (ret) {
1138		/* err >= 0 after dev_alloc_name() or stores the first errno */
1139		if (err >= 0) {
1140			err = ret;
1141			write_seqcount_begin(&devnet_rename_seq);
1142			memcpy(dev->name, oldname, IFNAMSIZ);
1143			memcpy(oldname, newname, IFNAMSIZ);
1144			goto rollback;
1145		} else {
1146			pr_err("%s: name change rollback failed: %d\n",
1147			       dev->name, ret);
1148		}
1149	}
1150
1151	return err;
1152}
1153
1154/**
1155 *	dev_set_alias - change ifalias of a device
1156 *	@dev: device
1157 *	@alias: name up to IFALIASZ
1158 *	@len: limit of bytes to copy from info
1159 *
1160 *	Set ifalias for a device,
1161 */
1162int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1163{
1164	char *new_ifalias;
1165
1166	ASSERT_RTNL();
1167
1168	if (len >= IFALIASZ)
1169		return -EINVAL;
1170
1171	if (!len) {
1172		kfree(dev->ifalias);
1173		dev->ifalias = NULL;
1174		return 0;
1175	}
1176
1177	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1178	if (!new_ifalias)
1179		return -ENOMEM;
1180	dev->ifalias = new_ifalias;
1181
1182	strlcpy(dev->ifalias, alias, len+1);
1183	return len;
1184}
1185
1186
1187/**
1188 *	netdev_features_change - device changes features
1189 *	@dev: device to cause notification
1190 *
1191 *	Called to indicate a device has changed features.
1192 */
1193void netdev_features_change(struct net_device *dev)
1194{
1195	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1196}
1197EXPORT_SYMBOL(netdev_features_change);
1198
1199/**
1200 *	netdev_state_change - device changes state
1201 *	@dev: device to cause notification
1202 *
1203 *	Called to indicate a device has changed state. This function calls
1204 *	the notifier chains for netdev_chain and sends a NEWLINK message
1205 *	to the routing socket.
1206 */
1207void netdev_state_change(struct net_device *dev)
1208{
1209	if (dev->flags & IFF_UP) {
1210		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1211		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1212	}
1213}
1214EXPORT_SYMBOL(netdev_state_change);
1215
1216/**
1217 * 	netdev_notify_peers - notify network peers about existence of @dev
1218 * 	@dev: network device
1219 *
1220 * Generate traffic such that interested network peers are aware of
1221 * @dev, such as by generating a gratuitous ARP. This may be used when
1222 * a device wants to inform the rest of the network about some sort of
1223 * reconfiguration such as a failover event or virtual machine
1224 * migration.
1225 */
1226void netdev_notify_peers(struct net_device *dev)
1227{
1228	rtnl_lock();
1229	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1230	rtnl_unlock();
1231}
1232EXPORT_SYMBOL(netdev_notify_peers);
1233
1234static int __dev_open(struct net_device *dev)
1235{
1236	const struct net_device_ops *ops = dev->netdev_ops;
1237	int ret;
1238
1239	ASSERT_RTNL();
1240
1241	if (!netif_device_present(dev))
1242		return -ENODEV;
1243
1244	/* Block netpoll from trying to do any rx path servicing.
1245	 * If we don't do this there is a chance ndo_poll_controller
1246	 * or ndo_poll may be running while we open the device
1247	 */
1248	netpoll_rx_disable(dev);
1249
1250	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1251	ret = notifier_to_errno(ret);
1252	if (ret)
1253		return ret;
1254
1255	set_bit(__LINK_STATE_START, &dev->state);
1256
1257	if (ops->ndo_validate_addr)
1258		ret = ops->ndo_validate_addr(dev);
1259
1260	if (!ret && ops->ndo_open)
1261		ret = ops->ndo_open(dev);
1262
1263	netpoll_rx_enable(dev);
1264
1265	if (ret)
1266		clear_bit(__LINK_STATE_START, &dev->state);
1267	else {
1268		dev->flags |= IFF_UP;
1269		net_dmaengine_get();
1270		dev_set_rx_mode(dev);
1271		dev_activate(dev);
1272		add_device_randomness(dev->dev_addr, dev->addr_len);
1273	}
1274
1275	return ret;
1276}
1277
1278/**
1279 *	dev_open	- prepare an interface for use.
1280 *	@dev:	device to open
1281 *
1282 *	Takes a device from down to up state. The device's private open
1283 *	function is invoked and then the multicast lists are loaded. Finally
1284 *	the device is moved into the up state and a %NETDEV_UP message is
1285 *	sent to the netdev notifier chain.
1286 *
1287 *	Calling this function on an active interface is a nop. On a failure
1288 *	a negative errno code is returned.
1289 */
1290int dev_open(struct net_device *dev)
1291{
1292	int ret;
1293
1294	if (dev->flags & IFF_UP)
1295		return 0;
1296
1297	ret = __dev_open(dev);
1298	if (ret < 0)
1299		return ret;
1300
1301	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1302	call_netdevice_notifiers(NETDEV_UP, dev);
1303
1304	return ret;
1305}
1306EXPORT_SYMBOL(dev_open);
1307
1308static int __dev_close_many(struct list_head *head)
1309{
1310	struct net_device *dev;
1311
1312	ASSERT_RTNL();
1313	might_sleep();
1314
1315	list_for_each_entry(dev, head, close_list) {
1316		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1317
1318		clear_bit(__LINK_STATE_START, &dev->state);
1319
1320		/* Synchronize to scheduled poll. We cannot touch poll list, it
1321		 * can be even on different cpu. So just clear netif_running().
1322		 *
1323		 * dev->stop() will invoke napi_disable() on all of it's
1324		 * napi_struct instances on this device.
1325		 */
1326		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1327	}
1328
1329	dev_deactivate_many(head);
1330
1331	list_for_each_entry(dev, head, close_list) {
1332		const struct net_device_ops *ops = dev->netdev_ops;
1333
1334		/*
1335		 *	Call the device specific close. This cannot fail.
1336		 *	Only if device is UP
1337		 *
1338		 *	We allow it to be called even after a DETACH hot-plug
1339		 *	event.
1340		 */
1341		if (ops->ndo_stop)
1342			ops->ndo_stop(dev);
1343
1344		dev->flags &= ~IFF_UP;
1345		net_dmaengine_put();
1346	}
1347
1348	return 0;
1349}
1350
1351static int __dev_close(struct net_device *dev)
1352{
1353	int retval;
1354	LIST_HEAD(single);
1355
1356	/* Temporarily disable netpoll until the interface is down */
1357	netpoll_rx_disable(dev);
1358
1359	list_add(&dev->close_list, &single);
1360	retval = __dev_close_many(&single);
1361	list_del(&single);
1362
1363	netpoll_rx_enable(dev);
1364	return retval;
1365}
1366
1367static int dev_close_many(struct list_head *head)
1368{
1369	struct net_device *dev, *tmp;
1370
1371	/* Remove the devices that don't need to be closed */
1372	list_for_each_entry_safe(dev, tmp, head, close_list)
1373		if (!(dev->flags & IFF_UP))
1374			list_del_init(&dev->close_list);
1375
1376	__dev_close_many(head);
1377
1378	list_for_each_entry_safe(dev, tmp, head, close_list) {
1379		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1380		call_netdevice_notifiers(NETDEV_DOWN, dev);
1381		list_del_init(&dev->close_list);
1382	}
1383
1384	return 0;
1385}
1386
1387/**
1388 *	dev_close - shutdown an interface.
1389 *	@dev: device to shutdown
1390 *
1391 *	This function moves an active device into down state. A
1392 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1393 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394 *	chain.
1395 */
1396int dev_close(struct net_device *dev)
1397{
1398	if (dev->flags & IFF_UP) {
1399		LIST_HEAD(single);
1400
1401		/* Block netpoll rx while the interface is going down */
1402		netpoll_rx_disable(dev);
1403
1404		list_add(&dev->close_list, &single);
1405		dev_close_many(&single);
1406		list_del(&single);
1407
1408		netpoll_rx_enable(dev);
1409	}
1410	return 0;
1411}
1412EXPORT_SYMBOL(dev_close);
1413
1414
1415/**
1416 *	dev_disable_lro - disable Large Receive Offload on a device
1417 *	@dev: device
1418 *
1419 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1420 *	called under RTNL.  This is needed if received packets may be
1421 *	forwarded to another interface.
1422 */
1423void dev_disable_lro(struct net_device *dev)
1424{
1425	/*
1426	 * If we're trying to disable lro on a vlan device
1427	 * use the underlying physical device instead
1428	 */
1429	if (is_vlan_dev(dev))
1430		dev = vlan_dev_real_dev(dev);
1431
1432	/* the same for macvlan devices */
1433	if (netif_is_macvlan(dev))
1434		dev = macvlan_dev_real_dev(dev);
1435
1436	dev->wanted_features &= ~NETIF_F_LRO;
1437	netdev_update_features(dev);
1438
1439	if (unlikely(dev->features & NETIF_F_LRO))
1440		netdev_WARN(dev, "failed to disable LRO!\n");
1441}
1442EXPORT_SYMBOL(dev_disable_lro);
1443
1444static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1445				   struct net_device *dev)
1446{
1447	struct netdev_notifier_info info;
1448
1449	netdev_notifier_info_init(&info, dev);
1450	return nb->notifier_call(nb, val, &info);
1451}
1452
1453static int dev_boot_phase = 1;
1454
1455/**
1456 *	register_netdevice_notifier - register a network notifier block
1457 *	@nb: notifier
1458 *
1459 *	Register a notifier to be called when network device events occur.
1460 *	The notifier passed is linked into the kernel structures and must
1461 *	not be reused until it has been unregistered. A negative errno code
1462 *	is returned on a failure.
1463 *
1464 * 	When registered all registration and up events are replayed
1465 *	to the new notifier to allow device to have a race free
1466 *	view of the network device list.
1467 */
1468
1469int register_netdevice_notifier(struct notifier_block *nb)
1470{
1471	struct net_device *dev;
1472	struct net_device *last;
1473	struct net *net;
1474	int err;
1475
1476	rtnl_lock();
1477	err = raw_notifier_chain_register(&netdev_chain, nb);
1478	if (err)
1479		goto unlock;
1480	if (dev_boot_phase)
1481		goto unlock;
1482	for_each_net(net) {
1483		for_each_netdev(net, dev) {
1484			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1485			err = notifier_to_errno(err);
1486			if (err)
1487				goto rollback;
1488
1489			if (!(dev->flags & IFF_UP))
1490				continue;
1491
1492			call_netdevice_notifier(nb, NETDEV_UP, dev);
1493		}
1494	}
1495
1496unlock:
1497	rtnl_unlock();
1498	return err;
1499
1500rollback:
1501	last = dev;
1502	for_each_net(net) {
1503		for_each_netdev(net, dev) {
1504			if (dev == last)
1505				goto outroll;
1506
1507			if (dev->flags & IFF_UP) {
1508				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1509							dev);
1510				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1511			}
1512			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1513		}
1514	}
1515
1516outroll:
1517	raw_notifier_chain_unregister(&netdev_chain, nb);
1518	goto unlock;
1519}
1520EXPORT_SYMBOL(register_netdevice_notifier);
1521
1522/**
1523 *	unregister_netdevice_notifier - unregister a network notifier block
1524 *	@nb: notifier
1525 *
1526 *	Unregister a notifier previously registered by
1527 *	register_netdevice_notifier(). The notifier is unlinked into the
1528 *	kernel structures and may then be reused. A negative errno code
1529 *	is returned on a failure.
1530 *
1531 * 	After unregistering unregister and down device events are synthesized
1532 *	for all devices on the device list to the removed notifier to remove
1533 *	the need for special case cleanup code.
1534 */
1535
1536int unregister_netdevice_notifier(struct notifier_block *nb)
1537{
1538	struct net_device *dev;
1539	struct net *net;
1540	int err;
1541
1542	rtnl_lock();
1543	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1544	if (err)
1545		goto unlock;
1546
1547	for_each_net(net) {
1548		for_each_netdev(net, dev) {
1549			if (dev->flags & IFF_UP) {
1550				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1551							dev);
1552				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1553			}
1554			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1555		}
1556	}
1557unlock:
1558	rtnl_unlock();
1559	return err;
1560}
1561EXPORT_SYMBOL(unregister_netdevice_notifier);
1562
1563/**
1564 *	call_netdevice_notifiers_info - call all network notifier blocks
1565 *	@val: value passed unmodified to notifier function
1566 *	@dev: net_device pointer passed unmodified to notifier function
1567 *	@info: notifier information data
1568 *
1569 *	Call all network notifier blocks.  Parameters and return value
1570 *	are as for raw_notifier_call_chain().
1571 */
1572
1573static int call_netdevice_notifiers_info(unsigned long val,
1574					 struct net_device *dev,
1575					 struct netdev_notifier_info *info)
1576{
1577	ASSERT_RTNL();
1578	netdev_notifier_info_init(info, dev);
1579	return raw_notifier_call_chain(&netdev_chain, val, info);
1580}
1581
1582/**
1583 *	call_netdevice_notifiers - call all network notifier blocks
1584 *      @val: value passed unmodified to notifier function
1585 *      @dev: net_device pointer passed unmodified to notifier function
1586 *
1587 *	Call all network notifier blocks.  Parameters and return value
1588 *	are as for raw_notifier_call_chain().
1589 */
1590
1591int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1592{
1593	struct netdev_notifier_info info;
1594
1595	return call_netdevice_notifiers_info(val, dev, &info);
1596}
1597EXPORT_SYMBOL(call_netdevice_notifiers);
1598
1599static struct static_key netstamp_needed __read_mostly;
1600#ifdef HAVE_JUMP_LABEL
1601/* We are not allowed to call static_key_slow_dec() from irq context
1602 * If net_disable_timestamp() is called from irq context, defer the
1603 * static_key_slow_dec() calls.
1604 */
1605static atomic_t netstamp_needed_deferred;
1606#endif
1607
1608void net_enable_timestamp(void)
1609{
1610#ifdef HAVE_JUMP_LABEL
1611	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1612
1613	if (deferred) {
1614		while (--deferred)
1615			static_key_slow_dec(&netstamp_needed);
1616		return;
1617	}
1618#endif
1619	static_key_slow_inc(&netstamp_needed);
1620}
1621EXPORT_SYMBOL(net_enable_timestamp);
1622
1623void net_disable_timestamp(void)
1624{
1625#ifdef HAVE_JUMP_LABEL
1626	if (in_interrupt()) {
1627		atomic_inc(&netstamp_needed_deferred);
1628		return;
1629	}
1630#endif
1631	static_key_slow_dec(&netstamp_needed);
1632}
1633EXPORT_SYMBOL(net_disable_timestamp);
1634
1635static inline void net_timestamp_set(struct sk_buff *skb)
1636{
1637	skb->tstamp.tv64 = 0;
1638	if (static_key_false(&netstamp_needed))
1639		__net_timestamp(skb);
1640}
1641
1642#define net_timestamp_check(COND, SKB)			\
1643	if (static_key_false(&netstamp_needed)) {		\
1644		if ((COND) && !(SKB)->tstamp.tv64)	\
1645			__net_timestamp(SKB);		\
1646	}						\
1647
1648static inline bool is_skb_forwardable(struct net_device *dev,
1649				      struct sk_buff *skb)
1650{
1651	unsigned int len;
1652
1653	if (!(dev->flags & IFF_UP))
1654		return false;
1655
1656	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1657	if (skb->len <= len)
1658		return true;
1659
1660	/* if TSO is enabled, we don't care about the length as the packet
1661	 * could be forwarded without being segmented before
1662	 */
1663	if (skb_is_gso(skb))
1664		return true;
1665
1666	return false;
1667}
1668
1669/**
1670 * dev_forward_skb - loopback an skb to another netif
1671 *
1672 * @dev: destination network device
1673 * @skb: buffer to forward
1674 *
1675 * return values:
1676 *	NET_RX_SUCCESS	(no congestion)
1677 *	NET_RX_DROP     (packet was dropped, but freed)
1678 *
1679 * dev_forward_skb can be used for injecting an skb from the
1680 * start_xmit function of one device into the receive queue
1681 * of another device.
1682 *
1683 * The receiving device may be in another namespace, so
1684 * we have to clear all information in the skb that could
1685 * impact namespace isolation.
1686 */
1687int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1688{
1689	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1690		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1691			atomic_long_inc(&dev->rx_dropped);
1692			kfree_skb(skb);
1693			return NET_RX_DROP;
1694		}
1695	}
1696
1697	if (unlikely(!is_skb_forwardable(dev, skb))) {
1698		atomic_long_inc(&dev->rx_dropped);
1699		kfree_skb(skb);
1700		return NET_RX_DROP;
1701	}
1702
1703	skb_scrub_packet(skb, true);
1704	skb->protocol = eth_type_trans(skb, dev);
1705
1706	return netif_rx_internal(skb);
1707}
1708EXPORT_SYMBOL_GPL(dev_forward_skb);
1709
1710static inline int deliver_skb(struct sk_buff *skb,
1711			      struct packet_type *pt_prev,
1712			      struct net_device *orig_dev)
1713{
1714	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1715		return -ENOMEM;
1716	atomic_inc(&skb->users);
1717	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1718}
1719
1720static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1721{
1722	if (!ptype->af_packet_priv || !skb->sk)
1723		return false;
1724
1725	if (ptype->id_match)
1726		return ptype->id_match(ptype, skb->sk);
1727	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1728		return true;
1729
1730	return false;
1731}
1732
1733/*
1734 *	Support routine. Sends outgoing frames to any network
1735 *	taps currently in use.
1736 */
1737
1738static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1739{
1740	struct packet_type *ptype;
1741	struct sk_buff *skb2 = NULL;
1742	struct packet_type *pt_prev = NULL;
1743
1744	rcu_read_lock();
1745	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1746		/* Never send packets back to the socket
1747		 * they originated from - MvS (miquels@drinkel.ow.org)
1748		 */
1749		if ((ptype->dev == dev || !ptype->dev) &&
1750		    (!skb_loop_sk(ptype, skb))) {
1751			if (pt_prev) {
1752				deliver_skb(skb2, pt_prev, skb->dev);
1753				pt_prev = ptype;
1754				continue;
1755			}
1756
1757			skb2 = skb_clone(skb, GFP_ATOMIC);
1758			if (!skb2)
1759				break;
1760
1761			net_timestamp_set(skb2);
1762
1763			/* skb->nh should be correctly
1764			   set by sender, so that the second statement is
1765			   just protection against buggy protocols.
1766			 */
1767			skb_reset_mac_header(skb2);
1768
1769			if (skb_network_header(skb2) < skb2->data ||
1770			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1771				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1772						     ntohs(skb2->protocol),
1773						     dev->name);
1774				skb_reset_network_header(skb2);
1775			}
1776
1777			skb2->transport_header = skb2->network_header;
1778			skb2->pkt_type = PACKET_OUTGOING;
1779			pt_prev = ptype;
1780		}
1781	}
1782	if (pt_prev)
1783		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1784	rcu_read_unlock();
1785}
1786
1787/**
1788 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1789 * @dev: Network device
1790 * @txq: number of queues available
1791 *
1792 * If real_num_tx_queues is changed the tc mappings may no longer be
1793 * valid. To resolve this verify the tc mapping remains valid and if
1794 * not NULL the mapping. With no priorities mapping to this
1795 * offset/count pair it will no longer be used. In the worst case TC0
1796 * is invalid nothing can be done so disable priority mappings. If is
1797 * expected that drivers will fix this mapping if they can before
1798 * calling netif_set_real_num_tx_queues.
1799 */
1800static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1801{
1802	int i;
1803	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1804
1805	/* If TC0 is invalidated disable TC mapping */
1806	if (tc->offset + tc->count > txq) {
1807		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1808		dev->num_tc = 0;
1809		return;
1810	}
1811
1812	/* Invalidated prio to tc mappings set to TC0 */
1813	for (i = 1; i < TC_BITMASK + 1; i++) {
1814		int q = netdev_get_prio_tc_map(dev, i);
1815
1816		tc = &dev->tc_to_txq[q];
1817		if (tc->offset + tc->count > txq) {
1818			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1819				i, q);
1820			netdev_set_prio_tc_map(dev, i, 0);
1821		}
1822	}
1823}
1824
1825#ifdef CONFIG_XPS
1826static DEFINE_MUTEX(xps_map_mutex);
1827#define xmap_dereference(P)		\
1828	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1829
1830static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1831					int cpu, u16 index)
1832{
1833	struct xps_map *map = NULL;
1834	int pos;
1835
1836	if (dev_maps)
1837		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1838
1839	for (pos = 0; map && pos < map->len; pos++) {
1840		if (map->queues[pos] == index) {
1841			if (map->len > 1) {
1842				map->queues[pos] = map->queues[--map->len];
1843			} else {
1844				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1845				kfree_rcu(map, rcu);
1846				map = NULL;
1847			}
1848			break;
1849		}
1850	}
1851
1852	return map;
1853}
1854
1855static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1856{
1857	struct xps_dev_maps *dev_maps;
1858	int cpu, i;
1859	bool active = false;
1860
1861	mutex_lock(&xps_map_mutex);
1862	dev_maps = xmap_dereference(dev->xps_maps);
1863
1864	if (!dev_maps)
1865		goto out_no_maps;
1866
1867	for_each_possible_cpu(cpu) {
1868		for (i = index; i < dev->num_tx_queues; i++) {
1869			if (!remove_xps_queue(dev_maps, cpu, i))
1870				break;
1871		}
1872		if (i == dev->num_tx_queues)
1873			active = true;
1874	}
1875
1876	if (!active) {
1877		RCU_INIT_POINTER(dev->xps_maps, NULL);
1878		kfree_rcu(dev_maps, rcu);
1879	}
1880
1881	for (i = index; i < dev->num_tx_queues; i++)
1882		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1883					     NUMA_NO_NODE);
1884
1885out_no_maps:
1886	mutex_unlock(&xps_map_mutex);
1887}
1888
1889static struct xps_map *expand_xps_map(struct xps_map *map,
1890				      int cpu, u16 index)
1891{
1892	struct xps_map *new_map;
1893	int alloc_len = XPS_MIN_MAP_ALLOC;
1894	int i, pos;
1895
1896	for (pos = 0; map && pos < map->len; pos++) {
1897		if (map->queues[pos] != index)
1898			continue;
1899		return map;
1900	}
1901
1902	/* Need to add queue to this CPU's existing map */
1903	if (map) {
1904		if (pos < map->alloc_len)
1905			return map;
1906
1907		alloc_len = map->alloc_len * 2;
1908	}
1909
1910	/* Need to allocate new map to store queue on this CPU's map */
1911	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1912			       cpu_to_node(cpu));
1913	if (!new_map)
1914		return NULL;
1915
1916	for (i = 0; i < pos; i++)
1917		new_map->queues[i] = map->queues[i];
1918	new_map->alloc_len = alloc_len;
1919	new_map->len = pos;
1920
1921	return new_map;
1922}
1923
1924int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1925			u16 index)
1926{
1927	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1928	struct xps_map *map, *new_map;
1929	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1930	int cpu, numa_node_id = -2;
1931	bool active = false;
1932
1933	mutex_lock(&xps_map_mutex);
1934
1935	dev_maps = xmap_dereference(dev->xps_maps);
1936
1937	/* allocate memory for queue storage */
1938	for_each_online_cpu(cpu) {
1939		if (!cpumask_test_cpu(cpu, mask))
1940			continue;
1941
1942		if (!new_dev_maps)
1943			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1944		if (!new_dev_maps) {
1945			mutex_unlock(&xps_map_mutex);
1946			return -ENOMEM;
1947		}
1948
1949		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1950				 NULL;
1951
1952		map = expand_xps_map(map, cpu, index);
1953		if (!map)
1954			goto error;
1955
1956		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1957	}
1958
1959	if (!new_dev_maps)
1960		goto out_no_new_maps;
1961
1962	for_each_possible_cpu(cpu) {
1963		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1964			/* add queue to CPU maps */
1965			int pos = 0;
1966
1967			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1968			while ((pos < map->len) && (map->queues[pos] != index))
1969				pos++;
1970
1971			if (pos == map->len)
1972				map->queues[map->len++] = index;
1973#ifdef CONFIG_NUMA
1974			if (numa_node_id == -2)
1975				numa_node_id = cpu_to_node(cpu);
1976			else if (numa_node_id != cpu_to_node(cpu))
1977				numa_node_id = -1;
1978#endif
1979		} else if (dev_maps) {
1980			/* fill in the new device map from the old device map */
1981			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1982			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1983		}
1984
1985	}
1986
1987	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1988
1989	/* Cleanup old maps */
1990	if (dev_maps) {
1991		for_each_possible_cpu(cpu) {
1992			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1993			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1994			if (map && map != new_map)
1995				kfree_rcu(map, rcu);
1996		}
1997
1998		kfree_rcu(dev_maps, rcu);
1999	}
2000
2001	dev_maps = new_dev_maps;
2002	active = true;
2003
2004out_no_new_maps:
2005	/* update Tx queue numa node */
2006	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2007				     (numa_node_id >= 0) ? numa_node_id :
2008				     NUMA_NO_NODE);
2009
2010	if (!dev_maps)
2011		goto out_no_maps;
2012
2013	/* removes queue from unused CPUs */
2014	for_each_possible_cpu(cpu) {
2015		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2016			continue;
2017
2018		if (remove_xps_queue(dev_maps, cpu, index))
2019			active = true;
2020	}
2021
2022	/* free map if not active */
2023	if (!active) {
2024		RCU_INIT_POINTER(dev->xps_maps, NULL);
2025		kfree_rcu(dev_maps, rcu);
2026	}
2027
2028out_no_maps:
2029	mutex_unlock(&xps_map_mutex);
2030
2031	return 0;
2032error:
2033	/* remove any maps that we added */
2034	for_each_possible_cpu(cpu) {
2035		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2036		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2037				 NULL;
2038		if (new_map && new_map != map)
2039			kfree(new_map);
2040	}
2041
2042	mutex_unlock(&xps_map_mutex);
2043
2044	kfree(new_dev_maps);
2045	return -ENOMEM;
2046}
2047EXPORT_SYMBOL(netif_set_xps_queue);
2048
2049#endif
2050/*
2051 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2052 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2053 */
2054int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2055{
2056	int rc;
2057
2058	if (txq < 1 || txq > dev->num_tx_queues)
2059		return -EINVAL;
2060
2061	if (dev->reg_state == NETREG_REGISTERED ||
2062	    dev->reg_state == NETREG_UNREGISTERING) {
2063		ASSERT_RTNL();
2064
2065		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2066						  txq);
2067		if (rc)
2068			return rc;
2069
2070		if (dev->num_tc)
2071			netif_setup_tc(dev, txq);
2072
2073		if (txq < dev->real_num_tx_queues) {
2074			qdisc_reset_all_tx_gt(dev, txq);
2075#ifdef CONFIG_XPS
2076			netif_reset_xps_queues_gt(dev, txq);
2077#endif
2078		}
2079	}
2080
2081	dev->real_num_tx_queues = txq;
2082	return 0;
2083}
2084EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2085
2086#ifdef CONFIG_SYSFS
2087/**
2088 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2089 *	@dev: Network device
2090 *	@rxq: Actual number of RX queues
2091 *
2092 *	This must be called either with the rtnl_lock held or before
2093 *	registration of the net device.  Returns 0 on success, or a
2094 *	negative error code.  If called before registration, it always
2095 *	succeeds.
2096 */
2097int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2098{
2099	int rc;
2100
2101	if (rxq < 1 || rxq > dev->num_rx_queues)
2102		return -EINVAL;
2103
2104	if (dev->reg_state == NETREG_REGISTERED) {
2105		ASSERT_RTNL();
2106
2107		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2108						  rxq);
2109		if (rc)
2110			return rc;
2111	}
2112
2113	dev->real_num_rx_queues = rxq;
2114	return 0;
2115}
2116EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2117#endif
2118
2119/**
2120 * netif_get_num_default_rss_queues - default number of RSS queues
2121 *
2122 * This routine should set an upper limit on the number of RSS queues
2123 * used by default by multiqueue devices.
2124 */
2125int netif_get_num_default_rss_queues(void)
2126{
2127	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2128}
2129EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2130
2131static inline void __netif_reschedule(struct Qdisc *q)
2132{
2133	struct softnet_data *sd;
2134	unsigned long flags;
2135
2136	local_irq_save(flags);
2137	sd = &__get_cpu_var(softnet_data);
2138	q->next_sched = NULL;
2139	*sd->output_queue_tailp = q;
2140	sd->output_queue_tailp = &q->next_sched;
2141	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2142	local_irq_restore(flags);
2143}
2144
2145void __netif_schedule(struct Qdisc *q)
2146{
2147	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2148		__netif_reschedule(q);
2149}
2150EXPORT_SYMBOL(__netif_schedule);
2151
2152struct dev_kfree_skb_cb {
2153	enum skb_free_reason reason;
2154};
2155
2156static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2157{
2158	return (struct dev_kfree_skb_cb *)skb->cb;
2159}
2160
2161void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2162{
2163	unsigned long flags;
2164
2165	if (likely(atomic_read(&skb->users) == 1)) {
2166		smp_rmb();
2167		atomic_set(&skb->users, 0);
2168	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2169		return;
2170	}
2171	get_kfree_skb_cb(skb)->reason = reason;
2172	local_irq_save(flags);
2173	skb->next = __this_cpu_read(softnet_data.completion_queue);
2174	__this_cpu_write(softnet_data.completion_queue, skb);
2175	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2176	local_irq_restore(flags);
2177}
2178EXPORT_SYMBOL(__dev_kfree_skb_irq);
2179
2180void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2181{
2182	if (in_irq() || irqs_disabled())
2183		__dev_kfree_skb_irq(skb, reason);
2184	else
2185		dev_kfree_skb(skb);
2186}
2187EXPORT_SYMBOL(__dev_kfree_skb_any);
2188
2189
2190/**
2191 * netif_device_detach - mark device as removed
2192 * @dev: network device
2193 *
2194 * Mark device as removed from system and therefore no longer available.
2195 */
2196void netif_device_detach(struct net_device *dev)
2197{
2198	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2199	    netif_running(dev)) {
2200		netif_tx_stop_all_queues(dev);
2201	}
2202}
2203EXPORT_SYMBOL(netif_device_detach);
2204
2205/**
2206 * netif_device_attach - mark device as attached
2207 * @dev: network device
2208 *
2209 * Mark device as attached from system and restart if needed.
2210 */
2211void netif_device_attach(struct net_device *dev)
2212{
2213	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2214	    netif_running(dev)) {
2215		netif_tx_wake_all_queues(dev);
2216		__netdev_watchdog_up(dev);
2217	}
2218}
2219EXPORT_SYMBOL(netif_device_attach);
2220
2221static void skb_warn_bad_offload(const struct sk_buff *skb)
2222{
2223	static const netdev_features_t null_features = 0;
2224	struct net_device *dev = skb->dev;
2225	const char *driver = "";
2226
2227	if (!net_ratelimit())
2228		return;
2229
2230	if (dev && dev->dev.parent)
2231		driver = dev_driver_string(dev->dev.parent);
2232
2233	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2234	     "gso_type=%d ip_summed=%d\n",
2235	     driver, dev ? &dev->features : &null_features,
2236	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2237	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2238	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2239}
2240
2241/*
2242 * Invalidate hardware checksum when packet is to be mangled, and
2243 * complete checksum manually on outgoing path.
2244 */
2245int skb_checksum_help(struct sk_buff *skb)
2246{
2247	__wsum csum;
2248	int ret = 0, offset;
2249
2250	if (skb->ip_summed == CHECKSUM_COMPLETE)
2251		goto out_set_summed;
2252
2253	if (unlikely(skb_shinfo(skb)->gso_size)) {
2254		skb_warn_bad_offload(skb);
2255		return -EINVAL;
2256	}
2257
2258	/* Before computing a checksum, we should make sure no frag could
2259	 * be modified by an external entity : checksum could be wrong.
2260	 */
2261	if (skb_has_shared_frag(skb)) {
2262		ret = __skb_linearize(skb);
2263		if (ret)
2264			goto out;
2265	}
2266
2267	offset = skb_checksum_start_offset(skb);
2268	BUG_ON(offset >= skb_headlen(skb));
2269	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2270
2271	offset += skb->csum_offset;
2272	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2273
2274	if (skb_cloned(skb) &&
2275	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2276		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2277		if (ret)
2278			goto out;
2279	}
2280
2281	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2282out_set_summed:
2283	skb->ip_summed = CHECKSUM_NONE;
2284out:
2285	return ret;
2286}
2287EXPORT_SYMBOL(skb_checksum_help);
2288
2289__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2290{
2291	__be16 type = skb->protocol;
2292	int vlan_depth = ETH_HLEN;
2293
2294	/* Tunnel gso handlers can set protocol to ethernet. */
2295	if (type == htons(ETH_P_TEB)) {
2296		struct ethhdr *eth;
2297
2298		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2299			return 0;
2300
2301		eth = (struct ethhdr *)skb_mac_header(skb);
2302		type = eth->h_proto;
2303	}
2304
2305	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2306		struct vlan_hdr *vh;
2307
2308		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2309			return 0;
2310
2311		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2312		type = vh->h_vlan_encapsulated_proto;
2313		vlan_depth += VLAN_HLEN;
2314	}
2315
2316	*depth = vlan_depth;
2317
2318	return type;
2319}
2320
2321/**
2322 *	skb_mac_gso_segment - mac layer segmentation handler.
2323 *	@skb: buffer to segment
2324 *	@features: features for the output path (see dev->features)
2325 */
2326struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2327				    netdev_features_t features)
2328{
2329	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2330	struct packet_offload *ptype;
2331	int vlan_depth = skb->mac_len;
2332	__be16 type = skb_network_protocol(skb, &vlan_depth);
2333
2334	if (unlikely(!type))
2335		return ERR_PTR(-EINVAL);
2336
2337	__skb_pull(skb, vlan_depth);
2338
2339	rcu_read_lock();
2340	list_for_each_entry_rcu(ptype, &offload_base, list) {
2341		if (ptype->type == type && ptype->callbacks.gso_segment) {
2342			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2343				int err;
2344
2345				err = ptype->callbacks.gso_send_check(skb);
2346				segs = ERR_PTR(err);
2347				if (err || skb_gso_ok(skb, features))
2348					break;
2349				__skb_push(skb, (skb->data -
2350						 skb_network_header(skb)));
2351			}
2352			segs = ptype->callbacks.gso_segment(skb, features);
2353			break;
2354		}
2355	}
2356	rcu_read_unlock();
2357
2358	__skb_push(skb, skb->data - skb_mac_header(skb));
2359
2360	return segs;
2361}
2362EXPORT_SYMBOL(skb_mac_gso_segment);
2363
2364
2365/* openvswitch calls this on rx path, so we need a different check.
2366 */
2367static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2368{
2369	if (tx_path)
2370		return skb->ip_summed != CHECKSUM_PARTIAL;
2371	else
2372		return skb->ip_summed == CHECKSUM_NONE;
2373}
2374
2375/**
2376 *	__skb_gso_segment - Perform segmentation on skb.
2377 *	@skb: buffer to segment
2378 *	@features: features for the output path (see dev->features)
2379 *	@tx_path: whether it is called in TX path
2380 *
2381 *	This function segments the given skb and returns a list of segments.
2382 *
2383 *	It may return NULL if the skb requires no segmentation.  This is
2384 *	only possible when GSO is used for verifying header integrity.
2385 */
2386struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2387				  netdev_features_t features, bool tx_path)
2388{
2389	if (unlikely(skb_needs_check(skb, tx_path))) {
2390		int err;
2391
2392		skb_warn_bad_offload(skb);
2393
2394		if (skb_header_cloned(skb) &&
2395		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2396			return ERR_PTR(err);
2397	}
2398
2399	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2400	SKB_GSO_CB(skb)->encap_level = 0;
2401
2402	skb_reset_mac_header(skb);
2403	skb_reset_mac_len(skb);
2404
2405	return skb_mac_gso_segment(skb, features);
2406}
2407EXPORT_SYMBOL(__skb_gso_segment);
2408
2409/* Take action when hardware reception checksum errors are detected. */
2410#ifdef CONFIG_BUG
2411void netdev_rx_csum_fault(struct net_device *dev)
2412{
2413	if (net_ratelimit()) {
2414		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2415		dump_stack();
2416	}
2417}
2418EXPORT_SYMBOL(netdev_rx_csum_fault);
2419#endif
2420
2421/* Actually, we should eliminate this check as soon as we know, that:
2422 * 1. IOMMU is present and allows to map all the memory.
2423 * 2. No high memory really exists on this machine.
2424 */
2425
2426static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2427{
2428#ifdef CONFIG_HIGHMEM
2429	int i;
2430	if (!(dev->features & NETIF_F_HIGHDMA)) {
2431		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2432			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2433			if (PageHighMem(skb_frag_page(frag)))
2434				return 1;
2435		}
2436	}
2437
2438	if (PCI_DMA_BUS_IS_PHYS) {
2439		struct device *pdev = dev->dev.parent;
2440
2441		if (!pdev)
2442			return 0;
2443		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2444			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2445			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2446			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2447				return 1;
2448		}
2449	}
2450#endif
2451	return 0;
2452}
2453
2454struct dev_gso_cb {
2455	void (*destructor)(struct sk_buff *skb);
2456};
2457
2458#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2459
2460static void dev_gso_skb_destructor(struct sk_buff *skb)
2461{
2462	struct dev_gso_cb *cb;
2463
2464	kfree_skb_list(skb->next);
2465	skb->next = NULL;
2466
2467	cb = DEV_GSO_CB(skb);
2468	if (cb->destructor)
2469		cb->destructor(skb);
2470}
2471
2472/**
2473 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2474 *	@skb: buffer to segment
2475 *	@features: device features as applicable to this skb
2476 *
2477 *	This function segments the given skb and stores the list of segments
2478 *	in skb->next.
2479 */
2480static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2481{
2482	struct sk_buff *segs;
2483
2484	segs = skb_gso_segment(skb, features);
2485
2486	/* Verifying header integrity only. */
2487	if (!segs)
2488		return 0;
2489
2490	if (IS_ERR(segs))
2491		return PTR_ERR(segs);
2492
2493	skb->next = segs;
2494	DEV_GSO_CB(skb)->destructor = skb->destructor;
2495	skb->destructor = dev_gso_skb_destructor;
2496
2497	return 0;
2498}
2499
2500static netdev_features_t harmonize_features(struct sk_buff *skb,
2501					    const struct net_device *dev,
2502					    netdev_features_t features)
2503{
2504	int tmp;
2505
2506	if (skb->ip_summed != CHECKSUM_NONE &&
2507	    !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
2508		features &= ~NETIF_F_ALL_CSUM;
2509	} else if (illegal_highdma(dev, skb)) {
2510		features &= ~NETIF_F_SG;
2511	}
2512
2513	return features;
2514}
2515
2516netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2517					 const struct net_device *dev)
2518{
2519	__be16 protocol = skb->protocol;
2520	netdev_features_t features = dev->features;
2521
2522	if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2523		features &= ~NETIF_F_GSO_MASK;
2524
2525	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2526		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2527		protocol = veh->h_vlan_encapsulated_proto;
2528	} else if (!vlan_tx_tag_present(skb)) {
2529		return harmonize_features(skb, dev, features);
2530	}
2531
2532	features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2533					       NETIF_F_HW_VLAN_STAG_TX);
2534
2535	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2536		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2537				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2538				NETIF_F_HW_VLAN_STAG_TX;
2539
2540	return harmonize_features(skb, dev, features);
2541}
2542EXPORT_SYMBOL(netif_skb_dev_features);
2543
2544int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2545			struct netdev_queue *txq)
2546{
2547	const struct net_device_ops *ops = dev->netdev_ops;
2548	int rc = NETDEV_TX_OK;
2549	unsigned int skb_len;
2550
2551	if (likely(!skb->next)) {
2552		netdev_features_t features;
2553
2554		/*
2555		 * If device doesn't need skb->dst, release it right now while
2556		 * its hot in this cpu cache
2557		 */
2558		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2559			skb_dst_drop(skb);
2560
2561		features = netif_skb_features(skb);
2562
2563		if (vlan_tx_tag_present(skb) &&
2564		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2565			skb = __vlan_put_tag(skb, skb->vlan_proto,
2566					     vlan_tx_tag_get(skb));
2567			if (unlikely(!skb))
2568				goto out;
2569
2570			skb->vlan_tci = 0;
2571		}
2572
2573		/* If encapsulation offload request, verify we are testing
2574		 * hardware encapsulation features instead of standard
2575		 * features for the netdev
2576		 */
2577		if (skb->encapsulation)
2578			features &= dev->hw_enc_features;
2579
2580		if (netif_needs_gso(skb, features)) {
2581			if (unlikely(dev_gso_segment(skb, features)))
2582				goto out_kfree_skb;
2583			if (skb->next)
2584				goto gso;
2585		} else {
2586			if (skb_needs_linearize(skb, features) &&
2587			    __skb_linearize(skb))
2588				goto out_kfree_skb;
2589
2590			/* If packet is not checksummed and device does not
2591			 * support checksumming for this protocol, complete
2592			 * checksumming here.
2593			 */
2594			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2595				if (skb->encapsulation)
2596					skb_set_inner_transport_header(skb,
2597						skb_checksum_start_offset(skb));
2598				else
2599					skb_set_transport_header(skb,
2600						skb_checksum_start_offset(skb));
2601				if (!(features & NETIF_F_ALL_CSUM) &&
2602				     skb_checksum_help(skb))
2603					goto out_kfree_skb;
2604			}
2605		}
2606
2607		if (!list_empty(&ptype_all))
2608			dev_queue_xmit_nit(skb, dev);
2609
2610		skb_len = skb->len;
2611		trace_net_dev_start_xmit(skb, dev);
2612		rc = ops->ndo_start_xmit(skb, dev);
2613		trace_net_dev_xmit(skb, rc, dev, skb_len);
2614		if (rc == NETDEV_TX_OK)
2615			txq_trans_update(txq);
2616		return rc;
2617	}
2618
2619gso:
2620	do {
2621		struct sk_buff *nskb = skb->next;
2622
2623		skb->next = nskb->next;
2624		nskb->next = NULL;
2625
2626		if (!list_empty(&ptype_all))
2627			dev_queue_xmit_nit(nskb, dev);
2628
2629		skb_len = nskb->len;
2630		trace_net_dev_start_xmit(nskb, dev);
2631		rc = ops->ndo_start_xmit(nskb, dev);
2632		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2633		if (unlikely(rc != NETDEV_TX_OK)) {
2634			if (rc & ~NETDEV_TX_MASK)
2635				goto out_kfree_gso_skb;
2636			nskb->next = skb->next;
2637			skb->next = nskb;
2638			return rc;
2639		}
2640		txq_trans_update(txq);
2641		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2642			return NETDEV_TX_BUSY;
2643	} while (skb->next);
2644
2645out_kfree_gso_skb:
2646	if (likely(skb->next == NULL)) {
2647		skb->destructor = DEV_GSO_CB(skb)->destructor;
2648		consume_skb(skb);
2649		return rc;
2650	}
2651out_kfree_skb:
2652	kfree_skb(skb);
2653out:
2654	return rc;
2655}
2656EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2657
2658static void qdisc_pkt_len_init(struct sk_buff *skb)
2659{
2660	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2661
2662	qdisc_skb_cb(skb)->pkt_len = skb->len;
2663
2664	/* To get more precise estimation of bytes sent on wire,
2665	 * we add to pkt_len the headers size of all segments
2666	 */
2667	if (shinfo->gso_size)  {
2668		unsigned int hdr_len;
2669		u16 gso_segs = shinfo->gso_segs;
2670
2671		/* mac layer + network layer */
2672		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2673
2674		/* + transport layer */
2675		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2676			hdr_len += tcp_hdrlen(skb);
2677		else
2678			hdr_len += sizeof(struct udphdr);
2679
2680		if (shinfo->gso_type & SKB_GSO_DODGY)
2681			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2682						shinfo->gso_size);
2683
2684		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2685	}
2686}
2687
2688static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2689				 struct net_device *dev,
2690				 struct netdev_queue *txq)
2691{
2692	spinlock_t *root_lock = qdisc_lock(q);
2693	bool contended;
2694	int rc;
2695
2696	qdisc_pkt_len_init(skb);
2697	qdisc_calculate_pkt_len(skb, q);
2698	/*
2699	 * Heuristic to force contended enqueues to serialize on a
2700	 * separate lock before trying to get qdisc main lock.
2701	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2702	 * and dequeue packets faster.
2703	 */
2704	contended = qdisc_is_running(q);
2705	if (unlikely(contended))
2706		spin_lock(&q->busylock);
2707
2708	spin_lock(root_lock);
2709	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2710		kfree_skb(skb);
2711		rc = NET_XMIT_DROP;
2712	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2713		   qdisc_run_begin(q)) {
2714		/*
2715		 * This is a work-conserving queue; there are no old skbs
2716		 * waiting to be sent out; and the qdisc is not running -
2717		 * xmit the skb directly.
2718		 */
2719		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2720			skb_dst_force(skb);
2721
2722		qdisc_bstats_update(q, skb);
2723
2724		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2725			if (unlikely(contended)) {
2726				spin_unlock(&q->busylock);
2727				contended = false;
2728			}
2729			__qdisc_run(q);
2730		} else
2731			qdisc_run_end(q);
2732
2733		rc = NET_XMIT_SUCCESS;
2734	} else {
2735		skb_dst_force(skb);
2736		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2737		if (qdisc_run_begin(q)) {
2738			if (unlikely(contended)) {
2739				spin_unlock(&q->busylock);
2740				contended = false;
2741			}
2742			__qdisc_run(q);
2743		}
2744	}
2745	spin_unlock(root_lock);
2746	if (unlikely(contended))
2747		spin_unlock(&q->busylock);
2748	return rc;
2749}
2750
2751#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2752static void skb_update_prio(struct sk_buff *skb)
2753{
2754	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2755
2756	if (!skb->priority && skb->sk && map) {
2757		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2758
2759		if (prioidx < map->priomap_len)
2760			skb->priority = map->priomap[prioidx];
2761	}
2762}
2763#else
2764#define skb_update_prio(skb)
2765#endif
2766
2767static DEFINE_PER_CPU(int, xmit_recursion);
2768#define RECURSION_LIMIT 10
2769
2770/**
2771 *	dev_loopback_xmit - loop back @skb
2772 *	@skb: buffer to transmit
2773 */
2774int dev_loopback_xmit(struct sk_buff *skb)
2775{
2776	skb_reset_mac_header(skb);
2777	__skb_pull(skb, skb_network_offset(skb));
2778	skb->pkt_type = PACKET_LOOPBACK;
2779	skb->ip_summed = CHECKSUM_UNNECESSARY;
2780	WARN_ON(!skb_dst(skb));
2781	skb_dst_force(skb);
2782	netif_rx_ni(skb);
2783	return 0;
2784}
2785EXPORT_SYMBOL(dev_loopback_xmit);
2786
2787/**
2788 *	__dev_queue_xmit - transmit a buffer
2789 *	@skb: buffer to transmit
2790 *	@accel_priv: private data used for L2 forwarding offload
2791 *
2792 *	Queue a buffer for transmission to a network device. The caller must
2793 *	have set the device and priority and built the buffer before calling
2794 *	this function. The function can be called from an interrupt.
2795 *
2796 *	A negative errno code is returned on a failure. A success does not
2797 *	guarantee the frame will be transmitted as it may be dropped due
2798 *	to congestion or traffic shaping.
2799 *
2800 * -----------------------------------------------------------------------------------
2801 *      I notice this method can also return errors from the queue disciplines,
2802 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2803 *      be positive.
2804 *
2805 *      Regardless of the return value, the skb is consumed, so it is currently
2806 *      difficult to retry a send to this method.  (You can bump the ref count
2807 *      before sending to hold a reference for retry if you are careful.)
2808 *
2809 *      When calling this method, interrupts MUST be enabled.  This is because
2810 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2811 *          --BLG
2812 */
2813static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2814{
2815	struct net_device *dev = skb->dev;
2816	struct netdev_queue *txq;
2817	struct Qdisc *q;
2818	int rc = -ENOMEM;
2819
2820	skb_reset_mac_header(skb);
2821
2822	/* Disable soft irqs for various locks below. Also
2823	 * stops preemption for RCU.
2824	 */
2825	rcu_read_lock_bh();
2826
2827	skb_update_prio(skb);
2828
2829	txq = netdev_pick_tx(dev, skb, accel_priv);
2830	q = rcu_dereference_bh(txq->qdisc);
2831
2832#ifdef CONFIG_NET_CLS_ACT
2833	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2834#endif
2835	trace_net_dev_queue(skb);
2836	if (q->enqueue) {
2837		rc = __dev_xmit_skb(skb, q, dev, txq);
2838		goto out;
2839	}
2840
2841	/* The device has no queue. Common case for software devices:
2842	   loopback, all the sorts of tunnels...
2843
2844	   Really, it is unlikely that netif_tx_lock protection is necessary
2845	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2846	   counters.)
2847	   However, it is possible, that they rely on protection
2848	   made by us here.
2849
2850	   Check this and shot the lock. It is not prone from deadlocks.
2851	   Either shot noqueue qdisc, it is even simpler 8)
2852	 */
2853	if (dev->flags & IFF_UP) {
2854		int cpu = smp_processor_id(); /* ok because BHs are off */
2855
2856		if (txq->xmit_lock_owner != cpu) {
2857
2858			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2859				goto recursion_alert;
2860
2861			HARD_TX_LOCK(dev, txq, cpu);
2862
2863			if (!netif_xmit_stopped(txq)) {
2864				__this_cpu_inc(xmit_recursion);
2865				rc = dev_hard_start_xmit(skb, dev, txq);
2866				__this_cpu_dec(xmit_recursion);
2867				if (dev_xmit_complete(rc)) {
2868					HARD_TX_UNLOCK(dev, txq);
2869					goto out;
2870				}
2871			}
2872			HARD_TX_UNLOCK(dev, txq);
2873			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2874					     dev->name);
2875		} else {
2876			/* Recursion is detected! It is possible,
2877			 * unfortunately
2878			 */
2879recursion_alert:
2880			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2881					     dev->name);
2882		}
2883	}
2884
2885	rc = -ENETDOWN;
2886	rcu_read_unlock_bh();
2887
2888	kfree_skb(skb);
2889	return rc;
2890out:
2891	rcu_read_unlock_bh();
2892	return rc;
2893}
2894
2895int dev_queue_xmit(struct sk_buff *skb)
2896{
2897	return __dev_queue_xmit(skb, NULL);
2898}
2899EXPORT_SYMBOL(dev_queue_xmit);
2900
2901int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2902{
2903	return __dev_queue_xmit(skb, accel_priv);
2904}
2905EXPORT_SYMBOL(dev_queue_xmit_accel);
2906
2907
2908/*=======================================================================
2909			Receiver routines
2910  =======================================================================*/
2911
2912int netdev_max_backlog __read_mostly = 1000;
2913EXPORT_SYMBOL(netdev_max_backlog);
2914
2915int netdev_tstamp_prequeue __read_mostly = 1;
2916int netdev_budget __read_mostly = 300;
2917int weight_p __read_mostly = 64;            /* old backlog weight */
2918
2919/* Called with irq disabled */
2920static inline void ____napi_schedule(struct softnet_data *sd,
2921				     struct napi_struct *napi)
2922{
2923	list_add_tail(&napi->poll_list, &sd->poll_list);
2924	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2925}
2926
2927#ifdef CONFIG_RPS
2928
2929/* One global table that all flow-based protocols share. */
2930struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2931EXPORT_SYMBOL(rps_sock_flow_table);
2932
2933struct static_key rps_needed __read_mostly;
2934
2935static struct rps_dev_flow *
2936set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2937	    struct rps_dev_flow *rflow, u16 next_cpu)
2938{
2939	if (next_cpu != RPS_NO_CPU) {
2940#ifdef CONFIG_RFS_ACCEL
2941		struct netdev_rx_queue *rxqueue;
2942		struct rps_dev_flow_table *flow_table;
2943		struct rps_dev_flow *old_rflow;
2944		u32 flow_id;
2945		u16 rxq_index;
2946		int rc;
2947
2948		/* Should we steer this flow to a different hardware queue? */
2949		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2950		    !(dev->features & NETIF_F_NTUPLE))
2951			goto out;
2952		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2953		if (rxq_index == skb_get_rx_queue(skb))
2954			goto out;
2955
2956		rxqueue = dev->_rx + rxq_index;
2957		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2958		if (!flow_table)
2959			goto out;
2960		flow_id = skb->rxhash & flow_table->mask;
2961		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2962							rxq_index, flow_id);
2963		if (rc < 0)
2964			goto out;
2965		old_rflow = rflow;
2966		rflow = &flow_table->flows[flow_id];
2967		rflow->filter = rc;
2968		if (old_rflow->filter == rflow->filter)
2969			old_rflow->filter = RPS_NO_FILTER;
2970	out:
2971#endif
2972		rflow->last_qtail =
2973			per_cpu(softnet_data, next_cpu).input_queue_head;
2974	}
2975
2976	rflow->cpu = next_cpu;
2977	return rflow;
2978}
2979
2980/*
2981 * get_rps_cpu is called from netif_receive_skb and returns the target
2982 * CPU from the RPS map of the receiving queue for a given skb.
2983 * rcu_read_lock must be held on entry.
2984 */
2985static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2986		       struct rps_dev_flow **rflowp)
2987{
2988	struct netdev_rx_queue *rxqueue;
2989	struct rps_map *map;
2990	struct rps_dev_flow_table *flow_table;
2991	struct rps_sock_flow_table *sock_flow_table;
2992	int cpu = -1;
2993	u16 tcpu;
2994
2995	if (skb_rx_queue_recorded(skb)) {
2996		u16 index = skb_get_rx_queue(skb);
2997		if (unlikely(index >= dev->real_num_rx_queues)) {
2998			WARN_ONCE(dev->real_num_rx_queues > 1,
2999				  "%s received packet on queue %u, but number "
3000				  "of RX queues is %u\n",
3001				  dev->name, index, dev->real_num_rx_queues);
3002			goto done;
3003		}
3004		rxqueue = dev->_rx + index;
3005	} else
3006		rxqueue = dev->_rx;
3007
3008	map = rcu_dereference(rxqueue->rps_map);
3009	if (map) {
3010		if (map->len == 1 &&
3011		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3012			tcpu = map->cpus[0];
3013			if (cpu_online(tcpu))
3014				cpu = tcpu;
3015			goto done;
3016		}
3017	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3018		goto done;
3019	}
3020
3021	skb_reset_network_header(skb);
3022	if (!skb_get_hash(skb))
3023		goto done;
3024
3025	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3026	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3027	if (flow_table && sock_flow_table) {
3028		u16 next_cpu;
3029		struct rps_dev_flow *rflow;
3030
3031		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3032		tcpu = rflow->cpu;
3033
3034		next_cpu = sock_flow_table->ents[skb->rxhash &
3035		    sock_flow_table->mask];
3036
3037		/*
3038		 * If the desired CPU (where last recvmsg was done) is
3039		 * different from current CPU (one in the rx-queue flow
3040		 * table entry), switch if one of the following holds:
3041		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3042		 *   - Current CPU is offline.
3043		 *   - The current CPU's queue tail has advanced beyond the
3044		 *     last packet that was enqueued using this table entry.
3045		 *     This guarantees that all previous packets for the flow
3046		 *     have been dequeued, thus preserving in order delivery.
3047		 */
3048		if (unlikely(tcpu != next_cpu) &&
3049		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3050		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3051		      rflow->last_qtail)) >= 0)) {
3052			tcpu = next_cpu;
3053			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3054		}
3055
3056		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3057			*rflowp = rflow;
3058			cpu = tcpu;
3059			goto done;
3060		}
3061	}
3062
3063	if (map) {
3064		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3065
3066		if (cpu_online(tcpu)) {
3067			cpu = tcpu;
3068			goto done;
3069		}
3070	}
3071
3072done:
3073	return cpu;
3074}
3075
3076#ifdef CONFIG_RFS_ACCEL
3077
3078/**
3079 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3080 * @dev: Device on which the filter was set
3081 * @rxq_index: RX queue index
3082 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3083 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3084 *
3085 * Drivers that implement ndo_rx_flow_steer() should periodically call
3086 * this function for each installed filter and remove the filters for
3087 * which it returns %true.
3088 */
3089bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3090			 u32 flow_id, u16 filter_id)
3091{
3092	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3093	struct rps_dev_flow_table *flow_table;
3094	struct rps_dev_flow *rflow;
3095	bool expire = true;
3096	int cpu;
3097
3098	rcu_read_lock();
3099	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3100	if (flow_table && flow_id <= flow_table->mask) {
3101		rflow = &flow_table->flows[flow_id];
3102		cpu = ACCESS_ONCE(rflow->cpu);
3103		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3104		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3105			   rflow->last_qtail) <
3106		     (int)(10 * flow_table->mask)))
3107			expire = false;
3108	}
3109	rcu_read_unlock();
3110	return expire;
3111}
3112EXPORT_SYMBOL(rps_may_expire_flow);
3113
3114#endif /* CONFIG_RFS_ACCEL */
3115
3116/* Called from hardirq (IPI) context */
3117static void rps_trigger_softirq(void *data)
3118{
3119	struct softnet_data *sd = data;
3120
3121	____napi_schedule(sd, &sd->backlog);
3122	sd->received_rps++;
3123}
3124
3125#endif /* CONFIG_RPS */
3126
3127/*
3128 * Check if this softnet_data structure is another cpu one
3129 * If yes, queue it to our IPI list and return 1
3130 * If no, return 0
3131 */
3132static int rps_ipi_queued(struct softnet_data *sd)
3133{
3134#ifdef CONFIG_RPS
3135	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3136
3137	if (sd != mysd) {
3138		sd->rps_ipi_next = mysd->rps_ipi_list;
3139		mysd->rps_ipi_list = sd;
3140
3141		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3142		return 1;
3143	}
3144#endif /* CONFIG_RPS */
3145	return 0;
3146}
3147
3148#ifdef CONFIG_NET_FLOW_LIMIT
3149int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3150#endif
3151
3152static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3153{
3154#ifdef CONFIG_NET_FLOW_LIMIT
3155	struct sd_flow_limit *fl;
3156	struct softnet_data *sd;
3157	unsigned int old_flow, new_flow;
3158
3159	if (qlen < (netdev_max_backlog >> 1))
3160		return false;
3161
3162	sd = &__get_cpu_var(softnet_data);
3163
3164	rcu_read_lock();
3165	fl = rcu_dereference(sd->flow_limit);
3166	if (fl) {
3167		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3168		old_flow = fl->history[fl->history_head];
3169		fl->history[fl->history_head] = new_flow;
3170
3171		fl->history_head++;
3172		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3173
3174		if (likely(fl->buckets[old_flow]))
3175			fl->buckets[old_flow]--;
3176
3177		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3178			fl->count++;
3179			rcu_read_unlock();
3180			return true;
3181		}
3182	}
3183	rcu_read_unlock();
3184#endif
3185	return false;
3186}
3187
3188/*
3189 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3190 * queue (may be a remote CPU queue).
3191 */
3192static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3193			      unsigned int *qtail)
3194{
3195	struct softnet_data *sd;
3196	unsigned long flags;
3197	unsigned int qlen;
3198
3199	sd = &per_cpu(softnet_data, cpu);
3200
3201	local_irq_save(flags);
3202
3203	rps_lock(sd);
3204	qlen = skb_queue_len(&sd->input_pkt_queue);
3205	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3206		if (skb_queue_len(&sd->input_pkt_queue)) {
3207enqueue:
3208			__skb_queue_tail(&sd->input_pkt_queue, skb);
3209			input_queue_tail_incr_save(sd, qtail);
3210			rps_unlock(sd);
3211			local_irq_restore(flags);
3212			return NET_RX_SUCCESS;
3213		}
3214
3215		/* Schedule NAPI for backlog device
3216		 * We can use non atomic operation since we own the queue lock
3217		 */
3218		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3219			if (!rps_ipi_queued(sd))
3220				____napi_schedule(sd, &sd->backlog);
3221		}
3222		goto enqueue;
3223	}
3224
3225	sd->dropped++;
3226	rps_unlock(sd);
3227
3228	local_irq_restore(flags);
3229
3230	atomic_long_inc(&skb->dev->rx_dropped);
3231	kfree_skb(skb);
3232	return NET_RX_DROP;
3233}
3234
3235static int netif_rx_internal(struct sk_buff *skb)
3236{
3237	int ret;
3238
3239	/* if netpoll wants it, pretend we never saw it */
3240	if (netpoll_rx(skb))
3241		return NET_RX_DROP;
3242
3243	net_timestamp_check(netdev_tstamp_prequeue, skb);
3244
3245	trace_netif_rx(skb);
3246#ifdef CONFIG_RPS
3247	if (static_key_false(&rps_needed)) {
3248		struct rps_dev_flow voidflow, *rflow = &voidflow;
3249		int cpu;
3250
3251		preempt_disable();
3252		rcu_read_lock();
3253
3254		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3255		if (cpu < 0)
3256			cpu = smp_processor_id();
3257
3258		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3259
3260		rcu_read_unlock();
3261		preempt_enable();
3262	} else
3263#endif
3264	{
3265		unsigned int qtail;
3266		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3267		put_cpu();
3268	}
3269	return ret;
3270}
3271
3272/**
3273 *	netif_rx	-	post buffer to the network code
3274 *	@skb: buffer to post
3275 *
3276 *	This function receives a packet from a device driver and queues it for
3277 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3278 *	may be dropped during processing for congestion control or by the
3279 *	protocol layers.
3280 *
3281 *	return values:
3282 *	NET_RX_SUCCESS	(no congestion)
3283 *	NET_RX_DROP     (packet was dropped)
3284 *
3285 */
3286
3287int netif_rx(struct sk_buff *skb)
3288{
3289	trace_netif_rx_entry(skb);
3290
3291	return netif_rx_internal(skb);
3292}
3293EXPORT_SYMBOL(netif_rx);
3294
3295int netif_rx_ni(struct sk_buff *skb)
3296{
3297	int err;
3298
3299	trace_netif_rx_ni_entry(skb);
3300
3301	preempt_disable();
3302	err = netif_rx_internal(skb);
3303	if (local_softirq_pending())
3304		do_softirq();
3305	preempt_enable();
3306
3307	return err;
3308}
3309EXPORT_SYMBOL(netif_rx_ni);
3310
3311static void net_tx_action(struct softirq_action *h)
3312{
3313	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3314
3315	if (sd->completion_queue) {
3316		struct sk_buff *clist;
3317
3318		local_irq_disable();
3319		clist = sd->completion_queue;
3320		sd->completion_queue = NULL;
3321		local_irq_enable();
3322
3323		while (clist) {
3324			struct sk_buff *skb = clist;
3325			clist = clist->next;
3326
3327			WARN_ON(atomic_read(&skb->users));
3328			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3329				trace_consume_skb(skb);
3330			else
3331				trace_kfree_skb(skb, net_tx_action);
3332			__kfree_skb(skb);
3333		}
3334	}
3335
3336	if (sd->output_queue) {
3337		struct Qdisc *head;
3338
3339		local_irq_disable();
3340		head = sd->output_queue;
3341		sd->output_queue = NULL;
3342		sd->output_queue_tailp = &sd->output_queue;
3343		local_irq_enable();
3344
3345		while (head) {
3346			struct Qdisc *q = head;
3347			spinlock_t *root_lock;
3348
3349			head = head->next_sched;
3350
3351			root_lock = qdisc_lock(q);
3352			if (spin_trylock(root_lock)) {
3353				smp_mb__before_clear_bit();
3354				clear_bit(__QDISC_STATE_SCHED,
3355					  &q->state);
3356				qdisc_run(q);
3357				spin_unlock(root_lock);
3358			} else {
3359				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3360					      &q->state)) {
3361					__netif_reschedule(q);
3362				} else {
3363					smp_mb__before_clear_bit();
3364					clear_bit(__QDISC_STATE_SCHED,
3365						  &q->state);
3366				}
3367			}
3368		}
3369	}
3370}
3371
3372#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3373    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3374/* This hook is defined here for ATM LANE */
3375int (*br_fdb_test_addr_hook)(struct net_device *dev,
3376			     unsigned char *addr) __read_mostly;
3377EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3378#endif
3379
3380#ifdef CONFIG_NET_CLS_ACT
3381/* TODO: Maybe we should just force sch_ingress to be compiled in
3382 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3383 * a compare and 2 stores extra right now if we dont have it on
3384 * but have CONFIG_NET_CLS_ACT
3385 * NOTE: This doesn't stop any functionality; if you dont have
3386 * the ingress scheduler, you just can't add policies on ingress.
3387 *
3388 */
3389static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3390{
3391	struct net_device *dev = skb->dev;
3392	u32 ttl = G_TC_RTTL(skb->tc_verd);
3393	int result = TC_ACT_OK;
3394	struct Qdisc *q;
3395
3396	if (unlikely(MAX_RED_LOOP < ttl++)) {
3397		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3398				     skb->skb_iif, dev->ifindex);
3399		return TC_ACT_SHOT;
3400	}
3401
3402	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3403	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3404
3405	q = rxq->qdisc;
3406	if (q != &noop_qdisc) {
3407		spin_lock(qdisc_lock(q));
3408		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3409			result = qdisc_enqueue_root(skb, q);
3410		spin_unlock(qdisc_lock(q));
3411	}
3412
3413	return result;
3414}
3415
3416static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3417					 struct packet_type **pt_prev,
3418					 int *ret, struct net_device *orig_dev)
3419{
3420	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3421
3422	if (!rxq || rxq->qdisc == &noop_qdisc)
3423		goto out;
3424
3425	if (*pt_prev) {
3426		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3427		*pt_prev = NULL;
3428	}
3429
3430	switch (ing_filter(skb, rxq)) {
3431	case TC_ACT_SHOT:
3432	case TC_ACT_STOLEN:
3433		kfree_skb(skb);
3434		return NULL;
3435	}
3436
3437out:
3438	skb->tc_verd = 0;
3439	return skb;
3440}
3441#endif
3442
3443/**
3444 *	netdev_rx_handler_register - register receive handler
3445 *	@dev: device to register a handler for
3446 *	@rx_handler: receive handler to register
3447 *	@rx_handler_data: data pointer that is used by rx handler
3448 *
3449 *	Register a receive hander for a device. This handler will then be
3450 *	called from __netif_receive_skb. A negative errno code is returned
3451 *	on a failure.
3452 *
3453 *	The caller must hold the rtnl_mutex.
3454 *
3455 *	For a general description of rx_handler, see enum rx_handler_result.
3456 */
3457int netdev_rx_handler_register(struct net_device *dev,
3458			       rx_handler_func_t *rx_handler,
3459			       void *rx_handler_data)
3460{
3461	ASSERT_RTNL();
3462
3463	if (dev->rx_handler)
3464		return -EBUSY;
3465
3466	/* Note: rx_handler_data must be set before rx_handler */
3467	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3468	rcu_assign_pointer(dev->rx_handler, rx_handler);
3469
3470	return 0;
3471}
3472EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3473
3474/**
3475 *	netdev_rx_handler_unregister - unregister receive handler
3476 *	@dev: device to unregister a handler from
3477 *
3478 *	Unregister a receive handler from a device.
3479 *
3480 *	The caller must hold the rtnl_mutex.
3481 */
3482void netdev_rx_handler_unregister(struct net_device *dev)
3483{
3484
3485	ASSERT_RTNL();
3486	RCU_INIT_POINTER(dev->rx_handler, NULL);
3487	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3488	 * section has a guarantee to see a non NULL rx_handler_data
3489	 * as well.
3490	 */
3491	synchronize_net();
3492	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3493}
3494EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3495
3496/*
3497 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3498 * the special handling of PFMEMALLOC skbs.
3499 */
3500static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3501{
3502	switch (skb->protocol) {
3503	case __constant_htons(ETH_P_ARP):
3504	case __constant_htons(ETH_P_IP):
3505	case __constant_htons(ETH_P_IPV6):
3506	case __constant_htons(ETH_P_8021Q):
3507	case __constant_htons(ETH_P_8021AD):
3508		return true;
3509	default:
3510		return false;
3511	}
3512}
3513
3514static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3515{
3516	struct packet_type *ptype, *pt_prev;
3517	rx_handler_func_t *rx_handler;
3518	struct net_device *orig_dev;
3519	struct net_device *null_or_dev;
3520	bool deliver_exact = false;
3521	int ret = NET_RX_DROP;
3522	__be16 type;
3523
3524	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3525
3526	trace_netif_receive_skb(skb);
3527
3528	/* if we've gotten here through NAPI, check netpoll */
3529	if (netpoll_receive_skb(skb))
3530		goto out;
3531
3532	orig_dev = skb->dev;
3533
3534	skb_reset_network_header(skb);
3535	if (!skb_transport_header_was_set(skb))
3536		skb_reset_transport_header(skb);
3537	skb_reset_mac_len(skb);
3538
3539	pt_prev = NULL;
3540
3541	rcu_read_lock();
3542
3543another_round:
3544	skb->skb_iif = skb->dev->ifindex;
3545
3546	__this_cpu_inc(softnet_data.processed);
3547
3548	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3549	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3550		skb = vlan_untag(skb);
3551		if (unlikely(!skb))
3552			goto unlock;
3553	}
3554
3555#ifdef CONFIG_NET_CLS_ACT
3556	if (skb->tc_verd & TC_NCLS) {
3557		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3558		goto ncls;
3559	}
3560#endif
3561
3562	if (pfmemalloc)
3563		goto skip_taps;
3564
3565	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3566		if (!ptype->dev || ptype->dev == skb->dev) {
3567			if (pt_prev)
3568				ret = deliver_skb(skb, pt_prev, orig_dev);
3569			pt_prev = ptype;
3570		}
3571	}
3572
3573skip_taps:
3574#ifdef CONFIG_NET_CLS_ACT
3575	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3576	if (!skb)
3577		goto unlock;
3578ncls:
3579#endif
3580
3581	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3582		goto drop;
3583
3584	if (vlan_tx_tag_present(skb)) {
3585		if (pt_prev) {
3586			ret = deliver_skb(skb, pt_prev, orig_dev);
3587			pt_prev = NULL;
3588		}
3589		if (vlan_do_receive(&skb))
3590			goto another_round;
3591		else if (unlikely(!skb))
3592			goto unlock;
3593	}
3594
3595	rx_handler = rcu_dereference(skb->dev->rx_handler);
3596	if (rx_handler) {
3597		if (pt_prev) {
3598			ret = deliver_skb(skb, pt_prev, orig_dev);
3599			pt_prev = NULL;
3600		}
3601		switch (rx_handler(&skb)) {
3602		case RX_HANDLER_CONSUMED:
3603			ret = NET_RX_SUCCESS;
3604			goto unlock;
3605		case RX_HANDLER_ANOTHER:
3606			goto another_round;
3607		case RX_HANDLER_EXACT:
3608			deliver_exact = true;
3609		case RX_HANDLER_PASS:
3610			break;
3611		default:
3612			BUG();
3613		}
3614	}
3615
3616	if (unlikely(vlan_tx_tag_present(skb))) {
3617		if (vlan_tx_tag_get_id(skb))
3618			skb->pkt_type = PACKET_OTHERHOST;
3619		/* Note: we might in the future use prio bits
3620		 * and set skb->priority like in vlan_do_receive()
3621		 * For the time being, just ignore Priority Code Point
3622		 */
3623		skb->vlan_tci = 0;
3624	}
3625
3626	/* deliver only exact match when indicated */
3627	null_or_dev = deliver_exact ? skb->dev : NULL;
3628
3629	type = skb->protocol;
3630	list_for_each_entry_rcu(ptype,
3631			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3632		if (ptype->type == type &&
3633		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3634		     ptype->dev == orig_dev)) {
3635			if (pt_prev)
3636				ret = deliver_skb(skb, pt_prev, orig_dev);
3637			pt_prev = ptype;
3638		}
3639	}
3640
3641	if (pt_prev) {
3642		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3643			goto drop;
3644		else
3645			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3646	} else {
3647drop:
3648		atomic_long_inc(&skb->dev->rx_dropped);
3649		kfree_skb(skb);
3650		/* Jamal, now you will not able to escape explaining
3651		 * me how you were going to use this. :-)
3652		 */
3653		ret = NET_RX_DROP;
3654	}
3655
3656unlock:
3657	rcu_read_unlock();
3658out:
3659	return ret;
3660}
3661
3662static int __netif_receive_skb(struct sk_buff *skb)
3663{
3664	int ret;
3665
3666	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3667		unsigned long pflags = current->flags;
3668
3669		/*
3670		 * PFMEMALLOC skbs are special, they should
3671		 * - be delivered to SOCK_MEMALLOC sockets only
3672		 * - stay away from userspace
3673		 * - have bounded memory usage
3674		 *
3675		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3676		 * context down to all allocation sites.
3677		 */
3678		current->flags |= PF_MEMALLOC;
3679		ret = __netif_receive_skb_core(skb, true);
3680		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3681	} else
3682		ret = __netif_receive_skb_core(skb, false);
3683
3684	return ret;
3685}
3686
3687static int netif_receive_skb_internal(struct sk_buff *skb)
3688{
3689	net_timestamp_check(netdev_tstamp_prequeue, skb);
3690
3691	if (skb_defer_rx_timestamp(skb))
3692		return NET_RX_SUCCESS;
3693
3694#ifdef CONFIG_RPS
3695	if (static_key_false(&rps_needed)) {
3696		struct rps_dev_flow voidflow, *rflow = &voidflow;
3697		int cpu, ret;
3698
3699		rcu_read_lock();
3700
3701		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3702
3703		if (cpu >= 0) {
3704			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3705			rcu_read_unlock();
3706			return ret;
3707		}
3708		rcu_read_unlock();
3709	}
3710#endif
3711	return __netif_receive_skb(skb);
3712}
3713
3714/**
3715 *	netif_receive_skb - process receive buffer from network
3716 *	@skb: buffer to process
3717 *
3718 *	netif_receive_skb() is the main receive data processing function.
3719 *	It always succeeds. The buffer may be dropped during processing
3720 *	for congestion control or by the protocol layers.
3721 *
3722 *	This function may only be called from softirq context and interrupts
3723 *	should be enabled.
3724 *
3725 *	Return values (usually ignored):
3726 *	NET_RX_SUCCESS: no congestion
3727 *	NET_RX_DROP: packet was dropped
3728 */
3729int netif_receive_skb(struct sk_buff *skb)
3730{
3731	trace_netif_receive_skb_entry(skb);
3732
3733	return netif_receive_skb_internal(skb);
3734}
3735EXPORT_SYMBOL(netif_receive_skb);
3736
3737/* Network device is going away, flush any packets still pending
3738 * Called with irqs disabled.
3739 */
3740static void flush_backlog(void *arg)
3741{
3742	struct net_device *dev = arg;
3743	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3744	struct sk_buff *skb, *tmp;
3745
3746	rps_lock(sd);
3747	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3748		if (skb->dev == dev) {
3749			__skb_unlink(skb, &sd->input_pkt_queue);
3750			kfree_skb(skb);
3751			input_queue_head_incr(sd);
3752		}
3753	}
3754	rps_unlock(sd);
3755
3756	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3757		if (skb->dev == dev) {
3758			__skb_unlink(skb, &sd->process_queue);
3759			kfree_skb(skb);
3760			input_queue_head_incr(sd);
3761		}
3762	}
3763}
3764
3765static int napi_gro_complete(struct sk_buff *skb)
3766{
3767	struct packet_offload *ptype;
3768	__be16 type = skb->protocol;
3769	struct list_head *head = &offload_base;
3770	int err = -ENOENT;
3771
3772	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3773
3774	if (NAPI_GRO_CB(skb)->count == 1) {
3775		skb_shinfo(skb)->gso_size = 0;
3776		goto out;
3777	}
3778
3779	rcu_read_lock();
3780	list_for_each_entry_rcu(ptype, head, list) {
3781		if (ptype->type != type || !ptype->callbacks.gro_complete)
3782			continue;
3783
3784		err = ptype->callbacks.gro_complete(skb, 0);
3785		break;
3786	}
3787	rcu_read_unlock();
3788
3789	if (err) {
3790		WARN_ON(&ptype->list == head);
3791		kfree_skb(skb);
3792		return NET_RX_SUCCESS;
3793	}
3794
3795out:
3796	return netif_receive_skb_internal(skb);
3797}
3798
3799/* napi->gro_list contains packets ordered by age.
3800 * youngest packets at the head of it.
3801 * Complete skbs in reverse order to reduce latencies.
3802 */
3803void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3804{
3805	struct sk_buff *skb, *prev = NULL;
3806
3807	/* scan list and build reverse chain */
3808	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3809		skb->prev = prev;
3810		prev = skb;
3811	}
3812
3813	for (skb = prev; skb; skb = prev) {
3814		skb->next = NULL;
3815
3816		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3817			return;
3818
3819		prev = skb->prev;
3820		napi_gro_complete(skb);
3821		napi->gro_count--;
3822	}
3823
3824	napi->gro_list = NULL;
3825}
3826EXPORT_SYMBOL(napi_gro_flush);
3827
3828static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3829{
3830	struct sk_buff *p;
3831	unsigned int maclen = skb->dev->hard_header_len;
3832	u32 hash = skb_get_hash_raw(skb);
3833
3834	for (p = napi->gro_list; p; p = p->next) {
3835		unsigned long diffs;
3836
3837		NAPI_GRO_CB(p)->flush = 0;
3838
3839		if (hash != skb_get_hash_raw(p)) {
3840			NAPI_GRO_CB(p)->same_flow = 0;
3841			continue;
3842		}
3843
3844		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3845		diffs |= p->vlan_tci ^ skb->vlan_tci;
3846		if (maclen == ETH_HLEN)
3847			diffs |= compare_ether_header(skb_mac_header(p),
3848						      skb_gro_mac_header(skb));
3849		else if (!diffs)
3850			diffs = memcmp(skb_mac_header(p),
3851				       skb_gro_mac_header(skb),
3852				       maclen);
3853		NAPI_GRO_CB(p)->same_flow = !diffs;
3854	}
3855}
3856
3857static void skb_gro_reset_offset(struct sk_buff *skb)
3858{
3859	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3860	const skb_frag_t *frag0 = &pinfo->frags[0];
3861
3862	NAPI_GRO_CB(skb)->data_offset = 0;
3863	NAPI_GRO_CB(skb)->frag0 = NULL;
3864	NAPI_GRO_CB(skb)->frag0_len = 0;
3865
3866	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3867	    pinfo->nr_frags &&
3868	    !PageHighMem(skb_frag_page(frag0))) {
3869		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3870		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3871	}
3872}
3873
3874static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3875{
3876	struct sk_buff **pp = NULL;
3877	struct packet_offload *ptype;
3878	__be16 type = skb->protocol;
3879	struct list_head *head = &offload_base;
3880	int same_flow;
3881	enum gro_result ret;
3882
3883	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3884		goto normal;
3885
3886	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3887		goto normal;
3888
3889	skb_gro_reset_offset(skb);
3890	gro_list_prepare(napi, skb);
3891	NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3892
3893	rcu_read_lock();
3894	list_for_each_entry_rcu(ptype, head, list) {
3895		if (ptype->type != type || !ptype->callbacks.gro_receive)
3896			continue;
3897
3898		skb_set_network_header(skb, skb_gro_offset(skb));
3899		skb_reset_mac_len(skb);
3900		NAPI_GRO_CB(skb)->same_flow = 0;
3901		NAPI_GRO_CB(skb)->flush = 0;
3902		NAPI_GRO_CB(skb)->free = 0;
3903		NAPI_GRO_CB(skb)->udp_mark = 0;
3904
3905		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3906		break;
3907	}
3908	rcu_read_unlock();
3909
3910	if (&ptype->list == head)
3911		goto normal;
3912
3913	same_flow = NAPI_GRO_CB(skb)->same_flow;
3914	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3915
3916	if (pp) {
3917		struct sk_buff *nskb = *pp;
3918
3919		*pp = nskb->next;
3920		nskb->next = NULL;
3921		napi_gro_complete(nskb);
3922		napi->gro_count--;
3923	}
3924
3925	if (same_flow)
3926		goto ok;
3927
3928	if (NAPI_GRO_CB(skb)->flush)
3929		goto normal;
3930
3931	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3932		struct sk_buff *nskb = napi->gro_list;
3933
3934		/* locate the end of the list to select the 'oldest' flow */
3935		while (nskb->next) {
3936			pp = &nskb->next;
3937			nskb = *pp;
3938		}
3939		*pp = NULL;
3940		nskb->next = NULL;
3941		napi_gro_complete(nskb);
3942	} else {
3943		napi->gro_count++;
3944	}
3945	NAPI_GRO_CB(skb)->count = 1;
3946	NAPI_GRO_CB(skb)->age = jiffies;
3947	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3948	skb->next = napi->gro_list;
3949	napi->gro_list = skb;
3950	ret = GRO_HELD;
3951
3952pull:
3953	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3954		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3955
3956		BUG_ON(skb->end - skb->tail < grow);
3957
3958		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3959
3960		skb->tail += grow;
3961		skb->data_len -= grow;
3962
3963		skb_shinfo(skb)->frags[0].page_offset += grow;
3964		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3965
3966		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3967			skb_frag_unref(skb, 0);
3968			memmove(skb_shinfo(skb)->frags,
3969				skb_shinfo(skb)->frags + 1,
3970				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3971		}
3972	}
3973
3974ok:
3975	return ret;
3976
3977normal:
3978	ret = GRO_NORMAL;
3979	goto pull;
3980}
3981
3982struct packet_offload *gro_find_receive_by_type(__be16 type)
3983{
3984	struct list_head *offload_head = &offload_base;
3985	struct packet_offload *ptype;
3986
3987	list_for_each_entry_rcu(ptype, offload_head, list) {
3988		if (ptype->type != type || !ptype->callbacks.gro_receive)
3989			continue;
3990		return ptype;
3991	}
3992	return NULL;
3993}
3994EXPORT_SYMBOL(gro_find_receive_by_type);
3995
3996struct packet_offload *gro_find_complete_by_type(__be16 type)
3997{
3998	struct list_head *offload_head = &offload_base;
3999	struct packet_offload *ptype;
4000
4001	list_for_each_entry_rcu(ptype, offload_head, list) {
4002		if (ptype->type != type || !ptype->callbacks.gro_complete)
4003			continue;
4004		return ptype;
4005	}
4006	return NULL;
4007}
4008EXPORT_SYMBOL(gro_find_complete_by_type);
4009
4010static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4011{
4012	switch (ret) {
4013	case GRO_NORMAL:
4014		if (netif_receive_skb_internal(skb))
4015			ret = GRO_DROP;
4016		break;
4017
4018	case GRO_DROP:
4019		kfree_skb(skb);
4020		break;
4021
4022	case GRO_MERGED_FREE:
4023		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4024			kmem_cache_free(skbuff_head_cache, skb);
4025		else
4026			__kfree_skb(skb);
4027		break;
4028
4029	case GRO_HELD:
4030	case GRO_MERGED:
4031		break;
4032	}
4033
4034	return ret;
4035}
4036
4037gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4038{
4039	trace_napi_gro_receive_entry(skb);
4040
4041	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4042}
4043EXPORT_SYMBOL(napi_gro_receive);
4044
4045static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4046{
4047	__skb_pull(skb, skb_headlen(skb));
4048	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4049	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4050	skb->vlan_tci = 0;
4051	skb->dev = napi->dev;
4052	skb->skb_iif = 0;
4053
4054	napi->skb = skb;
4055}
4056
4057struct sk_buff *napi_get_frags(struct napi_struct *napi)
4058{
4059	struct sk_buff *skb = napi->skb;
4060
4061	if (!skb) {
4062		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4063		napi->skb = skb;
4064	}
4065	return skb;
4066}
4067EXPORT_SYMBOL(napi_get_frags);
4068
4069static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
4070			       gro_result_t ret)
4071{
4072	switch (ret) {
4073	case GRO_NORMAL:
4074		if (netif_receive_skb_internal(skb))
4075			ret = GRO_DROP;
4076		break;
4077
4078	case GRO_DROP:
4079	case GRO_MERGED_FREE:
4080		napi_reuse_skb(napi, skb);
4081		break;
4082
4083	case GRO_HELD:
4084	case GRO_MERGED:
4085		break;
4086	}
4087
4088	return ret;
4089}
4090
4091static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4092{
4093	struct sk_buff *skb = napi->skb;
4094
4095	napi->skb = NULL;
4096
4097	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
4098		napi_reuse_skb(napi, skb);
4099		return NULL;
4100	}
4101	skb->protocol = eth_type_trans(skb, skb->dev);
4102
4103	return skb;
4104}
4105
4106gro_result_t napi_gro_frags(struct napi_struct *napi)
4107{
4108	struct sk_buff *skb = napi_frags_skb(napi);
4109
4110	if (!skb)
4111		return GRO_DROP;
4112
4113	trace_napi_gro_frags_entry(skb);
4114
4115	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4116}
4117EXPORT_SYMBOL(napi_gro_frags);
4118
4119/*
4120 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4121 * Note: called with local irq disabled, but exits with local irq enabled.
4122 */
4123static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4124{
4125#ifdef CONFIG_RPS
4126	struct softnet_data *remsd = sd->rps_ipi_list;
4127
4128	if (remsd) {
4129		sd->rps_ipi_list = NULL;
4130
4131		local_irq_enable();
4132
4133		/* Send pending IPI's to kick RPS processing on remote cpus. */
4134		while (remsd) {
4135			struct softnet_data *next = remsd->rps_ipi_next;
4136
4137			if (cpu_online(remsd->cpu))
4138				__smp_call_function_single(remsd->cpu,
4139							   &remsd->csd, 0);
4140			remsd = next;
4141		}
4142	} else
4143#endif
4144		local_irq_enable();
4145}
4146
4147static int process_backlog(struct napi_struct *napi, int quota)
4148{
4149	int work = 0;
4150	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4151
4152#ifdef CONFIG_RPS
4153	/* Check if we have pending ipi, its better to send them now,
4154	 * not waiting net_rx_action() end.
4155	 */
4156	if (sd->rps_ipi_list) {
4157		local_irq_disable();
4158		net_rps_action_and_irq_enable(sd);
4159	}
4160#endif
4161	napi->weight = weight_p;
4162	local_irq_disable();
4163	while (work < quota) {
4164		struct sk_buff *skb;
4165		unsigned int qlen;
4166
4167		while ((skb = __skb_dequeue(&sd->process_queue))) {
4168			local_irq_enable();
4169			__netif_receive_skb(skb);
4170			local_irq_disable();
4171			input_queue_head_incr(sd);
4172			if (++work >= quota) {
4173				local_irq_enable();
4174				return work;
4175			}
4176		}
4177
4178		rps_lock(sd);
4179		qlen = skb_queue_len(&sd->input_pkt_queue);
4180		if (qlen)
4181			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4182						   &sd->process_queue);
4183
4184		if (qlen < quota - work) {
4185			/*
4186			 * Inline a custom version of __napi_complete().
4187			 * only current cpu owns and manipulates this napi,
4188			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4189			 * we can use a plain write instead of clear_bit(),
4190			 * and we dont need an smp_mb() memory barrier.
4191			 */
4192			list_del(&napi->poll_list);
4193			napi->state = 0;
4194
4195			quota = work + qlen;
4196		}
4197		rps_unlock(sd);
4198	}
4199	local_irq_enable();
4200
4201	return work;
4202}
4203
4204/**
4205 * __napi_schedule - schedule for receive
4206 * @n: entry to schedule
4207 *
4208 * The entry's receive function will be scheduled to run
4209 */
4210void __napi_schedule(struct napi_struct *n)
4211{
4212	unsigned long flags;
4213
4214	local_irq_save(flags);
4215	____napi_schedule(&__get_cpu_var(softnet_data), n);
4216	local_irq_restore(flags);
4217}
4218EXPORT_SYMBOL(__napi_schedule);
4219
4220void __napi_complete(struct napi_struct *n)
4221{
4222	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4223	BUG_ON(n->gro_list);
4224
4225	list_del(&n->poll_list);
4226	smp_mb__before_clear_bit();
4227	clear_bit(NAPI_STATE_SCHED, &n->state);
4228}
4229EXPORT_SYMBOL(__napi_complete);
4230
4231void napi_complete(struct napi_struct *n)
4232{
4233	unsigned long flags;
4234
4235	/*
4236	 * don't let napi dequeue from the cpu poll list
4237	 * just in case its running on a different cpu
4238	 */
4239	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4240		return;
4241
4242	napi_gro_flush(n, false);
4243	local_irq_save(flags);
4244	__napi_complete(n);
4245	local_irq_restore(flags);
4246}
4247EXPORT_SYMBOL(napi_complete);
4248
4249/* must be called under rcu_read_lock(), as we dont take a reference */
4250struct napi_struct *napi_by_id(unsigned int napi_id)
4251{
4252	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4253	struct napi_struct *napi;
4254
4255	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4256		if (napi->napi_id == napi_id)
4257			return napi;
4258
4259	return NULL;
4260}
4261EXPORT_SYMBOL_GPL(napi_by_id);
4262
4263void napi_hash_add(struct napi_struct *napi)
4264{
4265	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4266
4267		spin_lock(&napi_hash_lock);
4268
4269		/* 0 is not a valid id, we also skip an id that is taken
4270		 * we expect both events to be extremely rare
4271		 */
4272		napi->napi_id = 0;
4273		while (!napi->napi_id) {
4274			napi->napi_id = ++napi_gen_id;
4275			if (napi_by_id(napi->napi_id))
4276				napi->napi_id = 0;
4277		}
4278
4279		hlist_add_head_rcu(&napi->napi_hash_node,
4280			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4281
4282		spin_unlock(&napi_hash_lock);
4283	}
4284}
4285EXPORT_SYMBOL_GPL(napi_hash_add);
4286
4287/* Warning : caller is responsible to make sure rcu grace period
4288 * is respected before freeing memory containing @napi
4289 */
4290void napi_hash_del(struct napi_struct *napi)
4291{
4292	spin_lock(&napi_hash_lock);
4293
4294	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4295		hlist_del_rcu(&napi->napi_hash_node);
4296
4297	spin_unlock(&napi_hash_lock);
4298}
4299EXPORT_SYMBOL_GPL(napi_hash_del);
4300
4301void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4302		    int (*poll)(struct napi_struct *, int), int weight)
4303{
4304	INIT_LIST_HEAD(&napi->poll_list);
4305	napi->gro_count = 0;
4306	napi->gro_list = NULL;
4307	napi->skb = NULL;
4308	napi->poll = poll;
4309	if (weight > NAPI_POLL_WEIGHT)
4310		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4311			    weight, dev->name);
4312	napi->weight = weight;
4313	list_add(&napi->dev_list, &dev->napi_list);
4314	napi->dev = dev;
4315#ifdef CONFIG_NETPOLL
4316	spin_lock_init(&napi->poll_lock);
4317	napi->poll_owner = -1;
4318#endif
4319	set_bit(NAPI_STATE_SCHED, &napi->state);
4320}
4321EXPORT_SYMBOL(netif_napi_add);
4322
4323void netif_napi_del(struct napi_struct *napi)
4324{
4325	list_del_init(&napi->dev_list);
4326	napi_free_frags(napi);
4327
4328	kfree_skb_list(napi->gro_list);
4329	napi->gro_list = NULL;
4330	napi->gro_count = 0;
4331}
4332EXPORT_SYMBOL(netif_napi_del);
4333
4334static void net_rx_action(struct softirq_action *h)
4335{
4336	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4337	unsigned long time_limit = jiffies + 2;
4338	int budget = netdev_budget;
4339	void *have;
4340
4341	local_irq_disable();
4342
4343	while (!list_empty(&sd->poll_list)) {
4344		struct napi_struct *n;
4345		int work, weight;
4346
4347		/* If softirq window is exhuasted then punt.
4348		 * Allow this to run for 2 jiffies since which will allow
4349		 * an average latency of 1.5/HZ.
4350		 */
4351		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4352			goto softnet_break;
4353
4354		local_irq_enable();
4355
4356		/* Even though interrupts have been re-enabled, this
4357		 * access is safe because interrupts can only add new
4358		 * entries to the tail of this list, and only ->poll()
4359		 * calls can remove this head entry from the list.
4360		 */
4361		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4362
4363		have = netpoll_poll_lock(n);
4364
4365		weight = n->weight;
4366
4367		/* This NAPI_STATE_SCHED test is for avoiding a race
4368		 * with netpoll's poll_napi().  Only the entity which
4369		 * obtains the lock and sees NAPI_STATE_SCHED set will
4370		 * actually make the ->poll() call.  Therefore we avoid
4371		 * accidentally calling ->poll() when NAPI is not scheduled.
4372		 */
4373		work = 0;
4374		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4375			work = n->poll(n, weight);
4376			trace_napi_poll(n);
4377		}
4378
4379		WARN_ON_ONCE(work > weight);
4380
4381		budget -= work;
4382
4383		local_irq_disable();
4384
4385		/* Drivers must not modify the NAPI state if they
4386		 * consume the entire weight.  In such cases this code
4387		 * still "owns" the NAPI instance and therefore can
4388		 * move the instance around on the list at-will.
4389		 */
4390		if (unlikely(work == weight)) {
4391			if (unlikely(napi_disable_pending(n))) {
4392				local_irq_enable();
4393				napi_complete(n);
4394				local_irq_disable();
4395			} else {
4396				if (n->gro_list) {
4397					/* flush too old packets
4398					 * If HZ < 1000, flush all packets.
4399					 */
4400					local_irq_enable();
4401					napi_gro_flush(n, HZ >= 1000);
4402					local_irq_disable();
4403				}
4404				list_move_tail(&n->poll_list, &sd->poll_list);
4405			}
4406		}
4407
4408		netpoll_poll_unlock(have);
4409	}
4410out:
4411	net_rps_action_and_irq_enable(sd);
4412
4413#ifdef CONFIG_NET_DMA
4414	/*
4415	 * There may not be any more sk_buffs coming right now, so push
4416	 * any pending DMA copies to hardware
4417	 */
4418	dma_issue_pending_all();
4419#endif
4420
4421	return;
4422
4423softnet_break:
4424	sd->time_squeeze++;
4425	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4426	goto out;
4427}
4428
4429struct netdev_adjacent {
4430	struct net_device *dev;
4431
4432	/* upper master flag, there can only be one master device per list */
4433	bool master;
4434
4435	/* counter for the number of times this device was added to us */
4436	u16 ref_nr;
4437
4438	/* private field for the users */
4439	void *private;
4440
4441	struct list_head list;
4442	struct rcu_head rcu;
4443};
4444
4445static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4446						 struct net_device *adj_dev,
4447						 struct list_head *adj_list)
4448{
4449	struct netdev_adjacent *adj;
4450
4451	list_for_each_entry(adj, adj_list, list) {
4452		if (adj->dev == adj_dev)
4453			return adj;
4454	}
4455	return NULL;
4456}
4457
4458/**
4459 * netdev_has_upper_dev - Check if device is linked to an upper device
4460 * @dev: device
4461 * @upper_dev: upper device to check
4462 *
4463 * Find out if a device is linked to specified upper device and return true
4464 * in case it is. Note that this checks only immediate upper device,
4465 * not through a complete stack of devices. The caller must hold the RTNL lock.
4466 */
4467bool netdev_has_upper_dev(struct net_device *dev,
4468			  struct net_device *upper_dev)
4469{
4470	ASSERT_RTNL();
4471
4472	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4473}
4474EXPORT_SYMBOL(netdev_has_upper_dev);
4475
4476/**
4477 * netdev_has_any_upper_dev - Check if device is linked to some device
4478 * @dev: device
4479 *
4480 * Find out if a device is linked to an upper device and return true in case
4481 * it is. The caller must hold the RTNL lock.
4482 */
4483static bool netdev_has_any_upper_dev(struct net_device *dev)
4484{
4485	ASSERT_RTNL();
4486
4487	return !list_empty(&dev->all_adj_list.upper);
4488}
4489
4490/**
4491 * netdev_master_upper_dev_get - Get master upper device
4492 * @dev: device
4493 *
4494 * Find a master upper device and return pointer to it or NULL in case
4495 * it's not there. The caller must hold the RTNL lock.
4496 */
4497struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4498{
4499	struct netdev_adjacent *upper;
4500
4501	ASSERT_RTNL();
4502
4503	if (list_empty(&dev->adj_list.upper))
4504		return NULL;
4505
4506	upper = list_first_entry(&dev->adj_list.upper,
4507				 struct netdev_adjacent, list);
4508	if (likely(upper->master))
4509		return upper->dev;
4510	return NULL;
4511}
4512EXPORT_SYMBOL(netdev_master_upper_dev_get);
4513
4514void *netdev_adjacent_get_private(struct list_head *adj_list)
4515{
4516	struct netdev_adjacent *adj;
4517
4518	adj = list_entry(adj_list, struct netdev_adjacent, list);
4519
4520	return adj->private;
4521}
4522EXPORT_SYMBOL(netdev_adjacent_get_private);
4523
4524/**
4525 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4526 * @dev: device
4527 * @iter: list_head ** of the current position
4528 *
4529 * Gets the next device from the dev's upper list, starting from iter
4530 * position. The caller must hold RCU read lock.
4531 */
4532struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4533						     struct list_head **iter)
4534{
4535	struct netdev_adjacent *upper;
4536
4537	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4538
4539	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4540
4541	if (&upper->list == &dev->all_adj_list.upper)
4542		return NULL;
4543
4544	*iter = &upper->list;
4545
4546	return upper->dev;
4547}
4548EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4549
4550/**
4551 * netdev_lower_get_next_private - Get the next ->private from the
4552 *				   lower neighbour list
4553 * @dev: device
4554 * @iter: list_head ** of the current position
4555 *
4556 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4557 * list, starting from iter position. The caller must hold either hold the
4558 * RTNL lock or its own locking that guarantees that the neighbour lower
4559 * list will remain unchainged.
4560 */
4561void *netdev_lower_get_next_private(struct net_device *dev,
4562				    struct list_head **iter)
4563{
4564	struct netdev_adjacent *lower;
4565
4566	lower = list_entry(*iter, struct netdev_adjacent, list);
4567
4568	if (&lower->list == &dev->adj_list.lower)
4569		return NULL;
4570
4571	if (iter)
4572		*iter = lower->list.next;
4573
4574	return lower->private;
4575}
4576EXPORT_SYMBOL(netdev_lower_get_next_private);
4577
4578/**
4579 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4580 *				       lower neighbour list, RCU
4581 *				       variant
4582 * @dev: device
4583 * @iter: list_head ** of the current position
4584 *
4585 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4586 * list, starting from iter position. The caller must hold RCU read lock.
4587 */
4588void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4589					struct list_head **iter)
4590{
4591	struct netdev_adjacent *lower;
4592
4593	WARN_ON_ONCE(!rcu_read_lock_held());
4594
4595	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4596
4597	if (&lower->list == &dev->adj_list.lower)
4598		return NULL;
4599
4600	if (iter)
4601		*iter = &lower->list;
4602
4603	return lower->private;
4604}
4605EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4606
4607/**
4608 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4609 *				       lower neighbour list, RCU
4610 *				       variant
4611 * @dev: device
4612 *
4613 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4614 * list. The caller must hold RCU read lock.
4615 */
4616void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4617{
4618	struct netdev_adjacent *lower;
4619
4620	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4621			struct netdev_adjacent, list);
4622	if (lower)
4623		return lower->private;
4624	return NULL;
4625}
4626EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4627
4628/**
4629 * netdev_master_upper_dev_get_rcu - Get master upper device
4630 * @dev: device
4631 *
4632 * Find a master upper device and return pointer to it or NULL in case
4633 * it's not there. The caller must hold the RCU read lock.
4634 */
4635struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4636{
4637	struct netdev_adjacent *upper;
4638
4639	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4640				       struct netdev_adjacent, list);
4641	if (upper && likely(upper->master))
4642		return upper->dev;
4643	return NULL;
4644}
4645EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4646
4647static int netdev_adjacent_sysfs_add(struct net_device *dev,
4648			      struct net_device *adj_dev,
4649			      struct list_head *dev_list)
4650{
4651	char linkname[IFNAMSIZ+7];
4652	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4653		"upper_%s" : "lower_%s", adj_dev->name);
4654	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4655				 linkname);
4656}
4657static void netdev_adjacent_sysfs_del(struct net_device *dev,
4658			       char *name,
4659			       struct list_head *dev_list)
4660{
4661	char linkname[IFNAMSIZ+7];
4662	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4663		"upper_%s" : "lower_%s", name);
4664	sysfs_remove_link(&(dev->dev.kobj), linkname);
4665}
4666
4667#define netdev_adjacent_is_neigh_list(dev, dev_list) \
4668		(dev_list == &dev->adj_list.upper || \
4669		 dev_list == &dev->adj_list.lower)
4670
4671static int __netdev_adjacent_dev_insert(struct net_device *dev,
4672					struct net_device *adj_dev,
4673					struct list_head *dev_list,
4674					void *private, bool master)
4675{
4676	struct netdev_adjacent *adj;
4677	int ret;
4678
4679	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4680
4681	if (adj) {
4682		adj->ref_nr++;
4683		return 0;
4684	}
4685
4686	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4687	if (!adj)
4688		return -ENOMEM;
4689
4690	adj->dev = adj_dev;
4691	adj->master = master;
4692	adj->ref_nr = 1;
4693	adj->private = private;
4694	dev_hold(adj_dev);
4695
4696	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4697		 adj_dev->name, dev->name, adj_dev->name);
4698
4699	if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4700		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4701		if (ret)
4702			goto free_adj;
4703	}
4704
4705	/* Ensure that master link is always the first item in list. */
4706	if (master) {
4707		ret = sysfs_create_link(&(dev->dev.kobj),
4708					&(adj_dev->dev.kobj), "master");
4709		if (ret)
4710			goto remove_symlinks;
4711
4712		list_add_rcu(&adj->list, dev_list);
4713	} else {
4714		list_add_tail_rcu(&adj->list, dev_list);
4715	}
4716
4717	return 0;
4718
4719remove_symlinks:
4720	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4721		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4722free_adj:
4723	kfree(adj);
4724	dev_put(adj_dev);
4725
4726	return ret;
4727}
4728
4729static void __netdev_adjacent_dev_remove(struct net_device *dev,
4730					 struct net_device *adj_dev,
4731					 struct list_head *dev_list)
4732{
4733	struct netdev_adjacent *adj;
4734
4735	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4736
4737	if (!adj) {
4738		pr_err("tried to remove device %s from %s\n",
4739		       dev->name, adj_dev->name);
4740		BUG();
4741	}
4742
4743	if (adj->ref_nr > 1) {
4744		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4745			 adj->ref_nr-1);
4746		adj->ref_nr--;
4747		return;
4748	}
4749
4750	if (adj->master)
4751		sysfs_remove_link(&(dev->dev.kobj), "master");
4752
4753	if (netdev_adjacent_is_neigh_list(dev, dev_list))
4754		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4755
4756	list_del_rcu(&adj->list);
4757	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4758		 adj_dev->name, dev->name, adj_dev->name);
4759	dev_put(adj_dev);
4760	kfree_rcu(adj, rcu);
4761}
4762
4763static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4764					    struct net_device *upper_dev,
4765					    struct list_head *up_list,
4766					    struct list_head *down_list,
4767					    void *private, bool master)
4768{
4769	int ret;
4770
4771	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4772					   master);
4773	if (ret)
4774		return ret;
4775
4776	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4777					   false);
4778	if (ret) {
4779		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4780		return ret;
4781	}
4782
4783	return 0;
4784}
4785
4786static int __netdev_adjacent_dev_link(struct net_device *dev,
4787				      struct net_device *upper_dev)
4788{
4789	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4790						&dev->all_adj_list.upper,
4791						&upper_dev->all_adj_list.lower,
4792						NULL, false);
4793}
4794
4795static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4796					       struct net_device *upper_dev,
4797					       struct list_head *up_list,
4798					       struct list_head *down_list)
4799{
4800	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4801	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4802}
4803
4804static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4805					 struct net_device *upper_dev)
4806{
4807	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4808					   &dev->all_adj_list.upper,
4809					   &upper_dev->all_adj_list.lower);
4810}
4811
4812static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4813						struct net_device *upper_dev,
4814						void *private, bool master)
4815{
4816	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4817
4818	if (ret)
4819		return ret;
4820
4821	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4822					       &dev->adj_list.upper,
4823					       &upper_dev->adj_list.lower,
4824					       private, master);
4825	if (ret) {
4826		__netdev_adjacent_dev_unlink(dev, upper_dev);
4827		return ret;
4828	}
4829
4830	return 0;
4831}
4832
4833static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4834						   struct net_device *upper_dev)
4835{
4836	__netdev_adjacent_dev_unlink(dev, upper_dev);
4837	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4838					   &dev->adj_list.upper,
4839					   &upper_dev->adj_list.lower);
4840}
4841
4842static int __netdev_upper_dev_link(struct net_device *dev,
4843				   struct net_device *upper_dev, bool master,
4844				   void *private)
4845{
4846	struct netdev_adjacent *i, *j, *to_i, *to_j;
4847	int ret = 0;
4848
4849	ASSERT_RTNL();
4850
4851	if (dev == upper_dev)
4852		return -EBUSY;
4853
4854	/* To prevent loops, check if dev is not upper device to upper_dev. */
4855	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4856		return -EBUSY;
4857
4858	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4859		return -EEXIST;
4860
4861	if (master && netdev_master_upper_dev_get(dev))
4862		return -EBUSY;
4863
4864	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4865						   master);
4866	if (ret)
4867		return ret;
4868
4869	/* Now that we linked these devs, make all the upper_dev's
4870	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4871	 * versa, and don't forget the devices itself. All of these
4872	 * links are non-neighbours.
4873	 */
4874	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4875		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4876			pr_debug("Interlinking %s with %s, non-neighbour\n",
4877				 i->dev->name, j->dev->name);
4878			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4879			if (ret)
4880				goto rollback_mesh;
4881		}
4882	}
4883
4884	/* add dev to every upper_dev's upper device */
4885	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4886		pr_debug("linking %s's upper device %s with %s\n",
4887			 upper_dev->name, i->dev->name, dev->name);
4888		ret = __netdev_adjacent_dev_link(dev, i->dev);
4889		if (ret)
4890			goto rollback_upper_mesh;
4891	}
4892
4893	/* add upper_dev to every dev's lower device */
4894	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4895		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4896			 i->dev->name, upper_dev->name);
4897		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4898		if (ret)
4899			goto rollback_lower_mesh;
4900	}
4901
4902	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4903	return 0;
4904
4905rollback_lower_mesh:
4906	to_i = i;
4907	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4908		if (i == to_i)
4909			break;
4910		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4911	}
4912
4913	i = NULL;
4914
4915rollback_upper_mesh:
4916	to_i = i;
4917	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4918		if (i == to_i)
4919			break;
4920		__netdev_adjacent_dev_unlink(dev, i->dev);
4921	}
4922
4923	i = j = NULL;
4924
4925rollback_mesh:
4926	to_i = i;
4927	to_j = j;
4928	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4929		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4930			if (i == to_i && j == to_j)
4931				break;
4932			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4933		}
4934		if (i == to_i)
4935			break;
4936	}
4937
4938	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4939
4940	return ret;
4941}
4942
4943/**
4944 * netdev_upper_dev_link - Add a link to the upper device
4945 * @dev: device
4946 * @upper_dev: new upper device
4947 *
4948 * Adds a link to device which is upper to this one. The caller must hold
4949 * the RTNL lock. On a failure a negative errno code is returned.
4950 * On success the reference counts are adjusted and the function
4951 * returns zero.
4952 */
4953int netdev_upper_dev_link(struct net_device *dev,
4954			  struct net_device *upper_dev)
4955{
4956	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4957}
4958EXPORT_SYMBOL(netdev_upper_dev_link);
4959
4960/**
4961 * netdev_master_upper_dev_link - Add a master link to the upper device
4962 * @dev: device
4963 * @upper_dev: new upper device
4964 *
4965 * Adds a link to device which is upper to this one. In this case, only
4966 * one master upper device can be linked, although other non-master devices
4967 * might be linked as well. The caller must hold the RTNL lock.
4968 * On a failure a negative errno code is returned. On success the reference
4969 * counts are adjusted and the function returns zero.
4970 */
4971int netdev_master_upper_dev_link(struct net_device *dev,
4972				 struct net_device *upper_dev)
4973{
4974	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4975}
4976EXPORT_SYMBOL(netdev_master_upper_dev_link);
4977
4978int netdev_master_upper_dev_link_private(struct net_device *dev,
4979					 struct net_device *upper_dev,
4980					 void *private)
4981{
4982	return __netdev_upper_dev_link(dev, upper_dev, true, private);
4983}
4984EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4985
4986/**
4987 * netdev_upper_dev_unlink - Removes a link to upper device
4988 * @dev: device
4989 * @upper_dev: new upper device
4990 *
4991 * Removes a link to device which is upper to this one. The caller must hold
4992 * the RTNL lock.
4993 */
4994void netdev_upper_dev_unlink(struct net_device *dev,
4995			     struct net_device *upper_dev)
4996{
4997	struct netdev_adjacent *i, *j;
4998	ASSERT_RTNL();
4999
5000	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5001
5002	/* Here is the tricky part. We must remove all dev's lower
5003	 * devices from all upper_dev's upper devices and vice
5004	 * versa, to maintain the graph relationship.
5005	 */
5006	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5007		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5008			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5009
5010	/* remove also the devices itself from lower/upper device
5011	 * list
5012	 */
5013	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5014		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5015
5016	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5017		__netdev_adjacent_dev_unlink(dev, i->dev);
5018
5019	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5020}
5021EXPORT_SYMBOL(netdev_upper_dev_unlink);
5022
5023void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5024{
5025	struct netdev_adjacent *iter;
5026
5027	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5028		netdev_adjacent_sysfs_del(iter->dev, oldname,
5029					  &iter->dev->adj_list.lower);
5030		netdev_adjacent_sysfs_add(iter->dev, dev,
5031					  &iter->dev->adj_list.lower);
5032	}
5033
5034	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5035		netdev_adjacent_sysfs_del(iter->dev, oldname,
5036					  &iter->dev->adj_list.upper);
5037		netdev_adjacent_sysfs_add(iter->dev, dev,
5038					  &iter->dev->adj_list.upper);
5039	}
5040}
5041
5042void *netdev_lower_dev_get_private(struct net_device *dev,
5043				   struct net_device *lower_dev)
5044{
5045	struct netdev_adjacent *lower;
5046
5047	if (!lower_dev)
5048		return NULL;
5049	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5050	if (!lower)
5051		return NULL;
5052
5053	return lower->private;
5054}
5055EXPORT_SYMBOL(netdev_lower_dev_get_private);
5056
5057static void dev_change_rx_flags(struct net_device *dev, int flags)
5058{
5059	const struct net_device_ops *ops = dev->netdev_ops;
5060
5061	if (ops->ndo_change_rx_flags)
5062		ops->ndo_change_rx_flags(dev, flags);
5063}
5064
5065static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5066{
5067	unsigned int old_flags = dev->flags;
5068	kuid_t uid;
5069	kgid_t gid;
5070
5071	ASSERT_RTNL();
5072
5073	dev->flags |= IFF_PROMISC;
5074	dev->promiscuity += inc;
5075	if (dev->promiscuity == 0) {
5076		/*
5077		 * Avoid overflow.
5078		 * If inc causes overflow, untouch promisc and return error.
5079		 */
5080		if (inc < 0)
5081			dev->flags &= ~IFF_PROMISC;
5082		else {
5083			dev->promiscuity -= inc;
5084			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5085				dev->name);
5086			return -EOVERFLOW;
5087		}
5088	}
5089	if (dev->flags != old_flags) {
5090		pr_info("device %s %s promiscuous mode\n",
5091			dev->name,
5092			dev->flags & IFF_PROMISC ? "entered" : "left");
5093		if (audit_enabled) {
5094			current_uid_gid(&uid, &gid);
5095			audit_log(current->audit_context, GFP_ATOMIC,
5096				AUDIT_ANOM_PROMISCUOUS,
5097				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5098				dev->name, (dev->flags & IFF_PROMISC),
5099				(old_flags & IFF_PROMISC),
5100				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5101				from_kuid(&init_user_ns, uid),
5102				from_kgid(&init_user_ns, gid),
5103				audit_get_sessionid(current));
5104		}
5105
5106		dev_change_rx_flags(dev, IFF_PROMISC);
5107	}
5108	if (notify)
5109		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5110	return 0;
5111}
5112
5113/**
5114 *	dev_set_promiscuity	- update promiscuity count on a device
5115 *	@dev: device
5116 *	@inc: modifier
5117 *
5118 *	Add or remove promiscuity from a device. While the count in the device
5119 *	remains above zero the interface remains promiscuous. Once it hits zero
5120 *	the device reverts back to normal filtering operation. A negative inc
5121 *	value is used to drop promiscuity on the device.
5122 *	Return 0 if successful or a negative errno code on error.
5123 */
5124int dev_set_promiscuity(struct net_device *dev, int inc)
5125{
5126	unsigned int old_flags = dev->flags;
5127	int err;
5128
5129	err = __dev_set_promiscuity(dev, inc, true);
5130	if (err < 0)
5131		return err;
5132	if (dev->flags != old_flags)
5133		dev_set_rx_mode(dev);
5134	return err;
5135}
5136EXPORT_SYMBOL(dev_set_promiscuity);
5137
5138static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5139{
5140	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5141
5142	ASSERT_RTNL();
5143
5144	dev->flags |= IFF_ALLMULTI;
5145	dev->allmulti += inc;
5146	if (dev->allmulti == 0) {
5147		/*
5148		 * Avoid overflow.
5149		 * If inc causes overflow, untouch allmulti and return error.
5150		 */
5151		if (inc < 0)
5152			dev->flags &= ~IFF_ALLMULTI;
5153		else {
5154			dev->allmulti -= inc;
5155			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5156				dev->name);
5157			return -EOVERFLOW;
5158		}
5159	}
5160	if (dev->flags ^ old_flags) {
5161		dev_change_rx_flags(dev, IFF_ALLMULTI);
5162		dev_set_rx_mode(dev);
5163		if (notify)
5164			__dev_notify_flags(dev, old_flags,
5165					   dev->gflags ^ old_gflags);
5166	}
5167	return 0;
5168}
5169
5170/**
5171 *	dev_set_allmulti	- update allmulti count on a device
5172 *	@dev: device
5173 *	@inc: modifier
5174 *
5175 *	Add or remove reception of all multicast frames to a device. While the
5176 *	count in the device remains above zero the interface remains listening
5177 *	to all interfaces. Once it hits zero the device reverts back to normal
5178 *	filtering operation. A negative @inc value is used to drop the counter
5179 *	when releasing a resource needing all multicasts.
5180 *	Return 0 if successful or a negative errno code on error.
5181 */
5182
5183int dev_set_allmulti(struct net_device *dev, int inc)
5184{
5185	return __dev_set_allmulti(dev, inc, true);
5186}
5187EXPORT_SYMBOL(dev_set_allmulti);
5188
5189/*
5190 *	Upload unicast and multicast address lists to device and
5191 *	configure RX filtering. When the device doesn't support unicast
5192 *	filtering it is put in promiscuous mode while unicast addresses
5193 *	are present.
5194 */
5195void __dev_set_rx_mode(struct net_device *dev)
5196{
5197	const struct net_device_ops *ops = dev->netdev_ops;
5198
5199	/* dev_open will call this function so the list will stay sane. */
5200	if (!(dev->flags&IFF_UP))
5201		return;
5202
5203	if (!netif_device_present(dev))
5204		return;
5205
5206	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5207		/* Unicast addresses changes may only happen under the rtnl,
5208		 * therefore calling __dev_set_promiscuity here is safe.
5209		 */
5210		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5211			__dev_set_promiscuity(dev, 1, false);
5212			dev->uc_promisc = true;
5213		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5214			__dev_set_promiscuity(dev, -1, false);
5215			dev->uc_promisc = false;
5216		}
5217	}
5218
5219	if (ops->ndo_set_rx_mode)
5220		ops->ndo_set_rx_mode(dev);
5221}
5222
5223void dev_set_rx_mode(struct net_device *dev)
5224{
5225	netif_addr_lock_bh(dev);
5226	__dev_set_rx_mode(dev);
5227	netif_addr_unlock_bh(dev);
5228}
5229
5230/**
5231 *	dev_get_flags - get flags reported to userspace
5232 *	@dev: device
5233 *
5234 *	Get the combination of flag bits exported through APIs to userspace.
5235 */
5236unsigned int dev_get_flags(const struct net_device *dev)
5237{
5238	unsigned int flags;
5239
5240	flags = (dev->flags & ~(IFF_PROMISC |
5241				IFF_ALLMULTI |
5242				IFF_RUNNING |
5243				IFF_LOWER_UP |
5244				IFF_DORMANT)) |
5245		(dev->gflags & (IFF_PROMISC |
5246				IFF_ALLMULTI));
5247
5248	if (netif_running(dev)) {
5249		if (netif_oper_up(dev))
5250			flags |= IFF_RUNNING;
5251		if (netif_carrier_ok(dev))
5252			flags |= IFF_LOWER_UP;
5253		if (netif_dormant(dev))
5254			flags |= IFF_DORMANT;
5255	}
5256
5257	return flags;
5258}
5259EXPORT_SYMBOL(dev_get_flags);
5260
5261int __dev_change_flags(struct net_device *dev, unsigned int flags)
5262{
5263	unsigned int old_flags = dev->flags;
5264	int ret;
5265
5266	ASSERT_RTNL();
5267
5268	/*
5269	 *	Set the flags on our device.
5270	 */
5271
5272	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5273			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5274			       IFF_AUTOMEDIA)) |
5275		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5276				    IFF_ALLMULTI));
5277
5278	/*
5279	 *	Load in the correct multicast list now the flags have changed.
5280	 */
5281
5282	if ((old_flags ^ flags) & IFF_MULTICAST)
5283		dev_change_rx_flags(dev, IFF_MULTICAST);
5284
5285	dev_set_rx_mode(dev);
5286
5287	/*
5288	 *	Have we downed the interface. We handle IFF_UP ourselves
5289	 *	according to user attempts to set it, rather than blindly
5290	 *	setting it.
5291	 */
5292
5293	ret = 0;
5294	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5295		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5296
5297		if (!ret)
5298			dev_set_rx_mode(dev);
5299	}
5300
5301	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5302		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5303		unsigned int old_flags = dev->flags;
5304
5305		dev->gflags ^= IFF_PROMISC;
5306
5307		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5308			if (dev->flags != old_flags)
5309				dev_set_rx_mode(dev);
5310	}
5311
5312	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5313	   is important. Some (broken) drivers set IFF_PROMISC, when
5314	   IFF_ALLMULTI is requested not asking us and not reporting.
5315	 */
5316	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5317		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5318
5319		dev->gflags ^= IFF_ALLMULTI;
5320		__dev_set_allmulti(dev, inc, false);
5321	}
5322
5323	return ret;
5324}
5325
5326void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5327			unsigned int gchanges)
5328{
5329	unsigned int changes = dev->flags ^ old_flags;
5330
5331	if (gchanges)
5332		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5333
5334	if (changes & IFF_UP) {
5335		if (dev->flags & IFF_UP)
5336			call_netdevice_notifiers(NETDEV_UP, dev);
5337		else
5338			call_netdevice_notifiers(NETDEV_DOWN, dev);
5339	}
5340
5341	if (dev->flags & IFF_UP &&
5342	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5343		struct netdev_notifier_change_info change_info;
5344
5345		change_info.flags_changed = changes;
5346		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5347					      &change_info.info);
5348	}
5349}
5350
5351/**
5352 *	dev_change_flags - change device settings
5353 *	@dev: device
5354 *	@flags: device state flags
5355 *
5356 *	Change settings on device based state flags. The flags are
5357 *	in the userspace exported format.
5358 */
5359int dev_change_flags(struct net_device *dev, unsigned int flags)
5360{
5361	int ret;
5362	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5363
5364	ret = __dev_change_flags(dev, flags);
5365	if (ret < 0)
5366		return ret;
5367
5368	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5369	__dev_notify_flags(dev, old_flags, changes);
5370	return ret;
5371}
5372EXPORT_SYMBOL(dev_change_flags);
5373
5374static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5375{
5376	const struct net_device_ops *ops = dev->netdev_ops;
5377
5378	if (ops->ndo_change_mtu)
5379		return ops->ndo_change_mtu(dev, new_mtu);
5380
5381	dev->mtu = new_mtu;
5382	return 0;
5383}
5384
5385/**
5386 *	dev_set_mtu - Change maximum transfer unit
5387 *	@dev: device
5388 *	@new_mtu: new transfer unit
5389 *
5390 *	Change the maximum transfer size of the network device.
5391 */
5392int dev_set_mtu(struct net_device *dev, int new_mtu)
5393{
5394	int err, orig_mtu;
5395
5396	if (new_mtu == dev->mtu)
5397		return 0;
5398
5399	/*	MTU must be positive.	 */
5400	if (new_mtu < 0)
5401		return -EINVAL;
5402
5403	if (!netif_device_present(dev))
5404		return -ENODEV;
5405
5406	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5407	err = notifier_to_errno(err);
5408	if (err)
5409		return err;
5410
5411	orig_mtu = dev->mtu;
5412	err = __dev_set_mtu(dev, new_mtu);
5413
5414	if (!err) {
5415		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5416		err = notifier_to_errno(err);
5417		if (err) {
5418			/* setting mtu back and notifying everyone again,
5419			 * so that they have a chance to revert changes.
5420			 */
5421			__dev_set_mtu(dev, orig_mtu);
5422			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5423		}
5424	}
5425	return err;
5426}
5427EXPORT_SYMBOL(dev_set_mtu);
5428
5429/**
5430 *	dev_set_group - Change group this device belongs to
5431 *	@dev: device
5432 *	@new_group: group this device should belong to
5433 */
5434void dev_set_group(struct net_device *dev, int new_group)
5435{
5436	dev->group = new_group;
5437}
5438EXPORT_SYMBOL(dev_set_group);
5439
5440/**
5441 *	dev_set_mac_address - Change Media Access Control Address
5442 *	@dev: device
5443 *	@sa: new address
5444 *
5445 *	Change the hardware (MAC) address of the device
5446 */
5447int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5448{
5449	const struct net_device_ops *ops = dev->netdev_ops;
5450	int err;
5451
5452	if (!ops->ndo_set_mac_address)
5453		return -EOPNOTSUPP;
5454	if (sa->sa_family != dev->type)
5455		return -EINVAL;
5456	if (!netif_device_present(dev))
5457		return -ENODEV;
5458	err = ops->ndo_set_mac_address(dev, sa);
5459	if (err)
5460		return err;
5461	dev->addr_assign_type = NET_ADDR_SET;
5462	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5463	add_device_randomness(dev->dev_addr, dev->addr_len);
5464	return 0;
5465}
5466EXPORT_SYMBOL(dev_set_mac_address);
5467
5468/**
5469 *	dev_change_carrier - Change device carrier
5470 *	@dev: device
5471 *	@new_carrier: new value
5472 *
5473 *	Change device carrier
5474 */
5475int dev_change_carrier(struct net_device *dev, bool new_carrier)
5476{
5477	const struct net_device_ops *ops = dev->netdev_ops;
5478
5479	if (!ops->ndo_change_carrier)
5480		return -EOPNOTSUPP;
5481	if (!netif_device_present(dev))
5482		return -ENODEV;
5483	return ops->ndo_change_carrier(dev, new_carrier);
5484}
5485EXPORT_SYMBOL(dev_change_carrier);
5486
5487/**
5488 *	dev_get_phys_port_id - Get device physical port ID
5489 *	@dev: device
5490 *	@ppid: port ID
5491 *
5492 *	Get device physical port ID
5493 */
5494int dev_get_phys_port_id(struct net_device *dev,
5495			 struct netdev_phys_port_id *ppid)
5496{
5497	const struct net_device_ops *ops = dev->netdev_ops;
5498
5499	if (!ops->ndo_get_phys_port_id)
5500		return -EOPNOTSUPP;
5501	return ops->ndo_get_phys_port_id(dev, ppid);
5502}
5503EXPORT_SYMBOL(dev_get_phys_port_id);
5504
5505/**
5506 *	dev_new_index	-	allocate an ifindex
5507 *	@net: the applicable net namespace
5508 *
5509 *	Returns a suitable unique value for a new device interface
5510 *	number.  The caller must hold the rtnl semaphore or the
5511 *	dev_base_lock to be sure it remains unique.
5512 */
5513static int dev_new_index(struct net *net)
5514{
5515	int ifindex = net->ifindex;
5516	for (;;) {
5517		if (++ifindex <= 0)
5518			ifindex = 1;
5519		if (!__dev_get_by_index(net, ifindex))
5520			return net->ifindex = ifindex;
5521	}
5522}
5523
5524/* Delayed registration/unregisteration */
5525static LIST_HEAD(net_todo_list);
5526static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5527
5528static void net_set_todo(struct net_device *dev)
5529{
5530	list_add_tail(&dev->todo_list, &net_todo_list);
5531	dev_net(dev)->dev_unreg_count++;
5532}
5533
5534static void rollback_registered_many(struct list_head *head)
5535{
5536	struct net_device *dev, *tmp;
5537	LIST_HEAD(close_head);
5538
5539	BUG_ON(dev_boot_phase);
5540	ASSERT_RTNL();
5541
5542	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5543		/* Some devices call without registering
5544		 * for initialization unwind. Remove those
5545		 * devices and proceed with the remaining.
5546		 */
5547		if (dev->reg_state == NETREG_UNINITIALIZED) {
5548			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5549				 dev->name, dev);
5550
5551			WARN_ON(1);
5552			list_del(&dev->unreg_list);
5553			continue;
5554		}
5555		dev->dismantle = true;
5556		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5557	}
5558
5559	/* If device is running, close it first. */
5560	list_for_each_entry(dev, head, unreg_list)
5561		list_add_tail(&dev->close_list, &close_head);
5562	dev_close_many(&close_head);
5563
5564	list_for_each_entry(dev, head, unreg_list) {
5565		/* And unlink it from device chain. */
5566		unlist_netdevice(dev);
5567
5568		dev->reg_state = NETREG_UNREGISTERING;
5569	}
5570
5571	synchronize_net();
5572
5573	list_for_each_entry(dev, head, unreg_list) {
5574		/* Shutdown queueing discipline. */
5575		dev_shutdown(dev);
5576
5577
5578		/* Notify protocols, that we are about to destroy
5579		   this device. They should clean all the things.
5580		*/
5581		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5582
5583		if (!dev->rtnl_link_ops ||
5584		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5585			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5586
5587		/*
5588		 *	Flush the unicast and multicast chains
5589		 */
5590		dev_uc_flush(dev);
5591		dev_mc_flush(dev);
5592
5593		if (dev->netdev_ops->ndo_uninit)
5594			dev->netdev_ops->ndo_uninit(dev);
5595
5596		/* Notifier chain MUST detach us all upper devices. */
5597		WARN_ON(netdev_has_any_upper_dev(dev));
5598
5599		/* Remove entries from kobject tree */
5600		netdev_unregister_kobject(dev);
5601#ifdef CONFIG_XPS
5602		/* Remove XPS queueing entries */
5603		netif_reset_xps_queues_gt(dev, 0);
5604#endif
5605	}
5606
5607	synchronize_net();
5608
5609	list_for_each_entry(dev, head, unreg_list)
5610		dev_put(dev);
5611}
5612
5613static void rollback_registered(struct net_device *dev)
5614{
5615	LIST_HEAD(single);
5616
5617	list_add(&dev->unreg_list, &single);
5618	rollback_registered_many(&single);
5619	list_del(&single);
5620}
5621
5622static netdev_features_t netdev_fix_features(struct net_device *dev,
5623	netdev_features_t features)
5624{
5625	/* Fix illegal checksum combinations */
5626	if ((features & NETIF_F_HW_CSUM) &&
5627	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5628		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5629		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5630	}
5631
5632	/* TSO requires that SG is present as well. */
5633	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5634		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5635		features &= ~NETIF_F_ALL_TSO;
5636	}
5637
5638	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5639					!(features & NETIF_F_IP_CSUM)) {
5640		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5641		features &= ~NETIF_F_TSO;
5642		features &= ~NETIF_F_TSO_ECN;
5643	}
5644
5645	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5646					 !(features & NETIF_F_IPV6_CSUM)) {
5647		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5648		features &= ~NETIF_F_TSO6;
5649	}
5650
5651	/* TSO ECN requires that TSO is present as well. */
5652	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5653		features &= ~NETIF_F_TSO_ECN;
5654
5655	/* Software GSO depends on SG. */
5656	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5657		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5658		features &= ~NETIF_F_GSO;
5659	}
5660
5661	/* UFO needs SG and checksumming */
5662	if (features & NETIF_F_UFO) {
5663		/* maybe split UFO into V4 and V6? */
5664		if (!((features & NETIF_F_GEN_CSUM) ||
5665		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5666			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5667			netdev_dbg(dev,
5668				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5669			features &= ~NETIF_F_UFO;
5670		}
5671
5672		if (!(features & NETIF_F_SG)) {
5673			netdev_dbg(dev,
5674				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5675			features &= ~NETIF_F_UFO;
5676		}
5677	}
5678
5679	return features;
5680}
5681
5682int __netdev_update_features(struct net_device *dev)
5683{
5684	netdev_features_t features;
5685	int err = 0;
5686
5687	ASSERT_RTNL();
5688
5689	features = netdev_get_wanted_features(dev);
5690
5691	if (dev->netdev_ops->ndo_fix_features)
5692		features = dev->netdev_ops->ndo_fix_features(dev, features);
5693
5694	/* driver might be less strict about feature dependencies */
5695	features = netdev_fix_features(dev, features);
5696
5697	if (dev->features == features)
5698		return 0;
5699
5700	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5701		&dev->features, &features);
5702
5703	if (dev->netdev_ops->ndo_set_features)
5704		err = dev->netdev_ops->ndo_set_features(dev, features);
5705
5706	if (unlikely(err < 0)) {
5707		netdev_err(dev,
5708			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5709			err, &features, &dev->features);
5710		return -1;
5711	}
5712
5713	if (!err)
5714		dev->features = features;
5715
5716	return 1;
5717}
5718
5719/**
5720 *	netdev_update_features - recalculate device features
5721 *	@dev: the device to check
5722 *
5723 *	Recalculate dev->features set and send notifications if it
5724 *	has changed. Should be called after driver or hardware dependent
5725 *	conditions might have changed that influence the features.
5726 */
5727void netdev_update_features(struct net_device *dev)
5728{
5729	if (__netdev_update_features(dev))
5730		netdev_features_change(dev);
5731}
5732EXPORT_SYMBOL(netdev_update_features);
5733
5734/**
5735 *	netdev_change_features - recalculate device features
5736 *	@dev: the device to check
5737 *
5738 *	Recalculate dev->features set and send notifications even
5739 *	if they have not changed. Should be called instead of
5740 *	netdev_update_features() if also dev->vlan_features might
5741 *	have changed to allow the changes to be propagated to stacked
5742 *	VLAN devices.
5743 */
5744void netdev_change_features(struct net_device *dev)
5745{
5746	__netdev_update_features(dev);
5747	netdev_features_change(dev);
5748}
5749EXPORT_SYMBOL(netdev_change_features);
5750
5751/**
5752 *	netif_stacked_transfer_operstate -	transfer operstate
5753 *	@rootdev: the root or lower level device to transfer state from
5754 *	@dev: the device to transfer operstate to
5755 *
5756 *	Transfer operational state from root to device. This is normally
5757 *	called when a stacking relationship exists between the root
5758 *	device and the device(a leaf device).
5759 */
5760void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5761					struct net_device *dev)
5762{
5763	if (rootdev->operstate == IF_OPER_DORMANT)
5764		netif_dormant_on(dev);
5765	else
5766		netif_dormant_off(dev);
5767
5768	if (netif_carrier_ok(rootdev)) {
5769		if (!netif_carrier_ok(dev))
5770			netif_carrier_on(dev);
5771	} else {
5772		if (netif_carrier_ok(dev))
5773			netif_carrier_off(dev);
5774	}
5775}
5776EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5777
5778#ifdef CONFIG_SYSFS
5779static int netif_alloc_rx_queues(struct net_device *dev)
5780{
5781	unsigned int i, count = dev->num_rx_queues;
5782	struct netdev_rx_queue *rx;
5783
5784	BUG_ON(count < 1);
5785
5786	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5787	if (!rx)
5788		return -ENOMEM;
5789
5790	dev->_rx = rx;
5791
5792	for (i = 0; i < count; i++)
5793		rx[i].dev = dev;
5794	return 0;
5795}
5796#endif
5797
5798static void netdev_init_one_queue(struct net_device *dev,
5799				  struct netdev_queue *queue, void *_unused)
5800{
5801	/* Initialize queue lock */
5802	spin_lock_init(&queue->_xmit_lock);
5803	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5804	queue->xmit_lock_owner = -1;
5805	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5806	queue->dev = dev;
5807#ifdef CONFIG_BQL
5808	dql_init(&queue->dql, HZ);
5809#endif
5810}
5811
5812static void netif_free_tx_queues(struct net_device *dev)
5813{
5814	if (is_vmalloc_addr(dev->_tx))
5815		vfree(dev->_tx);
5816	else
5817		kfree(dev->_tx);
5818}
5819
5820static int netif_alloc_netdev_queues(struct net_device *dev)
5821{
5822	unsigned int count = dev->num_tx_queues;
5823	struct netdev_queue *tx;
5824	size_t sz = count * sizeof(*tx);
5825
5826	BUG_ON(count < 1 || count > 0xffff);
5827
5828	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5829	if (!tx) {
5830		tx = vzalloc(sz);
5831		if (!tx)
5832			return -ENOMEM;
5833	}
5834	dev->_tx = tx;
5835
5836	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5837	spin_lock_init(&dev->tx_global_lock);
5838
5839	return 0;
5840}
5841
5842/**
5843 *	register_netdevice	- register a network device
5844 *	@dev: device to register
5845 *
5846 *	Take a completed network device structure and add it to the kernel
5847 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5848 *	chain. 0 is returned on success. A negative errno code is returned
5849 *	on a failure to set up the device, or if the name is a duplicate.
5850 *
5851 *	Callers must hold the rtnl semaphore. You may want
5852 *	register_netdev() instead of this.
5853 *
5854 *	BUGS:
5855 *	The locking appears insufficient to guarantee two parallel registers
5856 *	will not get the same name.
5857 */
5858
5859int register_netdevice(struct net_device *dev)
5860{
5861	int ret;
5862	struct net *net = dev_net(dev);
5863
5864	BUG_ON(dev_boot_phase);
5865	ASSERT_RTNL();
5866
5867	might_sleep();
5868
5869	/* When net_device's are persistent, this will be fatal. */
5870	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5871	BUG_ON(!net);
5872
5873	spin_lock_init(&dev->addr_list_lock);
5874	netdev_set_addr_lockdep_class(dev);
5875
5876	dev->iflink = -1;
5877
5878	ret = dev_get_valid_name(net, dev, dev->name);
5879	if (ret < 0)
5880		goto out;
5881
5882	/* Init, if this function is available */
5883	if (dev->netdev_ops->ndo_init) {
5884		ret = dev->netdev_ops->ndo_init(dev);
5885		if (ret) {
5886			if (ret > 0)
5887				ret = -EIO;
5888			goto out;
5889		}
5890	}
5891
5892	if (((dev->hw_features | dev->features) &
5893	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
5894	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5895	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5896		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5897		ret = -EINVAL;
5898		goto err_uninit;
5899	}
5900
5901	ret = -EBUSY;
5902	if (!dev->ifindex)
5903		dev->ifindex = dev_new_index(net);
5904	else if (__dev_get_by_index(net, dev->ifindex))
5905		goto err_uninit;
5906
5907	if (dev->iflink == -1)
5908		dev->iflink = dev->ifindex;
5909
5910	/* Transfer changeable features to wanted_features and enable
5911	 * software offloads (GSO and GRO).
5912	 */
5913	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5914	dev->features |= NETIF_F_SOFT_FEATURES;
5915	dev->wanted_features = dev->features & dev->hw_features;
5916
5917	if (!(dev->flags & IFF_LOOPBACK)) {
5918		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5919	}
5920
5921	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5922	 */
5923	dev->vlan_features |= NETIF_F_HIGHDMA;
5924
5925	/* Make NETIF_F_SG inheritable to tunnel devices.
5926	 */
5927	dev->hw_enc_features |= NETIF_F_SG;
5928
5929	/* Make NETIF_F_SG inheritable to MPLS.
5930	 */
5931	dev->mpls_features |= NETIF_F_SG;
5932
5933	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5934	ret = notifier_to_errno(ret);
5935	if (ret)
5936		goto err_uninit;
5937
5938	ret = netdev_register_kobject(dev);
5939	if (ret)
5940		goto err_uninit;
5941	dev->reg_state = NETREG_REGISTERED;
5942
5943	__netdev_update_features(dev);
5944
5945	/*
5946	 *	Default initial state at registry is that the
5947	 *	device is present.
5948	 */
5949
5950	set_bit(__LINK_STATE_PRESENT, &dev->state);
5951
5952	linkwatch_init_dev(dev);
5953
5954	dev_init_scheduler(dev);
5955	dev_hold(dev);
5956	list_netdevice(dev);
5957	add_device_randomness(dev->dev_addr, dev->addr_len);
5958
5959	/* If the device has permanent device address, driver should
5960	 * set dev_addr and also addr_assign_type should be set to
5961	 * NET_ADDR_PERM (default value).
5962	 */
5963	if (dev->addr_assign_type == NET_ADDR_PERM)
5964		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5965
5966	/* Notify protocols, that a new device appeared. */
5967	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5968	ret = notifier_to_errno(ret);
5969	if (ret) {
5970		rollback_registered(dev);
5971		dev->reg_state = NETREG_UNREGISTERED;
5972	}
5973	/*
5974	 *	Prevent userspace races by waiting until the network
5975	 *	device is fully setup before sending notifications.
5976	 */
5977	if (!dev->rtnl_link_ops ||
5978	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5979		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5980
5981out:
5982	return ret;
5983
5984err_uninit:
5985	if (dev->netdev_ops->ndo_uninit)
5986		dev->netdev_ops->ndo_uninit(dev);
5987	goto out;
5988}
5989EXPORT_SYMBOL(register_netdevice);
5990
5991/**
5992 *	init_dummy_netdev	- init a dummy network device for NAPI
5993 *	@dev: device to init
5994 *
5995 *	This takes a network device structure and initialize the minimum
5996 *	amount of fields so it can be used to schedule NAPI polls without
5997 *	registering a full blown interface. This is to be used by drivers
5998 *	that need to tie several hardware interfaces to a single NAPI
5999 *	poll scheduler due to HW limitations.
6000 */
6001int init_dummy_netdev(struct net_device *dev)
6002{
6003	/* Clear everything. Note we don't initialize spinlocks
6004	 * are they aren't supposed to be taken by any of the
6005	 * NAPI code and this dummy netdev is supposed to be
6006	 * only ever used for NAPI polls
6007	 */
6008	memset(dev, 0, sizeof(struct net_device));
6009
6010	/* make sure we BUG if trying to hit standard
6011	 * register/unregister code path
6012	 */
6013	dev->reg_state = NETREG_DUMMY;
6014
6015	/* NAPI wants this */
6016	INIT_LIST_HEAD(&dev->napi_list);
6017
6018	/* a dummy interface is started by default */
6019	set_bit(__LINK_STATE_PRESENT, &dev->state);
6020	set_bit(__LINK_STATE_START, &dev->state);
6021
6022	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6023	 * because users of this 'device' dont need to change
6024	 * its refcount.
6025	 */
6026
6027	return 0;
6028}
6029EXPORT_SYMBOL_GPL(init_dummy_netdev);
6030
6031
6032/**
6033 *	register_netdev	- register a network device
6034 *	@dev: device to register
6035 *
6036 *	Take a completed network device structure and add it to the kernel
6037 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6038 *	chain. 0 is returned on success. A negative errno code is returned
6039 *	on a failure to set up the device, or if the name is a duplicate.
6040 *
6041 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6042 *	and expands the device name if you passed a format string to
6043 *	alloc_netdev.
6044 */
6045int register_netdev(struct net_device *dev)
6046{
6047	int err;
6048
6049	rtnl_lock();
6050	err = register_netdevice(dev);
6051	rtnl_unlock();
6052	return err;
6053}
6054EXPORT_SYMBOL(register_netdev);
6055
6056int netdev_refcnt_read(const struct net_device *dev)
6057{
6058	int i, refcnt = 0;
6059
6060	for_each_possible_cpu(i)
6061		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6062	return refcnt;
6063}
6064EXPORT_SYMBOL(netdev_refcnt_read);
6065
6066/**
6067 * netdev_wait_allrefs - wait until all references are gone.
6068 * @dev: target net_device
6069 *
6070 * This is called when unregistering network devices.
6071 *
6072 * Any protocol or device that holds a reference should register
6073 * for netdevice notification, and cleanup and put back the
6074 * reference if they receive an UNREGISTER event.
6075 * We can get stuck here if buggy protocols don't correctly
6076 * call dev_put.
6077 */
6078static void netdev_wait_allrefs(struct net_device *dev)
6079{
6080	unsigned long rebroadcast_time, warning_time;
6081	int refcnt;
6082
6083	linkwatch_forget_dev(dev);
6084
6085	rebroadcast_time = warning_time = jiffies;
6086	refcnt = netdev_refcnt_read(dev);
6087
6088	while (refcnt != 0) {
6089		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6090			rtnl_lock();
6091
6092			/* Rebroadcast unregister notification */
6093			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6094
6095			__rtnl_unlock();
6096			rcu_barrier();
6097			rtnl_lock();
6098
6099			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6100			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6101				     &dev->state)) {
6102				/* We must not have linkwatch events
6103				 * pending on unregister. If this
6104				 * happens, we simply run the queue
6105				 * unscheduled, resulting in a noop
6106				 * for this device.
6107				 */
6108				linkwatch_run_queue();
6109			}
6110
6111			__rtnl_unlock();
6112
6113			rebroadcast_time = jiffies;
6114		}
6115
6116		msleep(250);
6117
6118		refcnt = netdev_refcnt_read(dev);
6119
6120		if (time_after(jiffies, warning_time + 10 * HZ)) {
6121			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6122				 dev->name, refcnt);
6123			warning_time = jiffies;
6124		}
6125	}
6126}
6127
6128/* The sequence is:
6129 *
6130 *	rtnl_lock();
6131 *	...
6132 *	register_netdevice(x1);
6133 *	register_netdevice(x2);
6134 *	...
6135 *	unregister_netdevice(y1);
6136 *	unregister_netdevice(y2);
6137 *      ...
6138 *	rtnl_unlock();
6139 *	free_netdev(y1);
6140 *	free_netdev(y2);
6141 *
6142 * We are invoked by rtnl_unlock().
6143 * This allows us to deal with problems:
6144 * 1) We can delete sysfs objects which invoke hotplug
6145 *    without deadlocking with linkwatch via keventd.
6146 * 2) Since we run with the RTNL semaphore not held, we can sleep
6147 *    safely in order to wait for the netdev refcnt to drop to zero.
6148 *
6149 * We must not return until all unregister events added during
6150 * the interval the lock was held have been completed.
6151 */
6152void netdev_run_todo(void)
6153{
6154	struct list_head list;
6155
6156	/* Snapshot list, allow later requests */
6157	list_replace_init(&net_todo_list, &list);
6158
6159	__rtnl_unlock();
6160
6161
6162	/* Wait for rcu callbacks to finish before next phase */
6163	if (!list_empty(&list))
6164		rcu_barrier();
6165
6166	while (!list_empty(&list)) {
6167		struct net_device *dev
6168			= list_first_entry(&list, struct net_device, todo_list);
6169		list_del(&dev->todo_list);
6170
6171		rtnl_lock();
6172		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6173		__rtnl_unlock();
6174
6175		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6176			pr_err("network todo '%s' but state %d\n",
6177			       dev->name, dev->reg_state);
6178			dump_stack();
6179			continue;
6180		}
6181
6182		dev->reg_state = NETREG_UNREGISTERED;
6183
6184		on_each_cpu(flush_backlog, dev, 1);
6185
6186		netdev_wait_allrefs(dev);
6187
6188		/* paranoia */
6189		BUG_ON(netdev_refcnt_read(dev));
6190		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6191		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6192		WARN_ON(dev->dn_ptr);
6193
6194		if (dev->destructor)
6195			dev->destructor(dev);
6196
6197		/* Report a network device has been unregistered */
6198		rtnl_lock();
6199		dev_net(dev)->dev_unreg_count--;
6200		__rtnl_unlock();
6201		wake_up(&netdev_unregistering_wq);
6202
6203		/* Free network device */
6204		kobject_put(&dev->dev.kobj);
6205	}
6206}
6207
6208/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6209 * fields in the same order, with only the type differing.
6210 */
6211void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6212			     const struct net_device_stats *netdev_stats)
6213{
6214#if BITS_PER_LONG == 64
6215	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6216	memcpy(stats64, netdev_stats, sizeof(*stats64));
6217#else
6218	size_t i, n = sizeof(*stats64) / sizeof(u64);
6219	const unsigned long *src = (const unsigned long *)netdev_stats;
6220	u64 *dst = (u64 *)stats64;
6221
6222	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6223		     sizeof(*stats64) / sizeof(u64));
6224	for (i = 0; i < n; i++)
6225		dst[i] = src[i];
6226#endif
6227}
6228EXPORT_SYMBOL(netdev_stats_to_stats64);
6229
6230/**
6231 *	dev_get_stats	- get network device statistics
6232 *	@dev: device to get statistics from
6233 *	@storage: place to store stats
6234 *
6235 *	Get network statistics from device. Return @storage.
6236 *	The device driver may provide its own method by setting
6237 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6238 *	otherwise the internal statistics structure is used.
6239 */
6240struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6241					struct rtnl_link_stats64 *storage)
6242{
6243	const struct net_device_ops *ops = dev->netdev_ops;
6244
6245	if (ops->ndo_get_stats64) {
6246		memset(storage, 0, sizeof(*storage));
6247		ops->ndo_get_stats64(dev, storage);
6248	} else if (ops->ndo_get_stats) {
6249		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6250	} else {
6251		netdev_stats_to_stats64(storage, &dev->stats);
6252	}
6253	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6254	return storage;
6255}
6256EXPORT_SYMBOL(dev_get_stats);
6257
6258struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6259{
6260	struct netdev_queue *queue = dev_ingress_queue(dev);
6261
6262#ifdef CONFIG_NET_CLS_ACT
6263	if (queue)
6264		return queue;
6265	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6266	if (!queue)
6267		return NULL;
6268	netdev_init_one_queue(dev, queue, NULL);
6269	queue->qdisc = &noop_qdisc;
6270	queue->qdisc_sleeping = &noop_qdisc;
6271	rcu_assign_pointer(dev->ingress_queue, queue);
6272#endif
6273	return queue;
6274}
6275
6276static const struct ethtool_ops default_ethtool_ops;
6277
6278void netdev_set_default_ethtool_ops(struct net_device *dev,
6279				    const struct ethtool_ops *ops)
6280{
6281	if (dev->ethtool_ops == &default_ethtool_ops)
6282		dev->ethtool_ops = ops;
6283}
6284EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6285
6286void netdev_freemem(struct net_device *dev)
6287{
6288	char *addr = (char *)dev - dev->padded;
6289
6290	if (is_vmalloc_addr(addr))
6291		vfree(addr);
6292	else
6293		kfree(addr);
6294}
6295
6296/**
6297 *	alloc_netdev_mqs - allocate network device
6298 *	@sizeof_priv:	size of private data to allocate space for
6299 *	@name:		device name format string
6300 *	@setup:		callback to initialize device
6301 *	@txqs:		the number of TX subqueues to allocate
6302 *	@rxqs:		the number of RX subqueues to allocate
6303 *
6304 *	Allocates a struct net_device with private data area for driver use
6305 *	and performs basic initialization.  Also allocates subqueue structs
6306 *	for each queue on the device.
6307 */
6308struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6309		void (*setup)(struct net_device *),
6310		unsigned int txqs, unsigned int rxqs)
6311{
6312	struct net_device *dev;
6313	size_t alloc_size;
6314	struct net_device *p;
6315
6316	BUG_ON(strlen(name) >= sizeof(dev->name));
6317
6318	if (txqs < 1) {
6319		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6320		return NULL;
6321	}
6322
6323#ifdef CONFIG_SYSFS
6324	if (rxqs < 1) {
6325		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6326		return NULL;
6327	}
6328#endif
6329
6330	alloc_size = sizeof(struct net_device);
6331	if (sizeof_priv) {
6332		/* ensure 32-byte alignment of private area */
6333		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6334		alloc_size += sizeof_priv;
6335	}
6336	/* ensure 32-byte alignment of whole construct */
6337	alloc_size += NETDEV_ALIGN - 1;
6338
6339	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6340	if (!p)
6341		p = vzalloc(alloc_size);
6342	if (!p)
6343		return NULL;
6344
6345	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6346	dev->padded = (char *)dev - (char *)p;
6347
6348	dev->pcpu_refcnt = alloc_percpu(int);
6349	if (!dev->pcpu_refcnt)
6350		goto free_dev;
6351
6352	if (dev_addr_init(dev))
6353		goto free_pcpu;
6354
6355	dev_mc_init(dev);
6356	dev_uc_init(dev);
6357
6358	dev_net_set(dev, &init_net);
6359
6360	dev->gso_max_size = GSO_MAX_SIZE;
6361	dev->gso_max_segs = GSO_MAX_SEGS;
6362
6363	INIT_LIST_HEAD(&dev->napi_list);
6364	INIT_LIST_HEAD(&dev->unreg_list);
6365	INIT_LIST_HEAD(&dev->close_list);
6366	INIT_LIST_HEAD(&dev->link_watch_list);
6367	INIT_LIST_HEAD(&dev->adj_list.upper);
6368	INIT_LIST_HEAD(&dev->adj_list.lower);
6369	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6370	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6371	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6372	setup(dev);
6373
6374	dev->num_tx_queues = txqs;
6375	dev->real_num_tx_queues = txqs;
6376	if (netif_alloc_netdev_queues(dev))
6377		goto free_all;
6378
6379#ifdef CONFIG_SYSFS
6380	dev->num_rx_queues = rxqs;
6381	dev->real_num_rx_queues = rxqs;
6382	if (netif_alloc_rx_queues(dev))
6383		goto free_all;
6384#endif
6385
6386	strcpy(dev->name, name);
6387	dev->group = INIT_NETDEV_GROUP;
6388	if (!dev->ethtool_ops)
6389		dev->ethtool_ops = &default_ethtool_ops;
6390	return dev;
6391
6392free_all:
6393	free_netdev(dev);
6394	return NULL;
6395
6396free_pcpu:
6397	free_percpu(dev->pcpu_refcnt);
6398	netif_free_tx_queues(dev);
6399#ifdef CONFIG_SYSFS
6400	kfree(dev->_rx);
6401#endif
6402
6403free_dev:
6404	netdev_freemem(dev);
6405	return NULL;
6406}
6407EXPORT_SYMBOL(alloc_netdev_mqs);
6408
6409/**
6410 *	free_netdev - free network device
6411 *	@dev: device
6412 *
6413 *	This function does the last stage of destroying an allocated device
6414 * 	interface. The reference to the device object is released.
6415 *	If this is the last reference then it will be freed.
6416 */
6417void free_netdev(struct net_device *dev)
6418{
6419	struct napi_struct *p, *n;
6420
6421	release_net(dev_net(dev));
6422
6423	netif_free_tx_queues(dev);
6424#ifdef CONFIG_SYSFS
6425	kfree(dev->_rx);
6426#endif
6427
6428	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6429
6430	/* Flush device addresses */
6431	dev_addr_flush(dev);
6432
6433	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6434		netif_napi_del(p);
6435
6436	free_percpu(dev->pcpu_refcnt);
6437	dev->pcpu_refcnt = NULL;
6438
6439	/*  Compatibility with error handling in drivers */
6440	if (dev->reg_state == NETREG_UNINITIALIZED) {
6441		netdev_freemem(dev);
6442		return;
6443	}
6444
6445	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6446	dev->reg_state = NETREG_RELEASED;
6447
6448	/* will free via device release */
6449	put_device(&dev->dev);
6450}
6451EXPORT_SYMBOL(free_netdev);
6452
6453/**
6454 *	synchronize_net -  Synchronize with packet receive processing
6455 *
6456 *	Wait for packets currently being received to be done.
6457 *	Does not block later packets from starting.
6458 */
6459void synchronize_net(void)
6460{
6461	might_sleep();
6462	if (rtnl_is_locked())
6463		synchronize_rcu_expedited();
6464	else
6465		synchronize_rcu();
6466}
6467EXPORT_SYMBOL(synchronize_net);
6468
6469/**
6470 *	unregister_netdevice_queue - remove device from the kernel
6471 *	@dev: device
6472 *	@head: list
6473 *
6474 *	This function shuts down a device interface and removes it
6475 *	from the kernel tables.
6476 *	If head not NULL, device is queued to be unregistered later.
6477 *
6478 *	Callers must hold the rtnl semaphore.  You may want
6479 *	unregister_netdev() instead of this.
6480 */
6481
6482void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6483{
6484	ASSERT_RTNL();
6485
6486	if (head) {
6487		list_move_tail(&dev->unreg_list, head);
6488	} else {
6489		rollback_registered(dev);
6490		/* Finish processing unregister after unlock */
6491		net_set_todo(dev);
6492	}
6493}
6494EXPORT_SYMBOL(unregister_netdevice_queue);
6495
6496/**
6497 *	unregister_netdevice_many - unregister many devices
6498 *	@head: list of devices
6499 */
6500void unregister_netdevice_many(struct list_head *head)
6501{
6502	struct net_device *dev;
6503
6504	if (!list_empty(head)) {
6505		rollback_registered_many(head);
6506		list_for_each_entry(dev, head, unreg_list)
6507			net_set_todo(dev);
6508	}
6509}
6510EXPORT_SYMBOL(unregister_netdevice_many);
6511
6512/**
6513 *	unregister_netdev - remove device from the kernel
6514 *	@dev: device
6515 *
6516 *	This function shuts down a device interface and removes it
6517 *	from the kernel tables.
6518 *
6519 *	This is just a wrapper for unregister_netdevice that takes
6520 *	the rtnl semaphore.  In general you want to use this and not
6521 *	unregister_netdevice.
6522 */
6523void unregister_netdev(struct net_device *dev)
6524{
6525	rtnl_lock();
6526	unregister_netdevice(dev);
6527	rtnl_unlock();
6528}
6529EXPORT_SYMBOL(unregister_netdev);
6530
6531/**
6532 *	dev_change_net_namespace - move device to different nethost namespace
6533 *	@dev: device
6534 *	@net: network namespace
6535 *	@pat: If not NULL name pattern to try if the current device name
6536 *	      is already taken in the destination network namespace.
6537 *
6538 *	This function shuts down a device interface and moves it
6539 *	to a new network namespace. On success 0 is returned, on
6540 *	a failure a netagive errno code is returned.
6541 *
6542 *	Callers must hold the rtnl semaphore.
6543 */
6544
6545int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6546{
6547	int err;
6548
6549	ASSERT_RTNL();
6550
6551	/* Don't allow namespace local devices to be moved. */
6552	err = -EINVAL;
6553	if (dev->features & NETIF_F_NETNS_LOCAL)
6554		goto out;
6555
6556	/* Ensure the device has been registrered */
6557	if (dev->reg_state != NETREG_REGISTERED)
6558		goto out;
6559
6560	/* Get out if there is nothing todo */
6561	err = 0;
6562	if (net_eq(dev_net(dev), net))
6563		goto out;
6564
6565	/* Pick the destination device name, and ensure
6566	 * we can use it in the destination network namespace.
6567	 */
6568	err = -EEXIST;
6569	if (__dev_get_by_name(net, dev->name)) {
6570		/* We get here if we can't use the current device name */
6571		if (!pat)
6572			goto out;
6573		if (dev_get_valid_name(net, dev, pat) < 0)
6574			goto out;
6575	}
6576
6577	/*
6578	 * And now a mini version of register_netdevice unregister_netdevice.
6579	 */
6580
6581	/* If device is running close it first. */
6582	dev_close(dev);
6583
6584	/* And unlink it from device chain */
6585	err = -ENODEV;
6586	unlist_netdevice(dev);
6587
6588	synchronize_net();
6589
6590	/* Shutdown queueing discipline. */
6591	dev_shutdown(dev);
6592
6593	/* Notify protocols, that we are about to destroy
6594	   this device. They should clean all the things.
6595
6596	   Note that dev->reg_state stays at NETREG_REGISTERED.
6597	   This is wanted because this way 8021q and macvlan know
6598	   the device is just moving and can keep their slaves up.
6599	*/
6600	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6601	rcu_barrier();
6602	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6603	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6604
6605	/*
6606	 *	Flush the unicast and multicast chains
6607	 */
6608	dev_uc_flush(dev);
6609	dev_mc_flush(dev);
6610
6611	/* Send a netdev-removed uevent to the old namespace */
6612	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6613
6614	/* Actually switch the network namespace */
6615	dev_net_set(dev, net);
6616
6617	/* If there is an ifindex conflict assign a new one */
6618	if (__dev_get_by_index(net, dev->ifindex)) {
6619		int iflink = (dev->iflink == dev->ifindex);
6620		dev->ifindex = dev_new_index(net);
6621		if (iflink)
6622			dev->iflink = dev->ifindex;
6623	}
6624
6625	/* Send a netdev-add uevent to the new namespace */
6626	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6627
6628	/* Fixup kobjects */
6629	err = device_rename(&dev->dev, dev->name);
6630	WARN_ON(err);
6631
6632	/* Add the device back in the hashes */
6633	list_netdevice(dev);
6634
6635	/* Notify protocols, that a new device appeared. */
6636	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6637
6638	/*
6639	 *	Prevent userspace races by waiting until the network
6640	 *	device is fully setup before sending notifications.
6641	 */
6642	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6643
6644	synchronize_net();
6645	err = 0;
6646out:
6647	return err;
6648}
6649EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6650
6651static int dev_cpu_callback(struct notifier_block *nfb,
6652			    unsigned long action,
6653			    void *ocpu)
6654{
6655	struct sk_buff **list_skb;
6656	struct sk_buff *skb;
6657	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6658	struct softnet_data *sd, *oldsd;
6659
6660	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6661		return NOTIFY_OK;
6662
6663	local_irq_disable();
6664	cpu = smp_processor_id();
6665	sd = &per_cpu(softnet_data, cpu);
6666	oldsd = &per_cpu(softnet_data, oldcpu);
6667
6668	/* Find end of our completion_queue. */
6669	list_skb = &sd->completion_queue;
6670	while (*list_skb)
6671		list_skb = &(*list_skb)->next;
6672	/* Append completion queue from offline CPU. */
6673	*list_skb = oldsd->completion_queue;
6674	oldsd->completion_queue = NULL;
6675
6676	/* Append output queue from offline CPU. */
6677	if (oldsd->output_queue) {
6678		*sd->output_queue_tailp = oldsd->output_queue;
6679		sd->output_queue_tailp = oldsd->output_queue_tailp;
6680		oldsd->output_queue = NULL;
6681		oldsd->output_queue_tailp = &oldsd->output_queue;
6682	}
6683	/* Append NAPI poll list from offline CPU. */
6684	if (!list_empty(&oldsd->poll_list)) {
6685		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6686		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6687	}
6688
6689	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6690	local_irq_enable();
6691
6692	/* Process offline CPU's input_pkt_queue */
6693	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6694		netif_rx_internal(skb);
6695		input_queue_head_incr(oldsd);
6696	}
6697	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6698		netif_rx_internal(skb);
6699		input_queue_head_incr(oldsd);
6700	}
6701
6702	return NOTIFY_OK;
6703}
6704
6705
6706/**
6707 *	netdev_increment_features - increment feature set by one
6708 *	@all: current feature set
6709 *	@one: new feature set
6710 *	@mask: mask feature set
6711 *
6712 *	Computes a new feature set after adding a device with feature set
6713 *	@one to the master device with current feature set @all.  Will not
6714 *	enable anything that is off in @mask. Returns the new feature set.
6715 */
6716netdev_features_t netdev_increment_features(netdev_features_t all,
6717	netdev_features_t one, netdev_features_t mask)
6718{
6719	if (mask & NETIF_F_GEN_CSUM)
6720		mask |= NETIF_F_ALL_CSUM;
6721	mask |= NETIF_F_VLAN_CHALLENGED;
6722
6723	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6724	all &= one | ~NETIF_F_ALL_FOR_ALL;
6725
6726	/* If one device supports hw checksumming, set for all. */
6727	if (all & NETIF_F_GEN_CSUM)
6728		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6729
6730	return all;
6731}
6732EXPORT_SYMBOL(netdev_increment_features);
6733
6734static struct hlist_head * __net_init netdev_create_hash(void)
6735{
6736	int i;
6737	struct hlist_head *hash;
6738
6739	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6740	if (hash != NULL)
6741		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6742			INIT_HLIST_HEAD(&hash[i]);
6743
6744	return hash;
6745}
6746
6747/* Initialize per network namespace state */
6748static int __net_init netdev_init(struct net *net)
6749{
6750	if (net != &init_net)
6751		INIT_LIST_HEAD(&net->dev_base_head);
6752
6753	net->dev_name_head = netdev_create_hash();
6754	if (net->dev_name_head == NULL)
6755		goto err_name;
6756
6757	net->dev_index_head = netdev_create_hash();
6758	if (net->dev_index_head == NULL)
6759		goto err_idx;
6760
6761	return 0;
6762
6763err_idx:
6764	kfree(net->dev_name_head);
6765err_name:
6766	return -ENOMEM;
6767}
6768
6769/**
6770 *	netdev_drivername - network driver for the device
6771 *	@dev: network device
6772 *
6773 *	Determine network driver for device.
6774 */
6775const char *netdev_drivername(const struct net_device *dev)
6776{
6777	const struct device_driver *driver;
6778	const struct device *parent;
6779	const char *empty = "";
6780
6781	parent = dev->dev.parent;
6782	if (!parent)
6783		return empty;
6784
6785	driver = parent->driver;
6786	if (driver && driver->name)
6787		return driver->name;
6788	return empty;
6789}
6790
6791static int __netdev_printk(const char *level, const struct net_device *dev,
6792			   struct va_format *vaf)
6793{
6794	int r;
6795
6796	if (dev && dev->dev.parent) {
6797		r = dev_printk_emit(level[1] - '0',
6798				    dev->dev.parent,
6799				    "%s %s %s: %pV",
6800				    dev_driver_string(dev->dev.parent),
6801				    dev_name(dev->dev.parent),
6802				    netdev_name(dev), vaf);
6803	} else if (dev) {
6804		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6805	} else {
6806		r = printk("%s(NULL net_device): %pV", level, vaf);
6807	}
6808
6809	return r;
6810}
6811
6812int netdev_printk(const char *level, const struct net_device *dev,
6813		  const char *format, ...)
6814{
6815	struct va_format vaf;
6816	va_list args;
6817	int r;
6818
6819	va_start(args, format);
6820
6821	vaf.fmt = format;
6822	vaf.va = &args;
6823
6824	r = __netdev_printk(level, dev, &vaf);
6825
6826	va_end(args);
6827
6828	return r;
6829}
6830EXPORT_SYMBOL(netdev_printk);
6831
6832#define define_netdev_printk_level(func, level)			\
6833int func(const struct net_device *dev, const char *fmt, ...)	\
6834{								\
6835	int r;							\
6836	struct va_format vaf;					\
6837	va_list args;						\
6838								\
6839	va_start(args, fmt);					\
6840								\
6841	vaf.fmt = fmt;						\
6842	vaf.va = &args;						\
6843								\
6844	r = __netdev_printk(level, dev, &vaf);			\
6845								\
6846	va_end(args);						\
6847								\
6848	return r;						\
6849}								\
6850EXPORT_SYMBOL(func);
6851
6852define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6853define_netdev_printk_level(netdev_alert, KERN_ALERT);
6854define_netdev_printk_level(netdev_crit, KERN_CRIT);
6855define_netdev_printk_level(netdev_err, KERN_ERR);
6856define_netdev_printk_level(netdev_warn, KERN_WARNING);
6857define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6858define_netdev_printk_level(netdev_info, KERN_INFO);
6859
6860static void __net_exit netdev_exit(struct net *net)
6861{
6862	kfree(net->dev_name_head);
6863	kfree(net->dev_index_head);
6864}
6865
6866static struct pernet_operations __net_initdata netdev_net_ops = {
6867	.init = netdev_init,
6868	.exit = netdev_exit,
6869};
6870
6871static void __net_exit default_device_exit(struct net *net)
6872{
6873	struct net_device *dev, *aux;
6874	/*
6875	 * Push all migratable network devices back to the
6876	 * initial network namespace
6877	 */
6878	rtnl_lock();
6879	for_each_netdev_safe(net, dev, aux) {
6880		int err;
6881		char fb_name[IFNAMSIZ];
6882
6883		/* Ignore unmoveable devices (i.e. loopback) */
6884		if (dev->features & NETIF_F_NETNS_LOCAL)
6885			continue;
6886
6887		/* Leave virtual devices for the generic cleanup */
6888		if (dev->rtnl_link_ops)
6889			continue;
6890
6891		/* Push remaining network devices to init_net */
6892		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6893		err = dev_change_net_namespace(dev, &init_net, fb_name);
6894		if (err) {
6895			pr_emerg("%s: failed to move %s to init_net: %d\n",
6896				 __func__, dev->name, err);
6897			BUG();
6898		}
6899	}
6900	rtnl_unlock();
6901}
6902
6903static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6904{
6905	/* Return with the rtnl_lock held when there are no network
6906	 * devices unregistering in any network namespace in net_list.
6907	 */
6908	struct net *net;
6909	bool unregistering;
6910	DEFINE_WAIT(wait);
6911
6912	for (;;) {
6913		prepare_to_wait(&netdev_unregistering_wq, &wait,
6914				TASK_UNINTERRUPTIBLE);
6915		unregistering = false;
6916		rtnl_lock();
6917		list_for_each_entry(net, net_list, exit_list) {
6918			if (net->dev_unreg_count > 0) {
6919				unregistering = true;
6920				break;
6921			}
6922		}
6923		if (!unregistering)
6924			break;
6925		__rtnl_unlock();
6926		schedule();
6927	}
6928	finish_wait(&netdev_unregistering_wq, &wait);
6929}
6930
6931static void __net_exit default_device_exit_batch(struct list_head *net_list)
6932{
6933	/* At exit all network devices most be removed from a network
6934	 * namespace.  Do this in the reverse order of registration.
6935	 * Do this across as many network namespaces as possible to
6936	 * improve batching efficiency.
6937	 */
6938	struct net_device *dev;
6939	struct net *net;
6940	LIST_HEAD(dev_kill_list);
6941
6942	/* To prevent network device cleanup code from dereferencing
6943	 * loopback devices or network devices that have been freed
6944	 * wait here for all pending unregistrations to complete,
6945	 * before unregistring the loopback device and allowing the
6946	 * network namespace be freed.
6947	 *
6948	 * The netdev todo list containing all network devices
6949	 * unregistrations that happen in default_device_exit_batch
6950	 * will run in the rtnl_unlock() at the end of
6951	 * default_device_exit_batch.
6952	 */
6953	rtnl_lock_unregistering(net_list);
6954	list_for_each_entry(net, net_list, exit_list) {
6955		for_each_netdev_reverse(net, dev) {
6956			if (dev->rtnl_link_ops)
6957				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6958			else
6959				unregister_netdevice_queue(dev, &dev_kill_list);
6960		}
6961	}
6962	unregister_netdevice_many(&dev_kill_list);
6963	list_del(&dev_kill_list);
6964	rtnl_unlock();
6965}
6966
6967static struct pernet_operations __net_initdata default_device_ops = {
6968	.exit = default_device_exit,
6969	.exit_batch = default_device_exit_batch,
6970};
6971
6972/*
6973 *	Initialize the DEV module. At boot time this walks the device list and
6974 *	unhooks any devices that fail to initialise (normally hardware not
6975 *	present) and leaves us with a valid list of present and active devices.
6976 *
6977 */
6978
6979/*
6980 *       This is called single threaded during boot, so no need
6981 *       to take the rtnl semaphore.
6982 */
6983static int __init net_dev_init(void)
6984{
6985	int i, rc = -ENOMEM;
6986
6987	BUG_ON(!dev_boot_phase);
6988
6989	if (dev_proc_init())
6990		goto out;
6991
6992	if (netdev_kobject_init())
6993		goto out;
6994
6995	INIT_LIST_HEAD(&ptype_all);
6996	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6997		INIT_LIST_HEAD(&ptype_base[i]);
6998
6999	INIT_LIST_HEAD(&offload_base);
7000
7001	if (register_pernet_subsys(&netdev_net_ops))
7002		goto out;
7003
7004	/*
7005	 *	Initialise the packet receive queues.
7006	 */
7007
7008	for_each_possible_cpu(i) {
7009		struct softnet_data *sd = &per_cpu(softnet_data, i);
7010
7011		skb_queue_head_init(&sd->input_pkt_queue);
7012		skb_queue_head_init(&sd->process_queue);
7013		INIT_LIST_HEAD(&sd->poll_list);
7014		sd->output_queue_tailp = &sd->output_queue;
7015#ifdef CONFIG_RPS
7016		sd->csd.func = rps_trigger_softirq;
7017		sd->csd.info = sd;
7018		sd->cpu = i;
7019#endif
7020
7021		sd->backlog.poll = process_backlog;
7022		sd->backlog.weight = weight_p;
7023	}
7024
7025	dev_boot_phase = 0;
7026
7027	/* The loopback device is special if any other network devices
7028	 * is present in a network namespace the loopback device must
7029	 * be present. Since we now dynamically allocate and free the
7030	 * loopback device ensure this invariant is maintained by
7031	 * keeping the loopback device as the first device on the
7032	 * list of network devices.  Ensuring the loopback devices
7033	 * is the first device that appears and the last network device
7034	 * that disappears.
7035	 */
7036	if (register_pernet_device(&loopback_net_ops))
7037		goto out;
7038
7039	if (register_pernet_device(&default_device_ops))
7040		goto out;
7041
7042	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7043	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7044
7045	hotcpu_notifier(dev_cpu_callback, 0);
7046	dst_init();
7047	rc = 0;
7048out:
7049	return rc;
7050}
7051
7052subsys_initcall(net_dev_init);