net/core/dev.c at v2.6.39-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.39-rc2 6475 lines 161 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136
 137#include "net-sysfs.h"
 138
 139/* Instead of increasing this, you should create a hash table. */
 140#define MAX_GRO_SKBS 8
 141
 142/* This should be increased if a protocol with a bigger head is added. */
 143#define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145/*
 146 *	The list of packet types we will receive (as opposed to discard)
 147 *	and the routines to invoke.
 148 *
 149 *	Why 16. Because with 16 the only overlap we get on a hash of the
 150 *	low nibble of the protocol value is RARP/SNAP/X.25.
 151 *
 152 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153 *             sure which should go first, but I bet it won't make much
 154 *             difference if we are running VLANs.  The good news is that
 155 *             this protocol won't be in the list unless compiled in, so
 156 *             the average user (w/out VLANs) will not be adversely affected.
 157 *             --BLG
 158 *
 159 *		0800	IP
 160 *		8100    802.1Q VLAN
 161 *		0001	802.3
 162 *		0002	AX.25
 163 *		0004	802.2
 164 *		8035	RARP
 165 *		0005	SNAP
 166 *		0805	X.25
 167 *		0806	ARP
 168 *		8137	IPX
 169 *		0009	Localtalk
 170 *		86DD	IPv6
 171 */
 172
 173#define PTYPE_HASH_SIZE	(16)
 174#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 175
 176static DEFINE_SPINLOCK(ptype_lock);
 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178static struct list_head ptype_all __read_mostly;	/* Taps */
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200EXPORT_SYMBOL(dev_base_lock);
 201
 202static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203{
 204	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206}
 207
 208static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209{
 210	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211}
 212
 213static inline void rps_lock(struct softnet_data *sd)
 214{
 215#ifdef CONFIG_RPS
 216	spin_lock(&sd->input_pkt_queue.lock);
 217#endif
 218}
 219
 220static inline void rps_unlock(struct softnet_data *sd)
 221{
 222#ifdef CONFIG_RPS
 223	spin_unlock(&sd->input_pkt_queue.lock);
 224#endif
 225}
 226
 227/* Device list insertion */
 228static int list_netdevice(struct net_device *dev)
 229{
 230	struct net *net = dev_net(dev);
 231
 232	ASSERT_RTNL();
 233
 234	write_lock_bh(&dev_base_lock);
 235	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237	hlist_add_head_rcu(&dev->index_hlist,
 238			   dev_index_hash(net, dev->ifindex));
 239	write_unlock_bh(&dev_base_lock);
 240	return 0;
 241}
 242
 243/* Device list removal
 244 * caller must respect a RCU grace period before freeing/reusing dev
 245 */
 246static void unlist_netdevice(struct net_device *dev)
 247{
 248	ASSERT_RTNL();
 249
 250	/* Unlink dev from the device chain */
 251	write_lock_bh(&dev_base_lock);
 252	list_del_rcu(&dev->dev_list);
 253	hlist_del_rcu(&dev->name_hlist);
 254	hlist_del_rcu(&dev->index_hlist);
 255	write_unlock_bh(&dev_base_lock);
 256}
 257
 258/*
 259 *	Our notifier list
 260 */
 261
 262static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264/*
 265 *	Device drivers call our routines to queue packets here. We empty the
 266 *	queue in the local softnet handler.
 267 */
 268
 269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272#ifdef CONFIG_LOCKDEP
 273/*
 274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275 * according to dev->type
 276 */
 277static const unsigned short netdev_lock_type[] =
 278	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293	 ARPHRD_VOID, ARPHRD_NONE};
 294
 295static const char *const netdev_lock_name[] =
 296	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311	 "_xmit_VOID", "_xmit_NONE"};
 312
 313static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317{
 318	int i;
 319
 320	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321		if (netdev_lock_type[i] == dev_type)
 322			return i;
 323	/* the last key is used by default */
 324	return ARRAY_SIZE(netdev_lock_type) - 1;
 325}
 326
 327static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328						 unsigned short dev_type)
 329{
 330	int i;
 331
 332	i = netdev_lock_pos(dev_type);
 333	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334				   netdev_lock_name[i]);
 335}
 336
 337static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338{
 339	int i;
 340
 341	i = netdev_lock_pos(dev->type);
 342	lockdep_set_class_and_name(&dev->addr_list_lock,
 343				   &netdev_addr_lock_key[i],
 344				   netdev_lock_name[i]);
 345}
 346#else
 347static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348						 unsigned short dev_type)
 349{
 350}
 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352{
 353}
 354#endif
 355
 356/*******************************************************************************
 357
 358		Protocol management and registration routines
 359
 360*******************************************************************************/
 361
 362/*
 363 *	Add a protocol ID to the list. Now that the input handler is
 364 *	smarter we can dispense with all the messy stuff that used to be
 365 *	here.
 366 *
 367 *	BEWARE!!! Protocol handlers, mangling input packets,
 368 *	MUST BE last in hash buckets and checking protocol handlers
 369 *	MUST start from promiscuous ptype_all chain in net_bh.
 370 *	It is true now, do not change it.
 371 *	Explanation follows: if protocol handler, mangling packet, will
 372 *	be the first on list, it is not able to sense, that packet
 373 *	is cloned and should be copied-on-write, so that it will
 374 *	change it and subsequent readers will get broken packet.
 375 *							--ANK (980803)
 376 */
 377
 378static inline struct list_head *ptype_head(const struct packet_type *pt)
 379{
 380	if (pt->type == htons(ETH_P_ALL))
 381		return &ptype_all;
 382	else
 383		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384}
 385
 386/**
 387 *	dev_add_pack - add packet handler
 388 *	@pt: packet type declaration
 389 *
 390 *	Add a protocol handler to the networking stack. The passed &packet_type
 391 *	is linked into kernel lists and may not be freed until it has been
 392 *	removed from the kernel lists.
 393 *
 394 *	This call does not sleep therefore it can not
 395 *	guarantee all CPU's that are in middle of receiving packets
 396 *	will see the new packet type (until the next received packet).
 397 */
 398
 399void dev_add_pack(struct packet_type *pt)
 400{
 401	struct list_head *head = ptype_head(pt);
 402
 403	spin_lock(&ptype_lock);
 404	list_add_rcu(&pt->list, head);
 405	spin_unlock(&ptype_lock);
 406}
 407EXPORT_SYMBOL(dev_add_pack);
 408
 409/**
 410 *	__dev_remove_pack	 - remove packet handler
 411 *	@pt: packet type declaration
 412 *
 413 *	Remove a protocol handler that was previously added to the kernel
 414 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415 *	from the kernel lists and can be freed or reused once this function
 416 *	returns.
 417 *
 418 *      The packet type might still be in use by receivers
 419 *	and must not be freed until after all the CPU's have gone
 420 *	through a quiescent state.
 421 */
 422void __dev_remove_pack(struct packet_type *pt)
 423{
 424	struct list_head *head = ptype_head(pt);
 425	struct packet_type *pt1;
 426
 427	spin_lock(&ptype_lock);
 428
 429	list_for_each_entry(pt1, head, list) {
 430		if (pt == pt1) {
 431			list_del_rcu(&pt->list);
 432			goto out;
 433		}
 434	}
 435
 436	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437out:
 438	spin_unlock(&ptype_lock);
 439}
 440EXPORT_SYMBOL(__dev_remove_pack);
 441
 442/**
 443 *	dev_remove_pack	 - remove packet handler
 444 *	@pt: packet type declaration
 445 *
 446 *	Remove a protocol handler that was previously added to the kernel
 447 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448 *	from the kernel lists and can be freed or reused once this function
 449 *	returns.
 450 *
 451 *	This call sleeps to guarantee that no CPU is looking at the packet
 452 *	type after return.
 453 */
 454void dev_remove_pack(struct packet_type *pt)
 455{
 456	__dev_remove_pack(pt);
 457
 458	synchronize_net();
 459}
 460EXPORT_SYMBOL(dev_remove_pack);
 461
 462/******************************************************************************
 463
 464		      Device Boot-time Settings Routines
 465
 466*******************************************************************************/
 467
 468/* Boot time configuration table */
 469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471/**
 472 *	netdev_boot_setup_add	- add new setup entry
 473 *	@name: name of the device
 474 *	@map: configured settings for the device
 475 *
 476 *	Adds new setup entry to the dev_boot_setup list.  The function
 477 *	returns 0 on error and 1 on success.  This is a generic routine to
 478 *	all netdevices.
 479 */
 480static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481{
 482	struct netdev_boot_setup *s;
 483	int i;
 484
 485	s = dev_boot_setup;
 486	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488			memset(s[i].name, 0, sizeof(s[i].name));
 489			strlcpy(s[i].name, name, IFNAMSIZ);
 490			memcpy(&s[i].map, map, sizeof(s[i].map));
 491			break;
 492		}
 493	}
 494
 495	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496}
 497
 498/**
 499 *	netdev_boot_setup_check	- check boot time settings
 500 *	@dev: the netdevice
 501 *
 502 * 	Check boot time settings for the device.
 503 *	The found settings are set for the device to be used
 504 *	later in the device probing.
 505 *	Returns 0 if no settings found, 1 if they are.
 506 */
 507int netdev_boot_setup_check(struct net_device *dev)
 508{
 509	struct netdev_boot_setup *s = dev_boot_setup;
 510	int i;
 511
 512	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514		    !strcmp(dev->name, s[i].name)) {
 515			dev->irq 	= s[i].map.irq;
 516			dev->base_addr 	= s[i].map.base_addr;
 517			dev->mem_start 	= s[i].map.mem_start;
 518			dev->mem_end 	= s[i].map.mem_end;
 519			return 1;
 520		}
 521	}
 522	return 0;
 523}
 524EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527/**
 528 *	netdev_boot_base	- get address from boot time settings
 529 *	@prefix: prefix for network device
 530 *	@unit: id for network device
 531 *
 532 * 	Check boot time settings for the base address of device.
 533 *	The found settings are set for the device to be used
 534 *	later in the device probing.
 535 *	Returns 0 if no settings found.
 536 */
 537unsigned long netdev_boot_base(const char *prefix, int unit)
 538{
 539	const struct netdev_boot_setup *s = dev_boot_setup;
 540	char name[IFNAMSIZ];
 541	int i;
 542
 543	sprintf(name, "%s%d", prefix, unit);
 544
 545	/*
 546	 * If device already registered then return base of 1
 547	 * to indicate not to probe for this interface
 548	 */
 549	if (__dev_get_by_name(&init_net, name))
 550		return 1;
 551
 552	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553		if (!strcmp(name, s[i].name))
 554			return s[i].map.base_addr;
 555	return 0;
 556}
 557
 558/*
 559 * Saves at boot time configured settings for any netdevice.
 560 */
 561int __init netdev_boot_setup(char *str)
 562{
 563	int ints[5];
 564	struct ifmap map;
 565
 566	str = get_options(str, ARRAY_SIZE(ints), ints);
 567	if (!str || !*str)
 568		return 0;
 569
 570	/* Save settings */
 571	memset(&map, 0, sizeof(map));
 572	if (ints[0] > 0)
 573		map.irq = ints[1];
 574	if (ints[0] > 1)
 575		map.base_addr = ints[2];
 576	if (ints[0] > 2)
 577		map.mem_start = ints[3];
 578	if (ints[0] > 3)
 579		map.mem_end = ints[4];
 580
 581	/* Add new entry to the list */
 582	return netdev_boot_setup_add(str, &map);
 583}
 584
 585__setup("netdev=", netdev_boot_setup);
 586
 587/*******************************************************************************
 588
 589			    Device Interface Subroutines
 590
 591*******************************************************************************/
 592
 593/**
 594 *	__dev_get_by_name	- find a device by its name
 595 *	@net: the applicable net namespace
 596 *	@name: name to find
 597 *
 598 *	Find an interface by name. Must be called under RTNL semaphore
 599 *	or @dev_base_lock. If the name is found a pointer to the device
 600 *	is returned. If the name is not found then %NULL is returned. The
 601 *	reference counters are not incremented so the caller must be
 602 *	careful with locks.
 603 */
 604
 605struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606{
 607	struct hlist_node *p;
 608	struct net_device *dev;
 609	struct hlist_head *head = dev_name_hash(net, name);
 610
 611	hlist_for_each_entry(dev, p, head, name_hlist)
 612		if (!strncmp(dev->name, name, IFNAMSIZ))
 613			return dev;
 614
 615	return NULL;
 616}
 617EXPORT_SYMBOL(__dev_get_by_name);
 618
 619/**
 620 *	dev_get_by_name_rcu	- find a device by its name
 621 *	@net: the applicable net namespace
 622 *	@name: name to find
 623 *
 624 *	Find an interface by name.
 625 *	If the name is found a pointer to the device is returned.
 626 * 	If the name is not found then %NULL is returned.
 627 *	The reference counters are not incremented so the caller must be
 628 *	careful with locks. The caller must hold RCU lock.
 629 */
 630
 631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632{
 633	struct hlist_node *p;
 634	struct net_device *dev;
 635	struct hlist_head *head = dev_name_hash(net, name);
 636
 637	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638		if (!strncmp(dev->name, name, IFNAMSIZ))
 639			return dev;
 640
 641	return NULL;
 642}
 643EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645/**
 646 *	dev_get_by_name		- find a device by its name
 647 *	@net: the applicable net namespace
 648 *	@name: name to find
 649 *
 650 *	Find an interface by name. This can be called from any
 651 *	context and does its own locking. The returned handle has
 652 *	the usage count incremented and the caller must use dev_put() to
 653 *	release it when it is no longer needed. %NULL is returned if no
 654 *	matching device is found.
 655 */
 656
 657struct net_device *dev_get_by_name(struct net *net, const char *name)
 658{
 659	struct net_device *dev;
 660
 661	rcu_read_lock();
 662	dev = dev_get_by_name_rcu(net, name);
 663	if (dev)
 664		dev_hold(dev);
 665	rcu_read_unlock();
 666	return dev;
 667}
 668EXPORT_SYMBOL(dev_get_by_name);
 669
 670/**
 671 *	__dev_get_by_index - find a device by its ifindex
 672 *	@net: the applicable net namespace
 673 *	@ifindex: index of device
 674 *
 675 *	Search for an interface by index. Returns %NULL if the device
 676 *	is not found or a pointer to the device. The device has not
 677 *	had its reference counter increased so the caller must be careful
 678 *	about locking. The caller must hold either the RTNL semaphore
 679 *	or @dev_base_lock.
 680 */
 681
 682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683{
 684	struct hlist_node *p;
 685	struct net_device *dev;
 686	struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688	hlist_for_each_entry(dev, p, head, index_hlist)
 689		if (dev->ifindex == ifindex)
 690			return dev;
 691
 692	return NULL;
 693}
 694EXPORT_SYMBOL(__dev_get_by_index);
 695
 696/**
 697 *	dev_get_by_index_rcu - find a device by its ifindex
 698 *	@net: the applicable net namespace
 699 *	@ifindex: index of device
 700 *
 701 *	Search for an interface by index. Returns %NULL if the device
 702 *	is not found or a pointer to the device. The device has not
 703 *	had its reference counter increased so the caller must be careful
 704 *	about locking. The caller must hold RCU lock.
 705 */
 706
 707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708{
 709	struct hlist_node *p;
 710	struct net_device *dev;
 711	struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714		if (dev->ifindex == ifindex)
 715			return dev;
 716
 717	return NULL;
 718}
 719EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722/**
 723 *	dev_get_by_index - find a device by its ifindex
 724 *	@net: the applicable net namespace
 725 *	@ifindex: index of device
 726 *
 727 *	Search for an interface by index. Returns NULL if the device
 728 *	is not found or a pointer to the device. The device returned has
 729 *	had a reference added and the pointer is safe until the user calls
 730 *	dev_put to indicate they have finished with it.
 731 */
 732
 733struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734{
 735	struct net_device *dev;
 736
 737	rcu_read_lock();
 738	dev = dev_get_by_index_rcu(net, ifindex);
 739	if (dev)
 740		dev_hold(dev);
 741	rcu_read_unlock();
 742	return dev;
 743}
 744EXPORT_SYMBOL(dev_get_by_index);
 745
 746/**
 747 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 748 *	@net: the applicable net namespace
 749 *	@type: media type of device
 750 *	@ha: hardware address
 751 *
 752 *	Search for an interface by MAC address. Returns NULL if the device
 753 *	is not found or a pointer to the device.
 754 *	The caller must hold RCU or RTNL.
 755 *	The returned device has not had its ref count increased
 756 *	and the caller must therefore be careful about locking
 757 *
 758 */
 759
 760struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761				       const char *ha)
 762{
 763	struct net_device *dev;
 764
 765	for_each_netdev_rcu(net, dev)
 766		if (dev->type == type &&
 767		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 768			return dev;
 769
 770	return NULL;
 771}
 772EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775{
 776	struct net_device *dev;
 777
 778	ASSERT_RTNL();
 779	for_each_netdev(net, dev)
 780		if (dev->type == type)
 781			return dev;
 782
 783	return NULL;
 784}
 785EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788{
 789	struct net_device *dev, *ret = NULL;
 790
 791	rcu_read_lock();
 792	for_each_netdev_rcu(net, dev)
 793		if (dev->type == type) {
 794			dev_hold(dev);
 795			ret = dev;
 796			break;
 797		}
 798	rcu_read_unlock();
 799	return ret;
 800}
 801EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803/**
 804 *	dev_get_by_flags_rcu - find any device with given flags
 805 *	@net: the applicable net namespace
 806 *	@if_flags: IFF_* values
 807 *	@mask: bitmask of bits in if_flags to check
 808 *
 809 *	Search for any interface with the given flags. Returns NULL if a device
 810 *	is not found or a pointer to the device. Must be called inside
 811 *	rcu_read_lock(), and result refcount is unchanged.
 812 */
 813
 814struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815				    unsigned short mask)
 816{
 817	struct net_device *dev, *ret;
 818
 819	ret = NULL;
 820	for_each_netdev_rcu(net, dev) {
 821		if (((dev->flags ^ if_flags) & mask) == 0) {
 822			ret = dev;
 823			break;
 824		}
 825	}
 826	return ret;
 827}
 828EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830/**
 831 *	dev_valid_name - check if name is okay for network device
 832 *	@name: name string
 833 *
 834 *	Network device names need to be valid file names to
 835 *	to allow sysfs to work.  We also disallow any kind of
 836 *	whitespace.
 837 */
 838int dev_valid_name(const char *name)
 839{
 840	if (*name == '\0')
 841		return 0;
 842	if (strlen(name) >= IFNAMSIZ)
 843		return 0;
 844	if (!strcmp(name, ".") || !strcmp(name, ".."))
 845		return 0;
 846
 847	while (*name) {
 848		if (*name == '/' || isspace(*name))
 849			return 0;
 850		name++;
 851	}
 852	return 1;
 853}
 854EXPORT_SYMBOL(dev_valid_name);
 855
 856/**
 857 *	__dev_alloc_name - allocate a name for a device
 858 *	@net: network namespace to allocate the device name in
 859 *	@name: name format string
 860 *	@buf:  scratch buffer and result name string
 861 *
 862 *	Passed a format string - eg "lt%d" it will try and find a suitable
 863 *	id. It scans list of devices to build up a free map, then chooses
 864 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 865 *	while allocating the name and adding the device in order to avoid
 866 *	duplicates.
 867 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868 *	Returns the number of the unit assigned or a negative errno code.
 869 */
 870
 871static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872{
 873	int i = 0;
 874	const char *p;
 875	const int max_netdevices = 8*PAGE_SIZE;
 876	unsigned long *inuse;
 877	struct net_device *d;
 878
 879	p = strnchr(name, IFNAMSIZ-1, '%');
 880	if (p) {
 881		/*
 882		 * Verify the string as this thing may have come from
 883		 * the user.  There must be either one "%d" and no other "%"
 884		 * characters.
 885		 */
 886		if (p[1] != 'd' || strchr(p + 2, '%'))
 887			return -EINVAL;
 888
 889		/* Use one page as a bit array of possible slots */
 890		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891		if (!inuse)
 892			return -ENOMEM;
 893
 894		for_each_netdev(net, d) {
 895			if (!sscanf(d->name, name, &i))
 896				continue;
 897			if (i < 0 || i >= max_netdevices)
 898				continue;
 899
 900			/*  avoid cases where sscanf is not exact inverse of printf */
 901			snprintf(buf, IFNAMSIZ, name, i);
 902			if (!strncmp(buf, d->name, IFNAMSIZ))
 903				set_bit(i, inuse);
 904		}
 905
 906		i = find_first_zero_bit(inuse, max_netdevices);
 907		free_page((unsigned long) inuse);
 908	}
 909
 910	if (buf != name)
 911		snprintf(buf, IFNAMSIZ, name, i);
 912	if (!__dev_get_by_name(net, buf))
 913		return i;
 914
 915	/* It is possible to run out of possible slots
 916	 * when the name is long and there isn't enough space left
 917	 * for the digits, or if all bits are used.
 918	 */
 919	return -ENFILE;
 920}
 921
 922/**
 923 *	dev_alloc_name - allocate a name for a device
 924 *	@dev: device
 925 *	@name: name format string
 926 *
 927 *	Passed a format string - eg "lt%d" it will try and find a suitable
 928 *	id. It scans list of devices to build up a free map, then chooses
 929 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 930 *	while allocating the name and adding the device in order to avoid
 931 *	duplicates.
 932 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933 *	Returns the number of the unit assigned or a negative errno code.
 934 */
 935
 936int dev_alloc_name(struct net_device *dev, const char *name)
 937{
 938	char buf[IFNAMSIZ];
 939	struct net *net;
 940	int ret;
 941
 942	BUG_ON(!dev_net(dev));
 943	net = dev_net(dev);
 944	ret = __dev_alloc_name(net, name, buf);
 945	if (ret >= 0)
 946		strlcpy(dev->name, buf, IFNAMSIZ);
 947	return ret;
 948}
 949EXPORT_SYMBOL(dev_alloc_name);
 950
 951static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952{
 953	struct net *net;
 954
 955	BUG_ON(!dev_net(dev));
 956	net = dev_net(dev);
 957
 958	if (!dev_valid_name(name))
 959		return -EINVAL;
 960
 961	if (fmt && strchr(name, '%'))
 962		return dev_alloc_name(dev, name);
 963	else if (__dev_get_by_name(net, name))
 964		return -EEXIST;
 965	else if (dev->name != name)
 966		strlcpy(dev->name, name, IFNAMSIZ);
 967
 968	return 0;
 969}
 970
 971/**
 972 *	dev_change_name - change name of a device
 973 *	@dev: device
 974 *	@newname: name (or format string) must be at least IFNAMSIZ
 975 *
 976 *	Change name of a device, can pass format strings "eth%d".
 977 *	for wildcarding.
 978 */
 979int dev_change_name(struct net_device *dev, const char *newname)
 980{
 981	char oldname[IFNAMSIZ];
 982	int err = 0;
 983	int ret;
 984	struct net *net;
 985
 986	ASSERT_RTNL();
 987	BUG_ON(!dev_net(dev));
 988
 989	net = dev_net(dev);
 990	if (dev->flags & IFF_UP)
 991		return -EBUSY;
 992
 993	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994		return 0;
 995
 996	memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998	err = dev_get_valid_name(dev, newname, 1);
 999	if (err < 0)
1000		return err;
1001
1002rollback:
1003	ret = device_rename(&dev->dev, dev->name);
1004	if (ret) {
1005		memcpy(dev->name, oldname, IFNAMSIZ);
1006		return ret;
1007	}
1008
1009	write_lock_bh(&dev_base_lock);
1010	hlist_del(&dev->name_hlist);
1011	write_unlock_bh(&dev_base_lock);
1012
1013	synchronize_rcu();
1014
1015	write_lock_bh(&dev_base_lock);
1016	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017	write_unlock_bh(&dev_base_lock);
1018
1019	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020	ret = notifier_to_errno(ret);
1021
1022	if (ret) {
1023		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024		if (err >= 0) {
1025			err = ret;
1026			memcpy(dev->name, oldname, IFNAMSIZ);
1027			goto rollback;
1028		} else {
1029			printk(KERN_ERR
1030			       "%s: name change rollback failed: %d.\n",
1031			       dev->name, ret);
1032		}
1033	}
1034
1035	return err;
1036}
1037
1038/**
1039 *	dev_set_alias - change ifalias of a device
1040 *	@dev: device
1041 *	@alias: name up to IFALIASZ
1042 *	@len: limit of bytes to copy from info
1043 *
1044 *	Set ifalias for a device,
1045 */
1046int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047{
1048	ASSERT_RTNL();
1049
1050	if (len >= IFALIASZ)
1051		return -EINVAL;
1052
1053	if (!len) {
1054		if (dev->ifalias) {
1055			kfree(dev->ifalias);
1056			dev->ifalias = NULL;
1057		}
1058		return 0;
1059	}
1060
1061	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062	if (!dev->ifalias)
1063		return -ENOMEM;
1064
1065	strlcpy(dev->ifalias, alias, len+1);
1066	return len;
1067}
1068
1069
1070/**
1071 *	netdev_features_change - device changes features
1072 *	@dev: device to cause notification
1073 *
1074 *	Called to indicate a device has changed features.
1075 */
1076void netdev_features_change(struct net_device *dev)
1077{
1078	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079}
1080EXPORT_SYMBOL(netdev_features_change);
1081
1082/**
1083 *	netdev_state_change - device changes state
1084 *	@dev: device to cause notification
1085 *
1086 *	Called to indicate a device has changed state. This function calls
1087 *	the notifier chains for netdev_chain and sends a NEWLINK message
1088 *	to the routing socket.
1089 */
1090void netdev_state_change(struct net_device *dev)
1091{
1092	if (dev->flags & IFF_UP) {
1093		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095	}
1096}
1097EXPORT_SYMBOL(netdev_state_change);
1098
1099int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100{
1101	return call_netdevice_notifiers(event, dev);
1102}
1103EXPORT_SYMBOL(netdev_bonding_change);
1104
1105/**
1106 *	dev_load 	- load a network module
1107 *	@net: the applicable net namespace
1108 *	@name: name of interface
1109 *
1110 *	If a network interface is not present and the process has suitable
1111 *	privileges this function loads the module. If module loading is not
1112 *	available in this kernel then it becomes a nop.
1113 */
1114
1115void dev_load(struct net *net, const char *name)
1116{
1117	struct net_device *dev;
1118	int no_module;
1119
1120	rcu_read_lock();
1121	dev = dev_get_by_name_rcu(net, name);
1122	rcu_read_unlock();
1123
1124	no_module = !dev;
1125	if (no_module && capable(CAP_NET_ADMIN))
1126		no_module = request_module("netdev-%s", name);
1127	if (no_module && capable(CAP_SYS_MODULE)) {
1128		if (!request_module("%s", name))
1129			pr_err("Loading kernel module for a network device "
1130"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131"instead\n", name);
1132	}
1133}
1134EXPORT_SYMBOL(dev_load);
1135
1136static int __dev_open(struct net_device *dev)
1137{
1138	const struct net_device_ops *ops = dev->netdev_ops;
1139	int ret;
1140
1141	ASSERT_RTNL();
1142
1143	if (!netif_device_present(dev))
1144		return -ENODEV;
1145
1146	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147	ret = notifier_to_errno(ret);
1148	if (ret)
1149		return ret;
1150
1151	set_bit(__LINK_STATE_START, &dev->state);
1152
1153	if (ops->ndo_validate_addr)
1154		ret = ops->ndo_validate_addr(dev);
1155
1156	if (!ret && ops->ndo_open)
1157		ret = ops->ndo_open(dev);
1158
1159	if (ret)
1160		clear_bit(__LINK_STATE_START, &dev->state);
1161	else {
1162		dev->flags |= IFF_UP;
1163		net_dmaengine_get();
1164		dev_set_rx_mode(dev);
1165		dev_activate(dev);
1166	}
1167
1168	return ret;
1169}
1170
1171/**
1172 *	dev_open	- prepare an interface for use.
1173 *	@dev:	device to open
1174 *
1175 *	Takes a device from down to up state. The device's private open
1176 *	function is invoked and then the multicast lists are loaded. Finally
1177 *	the device is moved into the up state and a %NETDEV_UP message is
1178 *	sent to the netdev notifier chain.
1179 *
1180 *	Calling this function on an active interface is a nop. On a failure
1181 *	a negative errno code is returned.
1182 */
1183int dev_open(struct net_device *dev)
1184{
1185	int ret;
1186
1187	if (dev->flags & IFF_UP)
1188		return 0;
1189
1190	ret = __dev_open(dev);
1191	if (ret < 0)
1192		return ret;
1193
1194	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195	call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197	return ret;
1198}
1199EXPORT_SYMBOL(dev_open);
1200
1201static int __dev_close_many(struct list_head *head)
1202{
1203	struct net_device *dev;
1204
1205	ASSERT_RTNL();
1206	might_sleep();
1207
1208	list_for_each_entry(dev, head, unreg_list) {
1209		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211		clear_bit(__LINK_STATE_START, &dev->state);
1212
1213		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214		 * can be even on different cpu. So just clear netif_running().
1215		 *
1216		 * dev->stop() will invoke napi_disable() on all of it's
1217		 * napi_struct instances on this device.
1218		 */
1219		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220	}
1221
1222	dev_deactivate_many(head);
1223
1224	list_for_each_entry(dev, head, unreg_list) {
1225		const struct net_device_ops *ops = dev->netdev_ops;
1226
1227		/*
1228		 *	Call the device specific close. This cannot fail.
1229		 *	Only if device is UP
1230		 *
1231		 *	We allow it to be called even after a DETACH hot-plug
1232		 *	event.
1233		 */
1234		if (ops->ndo_stop)
1235			ops->ndo_stop(dev);
1236
1237		dev->flags &= ~IFF_UP;
1238		net_dmaengine_put();
1239	}
1240
1241	return 0;
1242}
1243
1244static int __dev_close(struct net_device *dev)
1245{
1246	int retval;
1247	LIST_HEAD(single);
1248
1249	list_add(&dev->unreg_list, &single);
1250	retval = __dev_close_many(&single);
1251	list_del(&single);
1252	return retval;
1253}
1254
1255static int dev_close_many(struct list_head *head)
1256{
1257	struct net_device *dev, *tmp;
1258	LIST_HEAD(tmp_list);
1259
1260	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261		if (!(dev->flags & IFF_UP))
1262			list_move(&dev->unreg_list, &tmp_list);
1263
1264	__dev_close_many(head);
1265
1266	list_for_each_entry(dev, head, unreg_list) {
1267		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269	}
1270
1271	/* rollback_registered_many needs the complete original list */
1272	list_splice(&tmp_list, head);
1273	return 0;
1274}
1275
1276/**
1277 *	dev_close - shutdown an interface.
1278 *	@dev: device to shutdown
1279 *
1280 *	This function moves an active device into down state. A
1281 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283 *	chain.
1284 */
1285int dev_close(struct net_device *dev)
1286{
1287	LIST_HEAD(single);
1288
1289	list_add(&dev->unreg_list, &single);
1290	dev_close_many(&single);
1291	list_del(&single);
1292	return 0;
1293}
1294EXPORT_SYMBOL(dev_close);
1295
1296
1297/**
1298 *	dev_disable_lro - disable Large Receive Offload on a device
1299 *	@dev: device
1300 *
1301 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1302 *	called under RTNL.  This is needed if received packets may be
1303 *	forwarded to another interface.
1304 */
1305void dev_disable_lro(struct net_device *dev)
1306{
1307	u32 flags;
1308
1309	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310		flags = dev->ethtool_ops->get_flags(dev);
1311	else
1312		flags = ethtool_op_get_flags(dev);
1313
1314	if (!(flags & ETH_FLAG_LRO))
1315		return;
1316
1317	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318	WARN_ON(dev->features & NETIF_F_LRO);
1319}
1320EXPORT_SYMBOL(dev_disable_lro);
1321
1322
1323static int dev_boot_phase = 1;
1324
1325/**
1326 *	register_netdevice_notifier - register a network notifier block
1327 *	@nb: notifier
1328 *
1329 *	Register a notifier to be called when network device events occur.
1330 *	The notifier passed is linked into the kernel structures and must
1331 *	not be reused until it has been unregistered. A negative errno code
1332 *	is returned on a failure.
1333 *
1334 * 	When registered all registration and up events are replayed
1335 *	to the new notifier to allow device to have a race free
1336 *	view of the network device list.
1337 */
1338
1339int register_netdevice_notifier(struct notifier_block *nb)
1340{
1341	struct net_device *dev;
1342	struct net_device *last;
1343	struct net *net;
1344	int err;
1345
1346	rtnl_lock();
1347	err = raw_notifier_chain_register(&netdev_chain, nb);
1348	if (err)
1349		goto unlock;
1350	if (dev_boot_phase)
1351		goto unlock;
1352	for_each_net(net) {
1353		for_each_netdev(net, dev) {
1354			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1355			err = notifier_to_errno(err);
1356			if (err)
1357				goto rollback;
1358
1359			if (!(dev->flags & IFF_UP))
1360				continue;
1361
1362			nb->notifier_call(nb, NETDEV_UP, dev);
1363		}
1364	}
1365
1366unlock:
1367	rtnl_unlock();
1368	return err;
1369
1370rollback:
1371	last = dev;
1372	for_each_net(net) {
1373		for_each_netdev(net, dev) {
1374			if (dev == last)
1375				break;
1376
1377			if (dev->flags & IFF_UP) {
1378				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1379				nb->notifier_call(nb, NETDEV_DOWN, dev);
1380			}
1381			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1382			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383		}
1384	}
1385
1386	raw_notifier_chain_unregister(&netdev_chain, nb);
1387	goto unlock;
1388}
1389EXPORT_SYMBOL(register_netdevice_notifier);
1390
1391/**
1392 *	unregister_netdevice_notifier - unregister a network notifier block
1393 *	@nb: notifier
1394 *
1395 *	Unregister a notifier previously registered by
1396 *	register_netdevice_notifier(). The notifier is unlinked into the
1397 *	kernel structures and may then be reused. A negative errno code
1398 *	is returned on a failure.
1399 */
1400
1401int unregister_netdevice_notifier(struct notifier_block *nb)
1402{
1403	int err;
1404
1405	rtnl_lock();
1406	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1407	rtnl_unlock();
1408	return err;
1409}
1410EXPORT_SYMBOL(unregister_netdevice_notifier);
1411
1412/**
1413 *	call_netdevice_notifiers - call all network notifier blocks
1414 *      @val: value passed unmodified to notifier function
1415 *      @dev: net_device pointer passed unmodified to notifier function
1416 *
1417 *	Call all network notifier blocks.  Parameters and return value
1418 *	are as for raw_notifier_call_chain().
1419 */
1420
1421int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1422{
1423	ASSERT_RTNL();
1424	return raw_notifier_call_chain(&netdev_chain, val, dev);
1425}
1426EXPORT_SYMBOL(call_netdevice_notifiers);
1427
1428/* When > 0 there are consumers of rx skb time stamps */
1429static atomic_t netstamp_needed = ATOMIC_INIT(0);
1430
1431void net_enable_timestamp(void)
1432{
1433	atomic_inc(&netstamp_needed);
1434}
1435EXPORT_SYMBOL(net_enable_timestamp);
1436
1437void net_disable_timestamp(void)
1438{
1439	atomic_dec(&netstamp_needed);
1440}
1441EXPORT_SYMBOL(net_disable_timestamp);
1442
1443static inline void net_timestamp_set(struct sk_buff *skb)
1444{
1445	if (atomic_read(&netstamp_needed))
1446		__net_timestamp(skb);
1447	else
1448		skb->tstamp.tv64 = 0;
1449}
1450
1451static inline void net_timestamp_check(struct sk_buff *skb)
1452{
1453	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1454		__net_timestamp(skb);
1455}
1456
1457static inline bool is_skb_forwardable(struct net_device *dev,
1458				      struct sk_buff *skb)
1459{
1460	unsigned int len;
1461
1462	if (!(dev->flags & IFF_UP))
1463		return false;
1464
1465	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1466	if (skb->len <= len)
1467		return true;
1468
1469	/* if TSO is enabled, we don't care about the length as the packet
1470	 * could be forwarded without being segmented before
1471	 */
1472	if (skb_is_gso(skb))
1473		return true;
1474
1475	return false;
1476}
1477
1478/**
1479 * dev_forward_skb - loopback an skb to another netif
1480 *
1481 * @dev: destination network device
1482 * @skb: buffer to forward
1483 *
1484 * return values:
1485 *	NET_RX_SUCCESS	(no congestion)
1486 *	NET_RX_DROP     (packet was dropped, but freed)
1487 *
1488 * dev_forward_skb can be used for injecting an skb from the
1489 * start_xmit function of one device into the receive queue
1490 * of another device.
1491 *
1492 * The receiving device may be in another namespace, so
1493 * we have to clear all information in the skb that could
1494 * impact namespace isolation.
1495 */
1496int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1497{
1498	skb_orphan(skb);
1499	nf_reset(skb);
1500
1501	if (unlikely(!is_skb_forwardable(dev, skb))) {
1502		atomic_long_inc(&dev->rx_dropped);
1503		kfree_skb(skb);
1504		return NET_RX_DROP;
1505	}
1506	skb_set_dev(skb, dev);
1507	skb->tstamp.tv64 = 0;
1508	skb->pkt_type = PACKET_HOST;
1509	skb->protocol = eth_type_trans(skb, dev);
1510	return netif_rx(skb);
1511}
1512EXPORT_SYMBOL_GPL(dev_forward_skb);
1513
1514static inline int deliver_skb(struct sk_buff *skb,
1515			      struct packet_type *pt_prev,
1516			      struct net_device *orig_dev)
1517{
1518	atomic_inc(&skb->users);
1519	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1520}
1521
1522/*
1523 *	Support routine. Sends outgoing frames to any network
1524 *	taps currently in use.
1525 */
1526
1527static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1528{
1529	struct packet_type *ptype;
1530	struct sk_buff *skb2 = NULL;
1531	struct packet_type *pt_prev = NULL;
1532
1533	rcu_read_lock();
1534	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1535		/* Never send packets back to the socket
1536		 * they originated from - MvS (miquels@drinkel.ow.org)
1537		 */
1538		if ((ptype->dev == dev || !ptype->dev) &&
1539		    (ptype->af_packet_priv == NULL ||
1540		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1541			if (pt_prev) {
1542				deliver_skb(skb2, pt_prev, skb->dev);
1543				pt_prev = ptype;
1544				continue;
1545			}
1546
1547			skb2 = skb_clone(skb, GFP_ATOMIC);
1548			if (!skb2)
1549				break;
1550
1551			net_timestamp_set(skb2);
1552
1553			/* skb->nh should be correctly
1554			   set by sender, so that the second statement is
1555			   just protection against buggy protocols.
1556			 */
1557			skb_reset_mac_header(skb2);
1558
1559			if (skb_network_header(skb2) < skb2->data ||
1560			    skb2->network_header > skb2->tail) {
1561				if (net_ratelimit())
1562					printk(KERN_CRIT "protocol %04x is "
1563					       "buggy, dev %s\n",
1564					       ntohs(skb2->protocol),
1565					       dev->name);
1566				skb_reset_network_header(skb2);
1567			}
1568
1569			skb2->transport_header = skb2->network_header;
1570			skb2->pkt_type = PACKET_OUTGOING;
1571			pt_prev = ptype;
1572		}
1573	}
1574	if (pt_prev)
1575		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1576	rcu_read_unlock();
1577}
1578
1579/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1580 * @dev: Network device
1581 * @txq: number of queues available
1582 *
1583 * If real_num_tx_queues is changed the tc mappings may no longer be
1584 * valid. To resolve this verify the tc mapping remains valid and if
1585 * not NULL the mapping. With no priorities mapping to this
1586 * offset/count pair it will no longer be used. In the worst case TC0
1587 * is invalid nothing can be done so disable priority mappings. If is
1588 * expected that drivers will fix this mapping if they can before
1589 * calling netif_set_real_num_tx_queues.
1590 */
1591static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1592{
1593	int i;
1594	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1595
1596	/* If TC0 is invalidated disable TC mapping */
1597	if (tc->offset + tc->count > txq) {
1598		pr_warning("Number of in use tx queues changed "
1599			   "invalidating tc mappings. Priority "
1600			   "traffic classification disabled!\n");
1601		dev->num_tc = 0;
1602		return;
1603	}
1604
1605	/* Invalidated prio to tc mappings set to TC0 */
1606	for (i = 1; i < TC_BITMASK + 1; i++) {
1607		int q = netdev_get_prio_tc_map(dev, i);
1608
1609		tc = &dev->tc_to_txq[q];
1610		if (tc->offset + tc->count > txq) {
1611			pr_warning("Number of in use tx queues "
1612				   "changed. Priority %i to tc "
1613				   "mapping %i is no longer valid "
1614				   "setting map to 0\n",
1615				   i, q);
1616			netdev_set_prio_tc_map(dev, i, 0);
1617		}
1618	}
1619}
1620
1621/*
1622 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1623 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1624 */
1625int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1626{
1627	int rc;
1628
1629	if (txq < 1 || txq > dev->num_tx_queues)
1630		return -EINVAL;
1631
1632	if (dev->reg_state == NETREG_REGISTERED ||
1633	    dev->reg_state == NETREG_UNREGISTERING) {
1634		ASSERT_RTNL();
1635
1636		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1637						  txq);
1638		if (rc)
1639			return rc;
1640
1641		if (dev->num_tc)
1642			netif_setup_tc(dev, txq);
1643
1644		if (txq < dev->real_num_tx_queues)
1645			qdisc_reset_all_tx_gt(dev, txq);
1646	}
1647
1648	dev->real_num_tx_queues = txq;
1649	return 0;
1650}
1651EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1652
1653#ifdef CONFIG_RPS
1654/**
1655 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1656 *	@dev: Network device
1657 *	@rxq: Actual number of RX queues
1658 *
1659 *	This must be called either with the rtnl_lock held or before
1660 *	registration of the net device.  Returns 0 on success, or a
1661 *	negative error code.  If called before registration, it always
1662 *	succeeds.
1663 */
1664int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1665{
1666	int rc;
1667
1668	if (rxq < 1 || rxq > dev->num_rx_queues)
1669		return -EINVAL;
1670
1671	if (dev->reg_state == NETREG_REGISTERED) {
1672		ASSERT_RTNL();
1673
1674		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1675						  rxq);
1676		if (rc)
1677			return rc;
1678	}
1679
1680	dev->real_num_rx_queues = rxq;
1681	return 0;
1682}
1683EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1684#endif
1685
1686static inline void __netif_reschedule(struct Qdisc *q)
1687{
1688	struct softnet_data *sd;
1689	unsigned long flags;
1690
1691	local_irq_save(flags);
1692	sd = &__get_cpu_var(softnet_data);
1693	q->next_sched = NULL;
1694	*sd->output_queue_tailp = q;
1695	sd->output_queue_tailp = &q->next_sched;
1696	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1697	local_irq_restore(flags);
1698}
1699
1700void __netif_schedule(struct Qdisc *q)
1701{
1702	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1703		__netif_reschedule(q);
1704}
1705EXPORT_SYMBOL(__netif_schedule);
1706
1707void dev_kfree_skb_irq(struct sk_buff *skb)
1708{
1709	if (atomic_dec_and_test(&skb->users)) {
1710		struct softnet_data *sd;
1711		unsigned long flags;
1712
1713		local_irq_save(flags);
1714		sd = &__get_cpu_var(softnet_data);
1715		skb->next = sd->completion_queue;
1716		sd->completion_queue = skb;
1717		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1718		local_irq_restore(flags);
1719	}
1720}
1721EXPORT_SYMBOL(dev_kfree_skb_irq);
1722
1723void dev_kfree_skb_any(struct sk_buff *skb)
1724{
1725	if (in_irq() || irqs_disabled())
1726		dev_kfree_skb_irq(skb);
1727	else
1728		dev_kfree_skb(skb);
1729}
1730EXPORT_SYMBOL(dev_kfree_skb_any);
1731
1732
1733/**
1734 * netif_device_detach - mark device as removed
1735 * @dev: network device
1736 *
1737 * Mark device as removed from system and therefore no longer available.
1738 */
1739void netif_device_detach(struct net_device *dev)
1740{
1741	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1742	    netif_running(dev)) {
1743		netif_tx_stop_all_queues(dev);
1744	}
1745}
1746EXPORT_SYMBOL(netif_device_detach);
1747
1748/**
1749 * netif_device_attach - mark device as attached
1750 * @dev: network device
1751 *
1752 * Mark device as attached from system and restart if needed.
1753 */
1754void netif_device_attach(struct net_device *dev)
1755{
1756	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1757	    netif_running(dev)) {
1758		netif_tx_wake_all_queues(dev);
1759		__netdev_watchdog_up(dev);
1760	}
1761}
1762EXPORT_SYMBOL(netif_device_attach);
1763
1764/**
1765 * skb_dev_set -- assign a new device to a buffer
1766 * @skb: buffer for the new device
1767 * @dev: network device
1768 *
1769 * If an skb is owned by a device already, we have to reset
1770 * all data private to the namespace a device belongs to
1771 * before assigning it a new device.
1772 */
1773#ifdef CONFIG_NET_NS
1774void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1775{
1776	skb_dst_drop(skb);
1777	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1778		secpath_reset(skb);
1779		nf_reset(skb);
1780		skb_init_secmark(skb);
1781		skb->mark = 0;
1782		skb->priority = 0;
1783		skb->nf_trace = 0;
1784		skb->ipvs_property = 0;
1785#ifdef CONFIG_NET_SCHED
1786		skb->tc_index = 0;
1787#endif
1788	}
1789	skb->dev = dev;
1790}
1791EXPORT_SYMBOL(skb_set_dev);
1792#endif /* CONFIG_NET_NS */
1793
1794/*
1795 * Invalidate hardware checksum when packet is to be mangled, and
1796 * complete checksum manually on outgoing path.
1797 */
1798int skb_checksum_help(struct sk_buff *skb)
1799{
1800	__wsum csum;
1801	int ret = 0, offset;
1802
1803	if (skb->ip_summed == CHECKSUM_COMPLETE)
1804		goto out_set_summed;
1805
1806	if (unlikely(skb_shinfo(skb)->gso_size)) {
1807		/* Let GSO fix up the checksum. */
1808		goto out_set_summed;
1809	}
1810
1811	offset = skb_checksum_start_offset(skb);
1812	BUG_ON(offset >= skb_headlen(skb));
1813	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1814
1815	offset += skb->csum_offset;
1816	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1817
1818	if (skb_cloned(skb) &&
1819	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1820		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1821		if (ret)
1822			goto out;
1823	}
1824
1825	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1826out_set_summed:
1827	skb->ip_summed = CHECKSUM_NONE;
1828out:
1829	return ret;
1830}
1831EXPORT_SYMBOL(skb_checksum_help);
1832
1833/**
1834 *	skb_gso_segment - Perform segmentation on skb.
1835 *	@skb: buffer to segment
1836 *	@features: features for the output path (see dev->features)
1837 *
1838 *	This function segments the given skb and returns a list of segments.
1839 *
1840 *	It may return NULL if the skb requires no segmentation.  This is
1841 *	only possible when GSO is used for verifying header integrity.
1842 */
1843struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1844{
1845	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1846	struct packet_type *ptype;
1847	__be16 type = skb->protocol;
1848	int vlan_depth = ETH_HLEN;
1849	int err;
1850
1851	while (type == htons(ETH_P_8021Q)) {
1852		struct vlan_hdr *vh;
1853
1854		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1855			return ERR_PTR(-EINVAL);
1856
1857		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1858		type = vh->h_vlan_encapsulated_proto;
1859		vlan_depth += VLAN_HLEN;
1860	}
1861
1862	skb_reset_mac_header(skb);
1863	skb->mac_len = skb->network_header - skb->mac_header;
1864	__skb_pull(skb, skb->mac_len);
1865
1866	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1867		struct net_device *dev = skb->dev;
1868		struct ethtool_drvinfo info = {};
1869
1870		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1871			dev->ethtool_ops->get_drvinfo(dev, &info);
1872
1873		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1874		     info.driver, dev ? dev->features : 0L,
1875		     skb->sk ? skb->sk->sk_route_caps : 0L,
1876		     skb->len, skb->data_len, skb->ip_summed);
1877
1878		if (skb_header_cloned(skb) &&
1879		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1880			return ERR_PTR(err);
1881	}
1882
1883	rcu_read_lock();
1884	list_for_each_entry_rcu(ptype,
1885			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1886		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1887			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1888				err = ptype->gso_send_check(skb);
1889				segs = ERR_PTR(err);
1890				if (err || skb_gso_ok(skb, features))
1891					break;
1892				__skb_push(skb, (skb->data -
1893						 skb_network_header(skb)));
1894			}
1895			segs = ptype->gso_segment(skb, features);
1896			break;
1897		}
1898	}
1899	rcu_read_unlock();
1900
1901	__skb_push(skb, skb->data - skb_mac_header(skb));
1902
1903	return segs;
1904}
1905EXPORT_SYMBOL(skb_gso_segment);
1906
1907/* Take action when hardware reception checksum errors are detected. */
1908#ifdef CONFIG_BUG
1909void netdev_rx_csum_fault(struct net_device *dev)
1910{
1911	if (net_ratelimit()) {
1912		printk(KERN_ERR "%s: hw csum failure.\n",
1913			dev ? dev->name : "<unknown>");
1914		dump_stack();
1915	}
1916}
1917EXPORT_SYMBOL(netdev_rx_csum_fault);
1918#endif
1919
1920/* Actually, we should eliminate this check as soon as we know, that:
1921 * 1. IOMMU is present and allows to map all the memory.
1922 * 2. No high memory really exists on this machine.
1923 */
1924
1925static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1926{
1927#ifdef CONFIG_HIGHMEM
1928	int i;
1929	if (!(dev->features & NETIF_F_HIGHDMA)) {
1930		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1931			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1932				return 1;
1933	}
1934
1935	if (PCI_DMA_BUS_IS_PHYS) {
1936		struct device *pdev = dev->dev.parent;
1937
1938		if (!pdev)
1939			return 0;
1940		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1941			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1942			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1943				return 1;
1944		}
1945	}
1946#endif
1947	return 0;
1948}
1949
1950struct dev_gso_cb {
1951	void (*destructor)(struct sk_buff *skb);
1952};
1953
1954#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1955
1956static void dev_gso_skb_destructor(struct sk_buff *skb)
1957{
1958	struct dev_gso_cb *cb;
1959
1960	do {
1961		struct sk_buff *nskb = skb->next;
1962
1963		skb->next = nskb->next;
1964		nskb->next = NULL;
1965		kfree_skb(nskb);
1966	} while (skb->next);
1967
1968	cb = DEV_GSO_CB(skb);
1969	if (cb->destructor)
1970		cb->destructor(skb);
1971}
1972
1973/**
1974 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1975 *	@skb: buffer to segment
1976 *	@features: device features as applicable to this skb
1977 *
1978 *	This function segments the given skb and stores the list of segments
1979 *	in skb->next.
1980 */
1981static int dev_gso_segment(struct sk_buff *skb, int features)
1982{
1983	struct sk_buff *segs;
1984
1985	segs = skb_gso_segment(skb, features);
1986
1987	/* Verifying header integrity only. */
1988	if (!segs)
1989		return 0;
1990
1991	if (IS_ERR(segs))
1992		return PTR_ERR(segs);
1993
1994	skb->next = segs;
1995	DEV_GSO_CB(skb)->destructor = skb->destructor;
1996	skb->destructor = dev_gso_skb_destructor;
1997
1998	return 0;
1999}
2000
2001/*
2002 * Try to orphan skb early, right before transmission by the device.
2003 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2004 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2005 */
2006static inline void skb_orphan_try(struct sk_buff *skb)
2007{
2008	struct sock *sk = skb->sk;
2009
2010	if (sk && !skb_shinfo(skb)->tx_flags) {
2011		/* skb_tx_hash() wont be able to get sk.
2012		 * We copy sk_hash into skb->rxhash
2013		 */
2014		if (!skb->rxhash)
2015			skb->rxhash = sk->sk_hash;
2016		skb_orphan(skb);
2017	}
2018}
2019
2020static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2021{
2022	return ((features & NETIF_F_GEN_CSUM) ||
2023		((features & NETIF_F_V4_CSUM) &&
2024		 protocol == htons(ETH_P_IP)) ||
2025		((features & NETIF_F_V6_CSUM) &&
2026		 protocol == htons(ETH_P_IPV6)) ||
2027		((features & NETIF_F_FCOE_CRC) &&
2028		 protocol == htons(ETH_P_FCOE)));
2029}
2030
2031static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2032{
2033	if (!can_checksum_protocol(features, protocol)) {
2034		features &= ~NETIF_F_ALL_CSUM;
2035		features &= ~NETIF_F_SG;
2036	} else if (illegal_highdma(skb->dev, skb)) {
2037		features &= ~NETIF_F_SG;
2038	}
2039
2040	return features;
2041}
2042
2043u32 netif_skb_features(struct sk_buff *skb)
2044{
2045	__be16 protocol = skb->protocol;
2046	u32 features = skb->dev->features;
2047
2048	if (protocol == htons(ETH_P_8021Q)) {
2049		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2050		protocol = veh->h_vlan_encapsulated_proto;
2051	} else if (!vlan_tx_tag_present(skb)) {
2052		return harmonize_features(skb, protocol, features);
2053	}
2054
2055	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2056
2057	if (protocol != htons(ETH_P_8021Q)) {
2058		return harmonize_features(skb, protocol, features);
2059	} else {
2060		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2061				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2062		return harmonize_features(skb, protocol, features);
2063	}
2064}
2065EXPORT_SYMBOL(netif_skb_features);
2066
2067/*
2068 * Returns true if either:
2069 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2070 *	2. skb is fragmented and the device does not support SG, or if
2071 *	   at least one of fragments is in highmem and device does not
2072 *	   support DMA from it.
2073 */
2074static inline int skb_needs_linearize(struct sk_buff *skb,
2075				      int features)
2076{
2077	return skb_is_nonlinear(skb) &&
2078			((skb_has_frag_list(skb) &&
2079				!(features & NETIF_F_FRAGLIST)) ||
2080			(skb_shinfo(skb)->nr_frags &&
2081				!(features & NETIF_F_SG)));
2082}
2083
2084int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2085			struct netdev_queue *txq)
2086{
2087	const struct net_device_ops *ops = dev->netdev_ops;
2088	int rc = NETDEV_TX_OK;
2089
2090	if (likely(!skb->next)) {
2091		u32 features;
2092
2093		/*
2094		 * If device doesnt need skb->dst, release it right now while
2095		 * its hot in this cpu cache
2096		 */
2097		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2098			skb_dst_drop(skb);
2099
2100		if (!list_empty(&ptype_all))
2101			dev_queue_xmit_nit(skb, dev);
2102
2103		skb_orphan_try(skb);
2104
2105		features = netif_skb_features(skb);
2106
2107		if (vlan_tx_tag_present(skb) &&
2108		    !(features & NETIF_F_HW_VLAN_TX)) {
2109			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2110			if (unlikely(!skb))
2111				goto out;
2112
2113			skb->vlan_tci = 0;
2114		}
2115
2116		if (netif_needs_gso(skb, features)) {
2117			if (unlikely(dev_gso_segment(skb, features)))
2118				goto out_kfree_skb;
2119			if (skb->next)
2120				goto gso;
2121		} else {
2122			if (skb_needs_linearize(skb, features) &&
2123			    __skb_linearize(skb))
2124				goto out_kfree_skb;
2125
2126			/* If packet is not checksummed and device does not
2127			 * support checksumming for this protocol, complete
2128			 * checksumming here.
2129			 */
2130			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2131				skb_set_transport_header(skb,
2132					skb_checksum_start_offset(skb));
2133				if (!(features & NETIF_F_ALL_CSUM) &&
2134				     skb_checksum_help(skb))
2135					goto out_kfree_skb;
2136			}
2137		}
2138
2139		rc = ops->ndo_start_xmit(skb, dev);
2140		trace_net_dev_xmit(skb, rc);
2141		if (rc == NETDEV_TX_OK)
2142			txq_trans_update(txq);
2143		return rc;
2144	}
2145
2146gso:
2147	do {
2148		struct sk_buff *nskb = skb->next;
2149
2150		skb->next = nskb->next;
2151		nskb->next = NULL;
2152
2153		/*
2154		 * If device doesnt need nskb->dst, release it right now while
2155		 * its hot in this cpu cache
2156		 */
2157		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2158			skb_dst_drop(nskb);
2159
2160		rc = ops->ndo_start_xmit(nskb, dev);
2161		trace_net_dev_xmit(nskb, rc);
2162		if (unlikely(rc != NETDEV_TX_OK)) {
2163			if (rc & ~NETDEV_TX_MASK)
2164				goto out_kfree_gso_skb;
2165			nskb->next = skb->next;
2166			skb->next = nskb;
2167			return rc;
2168		}
2169		txq_trans_update(txq);
2170		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2171			return NETDEV_TX_BUSY;
2172	} while (skb->next);
2173
2174out_kfree_gso_skb:
2175	if (likely(skb->next == NULL))
2176		skb->destructor = DEV_GSO_CB(skb)->destructor;
2177out_kfree_skb:
2178	kfree_skb(skb);
2179out:
2180	return rc;
2181}
2182
2183static u32 hashrnd __read_mostly;
2184
2185/*
2186 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2187 * to be used as a distribution range.
2188 */
2189u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2190		  unsigned int num_tx_queues)
2191{
2192	u32 hash;
2193	u16 qoffset = 0;
2194	u16 qcount = num_tx_queues;
2195
2196	if (skb_rx_queue_recorded(skb)) {
2197		hash = skb_get_rx_queue(skb);
2198		while (unlikely(hash >= num_tx_queues))
2199			hash -= num_tx_queues;
2200		return hash;
2201	}
2202
2203	if (dev->num_tc) {
2204		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2205		qoffset = dev->tc_to_txq[tc].offset;
2206		qcount = dev->tc_to_txq[tc].count;
2207	}
2208
2209	if (skb->sk && skb->sk->sk_hash)
2210		hash = skb->sk->sk_hash;
2211	else
2212		hash = (__force u16) skb->protocol ^ skb->rxhash;
2213	hash = jhash_1word(hash, hashrnd);
2214
2215	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2216}
2217EXPORT_SYMBOL(__skb_tx_hash);
2218
2219static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2220{
2221	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2222		if (net_ratelimit()) {
2223			pr_warning("%s selects TX queue %d, but "
2224				"real number of TX queues is %d\n",
2225				dev->name, queue_index, dev->real_num_tx_queues);
2226		}
2227		return 0;
2228	}
2229	return queue_index;
2230}
2231
2232static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2233{
2234#ifdef CONFIG_XPS
2235	struct xps_dev_maps *dev_maps;
2236	struct xps_map *map;
2237	int queue_index = -1;
2238
2239	rcu_read_lock();
2240	dev_maps = rcu_dereference(dev->xps_maps);
2241	if (dev_maps) {
2242		map = rcu_dereference(
2243		    dev_maps->cpu_map[raw_smp_processor_id()]);
2244		if (map) {
2245			if (map->len == 1)
2246				queue_index = map->queues[0];
2247			else {
2248				u32 hash;
2249				if (skb->sk && skb->sk->sk_hash)
2250					hash = skb->sk->sk_hash;
2251				else
2252					hash = (__force u16) skb->protocol ^
2253					    skb->rxhash;
2254				hash = jhash_1word(hash, hashrnd);
2255				queue_index = map->queues[
2256				    ((u64)hash * map->len) >> 32];
2257			}
2258			if (unlikely(queue_index >= dev->real_num_tx_queues))
2259				queue_index = -1;
2260		}
2261	}
2262	rcu_read_unlock();
2263
2264	return queue_index;
2265#else
2266	return -1;
2267#endif
2268}
2269
2270static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2271					struct sk_buff *skb)
2272{
2273	int queue_index;
2274	const struct net_device_ops *ops = dev->netdev_ops;
2275
2276	if (dev->real_num_tx_queues == 1)
2277		queue_index = 0;
2278	else if (ops->ndo_select_queue) {
2279		queue_index = ops->ndo_select_queue(dev, skb);
2280		queue_index = dev_cap_txqueue(dev, queue_index);
2281	} else {
2282		struct sock *sk = skb->sk;
2283		queue_index = sk_tx_queue_get(sk);
2284
2285		if (queue_index < 0 || skb->ooo_okay ||
2286		    queue_index >= dev->real_num_tx_queues) {
2287			int old_index = queue_index;
2288
2289			queue_index = get_xps_queue(dev, skb);
2290			if (queue_index < 0)
2291				queue_index = skb_tx_hash(dev, skb);
2292
2293			if (queue_index != old_index && sk) {
2294				struct dst_entry *dst =
2295				    rcu_dereference_check(sk->sk_dst_cache, 1);
2296
2297				if (dst && skb_dst(skb) == dst)
2298					sk_tx_queue_set(sk, queue_index);
2299			}
2300		}
2301	}
2302
2303	skb_set_queue_mapping(skb, queue_index);
2304	return netdev_get_tx_queue(dev, queue_index);
2305}
2306
2307static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2308				 struct net_device *dev,
2309				 struct netdev_queue *txq)
2310{
2311	spinlock_t *root_lock = qdisc_lock(q);
2312	bool contended;
2313	int rc;
2314
2315	qdisc_skb_cb(skb)->pkt_len = skb->len;
2316	qdisc_calculate_pkt_len(skb, q);
2317	/*
2318	 * Heuristic to force contended enqueues to serialize on a
2319	 * separate lock before trying to get qdisc main lock.
2320	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2321	 * and dequeue packets faster.
2322	 */
2323	contended = qdisc_is_running(q);
2324	if (unlikely(contended))
2325		spin_lock(&q->busylock);
2326
2327	spin_lock(root_lock);
2328	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2329		kfree_skb(skb);
2330		rc = NET_XMIT_DROP;
2331	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2332		   qdisc_run_begin(q)) {
2333		/*
2334		 * This is a work-conserving queue; there are no old skbs
2335		 * waiting to be sent out; and the qdisc is not running -
2336		 * xmit the skb directly.
2337		 */
2338		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2339			skb_dst_force(skb);
2340
2341		qdisc_bstats_update(q, skb);
2342
2343		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2344			if (unlikely(contended)) {
2345				spin_unlock(&q->busylock);
2346				contended = false;
2347			}
2348			__qdisc_run(q);
2349		} else
2350			qdisc_run_end(q);
2351
2352		rc = NET_XMIT_SUCCESS;
2353	} else {
2354		skb_dst_force(skb);
2355		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2356		if (qdisc_run_begin(q)) {
2357			if (unlikely(contended)) {
2358				spin_unlock(&q->busylock);
2359				contended = false;
2360			}
2361			__qdisc_run(q);
2362		}
2363	}
2364	spin_unlock(root_lock);
2365	if (unlikely(contended))
2366		spin_unlock(&q->busylock);
2367	return rc;
2368}
2369
2370static DEFINE_PER_CPU(int, xmit_recursion);
2371#define RECURSION_LIMIT 10
2372
2373/**
2374 *	dev_queue_xmit - transmit a buffer
2375 *	@skb: buffer to transmit
2376 *
2377 *	Queue a buffer for transmission to a network device. The caller must
2378 *	have set the device and priority and built the buffer before calling
2379 *	this function. The function can be called from an interrupt.
2380 *
2381 *	A negative errno code is returned on a failure. A success does not
2382 *	guarantee the frame will be transmitted as it may be dropped due
2383 *	to congestion or traffic shaping.
2384 *
2385 * -----------------------------------------------------------------------------------
2386 *      I notice this method can also return errors from the queue disciplines,
2387 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2388 *      be positive.
2389 *
2390 *      Regardless of the return value, the skb is consumed, so it is currently
2391 *      difficult to retry a send to this method.  (You can bump the ref count
2392 *      before sending to hold a reference for retry if you are careful.)
2393 *
2394 *      When calling this method, interrupts MUST be enabled.  This is because
2395 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2396 *          --BLG
2397 */
2398int dev_queue_xmit(struct sk_buff *skb)
2399{
2400	struct net_device *dev = skb->dev;
2401	struct netdev_queue *txq;
2402	struct Qdisc *q;
2403	int rc = -ENOMEM;
2404
2405	/* Disable soft irqs for various locks below. Also
2406	 * stops preemption for RCU.
2407	 */
2408	rcu_read_lock_bh();
2409
2410	txq = dev_pick_tx(dev, skb);
2411	q = rcu_dereference_bh(txq->qdisc);
2412
2413#ifdef CONFIG_NET_CLS_ACT
2414	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2415#endif
2416	trace_net_dev_queue(skb);
2417	if (q->enqueue) {
2418		rc = __dev_xmit_skb(skb, q, dev, txq);
2419		goto out;
2420	}
2421
2422	/* The device has no queue. Common case for software devices:
2423	   loopback, all the sorts of tunnels...
2424
2425	   Really, it is unlikely that netif_tx_lock protection is necessary
2426	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2427	   counters.)
2428	   However, it is possible, that they rely on protection
2429	   made by us here.
2430
2431	   Check this and shot the lock. It is not prone from deadlocks.
2432	   Either shot noqueue qdisc, it is even simpler 8)
2433	 */
2434	if (dev->flags & IFF_UP) {
2435		int cpu = smp_processor_id(); /* ok because BHs are off */
2436
2437		if (txq->xmit_lock_owner != cpu) {
2438
2439			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2440				goto recursion_alert;
2441
2442			HARD_TX_LOCK(dev, txq, cpu);
2443
2444			if (!netif_tx_queue_stopped(txq)) {
2445				__this_cpu_inc(xmit_recursion);
2446				rc = dev_hard_start_xmit(skb, dev, txq);
2447				__this_cpu_dec(xmit_recursion);
2448				if (dev_xmit_complete(rc)) {
2449					HARD_TX_UNLOCK(dev, txq);
2450					goto out;
2451				}
2452			}
2453			HARD_TX_UNLOCK(dev, txq);
2454			if (net_ratelimit())
2455				printk(KERN_CRIT "Virtual device %s asks to "
2456				       "queue packet!\n", dev->name);
2457		} else {
2458			/* Recursion is detected! It is possible,
2459			 * unfortunately
2460			 */
2461recursion_alert:
2462			if (net_ratelimit())
2463				printk(KERN_CRIT "Dead loop on virtual device "
2464				       "%s, fix it urgently!\n", dev->name);
2465		}
2466	}
2467
2468	rc = -ENETDOWN;
2469	rcu_read_unlock_bh();
2470
2471	kfree_skb(skb);
2472	return rc;
2473out:
2474	rcu_read_unlock_bh();
2475	return rc;
2476}
2477EXPORT_SYMBOL(dev_queue_xmit);
2478
2479
2480/*=======================================================================
2481			Receiver routines
2482  =======================================================================*/
2483
2484int netdev_max_backlog __read_mostly = 1000;
2485int netdev_tstamp_prequeue __read_mostly = 1;
2486int netdev_budget __read_mostly = 300;
2487int weight_p __read_mostly = 64;            /* old backlog weight */
2488
2489/* Called with irq disabled */
2490static inline void ____napi_schedule(struct softnet_data *sd,
2491				     struct napi_struct *napi)
2492{
2493	list_add_tail(&napi->poll_list, &sd->poll_list);
2494	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2495}
2496
2497/*
2498 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2499 * and src/dst port numbers. Returns a non-zero hash number on success
2500 * and 0 on failure.
2501 */
2502__u32 __skb_get_rxhash(struct sk_buff *skb)
2503{
2504	int nhoff, hash = 0, poff;
2505	struct ipv6hdr *ip6;
2506	struct iphdr *ip;
2507	u8 ip_proto;
2508	u32 addr1, addr2, ihl;
2509	union {
2510		u32 v32;
2511		u16 v16[2];
2512	} ports;
2513
2514	nhoff = skb_network_offset(skb);
2515
2516	switch (skb->protocol) {
2517	case __constant_htons(ETH_P_IP):
2518		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2519			goto done;
2520
2521		ip = (struct iphdr *) (skb->data + nhoff);
2522		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2523			ip_proto = 0;
2524		else
2525			ip_proto = ip->protocol;
2526		addr1 = (__force u32) ip->saddr;
2527		addr2 = (__force u32) ip->daddr;
2528		ihl = ip->ihl;
2529		break;
2530	case __constant_htons(ETH_P_IPV6):
2531		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2532			goto done;
2533
2534		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2535		ip_proto = ip6->nexthdr;
2536		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2537		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2538		ihl = (40 >> 2);
2539		break;
2540	default:
2541		goto done;
2542	}
2543
2544	ports.v32 = 0;
2545	poff = proto_ports_offset(ip_proto);
2546	if (poff >= 0) {
2547		nhoff += ihl * 4 + poff;
2548		if (pskb_may_pull(skb, nhoff + 4)) {
2549			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2550			if (ports.v16[1] < ports.v16[0])
2551				swap(ports.v16[0], ports.v16[1]);
2552		}
2553	}
2554
2555	/* get a consistent hash (same value on both flow directions) */
2556	if (addr2 < addr1)
2557		swap(addr1, addr2);
2558
2559	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2560	if (!hash)
2561		hash = 1;
2562
2563done:
2564	return hash;
2565}
2566EXPORT_SYMBOL(__skb_get_rxhash);
2567
2568#ifdef CONFIG_RPS
2569
2570/* One global table that all flow-based protocols share. */
2571struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2572EXPORT_SYMBOL(rps_sock_flow_table);
2573
2574static struct rps_dev_flow *
2575set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2576	    struct rps_dev_flow *rflow, u16 next_cpu)
2577{
2578	u16 tcpu;
2579
2580	tcpu = rflow->cpu = next_cpu;
2581	if (tcpu != RPS_NO_CPU) {
2582#ifdef CONFIG_RFS_ACCEL
2583		struct netdev_rx_queue *rxqueue;
2584		struct rps_dev_flow_table *flow_table;
2585		struct rps_dev_flow *old_rflow;
2586		u32 flow_id;
2587		u16 rxq_index;
2588		int rc;
2589
2590		/* Should we steer this flow to a different hardware queue? */
2591		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2592		    !(dev->features & NETIF_F_NTUPLE))
2593			goto out;
2594		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2595		if (rxq_index == skb_get_rx_queue(skb))
2596			goto out;
2597
2598		rxqueue = dev->_rx + rxq_index;
2599		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2600		if (!flow_table)
2601			goto out;
2602		flow_id = skb->rxhash & flow_table->mask;
2603		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2604							rxq_index, flow_id);
2605		if (rc < 0)
2606			goto out;
2607		old_rflow = rflow;
2608		rflow = &flow_table->flows[flow_id];
2609		rflow->cpu = next_cpu;
2610		rflow->filter = rc;
2611		if (old_rflow->filter == rflow->filter)
2612			old_rflow->filter = RPS_NO_FILTER;
2613	out:
2614#endif
2615		rflow->last_qtail =
2616			per_cpu(softnet_data, tcpu).input_queue_head;
2617	}
2618
2619	return rflow;
2620}
2621
2622/*
2623 * get_rps_cpu is called from netif_receive_skb and returns the target
2624 * CPU from the RPS map of the receiving queue for a given skb.
2625 * rcu_read_lock must be held on entry.
2626 */
2627static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2628		       struct rps_dev_flow **rflowp)
2629{
2630	struct netdev_rx_queue *rxqueue;
2631	struct rps_map *map;
2632	struct rps_dev_flow_table *flow_table;
2633	struct rps_sock_flow_table *sock_flow_table;
2634	int cpu = -1;
2635	u16 tcpu;
2636
2637	if (skb_rx_queue_recorded(skb)) {
2638		u16 index = skb_get_rx_queue(skb);
2639		if (unlikely(index >= dev->real_num_rx_queues)) {
2640			WARN_ONCE(dev->real_num_rx_queues > 1,
2641				  "%s received packet on queue %u, but number "
2642				  "of RX queues is %u\n",
2643				  dev->name, index, dev->real_num_rx_queues);
2644			goto done;
2645		}
2646		rxqueue = dev->_rx + index;
2647	} else
2648		rxqueue = dev->_rx;
2649
2650	map = rcu_dereference(rxqueue->rps_map);
2651	if (map) {
2652		if (map->len == 1 &&
2653		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2654			tcpu = map->cpus[0];
2655			if (cpu_online(tcpu))
2656				cpu = tcpu;
2657			goto done;
2658		}
2659	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2660		goto done;
2661	}
2662
2663	skb_reset_network_header(skb);
2664	if (!skb_get_rxhash(skb))
2665		goto done;
2666
2667	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2668	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2669	if (flow_table && sock_flow_table) {
2670		u16 next_cpu;
2671		struct rps_dev_flow *rflow;
2672
2673		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2674		tcpu = rflow->cpu;
2675
2676		next_cpu = sock_flow_table->ents[skb->rxhash &
2677		    sock_flow_table->mask];
2678
2679		/*
2680		 * If the desired CPU (where last recvmsg was done) is
2681		 * different from current CPU (one in the rx-queue flow
2682		 * table entry), switch if one of the following holds:
2683		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2684		 *   - Current CPU is offline.
2685		 *   - The current CPU's queue tail has advanced beyond the
2686		 *     last packet that was enqueued using this table entry.
2687		 *     This guarantees that all previous packets for the flow
2688		 *     have been dequeued, thus preserving in order delivery.
2689		 */
2690		if (unlikely(tcpu != next_cpu) &&
2691		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2692		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2693		      rflow->last_qtail)) >= 0))
2694			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2695
2696		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2697			*rflowp = rflow;
2698			cpu = tcpu;
2699			goto done;
2700		}
2701	}
2702
2703	if (map) {
2704		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2705
2706		if (cpu_online(tcpu)) {
2707			cpu = tcpu;
2708			goto done;
2709		}
2710	}
2711
2712done:
2713	return cpu;
2714}
2715
2716#ifdef CONFIG_RFS_ACCEL
2717
2718/**
2719 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2720 * @dev: Device on which the filter was set
2721 * @rxq_index: RX queue index
2722 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2723 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2724 *
2725 * Drivers that implement ndo_rx_flow_steer() should periodically call
2726 * this function for each installed filter and remove the filters for
2727 * which it returns %true.
2728 */
2729bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2730			 u32 flow_id, u16 filter_id)
2731{
2732	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2733	struct rps_dev_flow_table *flow_table;
2734	struct rps_dev_flow *rflow;
2735	bool expire = true;
2736	int cpu;
2737
2738	rcu_read_lock();
2739	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2740	if (flow_table && flow_id <= flow_table->mask) {
2741		rflow = &flow_table->flows[flow_id];
2742		cpu = ACCESS_ONCE(rflow->cpu);
2743		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2744		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2745			   rflow->last_qtail) <
2746		     (int)(10 * flow_table->mask)))
2747			expire = false;
2748	}
2749	rcu_read_unlock();
2750	return expire;
2751}
2752EXPORT_SYMBOL(rps_may_expire_flow);
2753
2754#endif /* CONFIG_RFS_ACCEL */
2755
2756/* Called from hardirq (IPI) context */
2757static void rps_trigger_softirq(void *data)
2758{
2759	struct softnet_data *sd = data;
2760
2761	____napi_schedule(sd, &sd->backlog);
2762	sd->received_rps++;
2763}
2764
2765#endif /* CONFIG_RPS */
2766
2767/*
2768 * Check if this softnet_data structure is another cpu one
2769 * If yes, queue it to our IPI list and return 1
2770 * If no, return 0
2771 */
2772static int rps_ipi_queued(struct softnet_data *sd)
2773{
2774#ifdef CONFIG_RPS
2775	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2776
2777	if (sd != mysd) {
2778		sd->rps_ipi_next = mysd->rps_ipi_list;
2779		mysd->rps_ipi_list = sd;
2780
2781		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2782		return 1;
2783	}
2784#endif /* CONFIG_RPS */
2785	return 0;
2786}
2787
2788/*
2789 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2790 * queue (may be a remote CPU queue).
2791 */
2792static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2793			      unsigned int *qtail)
2794{
2795	struct softnet_data *sd;
2796	unsigned long flags;
2797
2798	sd = &per_cpu(softnet_data, cpu);
2799
2800	local_irq_save(flags);
2801
2802	rps_lock(sd);
2803	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2804		if (skb_queue_len(&sd->input_pkt_queue)) {
2805enqueue:
2806			__skb_queue_tail(&sd->input_pkt_queue, skb);
2807			input_queue_tail_incr_save(sd, qtail);
2808			rps_unlock(sd);
2809			local_irq_restore(flags);
2810			return NET_RX_SUCCESS;
2811		}
2812
2813		/* Schedule NAPI for backlog device
2814		 * We can use non atomic operation since we own the queue lock
2815		 */
2816		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2817			if (!rps_ipi_queued(sd))
2818				____napi_schedule(sd, &sd->backlog);
2819		}
2820		goto enqueue;
2821	}
2822
2823	sd->dropped++;
2824	rps_unlock(sd);
2825
2826	local_irq_restore(flags);
2827
2828	atomic_long_inc(&skb->dev->rx_dropped);
2829	kfree_skb(skb);
2830	return NET_RX_DROP;
2831}
2832
2833/**
2834 *	netif_rx	-	post buffer to the network code
2835 *	@skb: buffer to post
2836 *
2837 *	This function receives a packet from a device driver and queues it for
2838 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2839 *	may be dropped during processing for congestion control or by the
2840 *	protocol layers.
2841 *
2842 *	return values:
2843 *	NET_RX_SUCCESS	(no congestion)
2844 *	NET_RX_DROP     (packet was dropped)
2845 *
2846 */
2847
2848int netif_rx(struct sk_buff *skb)
2849{
2850	int ret;
2851
2852	/* if netpoll wants it, pretend we never saw it */
2853	if (netpoll_rx(skb))
2854		return NET_RX_DROP;
2855
2856	if (netdev_tstamp_prequeue)
2857		net_timestamp_check(skb);
2858
2859	trace_netif_rx(skb);
2860#ifdef CONFIG_RPS
2861	{
2862		struct rps_dev_flow voidflow, *rflow = &voidflow;
2863		int cpu;
2864
2865		preempt_disable();
2866		rcu_read_lock();
2867
2868		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2869		if (cpu < 0)
2870			cpu = smp_processor_id();
2871
2872		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2873
2874		rcu_read_unlock();
2875		preempt_enable();
2876	}
2877#else
2878	{
2879		unsigned int qtail;
2880		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2881		put_cpu();
2882	}
2883#endif
2884	return ret;
2885}
2886EXPORT_SYMBOL(netif_rx);
2887
2888int netif_rx_ni(struct sk_buff *skb)
2889{
2890	int err;
2891
2892	preempt_disable();
2893	err = netif_rx(skb);
2894	if (local_softirq_pending())
2895		do_softirq();
2896	preempt_enable();
2897
2898	return err;
2899}
2900EXPORT_SYMBOL(netif_rx_ni);
2901
2902static void net_tx_action(struct softirq_action *h)
2903{
2904	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2905
2906	if (sd->completion_queue) {
2907		struct sk_buff *clist;
2908
2909		local_irq_disable();
2910		clist = sd->completion_queue;
2911		sd->completion_queue = NULL;
2912		local_irq_enable();
2913
2914		while (clist) {
2915			struct sk_buff *skb = clist;
2916			clist = clist->next;
2917
2918			WARN_ON(atomic_read(&skb->users));
2919			trace_kfree_skb(skb, net_tx_action);
2920			__kfree_skb(skb);
2921		}
2922	}
2923
2924	if (sd->output_queue) {
2925		struct Qdisc *head;
2926
2927		local_irq_disable();
2928		head = sd->output_queue;
2929		sd->output_queue = NULL;
2930		sd->output_queue_tailp = &sd->output_queue;
2931		local_irq_enable();
2932
2933		while (head) {
2934			struct Qdisc *q = head;
2935			spinlock_t *root_lock;
2936
2937			head = head->next_sched;
2938
2939			root_lock = qdisc_lock(q);
2940			if (spin_trylock(root_lock)) {
2941				smp_mb__before_clear_bit();
2942				clear_bit(__QDISC_STATE_SCHED,
2943					  &q->state);
2944				qdisc_run(q);
2945				spin_unlock(root_lock);
2946			} else {
2947				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2948					      &q->state)) {
2949					__netif_reschedule(q);
2950				} else {
2951					smp_mb__before_clear_bit();
2952					clear_bit(__QDISC_STATE_SCHED,
2953						  &q->state);
2954				}
2955			}
2956		}
2957	}
2958}
2959
2960#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2961    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2962/* This hook is defined here for ATM LANE */
2963int (*br_fdb_test_addr_hook)(struct net_device *dev,
2964			     unsigned char *addr) __read_mostly;
2965EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2966#endif
2967
2968#ifdef CONFIG_NET_CLS_ACT
2969/* TODO: Maybe we should just force sch_ingress to be compiled in
2970 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2971 * a compare and 2 stores extra right now if we dont have it on
2972 * but have CONFIG_NET_CLS_ACT
2973 * NOTE: This doesnt stop any functionality; if you dont have
2974 * the ingress scheduler, you just cant add policies on ingress.
2975 *
2976 */
2977static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2978{
2979	struct net_device *dev = skb->dev;
2980	u32 ttl = G_TC_RTTL(skb->tc_verd);
2981	int result = TC_ACT_OK;
2982	struct Qdisc *q;
2983
2984	if (unlikely(MAX_RED_LOOP < ttl++)) {
2985		if (net_ratelimit())
2986			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2987			       skb->skb_iif, dev->ifindex);
2988		return TC_ACT_SHOT;
2989	}
2990
2991	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2992	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2993
2994	q = rxq->qdisc;
2995	if (q != &noop_qdisc) {
2996		spin_lock(qdisc_lock(q));
2997		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2998			result = qdisc_enqueue_root(skb, q);
2999		spin_unlock(qdisc_lock(q));
3000	}
3001
3002	return result;
3003}
3004
3005static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3006					 struct packet_type **pt_prev,
3007					 int *ret, struct net_device *orig_dev)
3008{
3009	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3010
3011	if (!rxq || rxq->qdisc == &noop_qdisc)
3012		goto out;
3013
3014	if (*pt_prev) {
3015		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3016		*pt_prev = NULL;
3017	}
3018
3019	switch (ing_filter(skb, rxq)) {
3020	case TC_ACT_SHOT:
3021	case TC_ACT_STOLEN:
3022		kfree_skb(skb);
3023		return NULL;
3024	}
3025
3026out:
3027	skb->tc_verd = 0;
3028	return skb;
3029}
3030#endif
3031
3032/**
3033 *	netdev_rx_handler_register - register receive handler
3034 *	@dev: device to register a handler for
3035 *	@rx_handler: receive handler to register
3036 *	@rx_handler_data: data pointer that is used by rx handler
3037 *
3038 *	Register a receive hander for a device. This handler will then be
3039 *	called from __netif_receive_skb. A negative errno code is returned
3040 *	on a failure.
3041 *
3042 *	The caller must hold the rtnl_mutex.
3043 *
3044 *	For a general description of rx_handler, see enum rx_handler_result.
3045 */
3046int netdev_rx_handler_register(struct net_device *dev,
3047			       rx_handler_func_t *rx_handler,
3048			       void *rx_handler_data)
3049{
3050	ASSERT_RTNL();
3051
3052	if (dev->rx_handler)
3053		return -EBUSY;
3054
3055	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3056	rcu_assign_pointer(dev->rx_handler, rx_handler);
3057
3058	return 0;
3059}
3060EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3061
3062/**
3063 *	netdev_rx_handler_unregister - unregister receive handler
3064 *	@dev: device to unregister a handler from
3065 *
3066 *	Unregister a receive hander from a device.
3067 *
3068 *	The caller must hold the rtnl_mutex.
3069 */
3070void netdev_rx_handler_unregister(struct net_device *dev)
3071{
3072
3073	ASSERT_RTNL();
3074	rcu_assign_pointer(dev->rx_handler, NULL);
3075	rcu_assign_pointer(dev->rx_handler_data, NULL);
3076}
3077EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3078
3079static void vlan_on_bond_hook(struct sk_buff *skb)
3080{
3081	/*
3082	 * Make sure ARP frames received on VLAN interfaces stacked on
3083	 * bonding interfaces still make their way to any base bonding
3084	 * device that may have registered for a specific ptype.
3085	 */
3086	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3087	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3088	    skb->protocol == htons(ETH_P_ARP)) {
3089		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3090
3091		if (!skb2)
3092			return;
3093		skb2->dev = vlan_dev_real_dev(skb->dev);
3094		netif_rx(skb2);
3095	}
3096}
3097
3098static int __netif_receive_skb(struct sk_buff *skb)
3099{
3100	struct packet_type *ptype, *pt_prev;
3101	rx_handler_func_t *rx_handler;
3102	struct net_device *orig_dev;
3103	struct net_device *null_or_dev;
3104	bool deliver_exact = false;
3105	int ret = NET_RX_DROP;
3106	__be16 type;
3107
3108	if (!netdev_tstamp_prequeue)
3109		net_timestamp_check(skb);
3110
3111	trace_netif_receive_skb(skb);
3112
3113	/* if we've gotten here through NAPI, check netpoll */
3114	if (netpoll_receive_skb(skb))
3115		return NET_RX_DROP;
3116
3117	if (!skb->skb_iif)
3118		skb->skb_iif = skb->dev->ifindex;
3119	orig_dev = skb->dev;
3120
3121	skb_reset_network_header(skb);
3122	skb_reset_transport_header(skb);
3123	skb->mac_len = skb->network_header - skb->mac_header;
3124
3125	pt_prev = NULL;
3126
3127	rcu_read_lock();
3128
3129another_round:
3130
3131	__this_cpu_inc(softnet_data.processed);
3132
3133#ifdef CONFIG_NET_CLS_ACT
3134	if (skb->tc_verd & TC_NCLS) {
3135		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3136		goto ncls;
3137	}
3138#endif
3139
3140	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3141		if (!ptype->dev || ptype->dev == skb->dev) {
3142			if (pt_prev)
3143				ret = deliver_skb(skb, pt_prev, orig_dev);
3144			pt_prev = ptype;
3145		}
3146	}
3147
3148#ifdef CONFIG_NET_CLS_ACT
3149	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3150	if (!skb)
3151		goto out;
3152ncls:
3153#endif
3154
3155	rx_handler = rcu_dereference(skb->dev->rx_handler);
3156	if (rx_handler) {
3157		if (pt_prev) {
3158			ret = deliver_skb(skb, pt_prev, orig_dev);
3159			pt_prev = NULL;
3160		}
3161		switch (rx_handler(&skb)) {
3162		case RX_HANDLER_CONSUMED:
3163			goto out;
3164		case RX_HANDLER_ANOTHER:
3165			goto another_round;
3166		case RX_HANDLER_EXACT:
3167			deliver_exact = true;
3168		case RX_HANDLER_PASS:
3169			break;
3170		default:
3171			BUG();
3172		}
3173	}
3174
3175	if (vlan_tx_tag_present(skb)) {
3176		if (pt_prev) {
3177			ret = deliver_skb(skb, pt_prev, orig_dev);
3178			pt_prev = NULL;
3179		}
3180		if (vlan_hwaccel_do_receive(&skb)) {
3181			ret = __netif_receive_skb(skb);
3182			goto out;
3183		} else if (unlikely(!skb))
3184			goto out;
3185	}
3186
3187	vlan_on_bond_hook(skb);
3188
3189	/* deliver only exact match when indicated */
3190	null_or_dev = deliver_exact ? skb->dev : NULL;
3191
3192	type = skb->protocol;
3193	list_for_each_entry_rcu(ptype,
3194			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3195		if (ptype->type == type &&
3196		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3197		     ptype->dev == orig_dev)) {
3198			if (pt_prev)
3199				ret = deliver_skb(skb, pt_prev, orig_dev);
3200			pt_prev = ptype;
3201		}
3202	}
3203
3204	if (pt_prev) {
3205		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3206	} else {
3207		atomic_long_inc(&skb->dev->rx_dropped);
3208		kfree_skb(skb);
3209		/* Jamal, now you will not able to escape explaining
3210		 * me how you were going to use this. :-)
3211		 */
3212		ret = NET_RX_DROP;
3213	}
3214
3215out:
3216	rcu_read_unlock();
3217	return ret;
3218}
3219
3220/**
3221 *	netif_receive_skb - process receive buffer from network
3222 *	@skb: buffer to process
3223 *
3224 *	netif_receive_skb() is the main receive data processing function.
3225 *	It always succeeds. The buffer may be dropped during processing
3226 *	for congestion control or by the protocol layers.
3227 *
3228 *	This function may only be called from softirq context and interrupts
3229 *	should be enabled.
3230 *
3231 *	Return values (usually ignored):
3232 *	NET_RX_SUCCESS: no congestion
3233 *	NET_RX_DROP: packet was dropped
3234 */
3235int netif_receive_skb(struct sk_buff *skb)
3236{
3237	if (netdev_tstamp_prequeue)
3238		net_timestamp_check(skb);
3239
3240	if (skb_defer_rx_timestamp(skb))
3241		return NET_RX_SUCCESS;
3242
3243#ifdef CONFIG_RPS
3244	{
3245		struct rps_dev_flow voidflow, *rflow = &voidflow;
3246		int cpu, ret;
3247
3248		rcu_read_lock();
3249
3250		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3251
3252		if (cpu >= 0) {
3253			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3254			rcu_read_unlock();
3255		} else {
3256			rcu_read_unlock();
3257			ret = __netif_receive_skb(skb);
3258		}
3259
3260		return ret;
3261	}
3262#else
3263	return __netif_receive_skb(skb);
3264#endif
3265}
3266EXPORT_SYMBOL(netif_receive_skb);
3267
3268/* Network device is going away, flush any packets still pending
3269 * Called with irqs disabled.
3270 */
3271static void flush_backlog(void *arg)
3272{
3273	struct net_device *dev = arg;
3274	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3275	struct sk_buff *skb, *tmp;
3276
3277	rps_lock(sd);
3278	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3279		if (skb->dev == dev) {
3280			__skb_unlink(skb, &sd->input_pkt_queue);
3281			kfree_skb(skb);
3282			input_queue_head_incr(sd);
3283		}
3284	}
3285	rps_unlock(sd);
3286
3287	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3288		if (skb->dev == dev) {
3289			__skb_unlink(skb, &sd->process_queue);
3290			kfree_skb(skb);
3291			input_queue_head_incr(sd);
3292		}
3293	}
3294}
3295
3296static int napi_gro_complete(struct sk_buff *skb)
3297{
3298	struct packet_type *ptype;
3299	__be16 type = skb->protocol;
3300	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3301	int err = -ENOENT;
3302
3303	if (NAPI_GRO_CB(skb)->count == 1) {
3304		skb_shinfo(skb)->gso_size = 0;
3305		goto out;
3306	}
3307
3308	rcu_read_lock();
3309	list_for_each_entry_rcu(ptype, head, list) {
3310		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3311			continue;
3312
3313		err = ptype->gro_complete(skb);
3314		break;
3315	}
3316	rcu_read_unlock();
3317
3318	if (err) {
3319		WARN_ON(&ptype->list == head);
3320		kfree_skb(skb);
3321		return NET_RX_SUCCESS;
3322	}
3323
3324out:
3325	return netif_receive_skb(skb);
3326}
3327
3328inline void napi_gro_flush(struct napi_struct *napi)
3329{
3330	struct sk_buff *skb, *next;
3331
3332	for (skb = napi->gro_list; skb; skb = next) {
3333		next = skb->next;
3334		skb->next = NULL;
3335		napi_gro_complete(skb);
3336	}
3337
3338	napi->gro_count = 0;
3339	napi->gro_list = NULL;
3340}
3341EXPORT_SYMBOL(napi_gro_flush);
3342
3343enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3344{
3345	struct sk_buff **pp = NULL;
3346	struct packet_type *ptype;
3347	__be16 type = skb->protocol;
3348	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3349	int same_flow;
3350	int mac_len;
3351	enum gro_result ret;
3352
3353	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3354		goto normal;
3355
3356	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3357		goto normal;
3358
3359	rcu_read_lock();
3360	list_for_each_entry_rcu(ptype, head, list) {
3361		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3362			continue;
3363
3364		skb_set_network_header(skb, skb_gro_offset(skb));
3365		mac_len = skb->network_header - skb->mac_header;
3366		skb->mac_len = mac_len;
3367		NAPI_GRO_CB(skb)->same_flow = 0;
3368		NAPI_GRO_CB(skb)->flush = 0;
3369		NAPI_GRO_CB(skb)->free = 0;
3370
3371		pp = ptype->gro_receive(&napi->gro_list, skb);
3372		break;
3373	}
3374	rcu_read_unlock();
3375
3376	if (&ptype->list == head)
3377		goto normal;
3378
3379	same_flow = NAPI_GRO_CB(skb)->same_flow;
3380	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3381
3382	if (pp) {
3383		struct sk_buff *nskb = *pp;
3384
3385		*pp = nskb->next;
3386		nskb->next = NULL;
3387		napi_gro_complete(nskb);
3388		napi->gro_count--;
3389	}
3390
3391	if (same_flow)
3392		goto ok;
3393
3394	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3395		goto normal;
3396
3397	napi->gro_count++;
3398	NAPI_GRO_CB(skb)->count = 1;
3399	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3400	skb->next = napi->gro_list;
3401	napi->gro_list = skb;
3402	ret = GRO_HELD;
3403
3404pull:
3405	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3406		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3407
3408		BUG_ON(skb->end - skb->tail < grow);
3409
3410		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3411
3412		skb->tail += grow;
3413		skb->data_len -= grow;
3414
3415		skb_shinfo(skb)->frags[0].page_offset += grow;
3416		skb_shinfo(skb)->frags[0].size -= grow;
3417
3418		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3419			put_page(skb_shinfo(skb)->frags[0].page);
3420			memmove(skb_shinfo(skb)->frags,
3421				skb_shinfo(skb)->frags + 1,
3422				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3423		}
3424	}
3425
3426ok:
3427	return ret;
3428
3429normal:
3430	ret = GRO_NORMAL;
3431	goto pull;
3432}
3433EXPORT_SYMBOL(dev_gro_receive);
3434
3435static inline gro_result_t
3436__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3437{
3438	struct sk_buff *p;
3439
3440	for (p = napi->gro_list; p; p = p->next) {
3441		unsigned long diffs;
3442
3443		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3444		diffs |= p->vlan_tci ^ skb->vlan_tci;
3445		diffs |= compare_ether_header(skb_mac_header(p),
3446					      skb_gro_mac_header(skb));
3447		NAPI_GRO_CB(p)->same_flow = !diffs;
3448		NAPI_GRO_CB(p)->flush = 0;
3449	}
3450
3451	return dev_gro_receive(napi, skb);
3452}
3453
3454gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3455{
3456	switch (ret) {
3457	case GRO_NORMAL:
3458		if (netif_receive_skb(skb))
3459			ret = GRO_DROP;
3460		break;
3461
3462	case GRO_DROP:
3463	case GRO_MERGED_FREE:
3464		kfree_skb(skb);
3465		break;
3466
3467	case GRO_HELD:
3468	case GRO_MERGED:
3469		break;
3470	}
3471
3472	return ret;
3473}
3474EXPORT_SYMBOL(napi_skb_finish);
3475
3476void skb_gro_reset_offset(struct sk_buff *skb)
3477{
3478	NAPI_GRO_CB(skb)->data_offset = 0;
3479	NAPI_GRO_CB(skb)->frag0 = NULL;
3480	NAPI_GRO_CB(skb)->frag0_len = 0;
3481
3482	if (skb->mac_header == skb->tail &&
3483	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3484		NAPI_GRO_CB(skb)->frag0 =
3485			page_address(skb_shinfo(skb)->frags[0].page) +
3486			skb_shinfo(skb)->frags[0].page_offset;
3487		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3488	}
3489}
3490EXPORT_SYMBOL(skb_gro_reset_offset);
3491
3492gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3493{
3494	skb_gro_reset_offset(skb);
3495
3496	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3497}
3498EXPORT_SYMBOL(napi_gro_receive);
3499
3500static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3501{
3502	__skb_pull(skb, skb_headlen(skb));
3503	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3504	skb->vlan_tci = 0;
3505	skb->dev = napi->dev;
3506	skb->skb_iif = 0;
3507
3508	napi->skb = skb;
3509}
3510
3511struct sk_buff *napi_get_frags(struct napi_struct *napi)
3512{
3513	struct sk_buff *skb = napi->skb;
3514
3515	if (!skb) {
3516		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3517		if (skb)
3518			napi->skb = skb;
3519	}
3520	return skb;
3521}
3522EXPORT_SYMBOL(napi_get_frags);
3523
3524gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3525			       gro_result_t ret)
3526{
3527	switch (ret) {
3528	case GRO_NORMAL:
3529	case GRO_HELD:
3530		skb->protocol = eth_type_trans(skb, skb->dev);
3531
3532		if (ret == GRO_HELD)
3533			skb_gro_pull(skb, -ETH_HLEN);
3534		else if (netif_receive_skb(skb))
3535			ret = GRO_DROP;
3536		break;
3537
3538	case GRO_DROP:
3539	case GRO_MERGED_FREE:
3540		napi_reuse_skb(napi, skb);
3541		break;
3542
3543	case GRO_MERGED:
3544		break;
3545	}
3546
3547	return ret;
3548}
3549EXPORT_SYMBOL(napi_frags_finish);
3550
3551struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3552{
3553	struct sk_buff *skb = napi->skb;
3554	struct ethhdr *eth;
3555	unsigned int hlen;
3556	unsigned int off;
3557
3558	napi->skb = NULL;
3559
3560	skb_reset_mac_header(skb);
3561	skb_gro_reset_offset(skb);
3562
3563	off = skb_gro_offset(skb);
3564	hlen = off + sizeof(*eth);
3565	eth = skb_gro_header_fast(skb, off);
3566	if (skb_gro_header_hard(skb, hlen)) {
3567		eth = skb_gro_header_slow(skb, hlen, off);
3568		if (unlikely(!eth)) {
3569			napi_reuse_skb(napi, skb);
3570			skb = NULL;
3571			goto out;
3572		}
3573	}
3574
3575	skb_gro_pull(skb, sizeof(*eth));
3576
3577	/*
3578	 * This works because the only protocols we care about don't require
3579	 * special handling.  We'll fix it up properly at the end.
3580	 */
3581	skb->protocol = eth->h_proto;
3582
3583out:
3584	return skb;
3585}
3586EXPORT_SYMBOL(napi_frags_skb);
3587
3588gro_result_t napi_gro_frags(struct napi_struct *napi)
3589{
3590	struct sk_buff *skb = napi_frags_skb(napi);
3591
3592	if (!skb)
3593		return GRO_DROP;
3594
3595	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3596}
3597EXPORT_SYMBOL(napi_gro_frags);
3598
3599/*
3600 * net_rps_action sends any pending IPI's for rps.
3601 * Note: called with local irq disabled, but exits with local irq enabled.
3602 */
3603static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3604{
3605#ifdef CONFIG_RPS
3606	struct softnet_data *remsd = sd->rps_ipi_list;
3607
3608	if (remsd) {
3609		sd->rps_ipi_list = NULL;
3610
3611		local_irq_enable();
3612
3613		/* Send pending IPI's to kick RPS processing on remote cpus. */
3614		while (remsd) {
3615			struct softnet_data *next = remsd->rps_ipi_next;
3616
3617			if (cpu_online(remsd->cpu))
3618				__smp_call_function_single(remsd->cpu,
3619							   &remsd->csd, 0);
3620			remsd = next;
3621		}
3622	} else
3623#endif
3624		local_irq_enable();
3625}
3626
3627static int process_backlog(struct napi_struct *napi, int quota)
3628{
3629	int work = 0;
3630	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3631
3632#ifdef CONFIG_RPS
3633	/* Check if we have pending ipi, its better to send them now,
3634	 * not waiting net_rx_action() end.
3635	 */
3636	if (sd->rps_ipi_list) {
3637		local_irq_disable();
3638		net_rps_action_and_irq_enable(sd);
3639	}
3640#endif
3641	napi->weight = weight_p;
3642	local_irq_disable();
3643	while (work < quota) {
3644		struct sk_buff *skb;
3645		unsigned int qlen;
3646
3647		while ((skb = __skb_dequeue(&sd->process_queue))) {
3648			local_irq_enable();
3649			__netif_receive_skb(skb);
3650			local_irq_disable();
3651			input_queue_head_incr(sd);
3652			if (++work >= quota) {
3653				local_irq_enable();
3654				return work;
3655			}
3656		}
3657
3658		rps_lock(sd);
3659		qlen = skb_queue_len(&sd->input_pkt_queue);
3660		if (qlen)
3661			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3662						   &sd->process_queue);
3663
3664		if (qlen < quota - work) {
3665			/*
3666			 * Inline a custom version of __napi_complete().
3667			 * only current cpu owns and manipulates this napi,
3668			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3669			 * we can use a plain write instead of clear_bit(),
3670			 * and we dont need an smp_mb() memory barrier.
3671			 */
3672			list_del(&napi->poll_list);
3673			napi->state = 0;
3674
3675			quota = work + qlen;
3676		}
3677		rps_unlock(sd);
3678	}
3679	local_irq_enable();
3680
3681	return work;
3682}
3683
3684/**
3685 * __napi_schedule - schedule for receive
3686 * @n: entry to schedule
3687 *
3688 * The entry's receive function will be scheduled to run
3689 */
3690void __napi_schedule(struct napi_struct *n)
3691{
3692	unsigned long flags;
3693
3694	local_irq_save(flags);
3695	____napi_schedule(&__get_cpu_var(softnet_data), n);
3696	local_irq_restore(flags);
3697}
3698EXPORT_SYMBOL(__napi_schedule);
3699
3700void __napi_complete(struct napi_struct *n)
3701{
3702	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3703	BUG_ON(n->gro_list);
3704
3705	list_del(&n->poll_list);
3706	smp_mb__before_clear_bit();
3707	clear_bit(NAPI_STATE_SCHED, &n->state);
3708}
3709EXPORT_SYMBOL(__napi_complete);
3710
3711void napi_complete(struct napi_struct *n)
3712{
3713	unsigned long flags;
3714
3715	/*
3716	 * don't let napi dequeue from the cpu poll list
3717	 * just in case its running on a different cpu
3718	 */
3719	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3720		return;
3721
3722	napi_gro_flush(n);
3723	local_irq_save(flags);
3724	__napi_complete(n);
3725	local_irq_restore(flags);
3726}
3727EXPORT_SYMBOL(napi_complete);
3728
3729void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3730		    int (*poll)(struct napi_struct *, int), int weight)
3731{
3732	INIT_LIST_HEAD(&napi->poll_list);
3733	napi->gro_count = 0;
3734	napi->gro_list = NULL;
3735	napi->skb = NULL;
3736	napi->poll = poll;
3737	napi->weight = weight;
3738	list_add(&napi->dev_list, &dev->napi_list);
3739	napi->dev = dev;
3740#ifdef CONFIG_NETPOLL
3741	spin_lock_init(&napi->poll_lock);
3742	napi->poll_owner = -1;
3743#endif
3744	set_bit(NAPI_STATE_SCHED, &napi->state);
3745}
3746EXPORT_SYMBOL(netif_napi_add);
3747
3748void netif_napi_del(struct napi_struct *napi)
3749{
3750	struct sk_buff *skb, *next;
3751
3752	list_del_init(&napi->dev_list);
3753	napi_free_frags(napi);
3754
3755	for (skb = napi->gro_list; skb; skb = next) {
3756		next = skb->next;
3757		skb->next = NULL;
3758		kfree_skb(skb);
3759	}
3760
3761	napi->gro_list = NULL;
3762	napi->gro_count = 0;
3763}
3764EXPORT_SYMBOL(netif_napi_del);
3765
3766static void net_rx_action(struct softirq_action *h)
3767{
3768	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3769	unsigned long time_limit = jiffies + 2;
3770	int budget = netdev_budget;
3771	void *have;
3772
3773	local_irq_disable();
3774
3775	while (!list_empty(&sd->poll_list)) {
3776		struct napi_struct *n;
3777		int work, weight;
3778
3779		/* If softirq window is exhuasted then punt.
3780		 * Allow this to run for 2 jiffies since which will allow
3781		 * an average latency of 1.5/HZ.
3782		 */
3783		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3784			goto softnet_break;
3785
3786		local_irq_enable();
3787
3788		/* Even though interrupts have been re-enabled, this
3789		 * access is safe because interrupts can only add new
3790		 * entries to the tail of this list, and only ->poll()
3791		 * calls can remove this head entry from the list.
3792		 */
3793		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3794
3795		have = netpoll_poll_lock(n);
3796
3797		weight = n->weight;
3798
3799		/* This NAPI_STATE_SCHED test is for avoiding a race
3800		 * with netpoll's poll_napi().  Only the entity which
3801		 * obtains the lock and sees NAPI_STATE_SCHED set will
3802		 * actually make the ->poll() call.  Therefore we avoid
3803		 * accidently calling ->poll() when NAPI is not scheduled.
3804		 */
3805		work = 0;
3806		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3807			work = n->poll(n, weight);
3808			trace_napi_poll(n);
3809		}
3810
3811		WARN_ON_ONCE(work > weight);
3812
3813		budget -= work;
3814
3815		local_irq_disable();
3816
3817		/* Drivers must not modify the NAPI state if they
3818		 * consume the entire weight.  In such cases this code
3819		 * still "owns" the NAPI instance and therefore can
3820		 * move the instance around on the list at-will.
3821		 */
3822		if (unlikely(work == weight)) {
3823			if (unlikely(napi_disable_pending(n))) {
3824				local_irq_enable();
3825				napi_complete(n);
3826				local_irq_disable();
3827			} else
3828				list_move_tail(&n->poll_list, &sd->poll_list);
3829		}
3830
3831		netpoll_poll_unlock(have);
3832	}
3833out:
3834	net_rps_action_and_irq_enable(sd);
3835
3836#ifdef CONFIG_NET_DMA
3837	/*
3838	 * There may not be any more sk_buffs coming right now, so push
3839	 * any pending DMA copies to hardware
3840	 */
3841	dma_issue_pending_all();
3842#endif
3843
3844	return;
3845
3846softnet_break:
3847	sd->time_squeeze++;
3848	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3849	goto out;
3850}
3851
3852static gifconf_func_t *gifconf_list[NPROTO];
3853
3854/**
3855 *	register_gifconf	-	register a SIOCGIF handler
3856 *	@family: Address family
3857 *	@gifconf: Function handler
3858 *
3859 *	Register protocol dependent address dumping routines. The handler
3860 *	that is passed must not be freed or reused until it has been replaced
3861 *	by another handler.
3862 */
3863int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3864{
3865	if (family >= NPROTO)
3866		return -EINVAL;
3867	gifconf_list[family] = gifconf;
3868	return 0;
3869}
3870EXPORT_SYMBOL(register_gifconf);
3871
3872
3873/*
3874 *	Map an interface index to its name (SIOCGIFNAME)
3875 */
3876
3877/*
3878 *	We need this ioctl for efficient implementation of the
3879 *	if_indextoname() function required by the IPv6 API.  Without
3880 *	it, we would have to search all the interfaces to find a
3881 *	match.  --pb
3882 */
3883
3884static int dev_ifname(struct net *net, struct ifreq __user *arg)
3885{
3886	struct net_device *dev;
3887	struct ifreq ifr;
3888
3889	/*
3890	 *	Fetch the caller's info block.
3891	 */
3892
3893	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3894		return -EFAULT;
3895
3896	rcu_read_lock();
3897	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3898	if (!dev) {
3899		rcu_read_unlock();
3900		return -ENODEV;
3901	}
3902
3903	strcpy(ifr.ifr_name, dev->name);
3904	rcu_read_unlock();
3905
3906	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3907		return -EFAULT;
3908	return 0;
3909}
3910
3911/*
3912 *	Perform a SIOCGIFCONF call. This structure will change
3913 *	size eventually, and there is nothing I can do about it.
3914 *	Thus we will need a 'compatibility mode'.
3915 */
3916
3917static int dev_ifconf(struct net *net, char __user *arg)
3918{
3919	struct ifconf ifc;
3920	struct net_device *dev;
3921	char __user *pos;
3922	int len;
3923	int total;
3924	int i;
3925
3926	/*
3927	 *	Fetch the caller's info block.
3928	 */
3929
3930	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3931		return -EFAULT;
3932
3933	pos = ifc.ifc_buf;
3934	len = ifc.ifc_len;
3935
3936	/*
3937	 *	Loop over the interfaces, and write an info block for each.
3938	 */
3939
3940	total = 0;
3941	for_each_netdev(net, dev) {
3942		for (i = 0; i < NPROTO; i++) {
3943			if (gifconf_list[i]) {
3944				int done;
3945				if (!pos)
3946					done = gifconf_list[i](dev, NULL, 0);
3947				else
3948					done = gifconf_list[i](dev, pos + total,
3949							       len - total);
3950				if (done < 0)
3951					return -EFAULT;
3952				total += done;
3953			}
3954		}
3955	}
3956
3957	/*
3958	 *	All done.  Write the updated control block back to the caller.
3959	 */
3960	ifc.ifc_len = total;
3961
3962	/*
3963	 * 	Both BSD and Solaris return 0 here, so we do too.
3964	 */
3965	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3966}
3967
3968#ifdef CONFIG_PROC_FS
3969/*
3970 *	This is invoked by the /proc filesystem handler to display a device
3971 *	in detail.
3972 */
3973void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3974	__acquires(RCU)
3975{
3976	struct net *net = seq_file_net(seq);
3977	loff_t off;
3978	struct net_device *dev;
3979
3980	rcu_read_lock();
3981	if (!*pos)
3982		return SEQ_START_TOKEN;
3983
3984	off = 1;
3985	for_each_netdev_rcu(net, dev)
3986		if (off++ == *pos)
3987			return dev;
3988
3989	return NULL;
3990}
3991
3992void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3993{
3994	struct net_device *dev = v;
3995
3996	if (v == SEQ_START_TOKEN)
3997		dev = first_net_device_rcu(seq_file_net(seq));
3998	else
3999		dev = next_net_device_rcu(dev);
4000
4001	++*pos;
4002	return dev;
4003}
4004
4005void dev_seq_stop(struct seq_file *seq, void *v)
4006	__releases(RCU)
4007{
4008	rcu_read_unlock();
4009}
4010
4011static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4012{
4013	struct rtnl_link_stats64 temp;
4014	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4015
4016	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4017		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4018		   dev->name, stats->rx_bytes, stats->rx_packets,
4019		   stats->rx_errors,
4020		   stats->rx_dropped + stats->rx_missed_errors,
4021		   stats->rx_fifo_errors,
4022		   stats->rx_length_errors + stats->rx_over_errors +
4023		    stats->rx_crc_errors + stats->rx_frame_errors,
4024		   stats->rx_compressed, stats->multicast,
4025		   stats->tx_bytes, stats->tx_packets,
4026		   stats->tx_errors, stats->tx_dropped,
4027		   stats->tx_fifo_errors, stats->collisions,
4028		   stats->tx_carrier_errors +
4029		    stats->tx_aborted_errors +
4030		    stats->tx_window_errors +
4031		    stats->tx_heartbeat_errors,
4032		   stats->tx_compressed);
4033}
4034
4035/*
4036 *	Called from the PROCfs module. This now uses the new arbitrary sized
4037 *	/proc/net interface to create /proc/net/dev
4038 */
4039static int dev_seq_show(struct seq_file *seq, void *v)
4040{
4041	if (v == SEQ_START_TOKEN)
4042		seq_puts(seq, "Inter-|   Receive                            "
4043			      "                    |  Transmit\n"
4044			      " face |bytes    packets errs drop fifo frame "
4045			      "compressed multicast|bytes    packets errs "
4046			      "drop fifo colls carrier compressed\n");
4047	else
4048		dev_seq_printf_stats(seq, v);
4049	return 0;
4050}
4051
4052static struct softnet_data *softnet_get_online(loff_t *pos)
4053{
4054	struct softnet_data *sd = NULL;
4055
4056	while (*pos < nr_cpu_ids)
4057		if (cpu_online(*pos)) {
4058			sd = &per_cpu(softnet_data, *pos);
4059			break;
4060		} else
4061			++*pos;
4062	return sd;
4063}
4064
4065static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4066{
4067	return softnet_get_online(pos);
4068}
4069
4070static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4071{
4072	++*pos;
4073	return softnet_get_online(pos);
4074}
4075
4076static void softnet_seq_stop(struct seq_file *seq, void *v)
4077{
4078}
4079
4080static int softnet_seq_show(struct seq_file *seq, void *v)
4081{
4082	struct softnet_data *sd = v;
4083
4084	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4085		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4086		   0, 0, 0, 0, /* was fastroute */
4087		   sd->cpu_collision, sd->received_rps);
4088	return 0;
4089}
4090
4091static const struct seq_operations dev_seq_ops = {
4092	.start = dev_seq_start,
4093	.next  = dev_seq_next,
4094	.stop  = dev_seq_stop,
4095	.show  = dev_seq_show,
4096};
4097
4098static int dev_seq_open(struct inode *inode, struct file *file)
4099{
4100	return seq_open_net(inode, file, &dev_seq_ops,
4101			    sizeof(struct seq_net_private));
4102}
4103
4104static const struct file_operations dev_seq_fops = {
4105	.owner	 = THIS_MODULE,
4106	.open    = dev_seq_open,
4107	.read    = seq_read,
4108	.llseek  = seq_lseek,
4109	.release = seq_release_net,
4110};
4111
4112static const struct seq_operations softnet_seq_ops = {
4113	.start = softnet_seq_start,
4114	.next  = softnet_seq_next,
4115	.stop  = softnet_seq_stop,
4116	.show  = softnet_seq_show,
4117};
4118
4119static int softnet_seq_open(struct inode *inode, struct file *file)
4120{
4121	return seq_open(file, &softnet_seq_ops);
4122}
4123
4124static const struct file_operations softnet_seq_fops = {
4125	.owner	 = THIS_MODULE,
4126	.open    = softnet_seq_open,
4127	.read    = seq_read,
4128	.llseek  = seq_lseek,
4129	.release = seq_release,
4130};
4131
4132static void *ptype_get_idx(loff_t pos)
4133{
4134	struct packet_type *pt = NULL;
4135	loff_t i = 0;
4136	int t;
4137
4138	list_for_each_entry_rcu(pt, &ptype_all, list) {
4139		if (i == pos)
4140			return pt;
4141		++i;
4142	}
4143
4144	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4145		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4146			if (i == pos)
4147				return pt;
4148			++i;
4149		}
4150	}
4151	return NULL;
4152}
4153
4154static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4155	__acquires(RCU)
4156{
4157	rcu_read_lock();
4158	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4159}
4160
4161static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4162{
4163	struct packet_type *pt;
4164	struct list_head *nxt;
4165	int hash;
4166
4167	++*pos;
4168	if (v == SEQ_START_TOKEN)
4169		return ptype_get_idx(0);
4170
4171	pt = v;
4172	nxt = pt->list.next;
4173	if (pt->type == htons(ETH_P_ALL)) {
4174		if (nxt != &ptype_all)
4175			goto found;
4176		hash = 0;
4177		nxt = ptype_base[0].next;
4178	} else
4179		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4180
4181	while (nxt == &ptype_base[hash]) {
4182		if (++hash >= PTYPE_HASH_SIZE)
4183			return NULL;
4184		nxt = ptype_base[hash].next;
4185	}
4186found:
4187	return list_entry(nxt, struct packet_type, list);
4188}
4189
4190static void ptype_seq_stop(struct seq_file *seq, void *v)
4191	__releases(RCU)
4192{
4193	rcu_read_unlock();
4194}
4195
4196static int ptype_seq_show(struct seq_file *seq, void *v)
4197{
4198	struct packet_type *pt = v;
4199
4200	if (v == SEQ_START_TOKEN)
4201		seq_puts(seq, "Type Device      Function\n");
4202	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4203		if (pt->type == htons(ETH_P_ALL))
4204			seq_puts(seq, "ALL ");
4205		else
4206			seq_printf(seq, "%04x", ntohs(pt->type));
4207
4208		seq_printf(seq, " %-8s %pF\n",
4209			   pt->dev ? pt->dev->name : "", pt->func);
4210	}
4211
4212	return 0;
4213}
4214
4215static const struct seq_operations ptype_seq_ops = {
4216	.start = ptype_seq_start,
4217	.next  = ptype_seq_next,
4218	.stop  = ptype_seq_stop,
4219	.show  = ptype_seq_show,
4220};
4221
4222static int ptype_seq_open(struct inode *inode, struct file *file)
4223{
4224	return seq_open_net(inode, file, &ptype_seq_ops,
4225			sizeof(struct seq_net_private));
4226}
4227
4228static const struct file_operations ptype_seq_fops = {
4229	.owner	 = THIS_MODULE,
4230	.open    = ptype_seq_open,
4231	.read    = seq_read,
4232	.llseek  = seq_lseek,
4233	.release = seq_release_net,
4234};
4235
4236
4237static int __net_init dev_proc_net_init(struct net *net)
4238{
4239	int rc = -ENOMEM;
4240
4241	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4242		goto out;
4243	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4244		goto out_dev;
4245	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4246		goto out_softnet;
4247
4248	if (wext_proc_init(net))
4249		goto out_ptype;
4250	rc = 0;
4251out:
4252	return rc;
4253out_ptype:
4254	proc_net_remove(net, "ptype");
4255out_softnet:
4256	proc_net_remove(net, "softnet_stat");
4257out_dev:
4258	proc_net_remove(net, "dev");
4259	goto out;
4260}
4261
4262static void __net_exit dev_proc_net_exit(struct net *net)
4263{
4264	wext_proc_exit(net);
4265
4266	proc_net_remove(net, "ptype");
4267	proc_net_remove(net, "softnet_stat");
4268	proc_net_remove(net, "dev");
4269}
4270
4271static struct pernet_operations __net_initdata dev_proc_ops = {
4272	.init = dev_proc_net_init,
4273	.exit = dev_proc_net_exit,
4274};
4275
4276static int __init dev_proc_init(void)
4277{
4278	return register_pernet_subsys(&dev_proc_ops);
4279}
4280#else
4281#define dev_proc_init() 0
4282#endif	/* CONFIG_PROC_FS */
4283
4284
4285/**
4286 *	netdev_set_master	-	set up master pointer
4287 *	@slave: slave device
4288 *	@master: new master device
4289 *
4290 *	Changes the master device of the slave. Pass %NULL to break the
4291 *	bonding. The caller must hold the RTNL semaphore. On a failure
4292 *	a negative errno code is returned. On success the reference counts
4293 *	are adjusted and the function returns zero.
4294 */
4295int netdev_set_master(struct net_device *slave, struct net_device *master)
4296{
4297	struct net_device *old = slave->master;
4298
4299	ASSERT_RTNL();
4300
4301	if (master) {
4302		if (old)
4303			return -EBUSY;
4304		dev_hold(master);
4305	}
4306
4307	slave->master = master;
4308
4309	if (old) {
4310		synchronize_net();
4311		dev_put(old);
4312	}
4313	return 0;
4314}
4315EXPORT_SYMBOL(netdev_set_master);
4316
4317/**
4318 *	netdev_set_bond_master	-	set up bonding master/slave pair
4319 *	@slave: slave device
4320 *	@master: new master device
4321 *
4322 *	Changes the master device of the slave. Pass %NULL to break the
4323 *	bonding. The caller must hold the RTNL semaphore. On a failure
4324 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4325 *	to the routing socket and the function returns zero.
4326 */
4327int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4328{
4329	int err;
4330
4331	ASSERT_RTNL();
4332
4333	err = netdev_set_master(slave, master);
4334	if (err)
4335		return err;
4336	if (master)
4337		slave->flags |= IFF_SLAVE;
4338	else
4339		slave->flags &= ~IFF_SLAVE;
4340
4341	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4342	return 0;
4343}
4344EXPORT_SYMBOL(netdev_set_bond_master);
4345
4346static void dev_change_rx_flags(struct net_device *dev, int flags)
4347{
4348	const struct net_device_ops *ops = dev->netdev_ops;
4349
4350	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4351		ops->ndo_change_rx_flags(dev, flags);
4352}
4353
4354static int __dev_set_promiscuity(struct net_device *dev, int inc)
4355{
4356	unsigned short old_flags = dev->flags;
4357	uid_t uid;
4358	gid_t gid;
4359
4360	ASSERT_RTNL();
4361
4362	dev->flags |= IFF_PROMISC;
4363	dev->promiscuity += inc;
4364	if (dev->promiscuity == 0) {
4365		/*
4366		 * Avoid overflow.
4367		 * If inc causes overflow, untouch promisc and return error.
4368		 */
4369		if (inc < 0)
4370			dev->flags &= ~IFF_PROMISC;
4371		else {
4372			dev->promiscuity -= inc;
4373			printk(KERN_WARNING "%s: promiscuity touches roof, "
4374				"set promiscuity failed, promiscuity feature "
4375				"of device might be broken.\n", dev->name);
4376			return -EOVERFLOW;
4377		}
4378	}
4379	if (dev->flags != old_flags) {
4380		printk(KERN_INFO "device %s %s promiscuous mode\n",
4381		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4382							       "left");
4383		if (audit_enabled) {
4384			current_uid_gid(&uid, &gid);
4385			audit_log(current->audit_context, GFP_ATOMIC,
4386				AUDIT_ANOM_PROMISCUOUS,
4387				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4388				dev->name, (dev->flags & IFF_PROMISC),
4389				(old_flags & IFF_PROMISC),
4390				audit_get_loginuid(current),
4391				uid, gid,
4392				audit_get_sessionid(current));
4393		}
4394
4395		dev_change_rx_flags(dev, IFF_PROMISC);
4396	}
4397	return 0;
4398}
4399
4400/**
4401 *	dev_set_promiscuity	- update promiscuity count on a device
4402 *	@dev: device
4403 *	@inc: modifier
4404 *
4405 *	Add or remove promiscuity from a device. While the count in the device
4406 *	remains above zero the interface remains promiscuous. Once it hits zero
4407 *	the device reverts back to normal filtering operation. A negative inc
4408 *	value is used to drop promiscuity on the device.
4409 *	Return 0 if successful or a negative errno code on error.
4410 */
4411int dev_set_promiscuity(struct net_device *dev, int inc)
4412{
4413	unsigned short old_flags = dev->flags;
4414	int err;
4415
4416	err = __dev_set_promiscuity(dev, inc);
4417	if (err < 0)
4418		return err;
4419	if (dev->flags != old_flags)
4420		dev_set_rx_mode(dev);
4421	return err;
4422}
4423EXPORT_SYMBOL(dev_set_promiscuity);
4424
4425/**
4426 *	dev_set_allmulti	- update allmulti count on a device
4427 *	@dev: device
4428 *	@inc: modifier
4429 *
4430 *	Add or remove reception of all multicast frames to a device. While the
4431 *	count in the device remains above zero the interface remains listening
4432 *	to all interfaces. Once it hits zero the device reverts back to normal
4433 *	filtering operation. A negative @inc value is used to drop the counter
4434 *	when releasing a resource needing all multicasts.
4435 *	Return 0 if successful or a negative errno code on error.
4436 */
4437
4438int dev_set_allmulti(struct net_device *dev, int inc)
4439{
4440	unsigned short old_flags = dev->flags;
4441
4442	ASSERT_RTNL();
4443
4444	dev->flags |= IFF_ALLMULTI;
4445	dev->allmulti += inc;
4446	if (dev->allmulti == 0) {
4447		/*
4448		 * Avoid overflow.
4449		 * If inc causes overflow, untouch allmulti and return error.
4450		 */
4451		if (inc < 0)
4452			dev->flags &= ~IFF_ALLMULTI;
4453		else {
4454			dev->allmulti -= inc;
4455			printk(KERN_WARNING "%s: allmulti touches roof, "
4456				"set allmulti failed, allmulti feature of "
4457				"device might be broken.\n", dev->name);
4458			return -EOVERFLOW;
4459		}
4460	}
4461	if (dev->flags ^ old_flags) {
4462		dev_change_rx_flags(dev, IFF_ALLMULTI);
4463		dev_set_rx_mode(dev);
4464	}
4465	return 0;
4466}
4467EXPORT_SYMBOL(dev_set_allmulti);
4468
4469/*
4470 *	Upload unicast and multicast address lists to device and
4471 *	configure RX filtering. When the device doesn't support unicast
4472 *	filtering it is put in promiscuous mode while unicast addresses
4473 *	are present.
4474 */
4475void __dev_set_rx_mode(struct net_device *dev)
4476{
4477	const struct net_device_ops *ops = dev->netdev_ops;
4478
4479	/* dev_open will call this function so the list will stay sane. */
4480	if (!(dev->flags&IFF_UP))
4481		return;
4482
4483	if (!netif_device_present(dev))
4484		return;
4485
4486	if (ops->ndo_set_rx_mode)
4487		ops->ndo_set_rx_mode(dev);
4488	else {
4489		/* Unicast addresses changes may only happen under the rtnl,
4490		 * therefore calling __dev_set_promiscuity here is safe.
4491		 */
4492		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4493			__dev_set_promiscuity(dev, 1);
4494			dev->uc_promisc = 1;
4495		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4496			__dev_set_promiscuity(dev, -1);
4497			dev->uc_promisc = 0;
4498		}
4499
4500		if (ops->ndo_set_multicast_list)
4501			ops->ndo_set_multicast_list(dev);
4502	}
4503}
4504
4505void dev_set_rx_mode(struct net_device *dev)
4506{
4507	netif_addr_lock_bh(dev);
4508	__dev_set_rx_mode(dev);
4509	netif_addr_unlock_bh(dev);
4510}
4511
4512/**
4513 *	dev_get_flags - get flags reported to userspace
4514 *	@dev: device
4515 *
4516 *	Get the combination of flag bits exported through APIs to userspace.
4517 */
4518unsigned dev_get_flags(const struct net_device *dev)
4519{
4520	unsigned flags;
4521
4522	flags = (dev->flags & ~(IFF_PROMISC |
4523				IFF_ALLMULTI |
4524				IFF_RUNNING |
4525				IFF_LOWER_UP |
4526				IFF_DORMANT)) |
4527		(dev->gflags & (IFF_PROMISC |
4528				IFF_ALLMULTI));
4529
4530	if (netif_running(dev)) {
4531		if (netif_oper_up(dev))
4532			flags |= IFF_RUNNING;
4533		if (netif_carrier_ok(dev))
4534			flags |= IFF_LOWER_UP;
4535		if (netif_dormant(dev))
4536			flags |= IFF_DORMANT;
4537	}
4538
4539	return flags;
4540}
4541EXPORT_SYMBOL(dev_get_flags);
4542
4543int __dev_change_flags(struct net_device *dev, unsigned int flags)
4544{
4545	int old_flags = dev->flags;
4546	int ret;
4547
4548	ASSERT_RTNL();
4549
4550	/*
4551	 *	Set the flags on our device.
4552	 */
4553
4554	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4555			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4556			       IFF_AUTOMEDIA)) |
4557		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4558				    IFF_ALLMULTI));
4559
4560	/*
4561	 *	Load in the correct multicast list now the flags have changed.
4562	 */
4563
4564	if ((old_flags ^ flags) & IFF_MULTICAST)
4565		dev_change_rx_flags(dev, IFF_MULTICAST);
4566
4567	dev_set_rx_mode(dev);
4568
4569	/*
4570	 *	Have we downed the interface. We handle IFF_UP ourselves
4571	 *	according to user attempts to set it, rather than blindly
4572	 *	setting it.
4573	 */
4574
4575	ret = 0;
4576	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4577		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4578
4579		if (!ret)
4580			dev_set_rx_mode(dev);
4581	}
4582
4583	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4584		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4585
4586		dev->gflags ^= IFF_PROMISC;
4587		dev_set_promiscuity(dev, inc);
4588	}
4589
4590	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4591	   is important. Some (broken) drivers set IFF_PROMISC, when
4592	   IFF_ALLMULTI is requested not asking us and not reporting.
4593	 */
4594	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4595		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4596
4597		dev->gflags ^= IFF_ALLMULTI;
4598		dev_set_allmulti(dev, inc);
4599	}
4600
4601	return ret;
4602}
4603
4604void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4605{
4606	unsigned int changes = dev->flags ^ old_flags;
4607
4608	if (changes & IFF_UP) {
4609		if (dev->flags & IFF_UP)
4610			call_netdevice_notifiers(NETDEV_UP, dev);
4611		else
4612			call_netdevice_notifiers(NETDEV_DOWN, dev);
4613	}
4614
4615	if (dev->flags & IFF_UP &&
4616	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4617		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4618}
4619
4620/**
4621 *	dev_change_flags - change device settings
4622 *	@dev: device
4623 *	@flags: device state flags
4624 *
4625 *	Change settings on device based state flags. The flags are
4626 *	in the userspace exported format.
4627 */
4628int dev_change_flags(struct net_device *dev, unsigned flags)
4629{
4630	int ret, changes;
4631	int old_flags = dev->flags;
4632
4633	ret = __dev_change_flags(dev, flags);
4634	if (ret < 0)
4635		return ret;
4636
4637	changes = old_flags ^ dev->flags;
4638	if (changes)
4639		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4640
4641	__dev_notify_flags(dev, old_flags);
4642	return ret;
4643}
4644EXPORT_SYMBOL(dev_change_flags);
4645
4646/**
4647 *	dev_set_mtu - Change maximum transfer unit
4648 *	@dev: device
4649 *	@new_mtu: new transfer unit
4650 *
4651 *	Change the maximum transfer size of the network device.
4652 */
4653int dev_set_mtu(struct net_device *dev, int new_mtu)
4654{
4655	const struct net_device_ops *ops = dev->netdev_ops;
4656	int err;
4657
4658	if (new_mtu == dev->mtu)
4659		return 0;
4660
4661	/*	MTU must be positive.	 */
4662	if (new_mtu < 0)
4663		return -EINVAL;
4664
4665	if (!netif_device_present(dev))
4666		return -ENODEV;
4667
4668	err = 0;
4669	if (ops->ndo_change_mtu)
4670		err = ops->ndo_change_mtu(dev, new_mtu);
4671	else
4672		dev->mtu = new_mtu;
4673
4674	if (!err && dev->flags & IFF_UP)
4675		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4676	return err;
4677}
4678EXPORT_SYMBOL(dev_set_mtu);
4679
4680/**
4681 *	dev_set_group - Change group this device belongs to
4682 *	@dev: device
4683 *	@new_group: group this device should belong to
4684 */
4685void dev_set_group(struct net_device *dev, int new_group)
4686{
4687	dev->group = new_group;
4688}
4689EXPORT_SYMBOL(dev_set_group);
4690
4691/**
4692 *	dev_set_mac_address - Change Media Access Control Address
4693 *	@dev: device
4694 *	@sa: new address
4695 *
4696 *	Change the hardware (MAC) address of the device
4697 */
4698int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4699{
4700	const struct net_device_ops *ops = dev->netdev_ops;
4701	int err;
4702
4703	if (!ops->ndo_set_mac_address)
4704		return -EOPNOTSUPP;
4705	if (sa->sa_family != dev->type)
4706		return -EINVAL;
4707	if (!netif_device_present(dev))
4708		return -ENODEV;
4709	err = ops->ndo_set_mac_address(dev, sa);
4710	if (!err)
4711		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4712	return err;
4713}
4714EXPORT_SYMBOL(dev_set_mac_address);
4715
4716/*
4717 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4718 */
4719static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4720{
4721	int err;
4722	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4723
4724	if (!dev)
4725		return -ENODEV;
4726
4727	switch (cmd) {
4728	case SIOCGIFFLAGS:	/* Get interface flags */
4729		ifr->ifr_flags = (short) dev_get_flags(dev);
4730		return 0;
4731
4732	case SIOCGIFMETRIC:	/* Get the metric on the interface
4733				   (currently unused) */
4734		ifr->ifr_metric = 0;
4735		return 0;
4736
4737	case SIOCGIFMTU:	/* Get the MTU of a device */
4738		ifr->ifr_mtu = dev->mtu;
4739		return 0;
4740
4741	case SIOCGIFHWADDR:
4742		if (!dev->addr_len)
4743			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4744		else
4745			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4746			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4747		ifr->ifr_hwaddr.sa_family = dev->type;
4748		return 0;
4749
4750	case SIOCGIFSLAVE:
4751		err = -EINVAL;
4752		break;
4753
4754	case SIOCGIFMAP:
4755		ifr->ifr_map.mem_start = dev->mem_start;
4756		ifr->ifr_map.mem_end   = dev->mem_end;
4757		ifr->ifr_map.base_addr = dev->base_addr;
4758		ifr->ifr_map.irq       = dev->irq;
4759		ifr->ifr_map.dma       = dev->dma;
4760		ifr->ifr_map.port      = dev->if_port;
4761		return 0;
4762
4763	case SIOCGIFINDEX:
4764		ifr->ifr_ifindex = dev->ifindex;
4765		return 0;
4766
4767	case SIOCGIFTXQLEN:
4768		ifr->ifr_qlen = dev->tx_queue_len;
4769		return 0;
4770
4771	default:
4772		/* dev_ioctl() should ensure this case
4773		 * is never reached
4774		 */
4775		WARN_ON(1);
4776		err = -EINVAL;
4777		break;
4778
4779	}
4780	return err;
4781}
4782
4783/*
4784 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4785 */
4786static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4787{
4788	int err;
4789	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4790	const struct net_device_ops *ops;
4791
4792	if (!dev)
4793		return -ENODEV;
4794
4795	ops = dev->netdev_ops;
4796
4797	switch (cmd) {
4798	case SIOCSIFFLAGS:	/* Set interface flags */
4799		return dev_change_flags(dev, ifr->ifr_flags);
4800
4801	case SIOCSIFMETRIC:	/* Set the metric on the interface
4802				   (currently unused) */
4803		return -EOPNOTSUPP;
4804
4805	case SIOCSIFMTU:	/* Set the MTU of a device */
4806		return dev_set_mtu(dev, ifr->ifr_mtu);
4807
4808	case SIOCSIFHWADDR:
4809		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4810
4811	case SIOCSIFHWBROADCAST:
4812		if (ifr->ifr_hwaddr.sa_family != dev->type)
4813			return -EINVAL;
4814		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4815		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4816		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4817		return 0;
4818
4819	case SIOCSIFMAP:
4820		if (ops->ndo_set_config) {
4821			if (!netif_device_present(dev))
4822				return -ENODEV;
4823			return ops->ndo_set_config(dev, &ifr->ifr_map);
4824		}
4825		return -EOPNOTSUPP;
4826
4827	case SIOCADDMULTI:
4828		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4829		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4830			return -EINVAL;
4831		if (!netif_device_present(dev))
4832			return -ENODEV;
4833		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4834
4835	case SIOCDELMULTI:
4836		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4837		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4838			return -EINVAL;
4839		if (!netif_device_present(dev))
4840			return -ENODEV;
4841		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4842
4843	case SIOCSIFTXQLEN:
4844		if (ifr->ifr_qlen < 0)
4845			return -EINVAL;
4846		dev->tx_queue_len = ifr->ifr_qlen;
4847		return 0;
4848
4849	case SIOCSIFNAME:
4850		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4851		return dev_change_name(dev, ifr->ifr_newname);
4852
4853	/*
4854	 *	Unknown or private ioctl
4855	 */
4856	default:
4857		if ((cmd >= SIOCDEVPRIVATE &&
4858		    cmd <= SIOCDEVPRIVATE + 15) ||
4859		    cmd == SIOCBONDENSLAVE ||
4860		    cmd == SIOCBONDRELEASE ||
4861		    cmd == SIOCBONDSETHWADDR ||
4862		    cmd == SIOCBONDSLAVEINFOQUERY ||
4863		    cmd == SIOCBONDINFOQUERY ||
4864		    cmd == SIOCBONDCHANGEACTIVE ||
4865		    cmd == SIOCGMIIPHY ||
4866		    cmd == SIOCGMIIREG ||
4867		    cmd == SIOCSMIIREG ||
4868		    cmd == SIOCBRADDIF ||
4869		    cmd == SIOCBRDELIF ||
4870		    cmd == SIOCSHWTSTAMP ||
4871		    cmd == SIOCWANDEV) {
4872			err = -EOPNOTSUPP;
4873			if (ops->ndo_do_ioctl) {
4874				if (netif_device_present(dev))
4875					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4876				else
4877					err = -ENODEV;
4878			}
4879		} else
4880			err = -EINVAL;
4881
4882	}
4883	return err;
4884}
4885
4886/*
4887 *	This function handles all "interface"-type I/O control requests. The actual
4888 *	'doing' part of this is dev_ifsioc above.
4889 */
4890
4891/**
4892 *	dev_ioctl	-	network device ioctl
4893 *	@net: the applicable net namespace
4894 *	@cmd: command to issue
4895 *	@arg: pointer to a struct ifreq in user space
4896 *
4897 *	Issue ioctl functions to devices. This is normally called by the
4898 *	user space syscall interfaces but can sometimes be useful for
4899 *	other purposes. The return value is the return from the syscall if
4900 *	positive or a negative errno code on error.
4901 */
4902
4903int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4904{
4905	struct ifreq ifr;
4906	int ret;
4907	char *colon;
4908
4909	/* One special case: SIOCGIFCONF takes ifconf argument
4910	   and requires shared lock, because it sleeps writing
4911	   to user space.
4912	 */
4913
4914	if (cmd == SIOCGIFCONF) {
4915		rtnl_lock();
4916		ret = dev_ifconf(net, (char __user *) arg);
4917		rtnl_unlock();
4918		return ret;
4919	}
4920	if (cmd == SIOCGIFNAME)
4921		return dev_ifname(net, (struct ifreq __user *)arg);
4922
4923	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4924		return -EFAULT;
4925
4926	ifr.ifr_name[IFNAMSIZ-1] = 0;
4927
4928	colon = strchr(ifr.ifr_name, ':');
4929	if (colon)
4930		*colon = 0;
4931
4932	/*
4933	 *	See which interface the caller is talking about.
4934	 */
4935
4936	switch (cmd) {
4937	/*
4938	 *	These ioctl calls:
4939	 *	- can be done by all.
4940	 *	- atomic and do not require locking.
4941	 *	- return a value
4942	 */
4943	case SIOCGIFFLAGS:
4944	case SIOCGIFMETRIC:
4945	case SIOCGIFMTU:
4946	case SIOCGIFHWADDR:
4947	case SIOCGIFSLAVE:
4948	case SIOCGIFMAP:
4949	case SIOCGIFINDEX:
4950	case SIOCGIFTXQLEN:
4951		dev_load(net, ifr.ifr_name);
4952		rcu_read_lock();
4953		ret = dev_ifsioc_locked(net, &ifr, cmd);
4954		rcu_read_unlock();
4955		if (!ret) {
4956			if (colon)
4957				*colon = ':';
4958			if (copy_to_user(arg, &ifr,
4959					 sizeof(struct ifreq)))
4960				ret = -EFAULT;
4961		}
4962		return ret;
4963
4964	case SIOCETHTOOL:
4965		dev_load(net, ifr.ifr_name);
4966		rtnl_lock();
4967		ret = dev_ethtool(net, &ifr);
4968		rtnl_unlock();
4969		if (!ret) {
4970			if (colon)
4971				*colon = ':';
4972			if (copy_to_user(arg, &ifr,
4973					 sizeof(struct ifreq)))
4974				ret = -EFAULT;
4975		}
4976		return ret;
4977
4978	/*
4979	 *	These ioctl calls:
4980	 *	- require superuser power.
4981	 *	- require strict serialization.
4982	 *	- return a value
4983	 */
4984	case SIOCGMIIPHY:
4985	case SIOCGMIIREG:
4986	case SIOCSIFNAME:
4987		if (!capable(CAP_NET_ADMIN))
4988			return -EPERM;
4989		dev_load(net, ifr.ifr_name);
4990		rtnl_lock();
4991		ret = dev_ifsioc(net, &ifr, cmd);
4992		rtnl_unlock();
4993		if (!ret) {
4994			if (colon)
4995				*colon = ':';
4996			if (copy_to_user(arg, &ifr,
4997					 sizeof(struct ifreq)))
4998				ret = -EFAULT;
4999		}
5000		return ret;
5001
5002	/*
5003	 *	These ioctl calls:
5004	 *	- require superuser power.
5005	 *	- require strict serialization.
5006	 *	- do not return a value
5007	 */
5008	case SIOCSIFFLAGS:
5009	case SIOCSIFMETRIC:
5010	case SIOCSIFMTU:
5011	case SIOCSIFMAP:
5012	case SIOCSIFHWADDR:
5013	case SIOCSIFSLAVE:
5014	case SIOCADDMULTI:
5015	case SIOCDELMULTI:
5016	case SIOCSIFHWBROADCAST:
5017	case SIOCSIFTXQLEN:
5018	case SIOCSMIIREG:
5019	case SIOCBONDENSLAVE:
5020	case SIOCBONDRELEASE:
5021	case SIOCBONDSETHWADDR:
5022	case SIOCBONDCHANGEACTIVE:
5023	case SIOCBRADDIF:
5024	case SIOCBRDELIF:
5025	case SIOCSHWTSTAMP:
5026		if (!capable(CAP_NET_ADMIN))
5027			return -EPERM;
5028		/* fall through */
5029	case SIOCBONDSLAVEINFOQUERY:
5030	case SIOCBONDINFOQUERY:
5031		dev_load(net, ifr.ifr_name);
5032		rtnl_lock();
5033		ret = dev_ifsioc(net, &ifr, cmd);
5034		rtnl_unlock();
5035		return ret;
5036
5037	case SIOCGIFMEM:
5038		/* Get the per device memory space. We can add this but
5039		 * currently do not support it */
5040	case SIOCSIFMEM:
5041		/* Set the per device memory buffer space.
5042		 * Not applicable in our case */
5043	case SIOCSIFLINK:
5044		return -EINVAL;
5045
5046	/*
5047	 *	Unknown or private ioctl.
5048	 */
5049	default:
5050		if (cmd == SIOCWANDEV ||
5051		    (cmd >= SIOCDEVPRIVATE &&
5052		     cmd <= SIOCDEVPRIVATE + 15)) {
5053			dev_load(net, ifr.ifr_name);
5054			rtnl_lock();
5055			ret = dev_ifsioc(net, &ifr, cmd);
5056			rtnl_unlock();
5057			if (!ret && copy_to_user(arg, &ifr,
5058						 sizeof(struct ifreq)))
5059				ret = -EFAULT;
5060			return ret;
5061		}
5062		/* Take care of Wireless Extensions */
5063		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5064			return wext_handle_ioctl(net, &ifr, cmd, arg);
5065		return -EINVAL;
5066	}
5067}
5068
5069
5070/**
5071 *	dev_new_index	-	allocate an ifindex
5072 *	@net: the applicable net namespace
5073 *
5074 *	Returns a suitable unique value for a new device interface
5075 *	number.  The caller must hold the rtnl semaphore or the
5076 *	dev_base_lock to be sure it remains unique.
5077 */
5078static int dev_new_index(struct net *net)
5079{
5080	static int ifindex;
5081	for (;;) {
5082		if (++ifindex <= 0)
5083			ifindex = 1;
5084		if (!__dev_get_by_index(net, ifindex))
5085			return ifindex;
5086	}
5087}
5088
5089/* Delayed registration/unregisteration */
5090static LIST_HEAD(net_todo_list);
5091
5092static void net_set_todo(struct net_device *dev)
5093{
5094	list_add_tail(&dev->todo_list, &net_todo_list);
5095}
5096
5097static void rollback_registered_many(struct list_head *head)
5098{
5099	struct net_device *dev, *tmp;
5100
5101	BUG_ON(dev_boot_phase);
5102	ASSERT_RTNL();
5103
5104	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5105		/* Some devices call without registering
5106		 * for initialization unwind. Remove those
5107		 * devices and proceed with the remaining.
5108		 */
5109		if (dev->reg_state == NETREG_UNINITIALIZED) {
5110			pr_debug("unregister_netdevice: device %s/%p never "
5111				 "was registered\n", dev->name, dev);
5112
5113			WARN_ON(1);
5114			list_del(&dev->unreg_list);
5115			continue;
5116		}
5117
5118		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5119	}
5120
5121	/* If device is running, close it first. */
5122	dev_close_many(head);
5123
5124	list_for_each_entry(dev, head, unreg_list) {
5125		/* And unlink it from device chain. */
5126		unlist_netdevice(dev);
5127
5128		dev->reg_state = NETREG_UNREGISTERING;
5129	}
5130
5131	synchronize_net();
5132
5133	list_for_each_entry(dev, head, unreg_list) {
5134		/* Shutdown queueing discipline. */
5135		dev_shutdown(dev);
5136
5137
5138		/* Notify protocols, that we are about to destroy
5139		   this device. They should clean all the things.
5140		*/
5141		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5142
5143		if (!dev->rtnl_link_ops ||
5144		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5145			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5146
5147		/*
5148		 *	Flush the unicast and multicast chains
5149		 */
5150		dev_uc_flush(dev);
5151		dev_mc_flush(dev);
5152
5153		if (dev->netdev_ops->ndo_uninit)
5154			dev->netdev_ops->ndo_uninit(dev);
5155
5156		/* Notifier chain MUST detach us from master device. */
5157		WARN_ON(dev->master);
5158
5159		/* Remove entries from kobject tree */
5160		netdev_unregister_kobject(dev);
5161	}
5162
5163	/* Process any work delayed until the end of the batch */
5164	dev = list_first_entry(head, struct net_device, unreg_list);
5165	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5166
5167	rcu_barrier();
5168
5169	list_for_each_entry(dev, head, unreg_list)
5170		dev_put(dev);
5171}
5172
5173static void rollback_registered(struct net_device *dev)
5174{
5175	LIST_HEAD(single);
5176
5177	list_add(&dev->unreg_list, &single);
5178	rollback_registered_many(&single);
5179	list_del(&single);
5180}
5181
5182u32 netdev_fix_features(struct net_device *dev, u32 features)
5183{
5184	/* Fix illegal checksum combinations */
5185	if ((features & NETIF_F_HW_CSUM) &&
5186	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5187		netdev_info(dev, "mixed HW and IP checksum settings.\n");
5188		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5189	}
5190
5191	if ((features & NETIF_F_NO_CSUM) &&
5192	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5193		netdev_info(dev, "mixed no checksumming and other settings.\n");
5194		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5195	}
5196
5197	/* Fix illegal SG+CSUM combinations. */
5198	if ((features & NETIF_F_SG) &&
5199	    !(features & NETIF_F_ALL_CSUM)) {
5200		netdev_info(dev,
5201			    "Dropping NETIF_F_SG since no checksum feature.\n");
5202		features &= ~NETIF_F_SG;
5203	}
5204
5205	/* TSO requires that SG is present as well. */
5206	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5207		netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5208		features &= ~NETIF_F_TSO;
5209	}
5210
5211	/* Software GSO depends on SG. */
5212	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5213		netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5214		features &= ~NETIF_F_GSO;
5215	}
5216
5217	/* UFO needs SG and checksumming */
5218	if (features & NETIF_F_UFO) {
5219		/* maybe split UFO into V4 and V6? */
5220		if (!((features & NETIF_F_GEN_CSUM) ||
5221		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5222			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5223			netdev_info(dev,
5224				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5225			features &= ~NETIF_F_UFO;
5226		}
5227
5228		if (!(features & NETIF_F_SG)) {
5229			netdev_info(dev,
5230				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5231			features &= ~NETIF_F_UFO;
5232		}
5233	}
5234
5235	return features;
5236}
5237EXPORT_SYMBOL(netdev_fix_features);
5238
5239void netdev_update_features(struct net_device *dev)
5240{
5241	u32 features;
5242	int err = 0;
5243
5244	features = netdev_get_wanted_features(dev);
5245
5246	if (dev->netdev_ops->ndo_fix_features)
5247		features = dev->netdev_ops->ndo_fix_features(dev, features);
5248
5249	/* driver might be less strict about feature dependencies */
5250	features = netdev_fix_features(dev, features);
5251
5252	if (dev->features == features)
5253		return;
5254
5255	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5256		dev->features, features);
5257
5258	if (dev->netdev_ops->ndo_set_features)
5259		err = dev->netdev_ops->ndo_set_features(dev, features);
5260
5261	if (!err)
5262		dev->features = features;
5263	else if (err < 0)
5264		netdev_err(dev,
5265			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5266			err, features, dev->features);
5267}
5268EXPORT_SYMBOL(netdev_update_features);
5269
5270/**
5271 *	netif_stacked_transfer_operstate -	transfer operstate
5272 *	@rootdev: the root or lower level device to transfer state from
5273 *	@dev: the device to transfer operstate to
5274 *
5275 *	Transfer operational state from root to device. This is normally
5276 *	called when a stacking relationship exists between the root
5277 *	device and the device(a leaf device).
5278 */
5279void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5280					struct net_device *dev)
5281{
5282	if (rootdev->operstate == IF_OPER_DORMANT)
5283		netif_dormant_on(dev);
5284	else
5285		netif_dormant_off(dev);
5286
5287	if (netif_carrier_ok(rootdev)) {
5288		if (!netif_carrier_ok(dev))
5289			netif_carrier_on(dev);
5290	} else {
5291		if (netif_carrier_ok(dev))
5292			netif_carrier_off(dev);
5293	}
5294}
5295EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5296
5297#ifdef CONFIG_RPS
5298static int netif_alloc_rx_queues(struct net_device *dev)
5299{
5300	unsigned int i, count = dev->num_rx_queues;
5301	struct netdev_rx_queue *rx;
5302
5303	BUG_ON(count < 1);
5304
5305	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5306	if (!rx) {
5307		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5308		return -ENOMEM;
5309	}
5310	dev->_rx = rx;
5311
5312	for (i = 0; i < count; i++)
5313		rx[i].dev = dev;
5314	return 0;
5315}
5316#endif
5317
5318static void netdev_init_one_queue(struct net_device *dev,
5319				  struct netdev_queue *queue, void *_unused)
5320{
5321	/* Initialize queue lock */
5322	spin_lock_init(&queue->_xmit_lock);
5323	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5324	queue->xmit_lock_owner = -1;
5325	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5326	queue->dev = dev;
5327}
5328
5329static int netif_alloc_netdev_queues(struct net_device *dev)
5330{
5331	unsigned int count = dev->num_tx_queues;
5332	struct netdev_queue *tx;
5333
5334	BUG_ON(count < 1);
5335
5336	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5337	if (!tx) {
5338		pr_err("netdev: Unable to allocate %u tx queues.\n",
5339		       count);
5340		return -ENOMEM;
5341	}
5342	dev->_tx = tx;
5343
5344	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5345	spin_lock_init(&dev->tx_global_lock);
5346
5347	return 0;
5348}
5349
5350/**
5351 *	register_netdevice	- register a network device
5352 *	@dev: device to register
5353 *
5354 *	Take a completed network device structure and add it to the kernel
5355 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5356 *	chain. 0 is returned on success. A negative errno code is returned
5357 *	on a failure to set up the device, or if the name is a duplicate.
5358 *
5359 *	Callers must hold the rtnl semaphore. You may want
5360 *	register_netdev() instead of this.
5361 *
5362 *	BUGS:
5363 *	The locking appears insufficient to guarantee two parallel registers
5364 *	will not get the same name.
5365 */
5366
5367int register_netdevice(struct net_device *dev)
5368{
5369	int ret;
5370	struct net *net = dev_net(dev);
5371
5372	BUG_ON(dev_boot_phase);
5373	ASSERT_RTNL();
5374
5375	might_sleep();
5376
5377	/* When net_device's are persistent, this will be fatal. */
5378	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5379	BUG_ON(!net);
5380
5381	spin_lock_init(&dev->addr_list_lock);
5382	netdev_set_addr_lockdep_class(dev);
5383
5384	dev->iflink = -1;
5385
5386	/* Init, if this function is available */
5387	if (dev->netdev_ops->ndo_init) {
5388		ret = dev->netdev_ops->ndo_init(dev);
5389		if (ret) {
5390			if (ret > 0)
5391				ret = -EIO;
5392			goto out;
5393		}
5394	}
5395
5396	ret = dev_get_valid_name(dev, dev->name, 0);
5397	if (ret)
5398		goto err_uninit;
5399
5400	dev->ifindex = dev_new_index(net);
5401	if (dev->iflink == -1)
5402		dev->iflink = dev->ifindex;
5403
5404	/* Transfer changeable features to wanted_features and enable
5405	 * software offloads (GSO and GRO).
5406	 */
5407	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5408	dev->features |= NETIF_F_SOFT_FEATURES;
5409	dev->wanted_features = dev->features & dev->hw_features;
5410
5411	/* Avoid warning from netdev_fix_features() for GSO without SG */
5412	if (!(dev->wanted_features & NETIF_F_SG)) {
5413		dev->wanted_features &= ~NETIF_F_GSO;
5414		dev->features &= ~NETIF_F_GSO;
5415	}
5416
5417	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5418	 * vlan_dev_init() will do the dev->features check, so these features
5419	 * are enabled only if supported by underlying device.
5420	 */
5421	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5422
5423	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5424	ret = notifier_to_errno(ret);
5425	if (ret)
5426		goto err_uninit;
5427
5428	ret = netdev_register_kobject(dev);
5429	if (ret)
5430		goto err_uninit;
5431	dev->reg_state = NETREG_REGISTERED;
5432
5433	netdev_update_features(dev);
5434
5435	/*
5436	 *	Default initial state at registry is that the
5437	 *	device is present.
5438	 */
5439
5440	set_bit(__LINK_STATE_PRESENT, &dev->state);
5441
5442	dev_init_scheduler(dev);
5443	dev_hold(dev);
5444	list_netdevice(dev);
5445
5446	/* Notify protocols, that a new device appeared. */
5447	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5448	ret = notifier_to_errno(ret);
5449	if (ret) {
5450		rollback_registered(dev);
5451		dev->reg_state = NETREG_UNREGISTERED;
5452	}
5453	/*
5454	 *	Prevent userspace races by waiting until the network
5455	 *	device is fully setup before sending notifications.
5456	 */
5457	if (!dev->rtnl_link_ops ||
5458	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5459		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5460
5461out:
5462	return ret;
5463
5464err_uninit:
5465	if (dev->netdev_ops->ndo_uninit)
5466		dev->netdev_ops->ndo_uninit(dev);
5467	goto out;
5468}
5469EXPORT_SYMBOL(register_netdevice);
5470
5471/**
5472 *	init_dummy_netdev	- init a dummy network device for NAPI
5473 *	@dev: device to init
5474 *
5475 *	This takes a network device structure and initialize the minimum
5476 *	amount of fields so it can be used to schedule NAPI polls without
5477 *	registering a full blown interface. This is to be used by drivers
5478 *	that need to tie several hardware interfaces to a single NAPI
5479 *	poll scheduler due to HW limitations.
5480 */
5481int init_dummy_netdev(struct net_device *dev)
5482{
5483	/* Clear everything. Note we don't initialize spinlocks
5484	 * are they aren't supposed to be taken by any of the
5485	 * NAPI code and this dummy netdev is supposed to be
5486	 * only ever used for NAPI polls
5487	 */
5488	memset(dev, 0, sizeof(struct net_device));
5489
5490	/* make sure we BUG if trying to hit standard
5491	 * register/unregister code path
5492	 */
5493	dev->reg_state = NETREG_DUMMY;
5494
5495	/* NAPI wants this */
5496	INIT_LIST_HEAD(&dev->napi_list);
5497
5498	/* a dummy interface is started by default */
5499	set_bit(__LINK_STATE_PRESENT, &dev->state);
5500	set_bit(__LINK_STATE_START, &dev->state);
5501
5502	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5503	 * because users of this 'device' dont need to change
5504	 * its refcount.
5505	 */
5506
5507	return 0;
5508}
5509EXPORT_SYMBOL_GPL(init_dummy_netdev);
5510
5511
5512/**
5513 *	register_netdev	- register a network device
5514 *	@dev: device to register
5515 *
5516 *	Take a completed network device structure and add it to the kernel
5517 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5518 *	chain. 0 is returned on success. A negative errno code is returned
5519 *	on a failure to set up the device, or if the name is a duplicate.
5520 *
5521 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5522 *	and expands the device name if you passed a format string to
5523 *	alloc_netdev.
5524 */
5525int register_netdev(struct net_device *dev)
5526{
5527	int err;
5528
5529	rtnl_lock();
5530
5531	/*
5532	 * If the name is a format string the caller wants us to do a
5533	 * name allocation.
5534	 */
5535	if (strchr(dev->name, '%')) {
5536		err = dev_alloc_name(dev, dev->name);
5537		if (err < 0)
5538			goto out;
5539	}
5540
5541	err = register_netdevice(dev);
5542out:
5543	rtnl_unlock();
5544	return err;
5545}
5546EXPORT_SYMBOL(register_netdev);
5547
5548int netdev_refcnt_read(const struct net_device *dev)
5549{
5550	int i, refcnt = 0;
5551
5552	for_each_possible_cpu(i)
5553		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5554	return refcnt;
5555}
5556EXPORT_SYMBOL(netdev_refcnt_read);
5557
5558/*
5559 * netdev_wait_allrefs - wait until all references are gone.
5560 *
5561 * This is called when unregistering network devices.
5562 *
5563 * Any protocol or device that holds a reference should register
5564 * for netdevice notification, and cleanup and put back the
5565 * reference if they receive an UNREGISTER event.
5566 * We can get stuck here if buggy protocols don't correctly
5567 * call dev_put.
5568 */
5569static void netdev_wait_allrefs(struct net_device *dev)
5570{
5571	unsigned long rebroadcast_time, warning_time;
5572	int refcnt;
5573
5574	linkwatch_forget_dev(dev);
5575
5576	rebroadcast_time = warning_time = jiffies;
5577	refcnt = netdev_refcnt_read(dev);
5578
5579	while (refcnt != 0) {
5580		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5581			rtnl_lock();
5582
5583			/* Rebroadcast unregister notification */
5584			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5585			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5586			 * should have already handle it the first time */
5587
5588			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5589				     &dev->state)) {
5590				/* We must not have linkwatch events
5591				 * pending on unregister. If this
5592				 * happens, we simply run the queue
5593				 * unscheduled, resulting in a noop
5594				 * for this device.
5595				 */
5596				linkwatch_run_queue();
5597			}
5598
5599			__rtnl_unlock();
5600
5601			rebroadcast_time = jiffies;
5602		}
5603
5604		msleep(250);
5605
5606		refcnt = netdev_refcnt_read(dev);
5607
5608		if (time_after(jiffies, warning_time + 10 * HZ)) {
5609			printk(KERN_EMERG "unregister_netdevice: "
5610			       "waiting for %s to become free. Usage "
5611			       "count = %d\n",
5612			       dev->name, refcnt);
5613			warning_time = jiffies;
5614		}
5615	}
5616}
5617
5618/* The sequence is:
5619 *
5620 *	rtnl_lock();
5621 *	...
5622 *	register_netdevice(x1);
5623 *	register_netdevice(x2);
5624 *	...
5625 *	unregister_netdevice(y1);
5626 *	unregister_netdevice(y2);
5627 *      ...
5628 *	rtnl_unlock();
5629 *	free_netdev(y1);
5630 *	free_netdev(y2);
5631 *
5632 * We are invoked by rtnl_unlock().
5633 * This allows us to deal with problems:
5634 * 1) We can delete sysfs objects which invoke hotplug
5635 *    without deadlocking with linkwatch via keventd.
5636 * 2) Since we run with the RTNL semaphore not held, we can sleep
5637 *    safely in order to wait for the netdev refcnt to drop to zero.
5638 *
5639 * We must not return until all unregister events added during
5640 * the interval the lock was held have been completed.
5641 */
5642void netdev_run_todo(void)
5643{
5644	struct list_head list;
5645
5646	/* Snapshot list, allow later requests */
5647	list_replace_init(&net_todo_list, &list);
5648
5649	__rtnl_unlock();
5650
5651	while (!list_empty(&list)) {
5652		struct net_device *dev
5653			= list_first_entry(&list, struct net_device, todo_list);
5654		list_del(&dev->todo_list);
5655
5656		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5657			printk(KERN_ERR "network todo '%s' but state %d\n",
5658			       dev->name, dev->reg_state);
5659			dump_stack();
5660			continue;
5661		}
5662
5663		dev->reg_state = NETREG_UNREGISTERED;
5664
5665		on_each_cpu(flush_backlog, dev, 1);
5666
5667		netdev_wait_allrefs(dev);
5668
5669		/* paranoia */
5670		BUG_ON(netdev_refcnt_read(dev));
5671		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5672		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5673		WARN_ON(dev->dn_ptr);
5674
5675		if (dev->destructor)
5676			dev->destructor(dev);
5677
5678		/* Free network device */
5679		kobject_put(&dev->dev.kobj);
5680	}
5681}
5682
5683/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5684 * fields in the same order, with only the type differing.
5685 */
5686static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5687				    const struct net_device_stats *netdev_stats)
5688{
5689#if BITS_PER_LONG == 64
5690        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5691        memcpy(stats64, netdev_stats, sizeof(*stats64));
5692#else
5693	size_t i, n = sizeof(*stats64) / sizeof(u64);
5694	const unsigned long *src = (const unsigned long *)netdev_stats;
5695	u64 *dst = (u64 *)stats64;
5696
5697	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5698		     sizeof(*stats64) / sizeof(u64));
5699	for (i = 0; i < n; i++)
5700		dst[i] = src[i];
5701#endif
5702}
5703
5704/**
5705 *	dev_get_stats	- get network device statistics
5706 *	@dev: device to get statistics from
5707 *	@storage: place to store stats
5708 *
5709 *	Get network statistics from device. Return @storage.
5710 *	The device driver may provide its own method by setting
5711 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5712 *	otherwise the internal statistics structure is used.
5713 */
5714struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5715					struct rtnl_link_stats64 *storage)
5716{
5717	const struct net_device_ops *ops = dev->netdev_ops;
5718
5719	if (ops->ndo_get_stats64) {
5720		memset(storage, 0, sizeof(*storage));
5721		ops->ndo_get_stats64(dev, storage);
5722	} else if (ops->ndo_get_stats) {
5723		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5724	} else {
5725		netdev_stats_to_stats64(storage, &dev->stats);
5726	}
5727	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5728	return storage;
5729}
5730EXPORT_SYMBOL(dev_get_stats);
5731
5732struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5733{
5734	struct netdev_queue *queue = dev_ingress_queue(dev);
5735
5736#ifdef CONFIG_NET_CLS_ACT
5737	if (queue)
5738		return queue;
5739	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5740	if (!queue)
5741		return NULL;
5742	netdev_init_one_queue(dev, queue, NULL);
5743	queue->qdisc = &noop_qdisc;
5744	queue->qdisc_sleeping = &noop_qdisc;
5745	rcu_assign_pointer(dev->ingress_queue, queue);
5746#endif
5747	return queue;
5748}
5749
5750/**
5751 *	alloc_netdev_mqs - allocate network device
5752 *	@sizeof_priv:	size of private data to allocate space for
5753 *	@name:		device name format string
5754 *	@setup:		callback to initialize device
5755 *	@txqs:		the number of TX subqueues to allocate
5756 *	@rxqs:		the number of RX subqueues to allocate
5757 *
5758 *	Allocates a struct net_device with private data area for driver use
5759 *	and performs basic initialization.  Also allocates subquue structs
5760 *	for each queue on the device.
5761 */
5762struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5763		void (*setup)(struct net_device *),
5764		unsigned int txqs, unsigned int rxqs)
5765{
5766	struct net_device *dev;
5767	size_t alloc_size;
5768	struct net_device *p;
5769
5770	BUG_ON(strlen(name) >= sizeof(dev->name));
5771
5772	if (txqs < 1) {
5773		pr_err("alloc_netdev: Unable to allocate device "
5774		       "with zero queues.\n");
5775		return NULL;
5776	}
5777
5778#ifdef CONFIG_RPS
5779	if (rxqs < 1) {
5780		pr_err("alloc_netdev: Unable to allocate device "
5781		       "with zero RX queues.\n");
5782		return NULL;
5783	}
5784#endif
5785
5786	alloc_size = sizeof(struct net_device);
5787	if (sizeof_priv) {
5788		/* ensure 32-byte alignment of private area */
5789		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5790		alloc_size += sizeof_priv;
5791	}
5792	/* ensure 32-byte alignment of whole construct */
5793	alloc_size += NETDEV_ALIGN - 1;
5794
5795	p = kzalloc(alloc_size, GFP_KERNEL);
5796	if (!p) {
5797		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5798		return NULL;
5799	}
5800
5801	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5802	dev->padded = (char *)dev - (char *)p;
5803
5804	dev->pcpu_refcnt = alloc_percpu(int);
5805	if (!dev->pcpu_refcnt)
5806		goto free_p;
5807
5808	if (dev_addr_init(dev))
5809		goto free_pcpu;
5810
5811	dev_mc_init(dev);
5812	dev_uc_init(dev);
5813
5814	dev_net_set(dev, &init_net);
5815
5816	dev->gso_max_size = GSO_MAX_SIZE;
5817
5818	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5819	dev->ethtool_ntuple_list.count = 0;
5820	INIT_LIST_HEAD(&dev->napi_list);
5821	INIT_LIST_HEAD(&dev->unreg_list);
5822	INIT_LIST_HEAD(&dev->link_watch_list);
5823	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5824	setup(dev);
5825
5826	dev->num_tx_queues = txqs;
5827	dev->real_num_tx_queues = txqs;
5828	if (netif_alloc_netdev_queues(dev))
5829		goto free_all;
5830
5831#ifdef CONFIG_RPS
5832	dev->num_rx_queues = rxqs;
5833	dev->real_num_rx_queues = rxqs;
5834	if (netif_alloc_rx_queues(dev))
5835		goto free_all;
5836#endif
5837
5838	strcpy(dev->name, name);
5839	dev->group = INIT_NETDEV_GROUP;
5840	return dev;
5841
5842free_all:
5843	free_netdev(dev);
5844	return NULL;
5845
5846free_pcpu:
5847	free_percpu(dev->pcpu_refcnt);
5848	kfree(dev->_tx);
5849#ifdef CONFIG_RPS
5850	kfree(dev->_rx);
5851#endif
5852
5853free_p:
5854	kfree(p);
5855	return NULL;
5856}
5857EXPORT_SYMBOL(alloc_netdev_mqs);
5858
5859/**
5860 *	free_netdev - free network device
5861 *	@dev: device
5862 *
5863 *	This function does the last stage of destroying an allocated device
5864 * 	interface. The reference to the device object is released.
5865 *	If this is the last reference then it will be freed.
5866 */
5867void free_netdev(struct net_device *dev)
5868{
5869	struct napi_struct *p, *n;
5870
5871	release_net(dev_net(dev));
5872
5873	kfree(dev->_tx);
5874#ifdef CONFIG_RPS
5875	kfree(dev->_rx);
5876#endif
5877
5878	kfree(rcu_dereference_raw(dev->ingress_queue));
5879
5880	/* Flush device addresses */
5881	dev_addr_flush(dev);
5882
5883	/* Clear ethtool n-tuple list */
5884	ethtool_ntuple_flush(dev);
5885
5886	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5887		netif_napi_del(p);
5888
5889	free_percpu(dev->pcpu_refcnt);
5890	dev->pcpu_refcnt = NULL;
5891
5892	/*  Compatibility with error handling in drivers */
5893	if (dev->reg_state == NETREG_UNINITIALIZED) {
5894		kfree((char *)dev - dev->padded);
5895		return;
5896	}
5897
5898	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5899	dev->reg_state = NETREG_RELEASED;
5900
5901	/* will free via device release */
5902	put_device(&dev->dev);
5903}
5904EXPORT_SYMBOL(free_netdev);
5905
5906/**
5907 *	synchronize_net -  Synchronize with packet receive processing
5908 *
5909 *	Wait for packets currently being received to be done.
5910 *	Does not block later packets from starting.
5911 */
5912void synchronize_net(void)
5913{
5914	might_sleep();
5915	synchronize_rcu();
5916}
5917EXPORT_SYMBOL(synchronize_net);
5918
5919/**
5920 *	unregister_netdevice_queue - remove device from the kernel
5921 *	@dev: device
5922 *	@head: list
5923 *
5924 *	This function shuts down a device interface and removes it
5925 *	from the kernel tables.
5926 *	If head not NULL, device is queued to be unregistered later.
5927 *
5928 *	Callers must hold the rtnl semaphore.  You may want
5929 *	unregister_netdev() instead of this.
5930 */
5931
5932void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5933{
5934	ASSERT_RTNL();
5935
5936	if (head) {
5937		list_move_tail(&dev->unreg_list, head);
5938	} else {
5939		rollback_registered(dev);
5940		/* Finish processing unregister after unlock */
5941		net_set_todo(dev);
5942	}
5943}
5944EXPORT_SYMBOL(unregister_netdevice_queue);
5945
5946/**
5947 *	unregister_netdevice_many - unregister many devices
5948 *	@head: list of devices
5949 */
5950void unregister_netdevice_many(struct list_head *head)
5951{
5952	struct net_device *dev;
5953
5954	if (!list_empty(head)) {
5955		rollback_registered_many(head);
5956		list_for_each_entry(dev, head, unreg_list)
5957			net_set_todo(dev);
5958	}
5959}
5960EXPORT_SYMBOL(unregister_netdevice_many);
5961
5962/**
5963 *	unregister_netdev - remove device from the kernel
5964 *	@dev: device
5965 *
5966 *	This function shuts down a device interface and removes it
5967 *	from the kernel tables.
5968 *
5969 *	This is just a wrapper for unregister_netdevice that takes
5970 *	the rtnl semaphore.  In general you want to use this and not
5971 *	unregister_netdevice.
5972 */
5973void unregister_netdev(struct net_device *dev)
5974{
5975	rtnl_lock();
5976	unregister_netdevice(dev);
5977	rtnl_unlock();
5978}
5979EXPORT_SYMBOL(unregister_netdev);
5980
5981/**
5982 *	dev_change_net_namespace - move device to different nethost namespace
5983 *	@dev: device
5984 *	@net: network namespace
5985 *	@pat: If not NULL name pattern to try if the current device name
5986 *	      is already taken in the destination network namespace.
5987 *
5988 *	This function shuts down a device interface and moves it
5989 *	to a new network namespace. On success 0 is returned, on
5990 *	a failure a netagive errno code is returned.
5991 *
5992 *	Callers must hold the rtnl semaphore.
5993 */
5994
5995int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5996{
5997	int err;
5998
5999	ASSERT_RTNL();
6000
6001	/* Don't allow namespace local devices to be moved. */
6002	err = -EINVAL;
6003	if (dev->features & NETIF_F_NETNS_LOCAL)
6004		goto out;
6005
6006	/* Ensure the device has been registrered */
6007	err = -EINVAL;
6008	if (dev->reg_state != NETREG_REGISTERED)
6009		goto out;
6010
6011	/* Get out if there is nothing todo */
6012	err = 0;
6013	if (net_eq(dev_net(dev), net))
6014		goto out;
6015
6016	/* Pick the destination device name, and ensure
6017	 * we can use it in the destination network namespace.
6018	 */
6019	err = -EEXIST;
6020	if (__dev_get_by_name(net, dev->name)) {
6021		/* We get here if we can't use the current device name */
6022		if (!pat)
6023			goto out;
6024		if (dev_get_valid_name(dev, pat, 1))
6025			goto out;
6026	}
6027
6028	/*
6029	 * And now a mini version of register_netdevice unregister_netdevice.
6030	 */
6031
6032	/* If device is running close it first. */
6033	dev_close(dev);
6034
6035	/* And unlink it from device chain */
6036	err = -ENODEV;
6037	unlist_netdevice(dev);
6038
6039	synchronize_net();
6040
6041	/* Shutdown queueing discipline. */
6042	dev_shutdown(dev);
6043
6044	/* Notify protocols, that we are about to destroy
6045	   this device. They should clean all the things.
6046
6047	   Note that dev->reg_state stays at NETREG_REGISTERED.
6048	   This is wanted because this way 8021q and macvlan know
6049	   the device is just moving and can keep their slaves up.
6050	*/
6051	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6052	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6053
6054	/*
6055	 *	Flush the unicast and multicast chains
6056	 */
6057	dev_uc_flush(dev);
6058	dev_mc_flush(dev);
6059
6060	/* Actually switch the network namespace */
6061	dev_net_set(dev, net);
6062
6063	/* If there is an ifindex conflict assign a new one */
6064	if (__dev_get_by_index(net, dev->ifindex)) {
6065		int iflink = (dev->iflink == dev->ifindex);
6066		dev->ifindex = dev_new_index(net);
6067		if (iflink)
6068			dev->iflink = dev->ifindex;
6069	}
6070
6071	/* Fixup kobjects */
6072	err = device_rename(&dev->dev, dev->name);
6073	WARN_ON(err);
6074
6075	/* Add the device back in the hashes */
6076	list_netdevice(dev);
6077
6078	/* Notify protocols, that a new device appeared. */
6079	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6080
6081	/*
6082	 *	Prevent userspace races by waiting until the network
6083	 *	device is fully setup before sending notifications.
6084	 */
6085	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6086
6087	synchronize_net();
6088	err = 0;
6089out:
6090	return err;
6091}
6092EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6093
6094static int dev_cpu_callback(struct notifier_block *nfb,
6095			    unsigned long action,
6096			    void *ocpu)
6097{
6098	struct sk_buff **list_skb;
6099	struct sk_buff *skb;
6100	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6101	struct softnet_data *sd, *oldsd;
6102
6103	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6104		return NOTIFY_OK;
6105
6106	local_irq_disable();
6107	cpu = smp_processor_id();
6108	sd = &per_cpu(softnet_data, cpu);
6109	oldsd = &per_cpu(softnet_data, oldcpu);
6110
6111	/* Find end of our completion_queue. */
6112	list_skb = &sd->completion_queue;
6113	while (*list_skb)
6114		list_skb = &(*list_skb)->next;
6115	/* Append completion queue from offline CPU. */
6116	*list_skb = oldsd->completion_queue;
6117	oldsd->completion_queue = NULL;
6118
6119	/* Append output queue from offline CPU. */
6120	if (oldsd->output_queue) {
6121		*sd->output_queue_tailp = oldsd->output_queue;
6122		sd->output_queue_tailp = oldsd->output_queue_tailp;
6123		oldsd->output_queue = NULL;
6124		oldsd->output_queue_tailp = &oldsd->output_queue;
6125	}
6126
6127	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6128	local_irq_enable();
6129
6130	/* Process offline CPU's input_pkt_queue */
6131	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6132		netif_rx(skb);
6133		input_queue_head_incr(oldsd);
6134	}
6135	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6136		netif_rx(skb);
6137		input_queue_head_incr(oldsd);
6138	}
6139
6140	return NOTIFY_OK;
6141}
6142
6143
6144/**
6145 *	netdev_increment_features - increment feature set by one
6146 *	@all: current feature set
6147 *	@one: new feature set
6148 *	@mask: mask feature set
6149 *
6150 *	Computes a new feature set after adding a device with feature set
6151 *	@one to the master device with current feature set @all.  Will not
6152 *	enable anything that is off in @mask. Returns the new feature set.
6153 */
6154u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6155{
6156	/* If device needs checksumming, downgrade to it. */
6157	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6158		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6159	else if (mask & NETIF_F_ALL_CSUM) {
6160		/* If one device supports v4/v6 checksumming, set for all. */
6161		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6162		    !(all & NETIF_F_GEN_CSUM)) {
6163			all &= ~NETIF_F_ALL_CSUM;
6164			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6165		}
6166
6167		/* If one device supports hw checksumming, set for all. */
6168		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6169			all &= ~NETIF_F_ALL_CSUM;
6170			all |= NETIF_F_HW_CSUM;
6171		}
6172	}
6173
6174	one |= NETIF_F_ALL_CSUM;
6175
6176	one |= all & NETIF_F_ONE_FOR_ALL;
6177	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6178	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6179
6180	return all;
6181}
6182EXPORT_SYMBOL(netdev_increment_features);
6183
6184static struct hlist_head *netdev_create_hash(void)
6185{
6186	int i;
6187	struct hlist_head *hash;
6188
6189	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6190	if (hash != NULL)
6191		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6192			INIT_HLIST_HEAD(&hash[i]);
6193
6194	return hash;
6195}
6196
6197/* Initialize per network namespace state */
6198static int __net_init netdev_init(struct net *net)
6199{
6200	INIT_LIST_HEAD(&net->dev_base_head);
6201
6202	net->dev_name_head = netdev_create_hash();
6203	if (net->dev_name_head == NULL)
6204		goto err_name;
6205
6206	net->dev_index_head = netdev_create_hash();
6207	if (net->dev_index_head == NULL)
6208		goto err_idx;
6209
6210	return 0;
6211
6212err_idx:
6213	kfree(net->dev_name_head);
6214err_name:
6215	return -ENOMEM;
6216}
6217
6218/**
6219 *	netdev_drivername - network driver for the device
6220 *	@dev: network device
6221 *	@buffer: buffer for resulting name
6222 *	@len: size of buffer
6223 *
6224 *	Determine network driver for device.
6225 */
6226char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6227{
6228	const struct device_driver *driver;
6229	const struct device *parent;
6230
6231	if (len <= 0 || !buffer)
6232		return buffer;
6233	buffer[0] = 0;
6234
6235	parent = dev->dev.parent;
6236
6237	if (!parent)
6238		return buffer;
6239
6240	driver = parent->driver;
6241	if (driver && driver->name)
6242		strlcpy(buffer, driver->name, len);
6243	return buffer;
6244}
6245
6246static int __netdev_printk(const char *level, const struct net_device *dev,
6247			   struct va_format *vaf)
6248{
6249	int r;
6250
6251	if (dev && dev->dev.parent)
6252		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6253			       netdev_name(dev), vaf);
6254	else if (dev)
6255		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6256	else
6257		r = printk("%s(NULL net_device): %pV", level, vaf);
6258
6259	return r;
6260}
6261
6262int netdev_printk(const char *level, const struct net_device *dev,
6263		  const char *format, ...)
6264{
6265	struct va_format vaf;
6266	va_list args;
6267	int r;
6268
6269	va_start(args, format);
6270
6271	vaf.fmt = format;
6272	vaf.va = &args;
6273
6274	r = __netdev_printk(level, dev, &vaf);
6275	va_end(args);
6276
6277	return r;
6278}
6279EXPORT_SYMBOL(netdev_printk);
6280
6281#define define_netdev_printk_level(func, level)			\
6282int func(const struct net_device *dev, const char *fmt, ...)	\
6283{								\
6284	int r;							\
6285	struct va_format vaf;					\
6286	va_list args;						\
6287								\
6288	va_start(args, fmt);					\
6289								\
6290	vaf.fmt = fmt;						\
6291	vaf.va = &args;						\
6292								\
6293	r = __netdev_printk(level, dev, &vaf);			\
6294	va_end(args);						\
6295								\
6296	return r;						\
6297}								\
6298EXPORT_SYMBOL(func);
6299
6300define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6301define_netdev_printk_level(netdev_alert, KERN_ALERT);
6302define_netdev_printk_level(netdev_crit, KERN_CRIT);
6303define_netdev_printk_level(netdev_err, KERN_ERR);
6304define_netdev_printk_level(netdev_warn, KERN_WARNING);
6305define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6306define_netdev_printk_level(netdev_info, KERN_INFO);
6307
6308static void __net_exit netdev_exit(struct net *net)
6309{
6310	kfree(net->dev_name_head);
6311	kfree(net->dev_index_head);
6312}
6313
6314static struct pernet_operations __net_initdata netdev_net_ops = {
6315	.init = netdev_init,
6316	.exit = netdev_exit,
6317};
6318
6319static void __net_exit default_device_exit(struct net *net)
6320{
6321	struct net_device *dev, *aux;
6322	/*
6323	 * Push all migratable network devices back to the
6324	 * initial network namespace
6325	 */
6326	rtnl_lock();
6327	for_each_netdev_safe(net, dev, aux) {
6328		int err;
6329		char fb_name[IFNAMSIZ];
6330
6331		/* Ignore unmoveable devices (i.e. loopback) */
6332		if (dev->features & NETIF_F_NETNS_LOCAL)
6333			continue;
6334
6335		/* Leave virtual devices for the generic cleanup */
6336		if (dev->rtnl_link_ops)
6337			continue;
6338
6339		/* Push remaing network devices to init_net */
6340		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6341		err = dev_change_net_namespace(dev, &init_net, fb_name);
6342		if (err) {
6343			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6344				__func__, dev->name, err);
6345			BUG();
6346		}
6347	}
6348	rtnl_unlock();
6349}
6350
6351static void __net_exit default_device_exit_batch(struct list_head *net_list)
6352{
6353	/* At exit all network devices most be removed from a network
6354	 * namespace.  Do this in the reverse order of registration.
6355	 * Do this across as many network namespaces as possible to
6356	 * improve batching efficiency.
6357	 */
6358	struct net_device *dev;
6359	struct net *net;
6360	LIST_HEAD(dev_kill_list);
6361
6362	rtnl_lock();
6363	list_for_each_entry(net, net_list, exit_list) {
6364		for_each_netdev_reverse(net, dev) {
6365			if (dev->rtnl_link_ops)
6366				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6367			else
6368				unregister_netdevice_queue(dev, &dev_kill_list);
6369		}
6370	}
6371	unregister_netdevice_many(&dev_kill_list);
6372	list_del(&dev_kill_list);
6373	rtnl_unlock();
6374}
6375
6376static struct pernet_operations __net_initdata default_device_ops = {
6377	.exit = default_device_exit,
6378	.exit_batch = default_device_exit_batch,
6379};
6380
6381/*
6382 *	Initialize the DEV module. At boot time this walks the device list and
6383 *	unhooks any devices that fail to initialise (normally hardware not
6384 *	present) and leaves us with a valid list of present and active devices.
6385 *
6386 */
6387
6388/*
6389 *       This is called single threaded during boot, so no need
6390 *       to take the rtnl semaphore.
6391 */
6392static int __init net_dev_init(void)
6393{
6394	int i, rc = -ENOMEM;
6395
6396	BUG_ON(!dev_boot_phase);
6397
6398	if (dev_proc_init())
6399		goto out;
6400
6401	if (netdev_kobject_init())
6402		goto out;
6403
6404	INIT_LIST_HEAD(&ptype_all);
6405	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6406		INIT_LIST_HEAD(&ptype_base[i]);
6407
6408	if (register_pernet_subsys(&netdev_net_ops))
6409		goto out;
6410
6411	/*
6412	 *	Initialise the packet receive queues.
6413	 */
6414
6415	for_each_possible_cpu(i) {
6416		struct softnet_data *sd = &per_cpu(softnet_data, i);
6417
6418		memset(sd, 0, sizeof(*sd));
6419		skb_queue_head_init(&sd->input_pkt_queue);
6420		skb_queue_head_init(&sd->process_queue);
6421		sd->completion_queue = NULL;
6422		INIT_LIST_HEAD(&sd->poll_list);
6423		sd->output_queue = NULL;
6424		sd->output_queue_tailp = &sd->output_queue;
6425#ifdef CONFIG_RPS
6426		sd->csd.func = rps_trigger_softirq;
6427		sd->csd.info = sd;
6428		sd->csd.flags = 0;
6429		sd->cpu = i;
6430#endif
6431
6432		sd->backlog.poll = process_backlog;
6433		sd->backlog.weight = weight_p;
6434		sd->backlog.gro_list = NULL;
6435		sd->backlog.gro_count = 0;
6436	}
6437
6438	dev_boot_phase = 0;
6439
6440	/* The loopback device is special if any other network devices
6441	 * is present in a network namespace the loopback device must
6442	 * be present. Since we now dynamically allocate and free the
6443	 * loopback device ensure this invariant is maintained by
6444	 * keeping the loopback device as the first device on the
6445	 * list of network devices.  Ensuring the loopback devices
6446	 * is the first device that appears and the last network device
6447	 * that disappears.
6448	 */
6449	if (register_pernet_device(&loopback_net_ops))
6450		goto out;
6451
6452	if (register_pernet_device(&default_device_ops))
6453		goto out;
6454
6455	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6456	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6457
6458	hotcpu_notifier(dev_cpu_callback, 0);
6459	dst_init();
6460	dev_mcast_init();
6461	rc = 0;
6462out:
6463	return rc;
6464}
6465
6466subsys_initcall(net_dev_init);
6467
6468static int __init initialize_hashrnd(void)
6469{
6470	get_random_bytes(&hashrnd, sizeof(hashrnd));
6471	return 0;
6472}
6473
6474late_initcall_sync(initialize_hashrnd);
6475