net/core/dev.c at v2.6.35-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.35-rc6 5983 lines 148 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <linux/if_bridge.h>
 105#include <linux/if_macvlan.h>
 106#include <net/dst.h>
 107#include <net/pkt_sched.h>
 108#include <net/checksum.h>
 109#include <net/xfrm.h>
 110#include <linux/highmem.h>
 111#include <linux/init.h>
 112#include <linux/kmod.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/wext.h>
 118#include <net/iw_handler.h>
 119#include <asm/current.h>
 120#include <linux/audit.h>
 121#include <linux/dmaengine.h>
 122#include <linux/err.h>
 123#include <linux/ctype.h>
 124#include <linux/if_arp.h>
 125#include <linux/if_vlan.h>
 126#include <linux/ip.h>
 127#include <net/ip.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133#include <linux/pci.h>
 134
 135#include "net-sysfs.h"
 136
 137/* Instead of increasing this, you should create a hash table. */
 138#define MAX_GRO_SKBS 8
 139
 140/* This should be increased if a protocol with a bigger head is added. */
 141#define GRO_MAX_HEAD (MAX_HEADER + 128)
 142
 143/*
 144 *	The list of packet types we will receive (as opposed to discard)
 145 *	and the routines to invoke.
 146 *
 147 *	Why 16. Because with 16 the only overlap we get on a hash of the
 148 *	low nibble of the protocol value is RARP/SNAP/X.25.
 149 *
 150 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 151 *             sure which should go first, but I bet it won't make much
 152 *             difference if we are running VLANs.  The good news is that
 153 *             this protocol won't be in the list unless compiled in, so
 154 *             the average user (w/out VLANs) will not be adversely affected.
 155 *             --BLG
 156 *
 157 *		0800	IP
 158 *		8100    802.1Q VLAN
 159 *		0001	802.3
 160 *		0002	AX.25
 161 *		0004	802.2
 162 *		8035	RARP
 163 *		0005	SNAP
 164 *		0805	X.25
 165 *		0806	ARP
 166 *		8137	IPX
 167 *		0009	Localtalk
 168 *		86DD	IPv6
 169 */
 170
 171#define PTYPE_HASH_SIZE	(16)
 172#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 173
 174static DEFINE_SPINLOCK(ptype_lock);
 175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 176static struct list_head ptype_all __read_mostly;	/* Taps */
 177
 178/*
 179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 180 * semaphore.
 181 *
 182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 183 *
 184 * Writers must hold the rtnl semaphore while they loop through the
 185 * dev_base_head list, and hold dev_base_lock for writing when they do the
 186 * actual updates.  This allows pure readers to access the list even
 187 * while a writer is preparing to update it.
 188 *
 189 * To put it another way, dev_base_lock is held for writing only to
 190 * protect against pure readers; the rtnl semaphore provides the
 191 * protection against other writers.
 192 *
 193 * See, for example usages, register_netdevice() and
 194 * unregister_netdevice(), which must be called with the rtnl
 195 * semaphore held.
 196 */
 197DEFINE_RWLOCK(dev_base_lock);
 198EXPORT_SYMBOL(dev_base_lock);
 199
 200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 201{
 202	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 203	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 204}
 205
 206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 207{
 208	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 209}
 210
 211static inline void rps_lock(struct softnet_data *sd)
 212{
 213#ifdef CONFIG_RPS
 214	spin_lock(&sd->input_pkt_queue.lock);
 215#endif
 216}
 217
 218static inline void rps_unlock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221	spin_unlock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225/* Device list insertion */
 226static int list_netdevice(struct net_device *dev)
 227{
 228	struct net *net = dev_net(dev);
 229
 230	ASSERT_RTNL();
 231
 232	write_lock_bh(&dev_base_lock);
 233	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 234	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 235	hlist_add_head_rcu(&dev->index_hlist,
 236			   dev_index_hash(net, dev->ifindex));
 237	write_unlock_bh(&dev_base_lock);
 238	return 0;
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246	ASSERT_RTNL();
 247
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254}
 255
 256/*
 257 *	Our notifier list
 258 */
 259
 260static RAW_NOTIFIER_HEAD(netdev_chain);
 261
 262/*
 263 *	Device drivers call our routines to queue packets here. We empty the
 264 *	queue in the local softnet handler.
 265 */
 266
 267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 268EXPORT_PER_CPU_SYMBOL(softnet_data);
 269
 270#ifdef CONFIG_LOCKDEP
 271/*
 272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 273 * according to dev->type
 274 */
 275static const unsigned short netdev_lock_type[] =
 276	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 277	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 278	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 279	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 280	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 281	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 282	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 283	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 284	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 285	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 286	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 287	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 288	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 289	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 290	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 291	 ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 307	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 308	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 309	 "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376/**
 377 *	dev_add_pack - add packet handler
 378 *	@pt: packet type declaration
 379 *
 380 *	Add a protocol handler to the networking stack. The passed &packet_type
 381 *	is linked into kernel lists and may not be freed until it has been
 382 *	removed from the kernel lists.
 383 *
 384 *	This call does not sleep therefore it can not
 385 *	guarantee all CPU's that are in middle of receiving packets
 386 *	will see the new packet type (until the next received packet).
 387 */
 388
 389void dev_add_pack(struct packet_type *pt)
 390{
 391	int hash;
 392
 393	spin_lock_bh(&ptype_lock);
 394	if (pt->type == htons(ETH_P_ALL))
 395		list_add_rcu(&pt->list, &ptype_all);
 396	else {
 397		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 398		list_add_rcu(&pt->list, &ptype_base[hash]);
 399	}
 400	spin_unlock_bh(&ptype_lock);
 401}
 402EXPORT_SYMBOL(dev_add_pack);
 403
 404/**
 405 *	__dev_remove_pack	 - remove packet handler
 406 *	@pt: packet type declaration
 407 *
 408 *	Remove a protocol handler that was previously added to the kernel
 409 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 410 *	from the kernel lists and can be freed or reused once this function
 411 *	returns.
 412 *
 413 *      The packet type might still be in use by receivers
 414 *	and must not be freed until after all the CPU's have gone
 415 *	through a quiescent state.
 416 */
 417void __dev_remove_pack(struct packet_type *pt)
 418{
 419	struct list_head *head;
 420	struct packet_type *pt1;
 421
 422	spin_lock_bh(&ptype_lock);
 423
 424	if (pt->type == htons(ETH_P_ALL))
 425		head = &ptype_all;
 426	else
 427		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 428
 429	list_for_each_entry(pt1, head, list) {
 430		if (pt == pt1) {
 431			list_del_rcu(&pt->list);
 432			goto out;
 433		}
 434	}
 435
 436	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437out:
 438	spin_unlock_bh(&ptype_lock);
 439}
 440EXPORT_SYMBOL(__dev_remove_pack);
 441
 442/**
 443 *	dev_remove_pack	 - remove packet handler
 444 *	@pt: packet type declaration
 445 *
 446 *	Remove a protocol handler that was previously added to the kernel
 447 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448 *	from the kernel lists and can be freed or reused once this function
 449 *	returns.
 450 *
 451 *	This call sleeps to guarantee that no CPU is looking at the packet
 452 *	type after return.
 453 */
 454void dev_remove_pack(struct packet_type *pt)
 455{
 456	__dev_remove_pack(pt);
 457
 458	synchronize_net();
 459}
 460EXPORT_SYMBOL(dev_remove_pack);
 461
 462/******************************************************************************
 463
 464		      Device Boot-time Settings Routines
 465
 466*******************************************************************************/
 467
 468/* Boot time configuration table */
 469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471/**
 472 *	netdev_boot_setup_add	- add new setup entry
 473 *	@name: name of the device
 474 *	@map: configured settings for the device
 475 *
 476 *	Adds new setup entry to the dev_boot_setup list.  The function
 477 *	returns 0 on error and 1 on success.  This is a generic routine to
 478 *	all netdevices.
 479 */
 480static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481{
 482	struct netdev_boot_setup *s;
 483	int i;
 484
 485	s = dev_boot_setup;
 486	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488			memset(s[i].name, 0, sizeof(s[i].name));
 489			strlcpy(s[i].name, name, IFNAMSIZ);
 490			memcpy(&s[i].map, map, sizeof(s[i].map));
 491			break;
 492		}
 493	}
 494
 495	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496}
 497
 498/**
 499 *	netdev_boot_setup_check	- check boot time settings
 500 *	@dev: the netdevice
 501 *
 502 * 	Check boot time settings for the device.
 503 *	The found settings are set for the device to be used
 504 *	later in the device probing.
 505 *	Returns 0 if no settings found, 1 if they are.
 506 */
 507int netdev_boot_setup_check(struct net_device *dev)
 508{
 509	struct netdev_boot_setup *s = dev_boot_setup;
 510	int i;
 511
 512	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514		    !strcmp(dev->name, s[i].name)) {
 515			dev->irq 	= s[i].map.irq;
 516			dev->base_addr 	= s[i].map.base_addr;
 517			dev->mem_start 	= s[i].map.mem_start;
 518			dev->mem_end 	= s[i].map.mem_end;
 519			return 1;
 520		}
 521	}
 522	return 0;
 523}
 524EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527/**
 528 *	netdev_boot_base	- get address from boot time settings
 529 *	@prefix: prefix for network device
 530 *	@unit: id for network device
 531 *
 532 * 	Check boot time settings for the base address of device.
 533 *	The found settings are set for the device to be used
 534 *	later in the device probing.
 535 *	Returns 0 if no settings found.
 536 */
 537unsigned long netdev_boot_base(const char *prefix, int unit)
 538{
 539	const struct netdev_boot_setup *s = dev_boot_setup;
 540	char name[IFNAMSIZ];
 541	int i;
 542
 543	sprintf(name, "%s%d", prefix, unit);
 544
 545	/*
 546	 * If device already registered then return base of 1
 547	 * to indicate not to probe for this interface
 548	 */
 549	if (__dev_get_by_name(&init_net, name))
 550		return 1;
 551
 552	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553		if (!strcmp(name, s[i].name))
 554			return s[i].map.base_addr;
 555	return 0;
 556}
 557
 558/*
 559 * Saves at boot time configured settings for any netdevice.
 560 */
 561int __init netdev_boot_setup(char *str)
 562{
 563	int ints[5];
 564	struct ifmap map;
 565
 566	str = get_options(str, ARRAY_SIZE(ints), ints);
 567	if (!str || !*str)
 568		return 0;
 569
 570	/* Save settings */
 571	memset(&map, 0, sizeof(map));
 572	if (ints[0] > 0)
 573		map.irq = ints[1];
 574	if (ints[0] > 1)
 575		map.base_addr = ints[2];
 576	if (ints[0] > 2)
 577		map.mem_start = ints[3];
 578	if (ints[0] > 3)
 579		map.mem_end = ints[4];
 580
 581	/* Add new entry to the list */
 582	return netdev_boot_setup_add(str, &map);
 583}
 584
 585__setup("netdev=", netdev_boot_setup);
 586
 587/*******************************************************************************
 588
 589			    Device Interface Subroutines
 590
 591*******************************************************************************/
 592
 593/**
 594 *	__dev_get_by_name	- find a device by its name
 595 *	@net: the applicable net namespace
 596 *	@name: name to find
 597 *
 598 *	Find an interface by name. Must be called under RTNL semaphore
 599 *	or @dev_base_lock. If the name is found a pointer to the device
 600 *	is returned. If the name is not found then %NULL is returned. The
 601 *	reference counters are not incremented so the caller must be
 602 *	careful with locks.
 603 */
 604
 605struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606{
 607	struct hlist_node *p;
 608	struct net_device *dev;
 609	struct hlist_head *head = dev_name_hash(net, name);
 610
 611	hlist_for_each_entry(dev, p, head, name_hlist)
 612		if (!strncmp(dev->name, name, IFNAMSIZ))
 613			return dev;
 614
 615	return NULL;
 616}
 617EXPORT_SYMBOL(__dev_get_by_name);
 618
 619/**
 620 *	dev_get_by_name_rcu	- find a device by its name
 621 *	@net: the applicable net namespace
 622 *	@name: name to find
 623 *
 624 *	Find an interface by name.
 625 *	If the name is found a pointer to the device is returned.
 626 * 	If the name is not found then %NULL is returned.
 627 *	The reference counters are not incremented so the caller must be
 628 *	careful with locks. The caller must hold RCU lock.
 629 */
 630
 631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632{
 633	struct hlist_node *p;
 634	struct net_device *dev;
 635	struct hlist_head *head = dev_name_hash(net, name);
 636
 637	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638		if (!strncmp(dev->name, name, IFNAMSIZ))
 639			return dev;
 640
 641	return NULL;
 642}
 643EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645/**
 646 *	dev_get_by_name		- find a device by its name
 647 *	@net: the applicable net namespace
 648 *	@name: name to find
 649 *
 650 *	Find an interface by name. This can be called from any
 651 *	context and does its own locking. The returned handle has
 652 *	the usage count incremented and the caller must use dev_put() to
 653 *	release it when it is no longer needed. %NULL is returned if no
 654 *	matching device is found.
 655 */
 656
 657struct net_device *dev_get_by_name(struct net *net, const char *name)
 658{
 659	struct net_device *dev;
 660
 661	rcu_read_lock();
 662	dev = dev_get_by_name_rcu(net, name);
 663	if (dev)
 664		dev_hold(dev);
 665	rcu_read_unlock();
 666	return dev;
 667}
 668EXPORT_SYMBOL(dev_get_by_name);
 669
 670/**
 671 *	__dev_get_by_index - find a device by its ifindex
 672 *	@net: the applicable net namespace
 673 *	@ifindex: index of device
 674 *
 675 *	Search for an interface by index. Returns %NULL if the device
 676 *	is not found or a pointer to the device. The device has not
 677 *	had its reference counter increased so the caller must be careful
 678 *	about locking. The caller must hold either the RTNL semaphore
 679 *	or @dev_base_lock.
 680 */
 681
 682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683{
 684	struct hlist_node *p;
 685	struct net_device *dev;
 686	struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688	hlist_for_each_entry(dev, p, head, index_hlist)
 689		if (dev->ifindex == ifindex)
 690			return dev;
 691
 692	return NULL;
 693}
 694EXPORT_SYMBOL(__dev_get_by_index);
 695
 696/**
 697 *	dev_get_by_index_rcu - find a device by its ifindex
 698 *	@net: the applicable net namespace
 699 *	@ifindex: index of device
 700 *
 701 *	Search for an interface by index. Returns %NULL if the device
 702 *	is not found or a pointer to the device. The device has not
 703 *	had its reference counter increased so the caller must be careful
 704 *	about locking. The caller must hold RCU lock.
 705 */
 706
 707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708{
 709	struct hlist_node *p;
 710	struct net_device *dev;
 711	struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714		if (dev->ifindex == ifindex)
 715			return dev;
 716
 717	return NULL;
 718}
 719EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722/**
 723 *	dev_get_by_index - find a device by its ifindex
 724 *	@net: the applicable net namespace
 725 *	@ifindex: index of device
 726 *
 727 *	Search for an interface by index. Returns NULL if the device
 728 *	is not found or a pointer to the device. The device returned has
 729 *	had a reference added and the pointer is safe until the user calls
 730 *	dev_put to indicate they have finished with it.
 731 */
 732
 733struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734{
 735	struct net_device *dev;
 736
 737	rcu_read_lock();
 738	dev = dev_get_by_index_rcu(net, ifindex);
 739	if (dev)
 740		dev_hold(dev);
 741	rcu_read_unlock();
 742	return dev;
 743}
 744EXPORT_SYMBOL(dev_get_by_index);
 745
 746/**
 747 *	dev_getbyhwaddr - find a device by its hardware address
 748 *	@net: the applicable net namespace
 749 *	@type: media type of device
 750 *	@ha: hardware address
 751 *
 752 *	Search for an interface by MAC address. Returns NULL if the device
 753 *	is not found or a pointer to the device. The caller must hold the
 754 *	rtnl semaphore. The returned device has not had its ref count increased
 755 *	and the caller must therefore be careful about locking
 756 *
 757 *	BUGS:
 758 *	If the API was consistent this would be __dev_get_by_hwaddr
 759 */
 760
 761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 762{
 763	struct net_device *dev;
 764
 765	ASSERT_RTNL();
 766
 767	for_each_netdev(net, dev)
 768		if (dev->type == type &&
 769		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 770			return dev;
 771
 772	return NULL;
 773}
 774EXPORT_SYMBOL(dev_getbyhwaddr);
 775
 776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 777{
 778	struct net_device *dev;
 779
 780	ASSERT_RTNL();
 781	for_each_netdev(net, dev)
 782		if (dev->type == type)
 783			return dev;
 784
 785	return NULL;
 786}
 787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 788
 789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790{
 791	struct net_device *dev, *ret = NULL;
 792
 793	rcu_read_lock();
 794	for_each_netdev_rcu(net, dev)
 795		if (dev->type == type) {
 796			dev_hold(dev);
 797			ret = dev;
 798			break;
 799		}
 800	rcu_read_unlock();
 801	return ret;
 802}
 803EXPORT_SYMBOL(dev_getfirstbyhwtype);
 804
 805/**
 806 *	dev_get_by_flags - find any device with given flags
 807 *	@net: the applicable net namespace
 808 *	@if_flags: IFF_* values
 809 *	@mask: bitmask of bits in if_flags to check
 810 *
 811 *	Search for any interface with the given flags. Returns NULL if a device
 812 *	is not found or a pointer to the device. The device returned has
 813 *	had a reference added and the pointer is safe until the user calls
 814 *	dev_put to indicate they have finished with it.
 815 */
 816
 817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 818				    unsigned short mask)
 819{
 820	struct net_device *dev, *ret;
 821
 822	ret = NULL;
 823	rcu_read_lock();
 824	for_each_netdev_rcu(net, dev) {
 825		if (((dev->flags ^ if_flags) & mask) == 0) {
 826			dev_hold(dev);
 827			ret = dev;
 828			break;
 829		}
 830	}
 831	rcu_read_unlock();
 832	return ret;
 833}
 834EXPORT_SYMBOL(dev_get_by_flags);
 835
 836/**
 837 *	dev_valid_name - check if name is okay for network device
 838 *	@name: name string
 839 *
 840 *	Network device names need to be valid file names to
 841 *	to allow sysfs to work.  We also disallow any kind of
 842 *	whitespace.
 843 */
 844int dev_valid_name(const char *name)
 845{
 846	if (*name == '\0')
 847		return 0;
 848	if (strlen(name) >= IFNAMSIZ)
 849		return 0;
 850	if (!strcmp(name, ".") || !strcmp(name, ".."))
 851		return 0;
 852
 853	while (*name) {
 854		if (*name == '/' || isspace(*name))
 855			return 0;
 856		name++;
 857	}
 858	return 1;
 859}
 860EXPORT_SYMBOL(dev_valid_name);
 861
 862/**
 863 *	__dev_alloc_name - allocate a name for a device
 864 *	@net: network namespace to allocate the device name in
 865 *	@name: name format string
 866 *	@buf:  scratch buffer and result name string
 867 *
 868 *	Passed a format string - eg "lt%d" it will try and find a suitable
 869 *	id. It scans list of devices to build up a free map, then chooses
 870 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 871 *	while allocating the name and adding the device in order to avoid
 872 *	duplicates.
 873 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 874 *	Returns the number of the unit assigned or a negative errno code.
 875 */
 876
 877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 878{
 879	int i = 0;
 880	const char *p;
 881	const int max_netdevices = 8*PAGE_SIZE;
 882	unsigned long *inuse;
 883	struct net_device *d;
 884
 885	p = strnchr(name, IFNAMSIZ-1, '%');
 886	if (p) {
 887		/*
 888		 * Verify the string as this thing may have come from
 889		 * the user.  There must be either one "%d" and no other "%"
 890		 * characters.
 891		 */
 892		if (p[1] != 'd' || strchr(p + 2, '%'))
 893			return -EINVAL;
 894
 895		/* Use one page as a bit array of possible slots */
 896		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 897		if (!inuse)
 898			return -ENOMEM;
 899
 900		for_each_netdev(net, d) {
 901			if (!sscanf(d->name, name, &i))
 902				continue;
 903			if (i < 0 || i >= max_netdevices)
 904				continue;
 905
 906			/*  avoid cases where sscanf is not exact inverse of printf */
 907			snprintf(buf, IFNAMSIZ, name, i);
 908			if (!strncmp(buf, d->name, IFNAMSIZ))
 909				set_bit(i, inuse);
 910		}
 911
 912		i = find_first_zero_bit(inuse, max_netdevices);
 913		free_page((unsigned long) inuse);
 914	}
 915
 916	if (buf != name)
 917		snprintf(buf, IFNAMSIZ, name, i);
 918	if (!__dev_get_by_name(net, buf))
 919		return i;
 920
 921	/* It is possible to run out of possible slots
 922	 * when the name is long and there isn't enough space left
 923	 * for the digits, or if all bits are used.
 924	 */
 925	return -ENFILE;
 926}
 927
 928/**
 929 *	dev_alloc_name - allocate a name for a device
 930 *	@dev: device
 931 *	@name: name format string
 932 *
 933 *	Passed a format string - eg "lt%d" it will try and find a suitable
 934 *	id. It scans list of devices to build up a free map, then chooses
 935 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 936 *	while allocating the name and adding the device in order to avoid
 937 *	duplicates.
 938 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 939 *	Returns the number of the unit assigned or a negative errno code.
 940 */
 941
 942int dev_alloc_name(struct net_device *dev, const char *name)
 943{
 944	char buf[IFNAMSIZ];
 945	struct net *net;
 946	int ret;
 947
 948	BUG_ON(!dev_net(dev));
 949	net = dev_net(dev);
 950	ret = __dev_alloc_name(net, name, buf);
 951	if (ret >= 0)
 952		strlcpy(dev->name, buf, IFNAMSIZ);
 953	return ret;
 954}
 955EXPORT_SYMBOL(dev_alloc_name);
 956
 957static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 958{
 959	struct net *net;
 960
 961	BUG_ON(!dev_net(dev));
 962	net = dev_net(dev);
 963
 964	if (!dev_valid_name(name))
 965		return -EINVAL;
 966
 967	if (fmt && strchr(name, '%'))
 968		return dev_alloc_name(dev, name);
 969	else if (__dev_get_by_name(net, name))
 970		return -EEXIST;
 971	else if (dev->name != name)
 972		strlcpy(dev->name, name, IFNAMSIZ);
 973
 974	return 0;
 975}
 976
 977/**
 978 *	dev_change_name - change name of a device
 979 *	@dev: device
 980 *	@newname: name (or format string) must be at least IFNAMSIZ
 981 *
 982 *	Change name of a device, can pass format strings "eth%d".
 983 *	for wildcarding.
 984 */
 985int dev_change_name(struct net_device *dev, const char *newname)
 986{
 987	char oldname[IFNAMSIZ];
 988	int err = 0;
 989	int ret;
 990	struct net *net;
 991
 992	ASSERT_RTNL();
 993	BUG_ON(!dev_net(dev));
 994
 995	net = dev_net(dev);
 996	if (dev->flags & IFF_UP)
 997		return -EBUSY;
 998
 999	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1000		return 0;
1001
1002	memcpy(oldname, dev->name, IFNAMSIZ);
1003
1004	err = dev_get_valid_name(dev, newname, 1);
1005	if (err < 0)
1006		return err;
1007
1008rollback:
1009	ret = device_rename(&dev->dev, dev->name);
1010	if (ret) {
1011		memcpy(dev->name, oldname, IFNAMSIZ);
1012		return ret;
1013	}
1014
1015	write_lock_bh(&dev_base_lock);
1016	hlist_del(&dev->name_hlist);
1017	write_unlock_bh(&dev_base_lock);
1018
1019	synchronize_rcu();
1020
1021	write_lock_bh(&dev_base_lock);
1022	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1023	write_unlock_bh(&dev_base_lock);
1024
1025	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1026	ret = notifier_to_errno(ret);
1027
1028	if (ret) {
1029		/* err >= 0 after dev_alloc_name() or stores the first errno */
1030		if (err >= 0) {
1031			err = ret;
1032			memcpy(dev->name, oldname, IFNAMSIZ);
1033			goto rollback;
1034		} else {
1035			printk(KERN_ERR
1036			       "%s: name change rollback failed: %d.\n",
1037			       dev->name, ret);
1038		}
1039	}
1040
1041	return err;
1042}
1043
1044/**
1045 *	dev_set_alias - change ifalias of a device
1046 *	@dev: device
1047 *	@alias: name up to IFALIASZ
1048 *	@len: limit of bytes to copy from info
1049 *
1050 *	Set ifalias for a device,
1051 */
1052int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1053{
1054	ASSERT_RTNL();
1055
1056	if (len >= IFALIASZ)
1057		return -EINVAL;
1058
1059	if (!len) {
1060		if (dev->ifalias) {
1061			kfree(dev->ifalias);
1062			dev->ifalias = NULL;
1063		}
1064		return 0;
1065	}
1066
1067	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1068	if (!dev->ifalias)
1069		return -ENOMEM;
1070
1071	strlcpy(dev->ifalias, alias, len+1);
1072	return len;
1073}
1074
1075
1076/**
1077 *	netdev_features_change - device changes features
1078 *	@dev: device to cause notification
1079 *
1080 *	Called to indicate a device has changed features.
1081 */
1082void netdev_features_change(struct net_device *dev)
1083{
1084	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1085}
1086EXPORT_SYMBOL(netdev_features_change);
1087
1088/**
1089 *	netdev_state_change - device changes state
1090 *	@dev: device to cause notification
1091 *
1092 *	Called to indicate a device has changed state. This function calls
1093 *	the notifier chains for netdev_chain and sends a NEWLINK message
1094 *	to the routing socket.
1095 */
1096void netdev_state_change(struct net_device *dev)
1097{
1098	if (dev->flags & IFF_UP) {
1099		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1100		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1101	}
1102}
1103EXPORT_SYMBOL(netdev_state_change);
1104
1105int netdev_bonding_change(struct net_device *dev, unsigned long event)
1106{
1107	return call_netdevice_notifiers(event, dev);
1108}
1109EXPORT_SYMBOL(netdev_bonding_change);
1110
1111/**
1112 *	dev_load 	- load a network module
1113 *	@net: the applicable net namespace
1114 *	@name: name of interface
1115 *
1116 *	If a network interface is not present and the process has suitable
1117 *	privileges this function loads the module. If module loading is not
1118 *	available in this kernel then it becomes a nop.
1119 */
1120
1121void dev_load(struct net *net, const char *name)
1122{
1123	struct net_device *dev;
1124
1125	rcu_read_lock();
1126	dev = dev_get_by_name_rcu(net, name);
1127	rcu_read_unlock();
1128
1129	if (!dev && capable(CAP_NET_ADMIN))
1130		request_module("%s", name);
1131}
1132EXPORT_SYMBOL(dev_load);
1133
1134static int __dev_open(struct net_device *dev)
1135{
1136	const struct net_device_ops *ops = dev->netdev_ops;
1137	int ret;
1138
1139	ASSERT_RTNL();
1140
1141	/*
1142	 *	Is it even present?
1143	 */
1144	if (!netif_device_present(dev))
1145		return -ENODEV;
1146
1147	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1148	ret = notifier_to_errno(ret);
1149	if (ret)
1150		return ret;
1151
1152	/*
1153	 *	Call device private open method
1154	 */
1155	set_bit(__LINK_STATE_START, &dev->state);
1156
1157	if (ops->ndo_validate_addr)
1158		ret = ops->ndo_validate_addr(dev);
1159
1160	if (!ret && ops->ndo_open)
1161		ret = ops->ndo_open(dev);
1162
1163	/*
1164	 *	If it went open OK then:
1165	 */
1166
1167	if (ret)
1168		clear_bit(__LINK_STATE_START, &dev->state);
1169	else {
1170		/*
1171		 *	Set the flags.
1172		 */
1173		dev->flags |= IFF_UP;
1174
1175		/*
1176		 *	Enable NET_DMA
1177		 */
1178		net_dmaengine_get();
1179
1180		/*
1181		 *	Initialize multicasting status
1182		 */
1183		dev_set_rx_mode(dev);
1184
1185		/*
1186		 *	Wakeup transmit queue engine
1187		 */
1188		dev_activate(dev);
1189	}
1190
1191	return ret;
1192}
1193
1194/**
1195 *	dev_open	- prepare an interface for use.
1196 *	@dev:	device to open
1197 *
1198 *	Takes a device from down to up state. The device's private open
1199 *	function is invoked and then the multicast lists are loaded. Finally
1200 *	the device is moved into the up state and a %NETDEV_UP message is
1201 *	sent to the netdev notifier chain.
1202 *
1203 *	Calling this function on an active interface is a nop. On a failure
1204 *	a negative errno code is returned.
1205 */
1206int dev_open(struct net_device *dev)
1207{
1208	int ret;
1209
1210	/*
1211	 *	Is it already up?
1212	 */
1213	if (dev->flags & IFF_UP)
1214		return 0;
1215
1216	/*
1217	 *	Open device
1218	 */
1219	ret = __dev_open(dev);
1220	if (ret < 0)
1221		return ret;
1222
1223	/*
1224	 *	... and announce new interface.
1225	 */
1226	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1227	call_netdevice_notifiers(NETDEV_UP, dev);
1228
1229	return ret;
1230}
1231EXPORT_SYMBOL(dev_open);
1232
1233static int __dev_close(struct net_device *dev)
1234{
1235	const struct net_device_ops *ops = dev->netdev_ops;
1236
1237	ASSERT_RTNL();
1238	might_sleep();
1239
1240	/*
1241	 *	Tell people we are going down, so that they can
1242	 *	prepare to death, when device is still operating.
1243	 */
1244	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1245
1246	clear_bit(__LINK_STATE_START, &dev->state);
1247
1248	/* Synchronize to scheduled poll. We cannot touch poll list,
1249	 * it can be even on different cpu. So just clear netif_running().
1250	 *
1251	 * dev->stop() will invoke napi_disable() on all of it's
1252	 * napi_struct instances on this device.
1253	 */
1254	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1255
1256	dev_deactivate(dev);
1257
1258	/*
1259	 *	Call the device specific close. This cannot fail.
1260	 *	Only if device is UP
1261	 *
1262	 *	We allow it to be called even after a DETACH hot-plug
1263	 *	event.
1264	 */
1265	if (ops->ndo_stop)
1266		ops->ndo_stop(dev);
1267
1268	/*
1269	 *	Device is now down.
1270	 */
1271
1272	dev->flags &= ~IFF_UP;
1273
1274	/*
1275	 *	Shutdown NET_DMA
1276	 */
1277	net_dmaengine_put();
1278
1279	return 0;
1280}
1281
1282/**
1283 *	dev_close - shutdown an interface.
1284 *	@dev: device to shutdown
1285 *
1286 *	This function moves an active device into down state. A
1287 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1288 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1289 *	chain.
1290 */
1291int dev_close(struct net_device *dev)
1292{
1293	if (!(dev->flags & IFF_UP))
1294		return 0;
1295
1296	__dev_close(dev);
1297
1298	/*
1299	 * Tell people we are down
1300	 */
1301	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1302	call_netdevice_notifiers(NETDEV_DOWN, dev);
1303
1304	return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *	dev_disable_lro - disable Large Receive Offload on a device
1311 *	@dev: device
1312 *
1313 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *	called under RTNL.  This is needed if received packets may be
1315 *	forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1320	    dev->ethtool_ops->set_flags) {
1321		u32 flags = dev->ethtool_ops->get_flags(dev);
1322		if (flags & ETH_FLAG_LRO) {
1323			flags &= ~ETH_FLAG_LRO;
1324			dev->ethtool_ops->set_flags(dev, flags);
1325		}
1326	}
1327	WARN_ON(dev->features & NETIF_F_LRO);
1328}
1329EXPORT_SYMBOL(dev_disable_lro);
1330
1331
1332static int dev_boot_phase = 1;
1333
1334/*
1335 *	Device change register/unregister. These are not inline or static
1336 *	as we export them to the world.
1337 */
1338
1339/**
1340 *	register_netdevice_notifier - register a network notifier block
1341 *	@nb: notifier
1342 *
1343 *	Register a notifier to be called when network device events occur.
1344 *	The notifier passed is linked into the kernel structures and must
1345 *	not be reused until it has been unregistered. A negative errno code
1346 *	is returned on a failure.
1347 *
1348 * 	When registered all registration and up events are replayed
1349 *	to the new notifier to allow device to have a race free
1350 *	view of the network device list.
1351 */
1352
1353int register_netdevice_notifier(struct notifier_block *nb)
1354{
1355	struct net_device *dev;
1356	struct net_device *last;
1357	struct net *net;
1358	int err;
1359
1360	rtnl_lock();
1361	err = raw_notifier_chain_register(&netdev_chain, nb);
1362	if (err)
1363		goto unlock;
1364	if (dev_boot_phase)
1365		goto unlock;
1366	for_each_net(net) {
1367		for_each_netdev(net, dev) {
1368			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1369			err = notifier_to_errno(err);
1370			if (err)
1371				goto rollback;
1372
1373			if (!(dev->flags & IFF_UP))
1374				continue;
1375
1376			nb->notifier_call(nb, NETDEV_UP, dev);
1377		}
1378	}
1379
1380unlock:
1381	rtnl_unlock();
1382	return err;
1383
1384rollback:
1385	last = dev;
1386	for_each_net(net) {
1387		for_each_netdev(net, dev) {
1388			if (dev == last)
1389				break;
1390
1391			if (dev->flags & IFF_UP) {
1392				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1393				nb->notifier_call(nb, NETDEV_DOWN, dev);
1394			}
1395			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1396			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1397		}
1398	}
1399
1400	raw_notifier_chain_unregister(&netdev_chain, nb);
1401	goto unlock;
1402}
1403EXPORT_SYMBOL(register_netdevice_notifier);
1404
1405/**
1406 *	unregister_netdevice_notifier - unregister a network notifier block
1407 *	@nb: notifier
1408 *
1409 *	Unregister a notifier previously registered by
1410 *	register_netdevice_notifier(). The notifier is unlinked into the
1411 *	kernel structures and may then be reused. A negative errno code
1412 *	is returned on a failure.
1413 */
1414
1415int unregister_netdevice_notifier(struct notifier_block *nb)
1416{
1417	int err;
1418
1419	rtnl_lock();
1420	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1421	rtnl_unlock();
1422	return err;
1423}
1424EXPORT_SYMBOL(unregister_netdevice_notifier);
1425
1426/**
1427 *	call_netdevice_notifiers - call all network notifier blocks
1428 *      @val: value passed unmodified to notifier function
1429 *      @dev: net_device pointer passed unmodified to notifier function
1430 *
1431 *	Call all network notifier blocks.  Parameters and return value
1432 *	are as for raw_notifier_call_chain().
1433 */
1434
1435int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1436{
1437	ASSERT_RTNL();
1438	return raw_notifier_call_chain(&netdev_chain, val, dev);
1439}
1440
1441/* When > 0 there are consumers of rx skb time stamps */
1442static atomic_t netstamp_needed = ATOMIC_INIT(0);
1443
1444void net_enable_timestamp(void)
1445{
1446	atomic_inc(&netstamp_needed);
1447}
1448EXPORT_SYMBOL(net_enable_timestamp);
1449
1450void net_disable_timestamp(void)
1451{
1452	atomic_dec(&netstamp_needed);
1453}
1454EXPORT_SYMBOL(net_disable_timestamp);
1455
1456static inline void net_timestamp_set(struct sk_buff *skb)
1457{
1458	if (atomic_read(&netstamp_needed))
1459		__net_timestamp(skb);
1460	else
1461		skb->tstamp.tv64 = 0;
1462}
1463
1464static inline void net_timestamp_check(struct sk_buff *skb)
1465{
1466	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1467		__net_timestamp(skb);
1468}
1469
1470/**
1471 * dev_forward_skb - loopback an skb to another netif
1472 *
1473 * @dev: destination network device
1474 * @skb: buffer to forward
1475 *
1476 * return values:
1477 *	NET_RX_SUCCESS	(no congestion)
1478 *	NET_RX_DROP     (packet was dropped, but freed)
1479 *
1480 * dev_forward_skb can be used for injecting an skb from the
1481 * start_xmit function of one device into the receive queue
1482 * of another device.
1483 *
1484 * The receiving device may be in another namespace, so
1485 * we have to clear all information in the skb that could
1486 * impact namespace isolation.
1487 */
1488int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1489{
1490	skb_orphan(skb);
1491
1492	if (!(dev->flags & IFF_UP) ||
1493	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1494		kfree_skb(skb);
1495		return NET_RX_DROP;
1496	}
1497	skb_set_dev(skb, dev);
1498	skb->tstamp.tv64 = 0;
1499	skb->pkt_type = PACKET_HOST;
1500	skb->protocol = eth_type_trans(skb, dev);
1501	return netif_rx(skb);
1502}
1503EXPORT_SYMBOL_GPL(dev_forward_skb);
1504
1505/*
1506 *	Support routine. Sends outgoing frames to any network
1507 *	taps currently in use.
1508 */
1509
1510static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1511{
1512	struct packet_type *ptype;
1513
1514#ifdef CONFIG_NET_CLS_ACT
1515	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1516		net_timestamp_set(skb);
1517#else
1518	net_timestamp_set(skb);
1519#endif
1520
1521	rcu_read_lock();
1522	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1523		/* Never send packets back to the socket
1524		 * they originated from - MvS (miquels@drinkel.ow.org)
1525		 */
1526		if ((ptype->dev == dev || !ptype->dev) &&
1527		    (ptype->af_packet_priv == NULL ||
1528		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1529			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1530			if (!skb2)
1531				break;
1532
1533			/* skb->nh should be correctly
1534			   set by sender, so that the second statement is
1535			   just protection against buggy protocols.
1536			 */
1537			skb_reset_mac_header(skb2);
1538
1539			if (skb_network_header(skb2) < skb2->data ||
1540			    skb2->network_header > skb2->tail) {
1541				if (net_ratelimit())
1542					printk(KERN_CRIT "protocol %04x is "
1543					       "buggy, dev %s\n",
1544					       skb2->protocol, dev->name);
1545				skb_reset_network_header(skb2);
1546			}
1547
1548			skb2->transport_header = skb2->network_header;
1549			skb2->pkt_type = PACKET_OUTGOING;
1550			ptype->func(skb2, skb->dev, ptype, skb->dev);
1551		}
1552	}
1553	rcu_read_unlock();
1554}
1555
1556/*
1557 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1558 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1559 */
1560void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1561{
1562	unsigned int real_num = dev->real_num_tx_queues;
1563
1564	if (unlikely(txq > dev->num_tx_queues))
1565		;
1566	else if (txq > real_num)
1567		dev->real_num_tx_queues = txq;
1568	else if (txq < real_num) {
1569		dev->real_num_tx_queues = txq;
1570		qdisc_reset_all_tx_gt(dev, txq);
1571	}
1572}
1573EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1574
1575static inline void __netif_reschedule(struct Qdisc *q)
1576{
1577	struct softnet_data *sd;
1578	unsigned long flags;
1579
1580	local_irq_save(flags);
1581	sd = &__get_cpu_var(softnet_data);
1582	q->next_sched = NULL;
1583	*sd->output_queue_tailp = q;
1584	sd->output_queue_tailp = &q->next_sched;
1585	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1586	local_irq_restore(flags);
1587}
1588
1589void __netif_schedule(struct Qdisc *q)
1590{
1591	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1592		__netif_reschedule(q);
1593}
1594EXPORT_SYMBOL(__netif_schedule);
1595
1596void dev_kfree_skb_irq(struct sk_buff *skb)
1597{
1598	if (atomic_dec_and_test(&skb->users)) {
1599		struct softnet_data *sd;
1600		unsigned long flags;
1601
1602		local_irq_save(flags);
1603		sd = &__get_cpu_var(softnet_data);
1604		skb->next = sd->completion_queue;
1605		sd->completion_queue = skb;
1606		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1607		local_irq_restore(flags);
1608	}
1609}
1610EXPORT_SYMBOL(dev_kfree_skb_irq);
1611
1612void dev_kfree_skb_any(struct sk_buff *skb)
1613{
1614	if (in_irq() || irqs_disabled())
1615		dev_kfree_skb_irq(skb);
1616	else
1617		dev_kfree_skb(skb);
1618}
1619EXPORT_SYMBOL(dev_kfree_skb_any);
1620
1621
1622/**
1623 * netif_device_detach - mark device as removed
1624 * @dev: network device
1625 *
1626 * Mark device as removed from system and therefore no longer available.
1627 */
1628void netif_device_detach(struct net_device *dev)
1629{
1630	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1631	    netif_running(dev)) {
1632		netif_tx_stop_all_queues(dev);
1633	}
1634}
1635EXPORT_SYMBOL(netif_device_detach);
1636
1637/**
1638 * netif_device_attach - mark device as attached
1639 * @dev: network device
1640 *
1641 * Mark device as attached from system and restart if needed.
1642 */
1643void netif_device_attach(struct net_device *dev)
1644{
1645	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1646	    netif_running(dev)) {
1647		netif_tx_wake_all_queues(dev);
1648		__netdev_watchdog_up(dev);
1649	}
1650}
1651EXPORT_SYMBOL(netif_device_attach);
1652
1653static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1654{
1655	return ((features & NETIF_F_GEN_CSUM) ||
1656		((features & NETIF_F_IP_CSUM) &&
1657		 protocol == htons(ETH_P_IP)) ||
1658		((features & NETIF_F_IPV6_CSUM) &&
1659		 protocol == htons(ETH_P_IPV6)) ||
1660		((features & NETIF_F_FCOE_CRC) &&
1661		 protocol == htons(ETH_P_FCOE)));
1662}
1663
1664static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1665{
1666	if (can_checksum_protocol(dev->features, skb->protocol))
1667		return true;
1668
1669	if (skb->protocol == htons(ETH_P_8021Q)) {
1670		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1671		if (can_checksum_protocol(dev->features & dev->vlan_features,
1672					  veh->h_vlan_encapsulated_proto))
1673			return true;
1674	}
1675
1676	return false;
1677}
1678
1679/**
1680 * skb_dev_set -- assign a new device to a buffer
1681 * @skb: buffer for the new device
1682 * @dev: network device
1683 *
1684 * If an skb is owned by a device already, we have to reset
1685 * all data private to the namespace a device belongs to
1686 * before assigning it a new device.
1687 */
1688#ifdef CONFIG_NET_NS
1689void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1690{
1691	skb_dst_drop(skb);
1692	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1693		secpath_reset(skb);
1694		nf_reset(skb);
1695		skb_init_secmark(skb);
1696		skb->mark = 0;
1697		skb->priority = 0;
1698		skb->nf_trace = 0;
1699		skb->ipvs_property = 0;
1700#ifdef CONFIG_NET_SCHED
1701		skb->tc_index = 0;
1702#endif
1703	}
1704	skb->dev = dev;
1705}
1706EXPORT_SYMBOL(skb_set_dev);
1707#endif /* CONFIG_NET_NS */
1708
1709/*
1710 * Invalidate hardware checksum when packet is to be mangled, and
1711 * complete checksum manually on outgoing path.
1712 */
1713int skb_checksum_help(struct sk_buff *skb)
1714{
1715	__wsum csum;
1716	int ret = 0, offset;
1717
1718	if (skb->ip_summed == CHECKSUM_COMPLETE)
1719		goto out_set_summed;
1720
1721	if (unlikely(skb_shinfo(skb)->gso_size)) {
1722		/* Let GSO fix up the checksum. */
1723		goto out_set_summed;
1724	}
1725
1726	offset = skb->csum_start - skb_headroom(skb);
1727	BUG_ON(offset >= skb_headlen(skb));
1728	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1729
1730	offset += skb->csum_offset;
1731	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1732
1733	if (skb_cloned(skb) &&
1734	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1735		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1736		if (ret)
1737			goto out;
1738	}
1739
1740	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1741out_set_summed:
1742	skb->ip_summed = CHECKSUM_NONE;
1743out:
1744	return ret;
1745}
1746EXPORT_SYMBOL(skb_checksum_help);
1747
1748/**
1749 *	skb_gso_segment - Perform segmentation on skb.
1750 *	@skb: buffer to segment
1751 *	@features: features for the output path (see dev->features)
1752 *
1753 *	This function segments the given skb and returns a list of segments.
1754 *
1755 *	It may return NULL if the skb requires no segmentation.  This is
1756 *	only possible when GSO is used for verifying header integrity.
1757 */
1758struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1759{
1760	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1761	struct packet_type *ptype;
1762	__be16 type = skb->protocol;
1763	int err;
1764
1765	skb_reset_mac_header(skb);
1766	skb->mac_len = skb->network_header - skb->mac_header;
1767	__skb_pull(skb, skb->mac_len);
1768
1769	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1770		struct net_device *dev = skb->dev;
1771		struct ethtool_drvinfo info = {};
1772
1773		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1774			dev->ethtool_ops->get_drvinfo(dev, &info);
1775
1776		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1777			"ip_summed=%d",
1778		     info.driver, dev ? dev->features : 0L,
1779		     skb->sk ? skb->sk->sk_route_caps : 0L,
1780		     skb->len, skb->data_len, skb->ip_summed);
1781
1782		if (skb_header_cloned(skb) &&
1783		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1784			return ERR_PTR(err);
1785	}
1786
1787	rcu_read_lock();
1788	list_for_each_entry_rcu(ptype,
1789			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1790		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1791			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1792				err = ptype->gso_send_check(skb);
1793				segs = ERR_PTR(err);
1794				if (err || skb_gso_ok(skb, features))
1795					break;
1796				__skb_push(skb, (skb->data -
1797						 skb_network_header(skb)));
1798			}
1799			segs = ptype->gso_segment(skb, features);
1800			break;
1801		}
1802	}
1803	rcu_read_unlock();
1804
1805	__skb_push(skb, skb->data - skb_mac_header(skb));
1806
1807	return segs;
1808}
1809EXPORT_SYMBOL(skb_gso_segment);
1810
1811/* Take action when hardware reception checksum errors are detected. */
1812#ifdef CONFIG_BUG
1813void netdev_rx_csum_fault(struct net_device *dev)
1814{
1815	if (net_ratelimit()) {
1816		printk(KERN_ERR "%s: hw csum failure.\n",
1817			dev ? dev->name : "<unknown>");
1818		dump_stack();
1819	}
1820}
1821EXPORT_SYMBOL(netdev_rx_csum_fault);
1822#endif
1823
1824/* Actually, we should eliminate this check as soon as we know, that:
1825 * 1. IOMMU is present and allows to map all the memory.
1826 * 2. No high memory really exists on this machine.
1827 */
1828
1829static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1830{
1831#ifdef CONFIG_HIGHMEM
1832	int i;
1833	if (!(dev->features & NETIF_F_HIGHDMA)) {
1834		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1835			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1836				return 1;
1837	}
1838
1839	if (PCI_DMA_BUS_IS_PHYS) {
1840		struct device *pdev = dev->dev.parent;
1841
1842		if (!pdev)
1843			return 0;
1844		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1845			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1846			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1847				return 1;
1848		}
1849	}
1850#endif
1851	return 0;
1852}
1853
1854struct dev_gso_cb {
1855	void (*destructor)(struct sk_buff *skb);
1856};
1857
1858#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1859
1860static void dev_gso_skb_destructor(struct sk_buff *skb)
1861{
1862	struct dev_gso_cb *cb;
1863
1864	do {
1865		struct sk_buff *nskb = skb->next;
1866
1867		skb->next = nskb->next;
1868		nskb->next = NULL;
1869		kfree_skb(nskb);
1870	} while (skb->next);
1871
1872	cb = DEV_GSO_CB(skb);
1873	if (cb->destructor)
1874		cb->destructor(skb);
1875}
1876
1877/**
1878 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1879 *	@skb: buffer to segment
1880 *
1881 *	This function segments the given skb and stores the list of segments
1882 *	in skb->next.
1883 */
1884static int dev_gso_segment(struct sk_buff *skb)
1885{
1886	struct net_device *dev = skb->dev;
1887	struct sk_buff *segs;
1888	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1889					 NETIF_F_SG : 0);
1890
1891	segs = skb_gso_segment(skb, features);
1892
1893	/* Verifying header integrity only. */
1894	if (!segs)
1895		return 0;
1896
1897	if (IS_ERR(segs))
1898		return PTR_ERR(segs);
1899
1900	skb->next = segs;
1901	DEV_GSO_CB(skb)->destructor = skb->destructor;
1902	skb->destructor = dev_gso_skb_destructor;
1903
1904	return 0;
1905}
1906
1907/*
1908 * Try to orphan skb early, right before transmission by the device.
1909 * We cannot orphan skb if tx timestamp is requested, since
1910 * drivers need to call skb_tstamp_tx() to send the timestamp.
1911 */
1912static inline void skb_orphan_try(struct sk_buff *skb)
1913{
1914	struct sock *sk = skb->sk;
1915
1916	if (sk && !skb_tx(skb)->flags) {
1917		/* skb_tx_hash() wont be able to get sk.
1918		 * We copy sk_hash into skb->rxhash
1919		 */
1920		if (!skb->rxhash)
1921			skb->rxhash = sk->sk_hash;
1922		skb_orphan(skb);
1923	}
1924}
1925
1926int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1927			struct netdev_queue *txq)
1928{
1929	const struct net_device_ops *ops = dev->netdev_ops;
1930	int rc = NETDEV_TX_OK;
1931
1932	if (likely(!skb->next)) {
1933		if (!list_empty(&ptype_all))
1934			dev_queue_xmit_nit(skb, dev);
1935
1936		/*
1937		 * If device doesnt need skb->dst, release it right now while
1938		 * its hot in this cpu cache
1939		 */
1940		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1941			skb_dst_drop(skb);
1942
1943		skb_orphan_try(skb);
1944
1945		if (netif_needs_gso(dev, skb)) {
1946			if (unlikely(dev_gso_segment(skb)))
1947				goto out_kfree_skb;
1948			if (skb->next)
1949				goto gso;
1950		}
1951
1952		rc = ops->ndo_start_xmit(skb, dev);
1953		if (rc == NETDEV_TX_OK)
1954			txq_trans_update(txq);
1955		return rc;
1956	}
1957
1958gso:
1959	do {
1960		struct sk_buff *nskb = skb->next;
1961
1962		skb->next = nskb->next;
1963		nskb->next = NULL;
1964
1965		/*
1966		 * If device doesnt need nskb->dst, release it right now while
1967		 * its hot in this cpu cache
1968		 */
1969		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1970			skb_dst_drop(nskb);
1971
1972		rc = ops->ndo_start_xmit(nskb, dev);
1973		if (unlikely(rc != NETDEV_TX_OK)) {
1974			if (rc & ~NETDEV_TX_MASK)
1975				goto out_kfree_gso_skb;
1976			nskb->next = skb->next;
1977			skb->next = nskb;
1978			return rc;
1979		}
1980		txq_trans_update(txq);
1981		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1982			return NETDEV_TX_BUSY;
1983	} while (skb->next);
1984
1985out_kfree_gso_skb:
1986	if (likely(skb->next == NULL))
1987		skb->destructor = DEV_GSO_CB(skb)->destructor;
1988out_kfree_skb:
1989	kfree_skb(skb);
1990	return rc;
1991}
1992
1993static u32 hashrnd __read_mostly;
1994
1995u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1996{
1997	u32 hash;
1998
1999	if (skb_rx_queue_recorded(skb)) {
2000		hash = skb_get_rx_queue(skb);
2001		while (unlikely(hash >= dev->real_num_tx_queues))
2002			hash -= dev->real_num_tx_queues;
2003		return hash;
2004	}
2005
2006	if (skb->sk && skb->sk->sk_hash)
2007		hash = skb->sk->sk_hash;
2008	else
2009		hash = (__force u16) skb->protocol ^ skb->rxhash;
2010	hash = jhash_1word(hash, hashrnd);
2011
2012	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2013}
2014EXPORT_SYMBOL(skb_tx_hash);
2015
2016static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2017{
2018	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2019		if (net_ratelimit()) {
2020			pr_warning("%s selects TX queue %d, but "
2021				"real number of TX queues is %d\n",
2022				dev->name, queue_index, dev->real_num_tx_queues);
2023		}
2024		return 0;
2025	}
2026	return queue_index;
2027}
2028
2029static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2030					struct sk_buff *skb)
2031{
2032	int queue_index;
2033	struct sock *sk = skb->sk;
2034
2035	queue_index = sk_tx_queue_get(sk);
2036	if (queue_index < 0) {
2037		const struct net_device_ops *ops = dev->netdev_ops;
2038
2039		if (ops->ndo_select_queue) {
2040			queue_index = ops->ndo_select_queue(dev, skb);
2041			queue_index = dev_cap_txqueue(dev, queue_index);
2042		} else {
2043			queue_index = 0;
2044			if (dev->real_num_tx_queues > 1)
2045				queue_index = skb_tx_hash(dev, skb);
2046
2047			if (sk) {
2048				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2049
2050				if (dst && skb_dst(skb) == dst)
2051					sk_tx_queue_set(sk, queue_index);
2052			}
2053		}
2054	}
2055
2056	skb_set_queue_mapping(skb, queue_index);
2057	return netdev_get_tx_queue(dev, queue_index);
2058}
2059
2060static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2061				 struct net_device *dev,
2062				 struct netdev_queue *txq)
2063{
2064	spinlock_t *root_lock = qdisc_lock(q);
2065	int rc;
2066
2067	spin_lock(root_lock);
2068	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2069		kfree_skb(skb);
2070		rc = NET_XMIT_DROP;
2071	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2072		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2073		/*
2074		 * This is a work-conserving queue; there are no old skbs
2075		 * waiting to be sent out; and the qdisc is not running -
2076		 * xmit the skb directly.
2077		 */
2078		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2079			skb_dst_force(skb);
2080		__qdisc_update_bstats(q, skb->len);
2081		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2082			__qdisc_run(q);
2083		else
2084			clear_bit(__QDISC_STATE_RUNNING, &q->state);
2085
2086		rc = NET_XMIT_SUCCESS;
2087	} else {
2088		skb_dst_force(skb);
2089		rc = qdisc_enqueue_root(skb, q);
2090		qdisc_run(q);
2091	}
2092	spin_unlock(root_lock);
2093
2094	return rc;
2095}
2096
2097/*
2098 * Returns true if either:
2099 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2100 *	2. skb is fragmented and the device does not support SG, or if
2101 *	   at least one of fragments is in highmem and device does not
2102 *	   support DMA from it.
2103 */
2104static inline int skb_needs_linearize(struct sk_buff *skb,
2105				      struct net_device *dev)
2106{
2107	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2108	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2109					      illegal_highdma(dev, skb)));
2110}
2111
2112/**
2113 *	dev_queue_xmit - transmit a buffer
2114 *	@skb: buffer to transmit
2115 *
2116 *	Queue a buffer for transmission to a network device. The caller must
2117 *	have set the device and priority and built the buffer before calling
2118 *	this function. The function can be called from an interrupt.
2119 *
2120 *	A negative errno code is returned on a failure. A success does not
2121 *	guarantee the frame will be transmitted as it may be dropped due
2122 *	to congestion or traffic shaping.
2123 *
2124 * -----------------------------------------------------------------------------------
2125 *      I notice this method can also return errors from the queue disciplines,
2126 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2127 *      be positive.
2128 *
2129 *      Regardless of the return value, the skb is consumed, so it is currently
2130 *      difficult to retry a send to this method.  (You can bump the ref count
2131 *      before sending to hold a reference for retry if you are careful.)
2132 *
2133 *      When calling this method, interrupts MUST be enabled.  This is because
2134 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2135 *          --BLG
2136 */
2137int dev_queue_xmit(struct sk_buff *skb)
2138{
2139	struct net_device *dev = skb->dev;
2140	struct netdev_queue *txq;
2141	struct Qdisc *q;
2142	int rc = -ENOMEM;
2143
2144	/* GSO will handle the following emulations directly. */
2145	if (netif_needs_gso(dev, skb))
2146		goto gso;
2147
2148	/* Convert a paged skb to linear, if required */
2149	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2150		goto out_kfree_skb;
2151
2152	/* If packet is not checksummed and device does not support
2153	 * checksumming for this protocol, complete checksumming here.
2154	 */
2155	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2156		skb_set_transport_header(skb, skb->csum_start -
2157					      skb_headroom(skb));
2158		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2159			goto out_kfree_skb;
2160	}
2161
2162gso:
2163	/* Disable soft irqs for various locks below. Also
2164	 * stops preemption for RCU.
2165	 */
2166	rcu_read_lock_bh();
2167
2168	txq = dev_pick_tx(dev, skb);
2169	q = rcu_dereference_bh(txq->qdisc);
2170
2171#ifdef CONFIG_NET_CLS_ACT
2172	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2173#endif
2174	if (q->enqueue) {
2175		rc = __dev_xmit_skb(skb, q, dev, txq);
2176		goto out;
2177	}
2178
2179	/* The device has no queue. Common case for software devices:
2180	   loopback, all the sorts of tunnels...
2181
2182	   Really, it is unlikely that netif_tx_lock protection is necessary
2183	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2184	   counters.)
2185	   However, it is possible, that they rely on protection
2186	   made by us here.
2187
2188	   Check this and shot the lock. It is not prone from deadlocks.
2189	   Either shot noqueue qdisc, it is even simpler 8)
2190	 */
2191	if (dev->flags & IFF_UP) {
2192		int cpu = smp_processor_id(); /* ok because BHs are off */
2193
2194		if (txq->xmit_lock_owner != cpu) {
2195
2196			HARD_TX_LOCK(dev, txq, cpu);
2197
2198			if (!netif_tx_queue_stopped(txq)) {
2199				rc = dev_hard_start_xmit(skb, dev, txq);
2200				if (dev_xmit_complete(rc)) {
2201					HARD_TX_UNLOCK(dev, txq);
2202					goto out;
2203				}
2204			}
2205			HARD_TX_UNLOCK(dev, txq);
2206			if (net_ratelimit())
2207				printk(KERN_CRIT "Virtual device %s asks to "
2208				       "queue packet!\n", dev->name);
2209		} else {
2210			/* Recursion is detected! It is possible,
2211			 * unfortunately */
2212			if (net_ratelimit())
2213				printk(KERN_CRIT "Dead loop on virtual device "
2214				       "%s, fix it urgently!\n", dev->name);
2215		}
2216	}
2217
2218	rc = -ENETDOWN;
2219	rcu_read_unlock_bh();
2220
2221out_kfree_skb:
2222	kfree_skb(skb);
2223	return rc;
2224out:
2225	rcu_read_unlock_bh();
2226	return rc;
2227}
2228EXPORT_SYMBOL(dev_queue_xmit);
2229
2230
2231/*=======================================================================
2232			Receiver routines
2233  =======================================================================*/
2234
2235int netdev_max_backlog __read_mostly = 1000;
2236int netdev_tstamp_prequeue __read_mostly = 1;
2237int netdev_budget __read_mostly = 300;
2238int weight_p __read_mostly = 64;            /* old backlog weight */
2239
2240/* Called with irq disabled */
2241static inline void ____napi_schedule(struct softnet_data *sd,
2242				     struct napi_struct *napi)
2243{
2244	list_add_tail(&napi->poll_list, &sd->poll_list);
2245	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2246}
2247
2248#ifdef CONFIG_RPS
2249
2250/* One global table that all flow-based protocols share. */
2251struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2252EXPORT_SYMBOL(rps_sock_flow_table);
2253
2254/*
2255 * get_rps_cpu is called from netif_receive_skb and returns the target
2256 * CPU from the RPS map of the receiving queue for a given skb.
2257 * rcu_read_lock must be held on entry.
2258 */
2259static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2260		       struct rps_dev_flow **rflowp)
2261{
2262	struct ipv6hdr *ip6;
2263	struct iphdr *ip;
2264	struct netdev_rx_queue *rxqueue;
2265	struct rps_map *map;
2266	struct rps_dev_flow_table *flow_table;
2267	struct rps_sock_flow_table *sock_flow_table;
2268	int cpu = -1;
2269	u8 ip_proto;
2270	u16 tcpu;
2271	u32 addr1, addr2, ihl;
2272	union {
2273		u32 v32;
2274		u16 v16[2];
2275	} ports;
2276
2277	if (skb_rx_queue_recorded(skb)) {
2278		u16 index = skb_get_rx_queue(skb);
2279		if (unlikely(index >= dev->num_rx_queues)) {
2280			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2281				"on queue %u, but number of RX queues is %u\n",
2282				dev->name, index, dev->num_rx_queues);
2283			goto done;
2284		}
2285		rxqueue = dev->_rx + index;
2286	} else
2287		rxqueue = dev->_rx;
2288
2289	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2290		goto done;
2291
2292	if (skb->rxhash)
2293		goto got_hash; /* Skip hash computation on packet header */
2294
2295	switch (skb->protocol) {
2296	case __constant_htons(ETH_P_IP):
2297		if (!pskb_may_pull(skb, sizeof(*ip)))
2298			goto done;
2299
2300		ip = (struct iphdr *) skb->data;
2301		ip_proto = ip->protocol;
2302		addr1 = (__force u32) ip->saddr;
2303		addr2 = (__force u32) ip->daddr;
2304		ihl = ip->ihl;
2305		break;
2306	case __constant_htons(ETH_P_IPV6):
2307		if (!pskb_may_pull(skb, sizeof(*ip6)))
2308			goto done;
2309
2310		ip6 = (struct ipv6hdr *) skb->data;
2311		ip_proto = ip6->nexthdr;
2312		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2313		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2314		ihl = (40 >> 2);
2315		break;
2316	default:
2317		goto done;
2318	}
2319	switch (ip_proto) {
2320	case IPPROTO_TCP:
2321	case IPPROTO_UDP:
2322	case IPPROTO_DCCP:
2323	case IPPROTO_ESP:
2324	case IPPROTO_AH:
2325	case IPPROTO_SCTP:
2326	case IPPROTO_UDPLITE:
2327		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2328			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2329			if (ports.v16[1] < ports.v16[0])
2330				swap(ports.v16[0], ports.v16[1]);
2331			break;
2332		}
2333	default:
2334		ports.v32 = 0;
2335		break;
2336	}
2337
2338	/* get a consistent hash (same value on both flow directions) */
2339	if (addr2 < addr1)
2340		swap(addr1, addr2);
2341	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2342	if (!skb->rxhash)
2343		skb->rxhash = 1;
2344
2345got_hash:
2346	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2347	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2348	if (flow_table && sock_flow_table) {
2349		u16 next_cpu;
2350		struct rps_dev_flow *rflow;
2351
2352		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2353		tcpu = rflow->cpu;
2354
2355		next_cpu = sock_flow_table->ents[skb->rxhash &
2356		    sock_flow_table->mask];
2357
2358		/*
2359		 * If the desired CPU (where last recvmsg was done) is
2360		 * different from current CPU (one in the rx-queue flow
2361		 * table entry), switch if one of the following holds:
2362		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2363		 *   - Current CPU is offline.
2364		 *   - The current CPU's queue tail has advanced beyond the
2365		 *     last packet that was enqueued using this table entry.
2366		 *     This guarantees that all previous packets for the flow
2367		 *     have been dequeued, thus preserving in order delivery.
2368		 */
2369		if (unlikely(tcpu != next_cpu) &&
2370		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2371		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2372		      rflow->last_qtail)) >= 0)) {
2373			tcpu = rflow->cpu = next_cpu;
2374			if (tcpu != RPS_NO_CPU)
2375				rflow->last_qtail = per_cpu(softnet_data,
2376				    tcpu).input_queue_head;
2377		}
2378		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2379			*rflowp = rflow;
2380			cpu = tcpu;
2381			goto done;
2382		}
2383	}
2384
2385	map = rcu_dereference(rxqueue->rps_map);
2386	if (map) {
2387		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2388
2389		if (cpu_online(tcpu)) {
2390			cpu = tcpu;
2391			goto done;
2392		}
2393	}
2394
2395done:
2396	return cpu;
2397}
2398
2399/* Called from hardirq (IPI) context */
2400static void rps_trigger_softirq(void *data)
2401{
2402	struct softnet_data *sd = data;
2403
2404	____napi_schedule(sd, &sd->backlog);
2405	sd->received_rps++;
2406}
2407
2408#endif /* CONFIG_RPS */
2409
2410/*
2411 * Check if this softnet_data structure is another cpu one
2412 * If yes, queue it to our IPI list and return 1
2413 * If no, return 0
2414 */
2415static int rps_ipi_queued(struct softnet_data *sd)
2416{
2417#ifdef CONFIG_RPS
2418	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2419
2420	if (sd != mysd) {
2421		sd->rps_ipi_next = mysd->rps_ipi_list;
2422		mysd->rps_ipi_list = sd;
2423
2424		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2425		return 1;
2426	}
2427#endif /* CONFIG_RPS */
2428	return 0;
2429}
2430
2431/*
2432 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2433 * queue (may be a remote CPU queue).
2434 */
2435static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2436			      unsigned int *qtail)
2437{
2438	struct softnet_data *sd;
2439	unsigned long flags;
2440
2441	sd = &per_cpu(softnet_data, cpu);
2442
2443	local_irq_save(flags);
2444
2445	rps_lock(sd);
2446	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2447		if (skb_queue_len(&sd->input_pkt_queue)) {
2448enqueue:
2449			__skb_queue_tail(&sd->input_pkt_queue, skb);
2450			input_queue_tail_incr_save(sd, qtail);
2451			rps_unlock(sd);
2452			local_irq_restore(flags);
2453			return NET_RX_SUCCESS;
2454		}
2455
2456		/* Schedule NAPI for backlog device
2457		 * We can use non atomic operation since we own the queue lock
2458		 */
2459		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2460			if (!rps_ipi_queued(sd))
2461				____napi_schedule(sd, &sd->backlog);
2462		}
2463		goto enqueue;
2464	}
2465
2466	sd->dropped++;
2467	rps_unlock(sd);
2468
2469	local_irq_restore(flags);
2470
2471	kfree_skb(skb);
2472	return NET_RX_DROP;
2473}
2474
2475/**
2476 *	netif_rx	-	post buffer to the network code
2477 *	@skb: buffer to post
2478 *
2479 *	This function receives a packet from a device driver and queues it for
2480 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2481 *	may be dropped during processing for congestion control or by the
2482 *	protocol layers.
2483 *
2484 *	return values:
2485 *	NET_RX_SUCCESS	(no congestion)
2486 *	NET_RX_DROP     (packet was dropped)
2487 *
2488 */
2489
2490int netif_rx(struct sk_buff *skb)
2491{
2492	int ret;
2493
2494	/* if netpoll wants it, pretend we never saw it */
2495	if (netpoll_rx(skb))
2496		return NET_RX_DROP;
2497
2498	if (netdev_tstamp_prequeue)
2499		net_timestamp_check(skb);
2500
2501#ifdef CONFIG_RPS
2502	{
2503		struct rps_dev_flow voidflow, *rflow = &voidflow;
2504		int cpu;
2505
2506		rcu_read_lock();
2507
2508		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2509		if (cpu < 0)
2510			cpu = smp_processor_id();
2511
2512		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2513
2514		rcu_read_unlock();
2515	}
2516#else
2517	{
2518		unsigned int qtail;
2519		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2520		put_cpu();
2521	}
2522#endif
2523	return ret;
2524}
2525EXPORT_SYMBOL(netif_rx);
2526
2527int netif_rx_ni(struct sk_buff *skb)
2528{
2529	int err;
2530
2531	preempt_disable();
2532	err = netif_rx(skb);
2533	if (local_softirq_pending())
2534		do_softirq();
2535	preempt_enable();
2536
2537	return err;
2538}
2539EXPORT_SYMBOL(netif_rx_ni);
2540
2541static void net_tx_action(struct softirq_action *h)
2542{
2543	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2544
2545	if (sd->completion_queue) {
2546		struct sk_buff *clist;
2547
2548		local_irq_disable();
2549		clist = sd->completion_queue;
2550		sd->completion_queue = NULL;
2551		local_irq_enable();
2552
2553		while (clist) {
2554			struct sk_buff *skb = clist;
2555			clist = clist->next;
2556
2557			WARN_ON(atomic_read(&skb->users));
2558			__kfree_skb(skb);
2559		}
2560	}
2561
2562	if (sd->output_queue) {
2563		struct Qdisc *head;
2564
2565		local_irq_disable();
2566		head = sd->output_queue;
2567		sd->output_queue = NULL;
2568		sd->output_queue_tailp = &sd->output_queue;
2569		local_irq_enable();
2570
2571		while (head) {
2572			struct Qdisc *q = head;
2573			spinlock_t *root_lock;
2574
2575			head = head->next_sched;
2576
2577			root_lock = qdisc_lock(q);
2578			if (spin_trylock(root_lock)) {
2579				smp_mb__before_clear_bit();
2580				clear_bit(__QDISC_STATE_SCHED,
2581					  &q->state);
2582				qdisc_run(q);
2583				spin_unlock(root_lock);
2584			} else {
2585				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2586					      &q->state)) {
2587					__netif_reschedule(q);
2588				} else {
2589					smp_mb__before_clear_bit();
2590					clear_bit(__QDISC_STATE_SCHED,
2591						  &q->state);
2592				}
2593			}
2594		}
2595	}
2596}
2597
2598static inline int deliver_skb(struct sk_buff *skb,
2599			      struct packet_type *pt_prev,
2600			      struct net_device *orig_dev)
2601{
2602	atomic_inc(&skb->users);
2603	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2604}
2605
2606#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2607
2608#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2609/* This hook is defined here for ATM LANE */
2610int (*br_fdb_test_addr_hook)(struct net_device *dev,
2611			     unsigned char *addr) __read_mostly;
2612EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2613#endif
2614
2615/*
2616 * If bridge module is loaded call bridging hook.
2617 *  returns NULL if packet was consumed.
2618 */
2619struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2620					struct sk_buff *skb) __read_mostly;
2621EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2622
2623static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2624					    struct packet_type **pt_prev, int *ret,
2625					    struct net_device *orig_dev)
2626{
2627	struct net_bridge_port *port;
2628
2629	if (skb->pkt_type == PACKET_LOOPBACK ||
2630	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2631		return skb;
2632
2633	if (*pt_prev) {
2634		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2635		*pt_prev = NULL;
2636	}
2637
2638	return br_handle_frame_hook(port, skb);
2639}
2640#else
2641#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2642#endif
2643
2644#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2645struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2646					     struct sk_buff *skb) __read_mostly;
2647EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2648
2649static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2650					     struct packet_type **pt_prev,
2651					     int *ret,
2652					     struct net_device *orig_dev)
2653{
2654	struct macvlan_port *port;
2655
2656	port = rcu_dereference(skb->dev->macvlan_port);
2657	if (!port)
2658		return skb;
2659
2660	if (*pt_prev) {
2661		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2662		*pt_prev = NULL;
2663	}
2664	return macvlan_handle_frame_hook(port, skb);
2665}
2666#else
2667#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2668#endif
2669
2670#ifdef CONFIG_NET_CLS_ACT
2671/* TODO: Maybe we should just force sch_ingress to be compiled in
2672 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2673 * a compare and 2 stores extra right now if we dont have it on
2674 * but have CONFIG_NET_CLS_ACT
2675 * NOTE: This doesnt stop any functionality; if you dont have
2676 * the ingress scheduler, you just cant add policies on ingress.
2677 *
2678 */
2679static int ing_filter(struct sk_buff *skb)
2680{
2681	struct net_device *dev = skb->dev;
2682	u32 ttl = G_TC_RTTL(skb->tc_verd);
2683	struct netdev_queue *rxq;
2684	int result = TC_ACT_OK;
2685	struct Qdisc *q;
2686
2687	if (MAX_RED_LOOP < ttl++) {
2688		printk(KERN_WARNING
2689		       "Redir loop detected Dropping packet (%d->%d)\n",
2690		       skb->skb_iif, dev->ifindex);
2691		return TC_ACT_SHOT;
2692	}
2693
2694	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2695	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2696
2697	rxq = &dev->rx_queue;
2698
2699	q = rxq->qdisc;
2700	if (q != &noop_qdisc) {
2701		spin_lock(qdisc_lock(q));
2702		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2703			result = qdisc_enqueue_root(skb, q);
2704		spin_unlock(qdisc_lock(q));
2705	}
2706
2707	return result;
2708}
2709
2710static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2711					 struct packet_type **pt_prev,
2712					 int *ret, struct net_device *orig_dev)
2713{
2714	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2715		goto out;
2716
2717	if (*pt_prev) {
2718		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2719		*pt_prev = NULL;
2720	} else {
2721		/* Huh? Why does turning on AF_PACKET affect this? */
2722		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2723	}
2724
2725	switch (ing_filter(skb)) {
2726	case TC_ACT_SHOT:
2727	case TC_ACT_STOLEN:
2728		kfree_skb(skb);
2729		return NULL;
2730	}
2731
2732out:
2733	skb->tc_verd = 0;
2734	return skb;
2735}
2736#endif
2737
2738/*
2739 * 	netif_nit_deliver - deliver received packets to network taps
2740 * 	@skb: buffer
2741 *
2742 * 	This function is used to deliver incoming packets to network
2743 * 	taps. It should be used when the normal netif_receive_skb path
2744 * 	is bypassed, for example because of VLAN acceleration.
2745 */
2746void netif_nit_deliver(struct sk_buff *skb)
2747{
2748	struct packet_type *ptype;
2749
2750	if (list_empty(&ptype_all))
2751		return;
2752
2753	skb_reset_network_header(skb);
2754	skb_reset_transport_header(skb);
2755	skb->mac_len = skb->network_header - skb->mac_header;
2756
2757	rcu_read_lock();
2758	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2759		if (!ptype->dev || ptype->dev == skb->dev)
2760			deliver_skb(skb, ptype, skb->dev);
2761	}
2762	rcu_read_unlock();
2763}
2764
2765static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2766					      struct net_device *master)
2767{
2768	if (skb->pkt_type == PACKET_HOST) {
2769		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2770
2771		memcpy(dest, master->dev_addr, ETH_ALEN);
2772	}
2773}
2774
2775/* On bonding slaves other than the currently active slave, suppress
2776 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2777 * ARP on active-backup slaves with arp_validate enabled.
2778 */
2779int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2780{
2781	struct net_device *dev = skb->dev;
2782
2783	if (master->priv_flags & IFF_MASTER_ARPMON)
2784		dev->last_rx = jiffies;
2785
2786	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2787		/* Do address unmangle. The local destination address
2788		 * will be always the one master has. Provides the right
2789		 * functionality in a bridge.
2790		 */
2791		skb_bond_set_mac_by_master(skb, master);
2792	}
2793
2794	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2795		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2796		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2797			return 0;
2798
2799		if (master->priv_flags & IFF_MASTER_ALB) {
2800			if (skb->pkt_type != PACKET_BROADCAST &&
2801			    skb->pkt_type != PACKET_MULTICAST)
2802				return 0;
2803		}
2804		if (master->priv_flags & IFF_MASTER_8023AD &&
2805		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2806			return 0;
2807
2808		return 1;
2809	}
2810	return 0;
2811}
2812EXPORT_SYMBOL(__skb_bond_should_drop);
2813
2814static int __netif_receive_skb(struct sk_buff *skb)
2815{
2816	struct packet_type *ptype, *pt_prev;
2817	struct net_device *orig_dev;
2818	struct net_device *master;
2819	struct net_device *null_or_orig;
2820	struct net_device *orig_or_bond;
2821	int ret = NET_RX_DROP;
2822	__be16 type;
2823
2824	if (!netdev_tstamp_prequeue)
2825		net_timestamp_check(skb);
2826
2827	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2828		return NET_RX_SUCCESS;
2829
2830	/* if we've gotten here through NAPI, check netpoll */
2831	if (netpoll_receive_skb(skb))
2832		return NET_RX_DROP;
2833
2834	if (!skb->skb_iif)
2835		skb->skb_iif = skb->dev->ifindex;
2836
2837	/*
2838	 * bonding note: skbs received on inactive slaves should only
2839	 * be delivered to pkt handlers that are exact matches.  Also
2840	 * the deliver_no_wcard flag will be set.  If packet handlers
2841	 * are sensitive to duplicate packets these skbs will need to
2842	 * be dropped at the handler.  The vlan accel path may have
2843	 * already set the deliver_no_wcard flag.
2844	 */
2845	null_or_orig = NULL;
2846	orig_dev = skb->dev;
2847	master = ACCESS_ONCE(orig_dev->master);
2848	if (skb->deliver_no_wcard)
2849		null_or_orig = orig_dev;
2850	else if (master) {
2851		if (skb_bond_should_drop(skb, master)) {
2852			skb->deliver_no_wcard = 1;
2853			null_or_orig = orig_dev; /* deliver only exact match */
2854		} else
2855			skb->dev = master;
2856	}
2857
2858	__get_cpu_var(softnet_data).processed++;
2859
2860	skb_reset_network_header(skb);
2861	skb_reset_transport_header(skb);
2862	skb->mac_len = skb->network_header - skb->mac_header;
2863
2864	pt_prev = NULL;
2865
2866	rcu_read_lock();
2867
2868#ifdef CONFIG_NET_CLS_ACT
2869	if (skb->tc_verd & TC_NCLS) {
2870		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2871		goto ncls;
2872	}
2873#endif
2874
2875	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2876		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2877		    ptype->dev == orig_dev) {
2878			if (pt_prev)
2879				ret = deliver_skb(skb, pt_prev, orig_dev);
2880			pt_prev = ptype;
2881		}
2882	}
2883
2884#ifdef CONFIG_NET_CLS_ACT
2885	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2886	if (!skb)
2887		goto out;
2888ncls:
2889#endif
2890
2891	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2892	if (!skb)
2893		goto out;
2894	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2895	if (!skb)
2896		goto out;
2897
2898	/*
2899	 * Make sure frames received on VLAN interfaces stacked on
2900	 * bonding interfaces still make their way to any base bonding
2901	 * device that may have registered for a specific ptype.  The
2902	 * handler may have to adjust skb->dev and orig_dev.
2903	 */
2904	orig_or_bond = orig_dev;
2905	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2906	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2907		orig_or_bond = vlan_dev_real_dev(skb->dev);
2908	}
2909
2910	type = skb->protocol;
2911	list_for_each_entry_rcu(ptype,
2912			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2913		if (ptype->type == type && (ptype->dev == null_or_orig ||
2914		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2915		     ptype->dev == orig_or_bond)) {
2916			if (pt_prev)
2917				ret = deliver_skb(skb, pt_prev, orig_dev);
2918			pt_prev = ptype;
2919		}
2920	}
2921
2922	if (pt_prev) {
2923		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2924	} else {
2925		kfree_skb(skb);
2926		/* Jamal, now you will not able to escape explaining
2927		 * me how you were going to use this. :-)
2928		 */
2929		ret = NET_RX_DROP;
2930	}
2931
2932out:
2933	rcu_read_unlock();
2934	return ret;
2935}
2936
2937/**
2938 *	netif_receive_skb - process receive buffer from network
2939 *	@skb: buffer to process
2940 *
2941 *	netif_receive_skb() is the main receive data processing function.
2942 *	It always succeeds. The buffer may be dropped during processing
2943 *	for congestion control or by the protocol layers.
2944 *
2945 *	This function may only be called from softirq context and interrupts
2946 *	should be enabled.
2947 *
2948 *	Return values (usually ignored):
2949 *	NET_RX_SUCCESS: no congestion
2950 *	NET_RX_DROP: packet was dropped
2951 */
2952int netif_receive_skb(struct sk_buff *skb)
2953{
2954	if (netdev_tstamp_prequeue)
2955		net_timestamp_check(skb);
2956
2957#ifdef CONFIG_RPS
2958	{
2959		struct rps_dev_flow voidflow, *rflow = &voidflow;
2960		int cpu, ret;
2961
2962		rcu_read_lock();
2963
2964		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2965
2966		if (cpu >= 0) {
2967			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2968			rcu_read_unlock();
2969		} else {
2970			rcu_read_unlock();
2971			ret = __netif_receive_skb(skb);
2972		}
2973
2974		return ret;
2975	}
2976#else
2977	return __netif_receive_skb(skb);
2978#endif
2979}
2980EXPORT_SYMBOL(netif_receive_skb);
2981
2982/* Network device is going away, flush any packets still pending
2983 * Called with irqs disabled.
2984 */
2985static void flush_backlog(void *arg)
2986{
2987	struct net_device *dev = arg;
2988	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2989	struct sk_buff *skb, *tmp;
2990
2991	rps_lock(sd);
2992	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2993		if (skb->dev == dev) {
2994			__skb_unlink(skb, &sd->input_pkt_queue);
2995			kfree_skb(skb);
2996			input_queue_head_incr(sd);
2997		}
2998	}
2999	rps_unlock(sd);
3000
3001	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3002		if (skb->dev == dev) {
3003			__skb_unlink(skb, &sd->process_queue);
3004			kfree_skb(skb);
3005			input_queue_head_incr(sd);
3006		}
3007	}
3008}
3009
3010static int napi_gro_complete(struct sk_buff *skb)
3011{
3012	struct packet_type *ptype;
3013	__be16 type = skb->protocol;
3014	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3015	int err = -ENOENT;
3016
3017	if (NAPI_GRO_CB(skb)->count == 1) {
3018		skb_shinfo(skb)->gso_size = 0;
3019		goto out;
3020	}
3021
3022	rcu_read_lock();
3023	list_for_each_entry_rcu(ptype, head, list) {
3024		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3025			continue;
3026
3027		err = ptype->gro_complete(skb);
3028		break;
3029	}
3030	rcu_read_unlock();
3031
3032	if (err) {
3033		WARN_ON(&ptype->list == head);
3034		kfree_skb(skb);
3035		return NET_RX_SUCCESS;
3036	}
3037
3038out:
3039	return netif_receive_skb(skb);
3040}
3041
3042static void napi_gro_flush(struct napi_struct *napi)
3043{
3044	struct sk_buff *skb, *next;
3045
3046	for (skb = napi->gro_list; skb; skb = next) {
3047		next = skb->next;
3048		skb->next = NULL;
3049		napi_gro_complete(skb);
3050	}
3051
3052	napi->gro_count = 0;
3053	napi->gro_list = NULL;
3054}
3055
3056enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3057{
3058	struct sk_buff **pp = NULL;
3059	struct packet_type *ptype;
3060	__be16 type = skb->protocol;
3061	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3062	int same_flow;
3063	int mac_len;
3064	enum gro_result ret;
3065
3066	if (!(skb->dev->features & NETIF_F_GRO))
3067		goto normal;
3068
3069	if (skb_is_gso(skb) || skb_has_frags(skb))
3070		goto normal;
3071
3072	rcu_read_lock();
3073	list_for_each_entry_rcu(ptype, head, list) {
3074		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3075			continue;
3076
3077		skb_set_network_header(skb, skb_gro_offset(skb));
3078		mac_len = skb->network_header - skb->mac_header;
3079		skb->mac_len = mac_len;
3080		NAPI_GRO_CB(skb)->same_flow = 0;
3081		NAPI_GRO_CB(skb)->flush = 0;
3082		NAPI_GRO_CB(skb)->free = 0;
3083
3084		pp = ptype->gro_receive(&napi->gro_list, skb);
3085		break;
3086	}
3087	rcu_read_unlock();
3088
3089	if (&ptype->list == head)
3090		goto normal;
3091
3092	same_flow = NAPI_GRO_CB(skb)->same_flow;
3093	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3094
3095	if (pp) {
3096		struct sk_buff *nskb = *pp;
3097
3098		*pp = nskb->next;
3099		nskb->next = NULL;
3100		napi_gro_complete(nskb);
3101		napi->gro_count--;
3102	}
3103
3104	if (same_flow)
3105		goto ok;
3106
3107	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3108		goto normal;
3109
3110	napi->gro_count++;
3111	NAPI_GRO_CB(skb)->count = 1;
3112	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3113	skb->next = napi->gro_list;
3114	napi->gro_list = skb;
3115	ret = GRO_HELD;
3116
3117pull:
3118	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3119		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3120
3121		BUG_ON(skb->end - skb->tail < grow);
3122
3123		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3124
3125		skb->tail += grow;
3126		skb->data_len -= grow;
3127
3128		skb_shinfo(skb)->frags[0].page_offset += grow;
3129		skb_shinfo(skb)->frags[0].size -= grow;
3130
3131		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3132			put_page(skb_shinfo(skb)->frags[0].page);
3133			memmove(skb_shinfo(skb)->frags,
3134				skb_shinfo(skb)->frags + 1,
3135				--skb_shinfo(skb)->nr_frags);
3136		}
3137	}
3138
3139ok:
3140	return ret;
3141
3142normal:
3143	ret = GRO_NORMAL;
3144	goto pull;
3145}
3146EXPORT_SYMBOL(dev_gro_receive);
3147
3148static gro_result_t
3149__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3150{
3151	struct sk_buff *p;
3152
3153	if (netpoll_rx_on(skb))
3154		return GRO_NORMAL;
3155
3156	for (p = napi->gro_list; p; p = p->next) {
3157		NAPI_GRO_CB(p)->same_flow =
3158			(p->dev == skb->dev) &&
3159			!compare_ether_header(skb_mac_header(p),
3160					      skb_gro_mac_header(skb));
3161		NAPI_GRO_CB(p)->flush = 0;
3162	}
3163
3164	return dev_gro_receive(napi, skb);
3165}
3166
3167gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3168{
3169	switch (ret) {
3170	case GRO_NORMAL:
3171		if (netif_receive_skb(skb))
3172			ret = GRO_DROP;
3173		break;
3174
3175	case GRO_DROP:
3176	case GRO_MERGED_FREE:
3177		kfree_skb(skb);
3178		break;
3179
3180	case GRO_HELD:
3181	case GRO_MERGED:
3182		break;
3183	}
3184
3185	return ret;
3186}
3187EXPORT_SYMBOL(napi_skb_finish);
3188
3189void skb_gro_reset_offset(struct sk_buff *skb)
3190{
3191	NAPI_GRO_CB(skb)->data_offset = 0;
3192	NAPI_GRO_CB(skb)->frag0 = NULL;
3193	NAPI_GRO_CB(skb)->frag0_len = 0;
3194
3195	if (skb->mac_header == skb->tail &&
3196	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3197		NAPI_GRO_CB(skb)->frag0 =
3198			page_address(skb_shinfo(skb)->frags[0].page) +
3199			skb_shinfo(skb)->frags[0].page_offset;
3200		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3201	}
3202}
3203EXPORT_SYMBOL(skb_gro_reset_offset);
3204
3205gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3206{
3207	skb_gro_reset_offset(skb);
3208
3209	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3210}
3211EXPORT_SYMBOL(napi_gro_receive);
3212
3213void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3214{
3215	__skb_pull(skb, skb_headlen(skb));
3216	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3217
3218	napi->skb = skb;
3219}
3220EXPORT_SYMBOL(napi_reuse_skb);
3221
3222struct sk_buff *napi_get_frags(struct napi_struct *napi)
3223{
3224	struct sk_buff *skb = napi->skb;
3225
3226	if (!skb) {
3227		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3228		if (skb)
3229			napi->skb = skb;
3230	}
3231	return skb;
3232}
3233EXPORT_SYMBOL(napi_get_frags);
3234
3235gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3236			       gro_result_t ret)
3237{
3238	switch (ret) {
3239	case GRO_NORMAL:
3240	case GRO_HELD:
3241		skb->protocol = eth_type_trans(skb, skb->dev);
3242
3243		if (ret == GRO_HELD)
3244			skb_gro_pull(skb, -ETH_HLEN);
3245		else if (netif_receive_skb(skb))
3246			ret = GRO_DROP;
3247		break;
3248
3249	case GRO_DROP:
3250	case GRO_MERGED_FREE:
3251		napi_reuse_skb(napi, skb);
3252		break;
3253
3254	case GRO_MERGED:
3255		break;
3256	}
3257
3258	return ret;
3259}
3260EXPORT_SYMBOL(napi_frags_finish);
3261
3262struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3263{
3264	struct sk_buff *skb = napi->skb;
3265	struct ethhdr *eth;
3266	unsigned int hlen;
3267	unsigned int off;
3268
3269	napi->skb = NULL;
3270
3271	skb_reset_mac_header(skb);
3272	skb_gro_reset_offset(skb);
3273
3274	off = skb_gro_offset(skb);
3275	hlen = off + sizeof(*eth);
3276	eth = skb_gro_header_fast(skb, off);
3277	if (skb_gro_header_hard(skb, hlen)) {
3278		eth = skb_gro_header_slow(skb, hlen, off);
3279		if (unlikely(!eth)) {
3280			napi_reuse_skb(napi, skb);
3281			skb = NULL;
3282			goto out;
3283		}
3284	}
3285
3286	skb_gro_pull(skb, sizeof(*eth));
3287
3288	/*
3289	 * This works because the only protocols we care about don't require
3290	 * special handling.  We'll fix it up properly at the end.
3291	 */
3292	skb->protocol = eth->h_proto;
3293
3294out:
3295	return skb;
3296}
3297EXPORT_SYMBOL(napi_frags_skb);
3298
3299gro_result_t napi_gro_frags(struct napi_struct *napi)
3300{
3301	struct sk_buff *skb = napi_frags_skb(napi);
3302
3303	if (!skb)
3304		return GRO_DROP;
3305
3306	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3307}
3308EXPORT_SYMBOL(napi_gro_frags);
3309
3310/*
3311 * net_rps_action sends any pending IPI's for rps.
3312 * Note: called with local irq disabled, but exits with local irq enabled.
3313 */
3314static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3315{
3316#ifdef CONFIG_RPS
3317	struct softnet_data *remsd = sd->rps_ipi_list;
3318
3319	if (remsd) {
3320		sd->rps_ipi_list = NULL;
3321
3322		local_irq_enable();
3323
3324		/* Send pending IPI's to kick RPS processing on remote cpus. */
3325		while (remsd) {
3326			struct softnet_data *next = remsd->rps_ipi_next;
3327
3328			if (cpu_online(remsd->cpu))
3329				__smp_call_function_single(remsd->cpu,
3330							   &remsd->csd, 0);
3331			remsd = next;
3332		}
3333	} else
3334#endif
3335		local_irq_enable();
3336}
3337
3338static int process_backlog(struct napi_struct *napi, int quota)
3339{
3340	int work = 0;
3341	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3342
3343#ifdef CONFIG_RPS
3344	/* Check if we have pending ipi, its better to send them now,
3345	 * not waiting net_rx_action() end.
3346	 */
3347	if (sd->rps_ipi_list) {
3348		local_irq_disable();
3349		net_rps_action_and_irq_enable(sd);
3350	}
3351#endif
3352	napi->weight = weight_p;
3353	local_irq_disable();
3354	while (work < quota) {
3355		struct sk_buff *skb;
3356		unsigned int qlen;
3357
3358		while ((skb = __skb_dequeue(&sd->process_queue))) {
3359			local_irq_enable();
3360			__netif_receive_skb(skb);
3361			local_irq_disable();
3362			input_queue_head_incr(sd);
3363			if (++work >= quota) {
3364				local_irq_enable();
3365				return work;
3366			}
3367		}
3368
3369		rps_lock(sd);
3370		qlen = skb_queue_len(&sd->input_pkt_queue);
3371		if (qlen)
3372			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3373						   &sd->process_queue);
3374
3375		if (qlen < quota - work) {
3376			/*
3377			 * Inline a custom version of __napi_complete().
3378			 * only current cpu owns and manipulates this napi,
3379			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3380			 * we can use a plain write instead of clear_bit(),
3381			 * and we dont need an smp_mb() memory barrier.
3382			 */
3383			list_del(&napi->poll_list);
3384			napi->state = 0;
3385
3386			quota = work + qlen;
3387		}
3388		rps_unlock(sd);
3389	}
3390	local_irq_enable();
3391
3392	return work;
3393}
3394
3395/**
3396 * __napi_schedule - schedule for receive
3397 * @n: entry to schedule
3398 *
3399 * The entry's receive function will be scheduled to run
3400 */
3401void __napi_schedule(struct napi_struct *n)
3402{
3403	unsigned long flags;
3404
3405	local_irq_save(flags);
3406	____napi_schedule(&__get_cpu_var(softnet_data), n);
3407	local_irq_restore(flags);
3408}
3409EXPORT_SYMBOL(__napi_schedule);
3410
3411void __napi_complete(struct napi_struct *n)
3412{
3413	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3414	BUG_ON(n->gro_list);
3415
3416	list_del(&n->poll_list);
3417	smp_mb__before_clear_bit();
3418	clear_bit(NAPI_STATE_SCHED, &n->state);
3419}
3420EXPORT_SYMBOL(__napi_complete);
3421
3422void napi_complete(struct napi_struct *n)
3423{
3424	unsigned long flags;
3425
3426	/*
3427	 * don't let napi dequeue from the cpu poll list
3428	 * just in case its running on a different cpu
3429	 */
3430	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3431		return;
3432
3433	napi_gro_flush(n);
3434	local_irq_save(flags);
3435	__napi_complete(n);
3436	local_irq_restore(flags);
3437}
3438EXPORT_SYMBOL(napi_complete);
3439
3440void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3441		    int (*poll)(struct napi_struct *, int), int weight)
3442{
3443	INIT_LIST_HEAD(&napi->poll_list);
3444	napi->gro_count = 0;
3445	napi->gro_list = NULL;
3446	napi->skb = NULL;
3447	napi->poll = poll;
3448	napi->weight = weight;
3449	list_add(&napi->dev_list, &dev->napi_list);
3450	napi->dev = dev;
3451#ifdef CONFIG_NETPOLL
3452	spin_lock_init(&napi->poll_lock);
3453	napi->poll_owner = -1;
3454#endif
3455	set_bit(NAPI_STATE_SCHED, &napi->state);
3456}
3457EXPORT_SYMBOL(netif_napi_add);
3458
3459void netif_napi_del(struct napi_struct *napi)
3460{
3461	struct sk_buff *skb, *next;
3462
3463	list_del_init(&napi->dev_list);
3464	napi_free_frags(napi);
3465
3466	for (skb = napi->gro_list; skb; skb = next) {
3467		next = skb->next;
3468		skb->next = NULL;
3469		kfree_skb(skb);
3470	}
3471
3472	napi->gro_list = NULL;
3473	napi->gro_count = 0;
3474}
3475EXPORT_SYMBOL(netif_napi_del);
3476
3477static void net_rx_action(struct softirq_action *h)
3478{
3479	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3480	unsigned long time_limit = jiffies + 2;
3481	int budget = netdev_budget;
3482	void *have;
3483
3484	local_irq_disable();
3485
3486	while (!list_empty(&sd->poll_list)) {
3487		struct napi_struct *n;
3488		int work, weight;
3489
3490		/* If softirq window is exhuasted then punt.
3491		 * Allow this to run for 2 jiffies since which will allow
3492		 * an average latency of 1.5/HZ.
3493		 */
3494		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3495			goto softnet_break;
3496
3497		local_irq_enable();
3498
3499		/* Even though interrupts have been re-enabled, this
3500		 * access is safe because interrupts can only add new
3501		 * entries to the tail of this list, and only ->poll()
3502		 * calls can remove this head entry from the list.
3503		 */
3504		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3505
3506		have = netpoll_poll_lock(n);
3507
3508		weight = n->weight;
3509
3510		/* This NAPI_STATE_SCHED test is for avoiding a race
3511		 * with netpoll's poll_napi().  Only the entity which
3512		 * obtains the lock and sees NAPI_STATE_SCHED set will
3513		 * actually make the ->poll() call.  Therefore we avoid
3514		 * accidently calling ->poll() when NAPI is not scheduled.
3515		 */
3516		work = 0;
3517		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3518			work = n->poll(n, weight);
3519			trace_napi_poll(n);
3520		}
3521
3522		WARN_ON_ONCE(work > weight);
3523
3524		budget -= work;
3525
3526		local_irq_disable();
3527
3528		/* Drivers must not modify the NAPI state if they
3529		 * consume the entire weight.  In such cases this code
3530		 * still "owns" the NAPI instance and therefore can
3531		 * move the instance around on the list at-will.
3532		 */
3533		if (unlikely(work == weight)) {
3534			if (unlikely(napi_disable_pending(n))) {
3535				local_irq_enable();
3536				napi_complete(n);
3537				local_irq_disable();
3538			} else
3539				list_move_tail(&n->poll_list, &sd->poll_list);
3540		}
3541
3542		netpoll_poll_unlock(have);
3543	}
3544out:
3545	net_rps_action_and_irq_enable(sd);
3546
3547#ifdef CONFIG_NET_DMA
3548	/*
3549	 * There may not be any more sk_buffs coming right now, so push
3550	 * any pending DMA copies to hardware
3551	 */
3552	dma_issue_pending_all();
3553#endif
3554
3555	return;
3556
3557softnet_break:
3558	sd->time_squeeze++;
3559	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3560	goto out;
3561}
3562
3563static gifconf_func_t *gifconf_list[NPROTO];
3564
3565/**
3566 *	register_gifconf	-	register a SIOCGIF handler
3567 *	@family: Address family
3568 *	@gifconf: Function handler
3569 *
3570 *	Register protocol dependent address dumping routines. The handler
3571 *	that is passed must not be freed or reused until it has been replaced
3572 *	by another handler.
3573 */
3574int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3575{
3576	if (family >= NPROTO)
3577		return -EINVAL;
3578	gifconf_list[family] = gifconf;
3579	return 0;
3580}
3581EXPORT_SYMBOL(register_gifconf);
3582
3583
3584/*
3585 *	Map an interface index to its name (SIOCGIFNAME)
3586 */
3587
3588/*
3589 *	We need this ioctl for efficient implementation of the
3590 *	if_indextoname() function required by the IPv6 API.  Without
3591 *	it, we would have to search all the interfaces to find a
3592 *	match.  --pb
3593 */
3594
3595static int dev_ifname(struct net *net, struct ifreq __user *arg)
3596{
3597	struct net_device *dev;
3598	struct ifreq ifr;
3599
3600	/*
3601	 *	Fetch the caller's info block.
3602	 */
3603
3604	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3605		return -EFAULT;
3606
3607	rcu_read_lock();
3608	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3609	if (!dev) {
3610		rcu_read_unlock();
3611		return -ENODEV;
3612	}
3613
3614	strcpy(ifr.ifr_name, dev->name);
3615	rcu_read_unlock();
3616
3617	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3618		return -EFAULT;
3619	return 0;
3620}
3621
3622/*
3623 *	Perform a SIOCGIFCONF call. This structure will change
3624 *	size eventually, and there is nothing I can do about it.
3625 *	Thus we will need a 'compatibility mode'.
3626 */
3627
3628static int dev_ifconf(struct net *net, char __user *arg)
3629{
3630	struct ifconf ifc;
3631	struct net_device *dev;
3632	char __user *pos;
3633	int len;
3634	int total;
3635	int i;
3636
3637	/*
3638	 *	Fetch the caller's info block.
3639	 */
3640
3641	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3642		return -EFAULT;
3643
3644	pos = ifc.ifc_buf;
3645	len = ifc.ifc_len;
3646
3647	/*
3648	 *	Loop over the interfaces, and write an info block for each.
3649	 */
3650
3651	total = 0;
3652	for_each_netdev(net, dev) {
3653		for (i = 0; i < NPROTO; i++) {
3654			if (gifconf_list[i]) {
3655				int done;
3656				if (!pos)
3657					done = gifconf_list[i](dev, NULL, 0);
3658				else
3659					done = gifconf_list[i](dev, pos + total,
3660							       len - total);
3661				if (done < 0)
3662					return -EFAULT;
3663				total += done;
3664			}
3665		}
3666	}
3667
3668	/*
3669	 *	All done.  Write the updated control block back to the caller.
3670	 */
3671	ifc.ifc_len = total;
3672
3673	/*
3674	 * 	Both BSD and Solaris return 0 here, so we do too.
3675	 */
3676	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3677}
3678
3679#ifdef CONFIG_PROC_FS
3680/*
3681 *	This is invoked by the /proc filesystem handler to display a device
3682 *	in detail.
3683 */
3684void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3685	__acquires(RCU)
3686{
3687	struct net *net = seq_file_net(seq);
3688	loff_t off;
3689	struct net_device *dev;
3690
3691	rcu_read_lock();
3692	if (!*pos)
3693		return SEQ_START_TOKEN;
3694
3695	off = 1;
3696	for_each_netdev_rcu(net, dev)
3697		if (off++ == *pos)
3698			return dev;
3699
3700	return NULL;
3701}
3702
3703void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3704{
3705	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3706				  first_net_device(seq_file_net(seq)) :
3707				  next_net_device((struct net_device *)v);
3708
3709	++*pos;
3710	return rcu_dereference(dev);
3711}
3712
3713void dev_seq_stop(struct seq_file *seq, void *v)
3714	__releases(RCU)
3715{
3716	rcu_read_unlock();
3717}
3718
3719static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3720{
3721	const struct net_device_stats *stats = dev_get_stats(dev);
3722
3723	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3724		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3725		   dev->name, stats->rx_bytes, stats->rx_packets,
3726		   stats->rx_errors,
3727		   stats->rx_dropped + stats->rx_missed_errors,
3728		   stats->rx_fifo_errors,
3729		   stats->rx_length_errors + stats->rx_over_errors +
3730		    stats->rx_crc_errors + stats->rx_frame_errors,
3731		   stats->rx_compressed, stats->multicast,
3732		   stats->tx_bytes, stats->tx_packets,
3733		   stats->tx_errors, stats->tx_dropped,
3734		   stats->tx_fifo_errors, stats->collisions,
3735		   stats->tx_carrier_errors +
3736		    stats->tx_aborted_errors +
3737		    stats->tx_window_errors +
3738		    stats->tx_heartbeat_errors,
3739		   stats->tx_compressed);
3740}
3741
3742/*
3743 *	Called from the PROCfs module. This now uses the new arbitrary sized
3744 *	/proc/net interface to create /proc/net/dev
3745 */
3746static int dev_seq_show(struct seq_file *seq, void *v)
3747{
3748	if (v == SEQ_START_TOKEN)
3749		seq_puts(seq, "Inter-|   Receive                            "
3750			      "                    |  Transmit\n"
3751			      " face |bytes    packets errs drop fifo frame "
3752			      "compressed multicast|bytes    packets errs "
3753			      "drop fifo colls carrier compressed\n");
3754	else
3755		dev_seq_printf_stats(seq, v);
3756	return 0;
3757}
3758
3759static struct softnet_data *softnet_get_online(loff_t *pos)
3760{
3761	struct softnet_data *sd = NULL;
3762
3763	while (*pos < nr_cpu_ids)
3764		if (cpu_online(*pos)) {
3765			sd = &per_cpu(softnet_data, *pos);
3766			break;
3767		} else
3768			++*pos;
3769	return sd;
3770}
3771
3772static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3773{
3774	return softnet_get_online(pos);
3775}
3776
3777static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3778{
3779	++*pos;
3780	return softnet_get_online(pos);
3781}
3782
3783static void softnet_seq_stop(struct seq_file *seq, void *v)
3784{
3785}
3786
3787static int softnet_seq_show(struct seq_file *seq, void *v)
3788{
3789	struct softnet_data *sd = v;
3790
3791	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3792		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3793		   0, 0, 0, 0, /* was fastroute */
3794		   sd->cpu_collision, sd->received_rps);
3795	return 0;
3796}
3797
3798static const struct seq_operations dev_seq_ops = {
3799	.start = dev_seq_start,
3800	.next  = dev_seq_next,
3801	.stop  = dev_seq_stop,
3802	.show  = dev_seq_show,
3803};
3804
3805static int dev_seq_open(struct inode *inode, struct file *file)
3806{
3807	return seq_open_net(inode, file, &dev_seq_ops,
3808			    sizeof(struct seq_net_private));
3809}
3810
3811static const struct file_operations dev_seq_fops = {
3812	.owner	 = THIS_MODULE,
3813	.open    = dev_seq_open,
3814	.read    = seq_read,
3815	.llseek  = seq_lseek,
3816	.release = seq_release_net,
3817};
3818
3819static const struct seq_operations softnet_seq_ops = {
3820	.start = softnet_seq_start,
3821	.next  = softnet_seq_next,
3822	.stop  = softnet_seq_stop,
3823	.show  = softnet_seq_show,
3824};
3825
3826static int softnet_seq_open(struct inode *inode, struct file *file)
3827{
3828	return seq_open(file, &softnet_seq_ops);
3829}
3830
3831static const struct file_operations softnet_seq_fops = {
3832	.owner	 = THIS_MODULE,
3833	.open    = softnet_seq_open,
3834	.read    = seq_read,
3835	.llseek  = seq_lseek,
3836	.release = seq_release,
3837};
3838
3839static void *ptype_get_idx(loff_t pos)
3840{
3841	struct packet_type *pt = NULL;
3842	loff_t i = 0;
3843	int t;
3844
3845	list_for_each_entry_rcu(pt, &ptype_all, list) {
3846		if (i == pos)
3847			return pt;
3848		++i;
3849	}
3850
3851	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3852		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3853			if (i == pos)
3854				return pt;
3855			++i;
3856		}
3857	}
3858	return NULL;
3859}
3860
3861static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3862	__acquires(RCU)
3863{
3864	rcu_read_lock();
3865	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3866}
3867
3868static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3869{
3870	struct packet_type *pt;
3871	struct list_head *nxt;
3872	int hash;
3873
3874	++*pos;
3875	if (v == SEQ_START_TOKEN)
3876		return ptype_get_idx(0);
3877
3878	pt = v;
3879	nxt = pt->list.next;
3880	if (pt->type == htons(ETH_P_ALL)) {
3881		if (nxt != &ptype_all)
3882			goto found;
3883		hash = 0;
3884		nxt = ptype_base[0].next;
3885	} else
3886		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3887
3888	while (nxt == &ptype_base[hash]) {
3889		if (++hash >= PTYPE_HASH_SIZE)
3890			return NULL;
3891		nxt = ptype_base[hash].next;
3892	}
3893found:
3894	return list_entry(nxt, struct packet_type, list);
3895}
3896
3897static void ptype_seq_stop(struct seq_file *seq, void *v)
3898	__releases(RCU)
3899{
3900	rcu_read_unlock();
3901}
3902
3903static int ptype_seq_show(struct seq_file *seq, void *v)
3904{
3905	struct packet_type *pt = v;
3906
3907	if (v == SEQ_START_TOKEN)
3908		seq_puts(seq, "Type Device      Function\n");
3909	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3910		if (pt->type == htons(ETH_P_ALL))
3911			seq_puts(seq, "ALL ");
3912		else
3913			seq_printf(seq, "%04x", ntohs(pt->type));
3914
3915		seq_printf(seq, " %-8s %pF\n",
3916			   pt->dev ? pt->dev->name : "", pt->func);
3917	}
3918
3919	return 0;
3920}
3921
3922static const struct seq_operations ptype_seq_ops = {
3923	.start = ptype_seq_start,
3924	.next  = ptype_seq_next,
3925	.stop  = ptype_seq_stop,
3926	.show  = ptype_seq_show,
3927};
3928
3929static int ptype_seq_open(struct inode *inode, struct file *file)
3930{
3931	return seq_open_net(inode, file, &ptype_seq_ops,
3932			sizeof(struct seq_net_private));
3933}
3934
3935static const struct file_operations ptype_seq_fops = {
3936	.owner	 = THIS_MODULE,
3937	.open    = ptype_seq_open,
3938	.read    = seq_read,
3939	.llseek  = seq_lseek,
3940	.release = seq_release_net,
3941};
3942
3943
3944static int __net_init dev_proc_net_init(struct net *net)
3945{
3946	int rc = -ENOMEM;
3947
3948	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3949		goto out;
3950	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3951		goto out_dev;
3952	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3953		goto out_softnet;
3954
3955	if (wext_proc_init(net))
3956		goto out_ptype;
3957	rc = 0;
3958out:
3959	return rc;
3960out_ptype:
3961	proc_net_remove(net, "ptype");
3962out_softnet:
3963	proc_net_remove(net, "softnet_stat");
3964out_dev:
3965	proc_net_remove(net, "dev");
3966	goto out;
3967}
3968
3969static void __net_exit dev_proc_net_exit(struct net *net)
3970{
3971	wext_proc_exit(net);
3972
3973	proc_net_remove(net, "ptype");
3974	proc_net_remove(net, "softnet_stat");
3975	proc_net_remove(net, "dev");
3976}
3977
3978static struct pernet_operations __net_initdata dev_proc_ops = {
3979	.init = dev_proc_net_init,
3980	.exit = dev_proc_net_exit,
3981};
3982
3983static int __init dev_proc_init(void)
3984{
3985	return register_pernet_subsys(&dev_proc_ops);
3986}
3987#else
3988#define dev_proc_init() 0
3989#endif	/* CONFIG_PROC_FS */
3990
3991
3992/**
3993 *	netdev_set_master	-	set up master/slave pair
3994 *	@slave: slave device
3995 *	@master: new master device
3996 *
3997 *	Changes the master device of the slave. Pass %NULL to break the
3998 *	bonding. The caller must hold the RTNL semaphore. On a failure
3999 *	a negative errno code is returned. On success the reference counts
4000 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4001 *	function returns zero.
4002 */
4003int netdev_set_master(struct net_device *slave, struct net_device *master)
4004{
4005	struct net_device *old = slave->master;
4006
4007	ASSERT_RTNL();
4008
4009	if (master) {
4010		if (old)
4011			return -EBUSY;
4012		dev_hold(master);
4013	}
4014
4015	slave->master = master;
4016
4017	if (old) {
4018		synchronize_net();
4019		dev_put(old);
4020	}
4021	if (master)
4022		slave->flags |= IFF_SLAVE;
4023	else
4024		slave->flags &= ~IFF_SLAVE;
4025
4026	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4027	return 0;
4028}
4029EXPORT_SYMBOL(netdev_set_master);
4030
4031static void dev_change_rx_flags(struct net_device *dev, int flags)
4032{
4033	const struct net_device_ops *ops = dev->netdev_ops;
4034
4035	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4036		ops->ndo_change_rx_flags(dev, flags);
4037}
4038
4039static int __dev_set_promiscuity(struct net_device *dev, int inc)
4040{
4041	unsigned short old_flags = dev->flags;
4042	uid_t uid;
4043	gid_t gid;
4044
4045	ASSERT_RTNL();
4046
4047	dev->flags |= IFF_PROMISC;
4048	dev->promiscuity += inc;
4049	if (dev->promiscuity == 0) {
4050		/*
4051		 * Avoid overflow.
4052		 * If inc causes overflow, untouch promisc and return error.
4053		 */
4054		if (inc < 0)
4055			dev->flags &= ~IFF_PROMISC;
4056		else {
4057			dev->promiscuity -= inc;
4058			printk(KERN_WARNING "%s: promiscuity touches roof, "
4059				"set promiscuity failed, promiscuity feature "
4060				"of device might be broken.\n", dev->name);
4061			return -EOVERFLOW;
4062		}
4063	}
4064	if (dev->flags != old_flags) {
4065		printk(KERN_INFO "device %s %s promiscuous mode\n",
4066		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4067							       "left");
4068		if (audit_enabled) {
4069			current_uid_gid(&uid, &gid);
4070			audit_log(current->audit_context, GFP_ATOMIC,
4071				AUDIT_ANOM_PROMISCUOUS,
4072				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4073				dev->name, (dev->flags & IFF_PROMISC),
4074				(old_flags & IFF_PROMISC),
4075				audit_get_loginuid(current),
4076				uid, gid,
4077				audit_get_sessionid(current));
4078		}
4079
4080		dev_change_rx_flags(dev, IFF_PROMISC);
4081	}
4082	return 0;
4083}
4084
4085/**
4086 *	dev_set_promiscuity	- update promiscuity count on a device
4087 *	@dev: device
4088 *	@inc: modifier
4089 *
4090 *	Add or remove promiscuity from a device. While the count in the device
4091 *	remains above zero the interface remains promiscuous. Once it hits zero
4092 *	the device reverts back to normal filtering operation. A negative inc
4093 *	value is used to drop promiscuity on the device.
4094 *	Return 0 if successful or a negative errno code on error.
4095 */
4096int dev_set_promiscuity(struct net_device *dev, int inc)
4097{
4098	unsigned short old_flags = dev->flags;
4099	int err;
4100
4101	err = __dev_set_promiscuity(dev, inc);
4102	if (err < 0)
4103		return err;
4104	if (dev->flags != old_flags)
4105		dev_set_rx_mode(dev);
4106	return err;
4107}
4108EXPORT_SYMBOL(dev_set_promiscuity);
4109
4110/**
4111 *	dev_set_allmulti	- update allmulti count on a device
4112 *	@dev: device
4113 *	@inc: modifier
4114 *
4115 *	Add or remove reception of all multicast frames to a device. While the
4116 *	count in the device remains above zero the interface remains listening
4117 *	to all interfaces. Once it hits zero the device reverts back to normal
4118 *	filtering operation. A negative @inc value is used to drop the counter
4119 *	when releasing a resource needing all multicasts.
4120 *	Return 0 if successful or a negative errno code on error.
4121 */
4122
4123int dev_set_allmulti(struct net_device *dev, int inc)
4124{
4125	unsigned short old_flags = dev->flags;
4126
4127	ASSERT_RTNL();
4128
4129	dev->flags |= IFF_ALLMULTI;
4130	dev->allmulti += inc;
4131	if (dev->allmulti == 0) {
4132		/*
4133		 * Avoid overflow.
4134		 * If inc causes overflow, untouch allmulti and return error.
4135		 */
4136		if (inc < 0)
4137			dev->flags &= ~IFF_ALLMULTI;
4138		else {
4139			dev->allmulti -= inc;
4140			printk(KERN_WARNING "%s: allmulti touches roof, "
4141				"set allmulti failed, allmulti feature of "
4142				"device might be broken.\n", dev->name);
4143			return -EOVERFLOW;
4144		}
4145	}
4146	if (dev->flags ^ old_flags) {
4147		dev_change_rx_flags(dev, IFF_ALLMULTI);
4148		dev_set_rx_mode(dev);
4149	}
4150	return 0;
4151}
4152EXPORT_SYMBOL(dev_set_allmulti);
4153
4154/*
4155 *	Upload unicast and multicast address lists to device and
4156 *	configure RX filtering. When the device doesn't support unicast
4157 *	filtering it is put in promiscuous mode while unicast addresses
4158 *	are present.
4159 */
4160void __dev_set_rx_mode(struct net_device *dev)
4161{
4162	const struct net_device_ops *ops = dev->netdev_ops;
4163
4164	/* dev_open will call this function so the list will stay sane. */
4165	if (!(dev->flags&IFF_UP))
4166		return;
4167
4168	if (!netif_device_present(dev))
4169		return;
4170
4171	if (ops->ndo_set_rx_mode)
4172		ops->ndo_set_rx_mode(dev);
4173	else {
4174		/* Unicast addresses changes may only happen under the rtnl,
4175		 * therefore calling __dev_set_promiscuity here is safe.
4176		 */
4177		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4178			__dev_set_promiscuity(dev, 1);
4179			dev->uc_promisc = 1;
4180		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4181			__dev_set_promiscuity(dev, -1);
4182			dev->uc_promisc = 0;
4183		}
4184
4185		if (ops->ndo_set_multicast_list)
4186			ops->ndo_set_multicast_list(dev);
4187	}
4188}
4189
4190void dev_set_rx_mode(struct net_device *dev)
4191{
4192	netif_addr_lock_bh(dev);
4193	__dev_set_rx_mode(dev);
4194	netif_addr_unlock_bh(dev);
4195}
4196
4197/**
4198 *	dev_get_flags - get flags reported to userspace
4199 *	@dev: device
4200 *
4201 *	Get the combination of flag bits exported through APIs to userspace.
4202 */
4203unsigned dev_get_flags(const struct net_device *dev)
4204{
4205	unsigned flags;
4206
4207	flags = (dev->flags & ~(IFF_PROMISC |
4208				IFF_ALLMULTI |
4209				IFF_RUNNING |
4210				IFF_LOWER_UP |
4211				IFF_DORMANT)) |
4212		(dev->gflags & (IFF_PROMISC |
4213				IFF_ALLMULTI));
4214
4215	if (netif_running(dev)) {
4216		if (netif_oper_up(dev))
4217			flags |= IFF_RUNNING;
4218		if (netif_carrier_ok(dev))
4219			flags |= IFF_LOWER_UP;
4220		if (netif_dormant(dev))
4221			flags |= IFF_DORMANT;
4222	}
4223
4224	return flags;
4225}
4226EXPORT_SYMBOL(dev_get_flags);
4227
4228int __dev_change_flags(struct net_device *dev, unsigned int flags)
4229{
4230	int old_flags = dev->flags;
4231	int ret;
4232
4233	ASSERT_RTNL();
4234
4235	/*
4236	 *	Set the flags on our device.
4237	 */
4238
4239	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4240			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4241			       IFF_AUTOMEDIA)) |
4242		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4243				    IFF_ALLMULTI));
4244
4245	/*
4246	 *	Load in the correct multicast list now the flags have changed.
4247	 */
4248
4249	if ((old_flags ^ flags) & IFF_MULTICAST)
4250		dev_change_rx_flags(dev, IFF_MULTICAST);
4251
4252	dev_set_rx_mode(dev);
4253
4254	/*
4255	 *	Have we downed the interface. We handle IFF_UP ourselves
4256	 *	according to user attempts to set it, rather than blindly
4257	 *	setting it.
4258	 */
4259
4260	ret = 0;
4261	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4262		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4263
4264		if (!ret)
4265			dev_set_rx_mode(dev);
4266	}
4267
4268	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4269		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4270
4271		dev->gflags ^= IFF_PROMISC;
4272		dev_set_promiscuity(dev, inc);
4273	}
4274
4275	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4276	   is important. Some (broken) drivers set IFF_PROMISC, when
4277	   IFF_ALLMULTI is requested not asking us and not reporting.
4278	 */
4279	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4280		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4281
4282		dev->gflags ^= IFF_ALLMULTI;
4283		dev_set_allmulti(dev, inc);
4284	}
4285
4286	return ret;
4287}
4288
4289void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4290{
4291	unsigned int changes = dev->flags ^ old_flags;
4292
4293	if (changes & IFF_UP) {
4294		if (dev->flags & IFF_UP)
4295			call_netdevice_notifiers(NETDEV_UP, dev);
4296		else
4297			call_netdevice_notifiers(NETDEV_DOWN, dev);
4298	}
4299
4300	if (dev->flags & IFF_UP &&
4301	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4302		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4303}
4304
4305/**
4306 *	dev_change_flags - change device settings
4307 *	@dev: device
4308 *	@flags: device state flags
4309 *
4310 *	Change settings on device based state flags. The flags are
4311 *	in the userspace exported format.
4312 */
4313int dev_change_flags(struct net_device *dev, unsigned flags)
4314{
4315	int ret, changes;
4316	int old_flags = dev->flags;
4317
4318	ret = __dev_change_flags(dev, flags);
4319	if (ret < 0)
4320		return ret;
4321
4322	changes = old_flags ^ dev->flags;
4323	if (changes)
4324		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4325
4326	__dev_notify_flags(dev, old_flags);
4327	return ret;
4328}
4329EXPORT_SYMBOL(dev_change_flags);
4330
4331/**
4332 *	dev_set_mtu - Change maximum transfer unit
4333 *	@dev: device
4334 *	@new_mtu: new transfer unit
4335 *
4336 *	Change the maximum transfer size of the network device.
4337 */
4338int dev_set_mtu(struct net_device *dev, int new_mtu)
4339{
4340	const struct net_device_ops *ops = dev->netdev_ops;
4341	int err;
4342
4343	if (new_mtu == dev->mtu)
4344		return 0;
4345
4346	/*	MTU must be positive.	 */
4347	if (new_mtu < 0)
4348		return -EINVAL;
4349
4350	if (!netif_device_present(dev))
4351		return -ENODEV;
4352
4353	err = 0;
4354	if (ops->ndo_change_mtu)
4355		err = ops->ndo_change_mtu(dev, new_mtu);
4356	else
4357		dev->mtu = new_mtu;
4358
4359	if (!err && dev->flags & IFF_UP)
4360		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4361	return err;
4362}
4363EXPORT_SYMBOL(dev_set_mtu);
4364
4365/**
4366 *	dev_set_mac_address - Change Media Access Control Address
4367 *	@dev: device
4368 *	@sa: new address
4369 *
4370 *	Change the hardware (MAC) address of the device
4371 */
4372int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4373{
4374	const struct net_device_ops *ops = dev->netdev_ops;
4375	int err;
4376
4377	if (!ops->ndo_set_mac_address)
4378		return -EOPNOTSUPP;
4379	if (sa->sa_family != dev->type)
4380		return -EINVAL;
4381	if (!netif_device_present(dev))
4382		return -ENODEV;
4383	err = ops->ndo_set_mac_address(dev, sa);
4384	if (!err)
4385		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4386	return err;
4387}
4388EXPORT_SYMBOL(dev_set_mac_address);
4389
4390/*
4391 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4392 */
4393static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4394{
4395	int err;
4396	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4397
4398	if (!dev)
4399		return -ENODEV;
4400
4401	switch (cmd) {
4402	case SIOCGIFFLAGS:	/* Get interface flags */
4403		ifr->ifr_flags = (short) dev_get_flags(dev);
4404		return 0;
4405
4406	case SIOCGIFMETRIC:	/* Get the metric on the interface
4407				   (currently unused) */
4408		ifr->ifr_metric = 0;
4409		return 0;
4410
4411	case SIOCGIFMTU:	/* Get the MTU of a device */
4412		ifr->ifr_mtu = dev->mtu;
4413		return 0;
4414
4415	case SIOCGIFHWADDR:
4416		if (!dev->addr_len)
4417			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4418		else
4419			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4420			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4421		ifr->ifr_hwaddr.sa_family = dev->type;
4422		return 0;
4423
4424	case SIOCGIFSLAVE:
4425		err = -EINVAL;
4426		break;
4427
4428	case SIOCGIFMAP:
4429		ifr->ifr_map.mem_start = dev->mem_start;
4430		ifr->ifr_map.mem_end   = dev->mem_end;
4431		ifr->ifr_map.base_addr = dev->base_addr;
4432		ifr->ifr_map.irq       = dev->irq;
4433		ifr->ifr_map.dma       = dev->dma;
4434		ifr->ifr_map.port      = dev->if_port;
4435		return 0;
4436
4437	case SIOCGIFINDEX:
4438		ifr->ifr_ifindex = dev->ifindex;
4439		return 0;
4440
4441	case SIOCGIFTXQLEN:
4442		ifr->ifr_qlen = dev->tx_queue_len;
4443		return 0;
4444
4445	default:
4446		/* dev_ioctl() should ensure this case
4447		 * is never reached
4448		 */
4449		WARN_ON(1);
4450		err = -EINVAL;
4451		break;
4452
4453	}
4454	return err;
4455}
4456
4457/*
4458 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4459 */
4460static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4461{
4462	int err;
4463	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4464	const struct net_device_ops *ops;
4465
4466	if (!dev)
4467		return -ENODEV;
4468
4469	ops = dev->netdev_ops;
4470
4471	switch (cmd) {
4472	case SIOCSIFFLAGS:	/* Set interface flags */
4473		return dev_change_flags(dev, ifr->ifr_flags);
4474
4475	case SIOCSIFMETRIC:	/* Set the metric on the interface
4476				   (currently unused) */
4477		return -EOPNOTSUPP;
4478
4479	case SIOCSIFMTU:	/* Set the MTU of a device */
4480		return dev_set_mtu(dev, ifr->ifr_mtu);
4481
4482	case SIOCSIFHWADDR:
4483		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4484
4485	case SIOCSIFHWBROADCAST:
4486		if (ifr->ifr_hwaddr.sa_family != dev->type)
4487			return -EINVAL;
4488		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4489		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4490		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4491		return 0;
4492
4493	case SIOCSIFMAP:
4494		if (ops->ndo_set_config) {
4495			if (!netif_device_present(dev))
4496				return -ENODEV;
4497			return ops->ndo_set_config(dev, &ifr->ifr_map);
4498		}
4499		return -EOPNOTSUPP;
4500
4501	case SIOCADDMULTI:
4502		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4503		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4504			return -EINVAL;
4505		if (!netif_device_present(dev))
4506			return -ENODEV;
4507		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4508
4509	case SIOCDELMULTI:
4510		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4511		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4512			return -EINVAL;
4513		if (!netif_device_present(dev))
4514			return -ENODEV;
4515		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4516
4517	case SIOCSIFTXQLEN:
4518		if (ifr->ifr_qlen < 0)
4519			return -EINVAL;
4520		dev->tx_queue_len = ifr->ifr_qlen;
4521		return 0;
4522
4523	case SIOCSIFNAME:
4524		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4525		return dev_change_name(dev, ifr->ifr_newname);
4526
4527	/*
4528	 *	Unknown or private ioctl
4529	 */
4530	default:
4531		if ((cmd >= SIOCDEVPRIVATE &&
4532		    cmd <= SIOCDEVPRIVATE + 15) ||
4533		    cmd == SIOCBONDENSLAVE ||
4534		    cmd == SIOCBONDRELEASE ||
4535		    cmd == SIOCBONDSETHWADDR ||
4536		    cmd == SIOCBONDSLAVEINFOQUERY ||
4537		    cmd == SIOCBONDINFOQUERY ||
4538		    cmd == SIOCBONDCHANGEACTIVE ||
4539		    cmd == SIOCGMIIPHY ||
4540		    cmd == SIOCGMIIREG ||
4541		    cmd == SIOCSMIIREG ||
4542		    cmd == SIOCBRADDIF ||
4543		    cmd == SIOCBRDELIF ||
4544		    cmd == SIOCSHWTSTAMP ||
4545		    cmd == SIOCWANDEV) {
4546			err = -EOPNOTSUPP;
4547			if (ops->ndo_do_ioctl) {
4548				if (netif_device_present(dev))
4549					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4550				else
4551					err = -ENODEV;
4552			}
4553		} else
4554			err = -EINVAL;
4555
4556	}
4557	return err;
4558}
4559
4560/*
4561 *	This function handles all "interface"-type I/O control requests. The actual
4562 *	'doing' part of this is dev_ifsioc above.
4563 */
4564
4565/**
4566 *	dev_ioctl	-	network device ioctl
4567 *	@net: the applicable net namespace
4568 *	@cmd: command to issue
4569 *	@arg: pointer to a struct ifreq in user space
4570 *
4571 *	Issue ioctl functions to devices. This is normally called by the
4572 *	user space syscall interfaces but can sometimes be useful for
4573 *	other purposes. The return value is the return from the syscall if
4574 *	positive or a negative errno code on error.
4575 */
4576
4577int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4578{
4579	struct ifreq ifr;
4580	int ret;
4581	char *colon;
4582
4583	/* One special case: SIOCGIFCONF takes ifconf argument
4584	   and requires shared lock, because it sleeps writing
4585	   to user space.
4586	 */
4587
4588	if (cmd == SIOCGIFCONF) {
4589		rtnl_lock();
4590		ret = dev_ifconf(net, (char __user *) arg);
4591		rtnl_unlock();
4592		return ret;
4593	}
4594	if (cmd == SIOCGIFNAME)
4595		return dev_ifname(net, (struct ifreq __user *)arg);
4596
4597	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4598		return -EFAULT;
4599
4600	ifr.ifr_name[IFNAMSIZ-1] = 0;
4601
4602	colon = strchr(ifr.ifr_name, ':');
4603	if (colon)
4604		*colon = 0;
4605
4606	/*
4607	 *	See which interface the caller is talking about.
4608	 */
4609
4610	switch (cmd) {
4611	/*
4612	 *	These ioctl calls:
4613	 *	- can be done by all.
4614	 *	- atomic and do not require locking.
4615	 *	- return a value
4616	 */
4617	case SIOCGIFFLAGS:
4618	case SIOCGIFMETRIC:
4619	case SIOCGIFMTU:
4620	case SIOCGIFHWADDR:
4621	case SIOCGIFSLAVE:
4622	case SIOCGIFMAP:
4623	case SIOCGIFINDEX:
4624	case SIOCGIFTXQLEN:
4625		dev_load(net, ifr.ifr_name);
4626		rcu_read_lock();
4627		ret = dev_ifsioc_locked(net, &ifr, cmd);
4628		rcu_read_unlock();
4629		if (!ret) {
4630			if (colon)
4631				*colon = ':';
4632			if (copy_to_user(arg, &ifr,
4633					 sizeof(struct ifreq)))
4634				ret = -EFAULT;
4635		}
4636		return ret;
4637
4638	case SIOCETHTOOL:
4639		dev_load(net, ifr.ifr_name);
4640		rtnl_lock();
4641		ret = dev_ethtool(net, &ifr);
4642		rtnl_unlock();
4643		if (!ret) {
4644			if (colon)
4645				*colon = ':';
4646			if (copy_to_user(arg, &ifr,
4647					 sizeof(struct ifreq)))
4648				ret = -EFAULT;
4649		}
4650		return ret;
4651
4652	/*
4653	 *	These ioctl calls:
4654	 *	- require superuser power.
4655	 *	- require strict serialization.
4656	 *	- return a value
4657	 */
4658	case SIOCGMIIPHY:
4659	case SIOCGMIIREG:
4660	case SIOCSIFNAME:
4661		if (!capable(CAP_NET_ADMIN))
4662			return -EPERM;
4663		dev_load(net, ifr.ifr_name);
4664		rtnl_lock();
4665		ret = dev_ifsioc(net, &ifr, cmd);
4666		rtnl_unlock();
4667		if (!ret) {
4668			if (colon)
4669				*colon = ':';
4670			if (copy_to_user(arg, &ifr,
4671					 sizeof(struct ifreq)))
4672				ret = -EFAULT;
4673		}
4674		return ret;
4675
4676	/*
4677	 *	These ioctl calls:
4678	 *	- require superuser power.
4679	 *	- require strict serialization.
4680	 *	- do not return a value
4681	 */
4682	case SIOCSIFFLAGS:
4683	case SIOCSIFMETRIC:
4684	case SIOCSIFMTU:
4685	case SIOCSIFMAP:
4686	case SIOCSIFHWADDR:
4687	case SIOCSIFSLAVE:
4688	case SIOCADDMULTI:
4689	case SIOCDELMULTI:
4690	case SIOCSIFHWBROADCAST:
4691	case SIOCSIFTXQLEN:
4692	case SIOCSMIIREG:
4693	case SIOCBONDENSLAVE:
4694	case SIOCBONDRELEASE:
4695	case SIOCBONDSETHWADDR:
4696	case SIOCBONDCHANGEACTIVE:
4697	case SIOCBRADDIF:
4698	case SIOCBRDELIF:
4699	case SIOCSHWTSTAMP:
4700		if (!capable(CAP_NET_ADMIN))
4701			return -EPERM;
4702		/* fall through */
4703	case SIOCBONDSLAVEINFOQUERY:
4704	case SIOCBONDINFOQUERY:
4705		dev_load(net, ifr.ifr_name);
4706		rtnl_lock();
4707		ret = dev_ifsioc(net, &ifr, cmd);
4708		rtnl_unlock();
4709		return ret;
4710
4711	case SIOCGIFMEM:
4712		/* Get the per device memory space. We can add this but
4713		 * currently do not support it */
4714	case SIOCSIFMEM:
4715		/* Set the per device memory buffer space.
4716		 * Not applicable in our case */
4717	case SIOCSIFLINK:
4718		return -EINVAL;
4719
4720	/*
4721	 *	Unknown or private ioctl.
4722	 */
4723	default:
4724		if (cmd == SIOCWANDEV ||
4725		    (cmd >= SIOCDEVPRIVATE &&
4726		     cmd <= SIOCDEVPRIVATE + 15)) {
4727			dev_load(net, ifr.ifr_name);
4728			rtnl_lock();
4729			ret = dev_ifsioc(net, &ifr, cmd);
4730			rtnl_unlock();
4731			if (!ret && copy_to_user(arg, &ifr,
4732						 sizeof(struct ifreq)))
4733				ret = -EFAULT;
4734			return ret;
4735		}
4736		/* Take care of Wireless Extensions */
4737		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4738			return wext_handle_ioctl(net, &ifr, cmd, arg);
4739		return -EINVAL;
4740	}
4741}
4742
4743
4744/**
4745 *	dev_new_index	-	allocate an ifindex
4746 *	@net: the applicable net namespace
4747 *
4748 *	Returns a suitable unique value for a new device interface
4749 *	number.  The caller must hold the rtnl semaphore or the
4750 *	dev_base_lock to be sure it remains unique.
4751 */
4752static int dev_new_index(struct net *net)
4753{
4754	static int ifindex;
4755	for (;;) {
4756		if (++ifindex <= 0)
4757			ifindex = 1;
4758		if (!__dev_get_by_index(net, ifindex))
4759			return ifindex;
4760	}
4761}
4762
4763/* Delayed registration/unregisteration */
4764static LIST_HEAD(net_todo_list);
4765
4766static void net_set_todo(struct net_device *dev)
4767{
4768	list_add_tail(&dev->todo_list, &net_todo_list);
4769}
4770
4771static void rollback_registered_many(struct list_head *head)
4772{
4773	struct net_device *dev, *tmp;
4774
4775	BUG_ON(dev_boot_phase);
4776	ASSERT_RTNL();
4777
4778	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4779		/* Some devices call without registering
4780		 * for initialization unwind. Remove those
4781		 * devices and proceed with the remaining.
4782		 */
4783		if (dev->reg_state == NETREG_UNINITIALIZED) {
4784			pr_debug("unregister_netdevice: device %s/%p never "
4785				 "was registered\n", dev->name, dev);
4786
4787			WARN_ON(1);
4788			list_del(&dev->unreg_list);
4789			continue;
4790		}
4791
4792		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4793
4794		/* If device is running, close it first. */
4795		dev_close(dev);
4796
4797		/* And unlink it from device chain. */
4798		unlist_netdevice(dev);
4799
4800		dev->reg_state = NETREG_UNREGISTERING;
4801	}
4802
4803	synchronize_net();
4804
4805	list_for_each_entry(dev, head, unreg_list) {
4806		/* Shutdown queueing discipline. */
4807		dev_shutdown(dev);
4808
4809
4810		/* Notify protocols, that we are about to destroy
4811		   this device. They should clean all the things.
4812		*/
4813		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4814
4815		if (!dev->rtnl_link_ops ||
4816		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4817			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4818
4819		/*
4820		 *	Flush the unicast and multicast chains
4821		 */
4822		dev_uc_flush(dev);
4823		dev_mc_flush(dev);
4824
4825		if (dev->netdev_ops->ndo_uninit)
4826			dev->netdev_ops->ndo_uninit(dev);
4827
4828		/* Notifier chain MUST detach us from master device. */
4829		WARN_ON(dev->master);
4830
4831		/* Remove entries from kobject tree */
4832		netdev_unregister_kobject(dev);
4833	}
4834
4835	/* Process any work delayed until the end of the batch */
4836	dev = list_first_entry(head, struct net_device, unreg_list);
4837	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4838
4839	synchronize_net();
4840
4841	list_for_each_entry(dev, head, unreg_list)
4842		dev_put(dev);
4843}
4844
4845static void rollback_registered(struct net_device *dev)
4846{
4847	LIST_HEAD(single);
4848
4849	list_add(&dev->unreg_list, &single);
4850	rollback_registered_many(&single);
4851}
4852
4853static void __netdev_init_queue_locks_one(struct net_device *dev,
4854					  struct netdev_queue *dev_queue,
4855					  void *_unused)
4856{
4857	spin_lock_init(&dev_queue->_xmit_lock);
4858	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4859	dev_queue->xmit_lock_owner = -1;
4860}
4861
4862static void netdev_init_queue_locks(struct net_device *dev)
4863{
4864	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4865	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4866}
4867
4868unsigned long netdev_fix_features(unsigned long features, const char *name)
4869{
4870	/* Fix illegal SG+CSUM combinations. */
4871	if ((features & NETIF_F_SG) &&
4872	    !(features & NETIF_F_ALL_CSUM)) {
4873		if (name)
4874			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4875			       "checksum feature.\n", name);
4876		features &= ~NETIF_F_SG;
4877	}
4878
4879	/* TSO requires that SG is present as well. */
4880	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4881		if (name)
4882			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4883			       "SG feature.\n", name);
4884		features &= ~NETIF_F_TSO;
4885	}
4886
4887	if (features & NETIF_F_UFO) {
4888		if (!(features & NETIF_F_GEN_CSUM)) {
4889			if (name)
4890				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4891				       "since no NETIF_F_HW_CSUM feature.\n",
4892				       name);
4893			features &= ~NETIF_F_UFO;
4894		}
4895
4896		if (!(features & NETIF_F_SG)) {
4897			if (name)
4898				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4899				       "since no NETIF_F_SG feature.\n", name);
4900			features &= ~NETIF_F_UFO;
4901		}
4902	}
4903
4904	return features;
4905}
4906EXPORT_SYMBOL(netdev_fix_features);
4907
4908/**
4909 *	netif_stacked_transfer_operstate -	transfer operstate
4910 *	@rootdev: the root or lower level device to transfer state from
4911 *	@dev: the device to transfer operstate to
4912 *
4913 *	Transfer operational state from root to device. This is normally
4914 *	called when a stacking relationship exists between the root
4915 *	device and the device(a leaf device).
4916 */
4917void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4918					struct net_device *dev)
4919{
4920	if (rootdev->operstate == IF_OPER_DORMANT)
4921		netif_dormant_on(dev);
4922	else
4923		netif_dormant_off(dev);
4924
4925	if (netif_carrier_ok(rootdev)) {
4926		if (!netif_carrier_ok(dev))
4927			netif_carrier_on(dev);
4928	} else {
4929		if (netif_carrier_ok(dev))
4930			netif_carrier_off(dev);
4931	}
4932}
4933EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4934
4935/**
4936 *	register_netdevice	- register a network device
4937 *	@dev: device to register
4938 *
4939 *	Take a completed network device structure and add it to the kernel
4940 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4941 *	chain. 0 is returned on success. A negative errno code is returned
4942 *	on a failure to set up the device, or if the name is a duplicate.
4943 *
4944 *	Callers must hold the rtnl semaphore. You may want
4945 *	register_netdev() instead of this.
4946 *
4947 *	BUGS:
4948 *	The locking appears insufficient to guarantee two parallel registers
4949 *	will not get the same name.
4950 */
4951
4952int register_netdevice(struct net_device *dev)
4953{
4954	int ret;
4955	struct net *net = dev_net(dev);
4956
4957	BUG_ON(dev_boot_phase);
4958	ASSERT_RTNL();
4959
4960	might_sleep();
4961
4962	/* When net_device's are persistent, this will be fatal. */
4963	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4964	BUG_ON(!net);
4965
4966	spin_lock_init(&dev->addr_list_lock);
4967	netdev_set_addr_lockdep_class(dev);
4968	netdev_init_queue_locks(dev);
4969
4970	dev->iflink = -1;
4971
4972#ifdef CONFIG_RPS
4973	if (!dev->num_rx_queues) {
4974		/*
4975		 * Allocate a single RX queue if driver never called
4976		 * alloc_netdev_mq
4977		 */
4978
4979		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4980		if (!dev->_rx) {
4981			ret = -ENOMEM;
4982			goto out;
4983		}
4984
4985		dev->_rx->first = dev->_rx;
4986		atomic_set(&dev->_rx->count, 1);
4987		dev->num_rx_queues = 1;
4988	}
4989#endif
4990	/* Init, if this function is available */
4991	if (dev->netdev_ops->ndo_init) {
4992		ret = dev->netdev_ops->ndo_init(dev);
4993		if (ret) {
4994			if (ret > 0)
4995				ret = -EIO;
4996			goto out;
4997		}
4998	}
4999
5000	ret = dev_get_valid_name(dev, dev->name, 0);
5001	if (ret)
5002		goto err_uninit;
5003
5004	dev->ifindex = dev_new_index(net);
5005	if (dev->iflink == -1)
5006		dev->iflink = dev->ifindex;
5007
5008	/* Fix illegal checksum combinations */
5009	if ((dev->features & NETIF_F_HW_CSUM) &&
5010	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5011		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5012		       dev->name);
5013		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5014	}
5015
5016	if ((dev->features & NETIF_F_NO_CSUM) &&
5017	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5018		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5019		       dev->name);
5020		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5021	}
5022
5023	dev->features = netdev_fix_features(dev->features, dev->name);
5024
5025	/* Enable software GSO if SG is supported. */
5026	if (dev->features & NETIF_F_SG)
5027		dev->features |= NETIF_F_GSO;
5028
5029	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5030	ret = notifier_to_errno(ret);
5031	if (ret)
5032		goto err_uninit;
5033
5034	ret = netdev_register_kobject(dev);
5035	if (ret)
5036		goto err_uninit;
5037	dev->reg_state = NETREG_REGISTERED;
5038
5039	/*
5040	 *	Default initial state at registry is that the
5041	 *	device is present.
5042	 */
5043
5044	set_bit(__LINK_STATE_PRESENT, &dev->state);
5045
5046	dev_init_scheduler(dev);
5047	dev_hold(dev);
5048	list_netdevice(dev);
5049
5050	/* Notify protocols, that a new device appeared. */
5051	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5052	ret = notifier_to_errno(ret);
5053	if (ret) {
5054		rollback_registered(dev);
5055		dev->reg_state = NETREG_UNREGISTERED;
5056	}
5057	/*
5058	 *	Prevent userspace races by waiting until the network
5059	 *	device is fully setup before sending notifications.
5060	 */
5061	if (!dev->rtnl_link_ops ||
5062	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5063		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5064
5065out:
5066	return ret;
5067
5068err_uninit:
5069	if (dev->netdev_ops->ndo_uninit)
5070		dev->netdev_ops->ndo_uninit(dev);
5071	goto out;
5072}
5073EXPORT_SYMBOL(register_netdevice);
5074
5075/**
5076 *	init_dummy_netdev	- init a dummy network device for NAPI
5077 *	@dev: device to init
5078 *
5079 *	This takes a network device structure and initialize the minimum
5080 *	amount of fields so it can be used to schedule NAPI polls without
5081 *	registering a full blown interface. This is to be used by drivers
5082 *	that need to tie several hardware interfaces to a single NAPI
5083 *	poll scheduler due to HW limitations.
5084 */
5085int init_dummy_netdev(struct net_device *dev)
5086{
5087	/* Clear everything. Note we don't initialize spinlocks
5088	 * are they aren't supposed to be taken by any of the
5089	 * NAPI code and this dummy netdev is supposed to be
5090	 * only ever used for NAPI polls
5091	 */
5092	memset(dev, 0, sizeof(struct net_device));
5093
5094	/* make sure we BUG if trying to hit standard
5095	 * register/unregister code path
5096	 */
5097	dev->reg_state = NETREG_DUMMY;
5098
5099	/* initialize the ref count */
5100	atomic_set(&dev->refcnt, 1);
5101
5102	/* NAPI wants this */
5103	INIT_LIST_HEAD(&dev->napi_list);
5104
5105	/* a dummy interface is started by default */
5106	set_bit(__LINK_STATE_PRESENT, &dev->state);
5107	set_bit(__LINK_STATE_START, &dev->state);
5108
5109	return 0;
5110}
5111EXPORT_SYMBOL_GPL(init_dummy_netdev);
5112
5113
5114/**
5115 *	register_netdev	- register a network device
5116 *	@dev: device to register
5117 *
5118 *	Take a completed network device structure and add it to the kernel
5119 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5120 *	chain. 0 is returned on success. A negative errno code is returned
5121 *	on a failure to set up the device, or if the name is a duplicate.
5122 *
5123 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5124 *	and expands the device name if you passed a format string to
5125 *	alloc_netdev.
5126 */
5127int register_netdev(struct net_device *dev)
5128{
5129	int err;
5130
5131	rtnl_lock();
5132
5133	/*
5134	 * If the name is a format string the caller wants us to do a
5135	 * name allocation.
5136	 */
5137	if (strchr(dev->name, '%')) {
5138		err = dev_alloc_name(dev, dev->name);
5139		if (err < 0)
5140			goto out;
5141	}
5142
5143	err = register_netdevice(dev);
5144out:
5145	rtnl_unlock();
5146	return err;
5147}
5148EXPORT_SYMBOL(register_netdev);
5149
5150/*
5151 * netdev_wait_allrefs - wait until all references are gone.
5152 *
5153 * This is called when unregistering network devices.
5154 *
5155 * Any protocol or device that holds a reference should register
5156 * for netdevice notification, and cleanup and put back the
5157 * reference if they receive an UNREGISTER event.
5158 * We can get stuck here if buggy protocols don't correctly
5159 * call dev_put.
5160 */
5161static void netdev_wait_allrefs(struct net_device *dev)
5162{
5163	unsigned long rebroadcast_time, warning_time;
5164
5165	linkwatch_forget_dev(dev);
5166
5167	rebroadcast_time = warning_time = jiffies;
5168	while (atomic_read(&dev->refcnt) != 0) {
5169		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5170			rtnl_lock();
5171
5172			/* Rebroadcast unregister notification */
5173			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5174			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5175			 * should have already handle it the first time */
5176
5177			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5178				     &dev->state)) {
5179				/* We must not have linkwatch events
5180				 * pending on unregister. If this
5181				 * happens, we simply run the queue
5182				 * unscheduled, resulting in a noop
5183				 * for this device.
5184				 */
5185				linkwatch_run_queue();
5186			}
5187
5188			__rtnl_unlock();
5189
5190			rebroadcast_time = jiffies;
5191		}
5192
5193		msleep(250);
5194
5195		if (time_after(jiffies, warning_time + 10 * HZ)) {
5196			printk(KERN_EMERG "unregister_netdevice: "
5197			       "waiting for %s to become free. Usage "
5198			       "count = %d\n",
5199			       dev->name, atomic_read(&dev->refcnt));
5200			warning_time = jiffies;
5201		}
5202	}
5203}
5204
5205/* The sequence is:
5206 *
5207 *	rtnl_lock();
5208 *	...
5209 *	register_netdevice(x1);
5210 *	register_netdevice(x2);
5211 *	...
5212 *	unregister_netdevice(y1);
5213 *	unregister_netdevice(y2);
5214 *      ...
5215 *	rtnl_unlock();
5216 *	free_netdev(y1);
5217 *	free_netdev(y2);
5218 *
5219 * We are invoked by rtnl_unlock().
5220 * This allows us to deal with problems:
5221 * 1) We can delete sysfs objects which invoke hotplug
5222 *    without deadlocking with linkwatch via keventd.
5223 * 2) Since we run with the RTNL semaphore not held, we can sleep
5224 *    safely in order to wait for the netdev refcnt to drop to zero.
5225 *
5226 * We must not return until all unregister events added during
5227 * the interval the lock was held have been completed.
5228 */
5229void netdev_run_todo(void)
5230{
5231	struct list_head list;
5232
5233	/* Snapshot list, allow later requests */
5234	list_replace_init(&net_todo_list, &list);
5235
5236	__rtnl_unlock();
5237
5238	while (!list_empty(&list)) {
5239		struct net_device *dev
5240			= list_first_entry(&list, struct net_device, todo_list);
5241		list_del(&dev->todo_list);
5242
5243		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5244			printk(KERN_ERR "network todo '%s' but state %d\n",
5245			       dev->name, dev->reg_state);
5246			dump_stack();
5247			continue;
5248		}
5249
5250		dev->reg_state = NETREG_UNREGISTERED;
5251
5252		on_each_cpu(flush_backlog, dev, 1);
5253
5254		netdev_wait_allrefs(dev);
5255
5256		/* paranoia */
5257		BUG_ON(atomic_read(&dev->refcnt));
5258		WARN_ON(dev->ip_ptr);
5259		WARN_ON(dev->ip6_ptr);
5260		WARN_ON(dev->dn_ptr);
5261
5262		if (dev->destructor)
5263			dev->destructor(dev);
5264
5265		/* Free network device */
5266		kobject_put(&dev->dev.kobj);
5267	}
5268}
5269
5270/**
5271 *	dev_txq_stats_fold - fold tx_queues stats
5272 *	@dev: device to get statistics from
5273 *	@stats: struct net_device_stats to hold results
5274 */
5275void dev_txq_stats_fold(const struct net_device *dev,
5276			struct net_device_stats *stats)
5277{
5278	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5279	unsigned int i;
5280	struct netdev_queue *txq;
5281
5282	for (i = 0; i < dev->num_tx_queues; i++) {
5283		txq = netdev_get_tx_queue(dev, i);
5284		tx_bytes   += txq->tx_bytes;
5285		tx_packets += txq->tx_packets;
5286		tx_dropped += txq->tx_dropped;
5287	}
5288	if (tx_bytes || tx_packets || tx_dropped) {
5289		stats->tx_bytes   = tx_bytes;
5290		stats->tx_packets = tx_packets;
5291		stats->tx_dropped = tx_dropped;
5292	}
5293}
5294EXPORT_SYMBOL(dev_txq_stats_fold);
5295
5296/**
5297 *	dev_get_stats	- get network device statistics
5298 *	@dev: device to get statistics from
5299 *
5300 *	Get network statistics from device. The device driver may provide
5301 *	its own method by setting dev->netdev_ops->get_stats; otherwise
5302 *	the internal statistics structure is used.
5303 */
5304const struct net_device_stats *dev_get_stats(struct net_device *dev)
5305{
5306	const struct net_device_ops *ops = dev->netdev_ops;
5307
5308	if (ops->ndo_get_stats)
5309		return ops->ndo_get_stats(dev);
5310
5311	dev_txq_stats_fold(dev, &dev->stats);
5312	return &dev->stats;
5313}
5314EXPORT_SYMBOL(dev_get_stats);
5315
5316static void netdev_init_one_queue(struct net_device *dev,
5317				  struct netdev_queue *queue,
5318				  void *_unused)
5319{
5320	queue->dev = dev;
5321}
5322
5323static void netdev_init_queues(struct net_device *dev)
5324{
5325	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5326	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5327	spin_lock_init(&dev->tx_global_lock);
5328}
5329
5330/**
5331 *	alloc_netdev_mq - allocate network device
5332 *	@sizeof_priv:	size of private data to allocate space for
5333 *	@name:		device name format string
5334 *	@setup:		callback to initialize device
5335 *	@queue_count:	the number of subqueues to allocate
5336 *
5337 *	Allocates a struct net_device with private data area for driver use
5338 *	and performs basic initialization.  Also allocates subquue structs
5339 *	for each queue on the device at the end of the netdevice.
5340 */
5341struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5342		void (*setup)(struct net_device *), unsigned int queue_count)
5343{
5344	struct netdev_queue *tx;
5345	struct net_device *dev;
5346	size_t alloc_size;
5347	struct net_device *p;
5348#ifdef CONFIG_RPS
5349	struct netdev_rx_queue *rx;
5350	int i;
5351#endif
5352
5353	BUG_ON(strlen(name) >= sizeof(dev->name));
5354
5355	alloc_size = sizeof(struct net_device);
5356	if (sizeof_priv) {
5357		/* ensure 32-byte alignment of private area */
5358		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5359		alloc_size += sizeof_priv;
5360	}
5361	/* ensure 32-byte alignment of whole construct */
5362	alloc_size += NETDEV_ALIGN - 1;
5363
5364	p = kzalloc(alloc_size, GFP_KERNEL);
5365	if (!p) {
5366		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5367		return NULL;
5368	}
5369
5370	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5371	if (!tx) {
5372		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5373		       "tx qdiscs.\n");
5374		goto free_p;
5375	}
5376
5377#ifdef CONFIG_RPS
5378	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5379	if (!rx) {
5380		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5381		       "rx queues.\n");
5382		goto free_tx;
5383	}
5384
5385	atomic_set(&rx->count, queue_count);
5386
5387	/*
5388	 * Set a pointer to first element in the array which holds the
5389	 * reference count.
5390	 */
5391	for (i = 0; i < queue_count; i++)
5392		rx[i].first = rx;
5393#endif
5394
5395	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5396	dev->padded = (char *)dev - (char *)p;
5397
5398	if (dev_addr_init(dev))
5399		goto free_rx;
5400
5401	dev_mc_init(dev);
5402	dev_uc_init(dev);
5403
5404	dev_net_set(dev, &init_net);
5405
5406	dev->_tx = tx;
5407	dev->num_tx_queues = queue_count;
5408	dev->real_num_tx_queues = queue_count;
5409
5410#ifdef CONFIG_RPS
5411	dev->_rx = rx;
5412	dev->num_rx_queues = queue_count;
5413#endif
5414
5415	dev->gso_max_size = GSO_MAX_SIZE;
5416
5417	netdev_init_queues(dev);
5418
5419	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5420	dev->ethtool_ntuple_list.count = 0;
5421	INIT_LIST_HEAD(&dev->napi_list);
5422	INIT_LIST_HEAD(&dev->unreg_list);
5423	INIT_LIST_HEAD(&dev->link_watch_list);
5424	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5425	setup(dev);
5426	strcpy(dev->name, name);
5427	return dev;
5428
5429free_rx:
5430#ifdef CONFIG_RPS
5431	kfree(rx);
5432free_tx:
5433#endif
5434	kfree(tx);
5435free_p:
5436	kfree(p);
5437	return NULL;
5438}
5439EXPORT_SYMBOL(alloc_netdev_mq);
5440
5441/**
5442 *	free_netdev - free network device
5443 *	@dev: device
5444 *
5445 *	This function does the last stage of destroying an allocated device
5446 * 	interface. The reference to the device object is released.
5447 *	If this is the last reference then it will be freed.
5448 */
5449void free_netdev(struct net_device *dev)
5450{
5451	struct napi_struct *p, *n;
5452
5453	release_net(dev_net(dev));
5454
5455	kfree(dev->_tx);
5456
5457	/* Flush device addresses */
5458	dev_addr_flush(dev);
5459
5460	/* Clear ethtool n-tuple list */
5461	ethtool_ntuple_flush(dev);
5462
5463	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5464		netif_napi_del(p);
5465
5466	/*  Compatibility with error handling in drivers */
5467	if (dev->reg_state == NETREG_UNINITIALIZED) {
5468		kfree((char *)dev - dev->padded);
5469		return;
5470	}
5471
5472	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5473	dev->reg_state = NETREG_RELEASED;
5474
5475	/* will free via device release */
5476	put_device(&dev->dev);
5477}
5478EXPORT_SYMBOL(free_netdev);
5479
5480/**
5481 *	synchronize_net -  Synchronize with packet receive processing
5482 *
5483 *	Wait for packets currently being received to be done.
5484 *	Does not block later packets from starting.
5485 */
5486void synchronize_net(void)
5487{
5488	might_sleep();
5489	synchronize_rcu();
5490}
5491EXPORT_SYMBOL(synchronize_net);
5492
5493/**
5494 *	unregister_netdevice_queue - remove device from the kernel
5495 *	@dev: device
5496 *	@head: list
5497 *
5498 *	This function shuts down a device interface and removes it
5499 *	from the kernel tables.
5500 *	If head not NULL, device is queued to be unregistered later.
5501 *
5502 *	Callers must hold the rtnl semaphore.  You may want
5503 *	unregister_netdev() instead of this.
5504 */
5505
5506void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5507{
5508	ASSERT_RTNL();
5509
5510	if (head) {
5511		list_move_tail(&dev->unreg_list, head);
5512	} else {
5513		rollback_registered(dev);
5514		/* Finish processing unregister after unlock */
5515		net_set_todo(dev);
5516	}
5517}
5518EXPORT_SYMBOL(unregister_netdevice_queue);
5519
5520/**
5521 *	unregister_netdevice_many - unregister many devices
5522 *	@head: list of devices
5523 */
5524void unregister_netdevice_many(struct list_head *head)
5525{
5526	struct net_device *dev;
5527
5528	if (!list_empty(head)) {
5529		rollback_registered_many(head);
5530		list_for_each_entry(dev, head, unreg_list)
5531			net_set_todo(dev);
5532	}
5533}
5534EXPORT_SYMBOL(unregister_netdevice_many);
5535
5536/**
5537 *	unregister_netdev - remove device from the kernel
5538 *	@dev: device
5539 *
5540 *	This function shuts down a device interface and removes it
5541 *	from the kernel tables.
5542 *
5543 *	This is just a wrapper for unregister_netdevice that takes
5544 *	the rtnl semaphore.  In general you want to use this and not
5545 *	unregister_netdevice.
5546 */
5547void unregister_netdev(struct net_device *dev)
5548{
5549	rtnl_lock();
5550	unregister_netdevice(dev);
5551	rtnl_unlock();
5552}
5553EXPORT_SYMBOL(unregister_netdev);
5554
5555/**
5556 *	dev_change_net_namespace - move device to different nethost namespace
5557 *	@dev: device
5558 *	@net: network namespace
5559 *	@pat: If not NULL name pattern to try if the current device name
5560 *	      is already taken in the destination network namespace.
5561 *
5562 *	This function shuts down a device interface and moves it
5563 *	to a new network namespace. On success 0 is returned, on
5564 *	a failure a netagive errno code is returned.
5565 *
5566 *	Callers must hold the rtnl semaphore.
5567 */
5568
5569int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5570{
5571	int err;
5572
5573	ASSERT_RTNL();
5574
5575	/* Don't allow namespace local devices to be moved. */
5576	err = -EINVAL;
5577	if (dev->features & NETIF_F_NETNS_LOCAL)
5578		goto out;
5579
5580	/* Ensure the device has been registrered */
5581	err = -EINVAL;
5582	if (dev->reg_state != NETREG_REGISTERED)
5583		goto out;
5584
5585	/* Get out if there is nothing todo */
5586	err = 0;
5587	if (net_eq(dev_net(dev), net))
5588		goto out;
5589
5590	/* Pick the destination device name, and ensure
5591	 * we can use it in the destination network namespace.
5592	 */
5593	err = -EEXIST;
5594	if (__dev_get_by_name(net, dev->name)) {
5595		/* We get here if we can't use the current device name */
5596		if (!pat)
5597			goto out;
5598		if (dev_get_valid_name(dev, pat, 1))
5599			goto out;
5600	}
5601
5602	/*
5603	 * And now a mini version of register_netdevice unregister_netdevice.
5604	 */
5605
5606	/* If device is running close it first. */
5607	dev_close(dev);
5608
5609	/* And unlink it from device chain */
5610	err = -ENODEV;
5611	unlist_netdevice(dev);
5612
5613	synchronize_net();
5614
5615	/* Shutdown queueing discipline. */
5616	dev_shutdown(dev);
5617
5618	/* Notify protocols, that we are about to destroy
5619	   this device. They should clean all the things.
5620	*/
5621	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5622	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5623
5624	/*
5625	 *	Flush the unicast and multicast chains
5626	 */
5627	dev_uc_flush(dev);
5628	dev_mc_flush(dev);
5629
5630	/* Actually switch the network namespace */
5631	dev_net_set(dev, net);
5632
5633	/* If there is an ifindex conflict assign a new one */
5634	if (__dev_get_by_index(net, dev->ifindex)) {
5635		int iflink = (dev->iflink == dev->ifindex);
5636		dev->ifindex = dev_new_index(net);
5637		if (iflink)
5638			dev->iflink = dev->ifindex;
5639	}
5640
5641	/* Fixup kobjects */
5642	err = device_rename(&dev->dev, dev->name);
5643	WARN_ON(err);
5644
5645	/* Add the device back in the hashes */
5646	list_netdevice(dev);
5647
5648	/* Notify protocols, that a new device appeared. */
5649	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5650
5651	/*
5652	 *	Prevent userspace races by waiting until the network
5653	 *	device is fully setup before sending notifications.
5654	 */
5655	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5656
5657	synchronize_net();
5658	err = 0;
5659out:
5660	return err;
5661}
5662EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5663
5664static int dev_cpu_callback(struct notifier_block *nfb,
5665			    unsigned long action,
5666			    void *ocpu)
5667{
5668	struct sk_buff **list_skb;
5669	struct sk_buff *skb;
5670	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5671	struct softnet_data *sd, *oldsd;
5672
5673	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5674		return NOTIFY_OK;
5675
5676	local_irq_disable();
5677	cpu = smp_processor_id();
5678	sd = &per_cpu(softnet_data, cpu);
5679	oldsd = &per_cpu(softnet_data, oldcpu);
5680
5681	/* Find end of our completion_queue. */
5682	list_skb = &sd->completion_queue;
5683	while (*list_skb)
5684		list_skb = &(*list_skb)->next;
5685	/* Append completion queue from offline CPU. */
5686	*list_skb = oldsd->completion_queue;
5687	oldsd->completion_queue = NULL;
5688
5689	/* Append output queue from offline CPU. */
5690	if (oldsd->output_queue) {
5691		*sd->output_queue_tailp = oldsd->output_queue;
5692		sd->output_queue_tailp = oldsd->output_queue_tailp;
5693		oldsd->output_queue = NULL;
5694		oldsd->output_queue_tailp = &oldsd->output_queue;
5695	}
5696
5697	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5698	local_irq_enable();
5699
5700	/* Process offline CPU's input_pkt_queue */
5701	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5702		netif_rx(skb);
5703		input_queue_head_incr(oldsd);
5704	}
5705	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5706		netif_rx(skb);
5707		input_queue_head_incr(oldsd);
5708	}
5709
5710	return NOTIFY_OK;
5711}
5712
5713
5714/**
5715 *	netdev_increment_features - increment feature set by one
5716 *	@all: current feature set
5717 *	@one: new feature set
5718 *	@mask: mask feature set
5719 *
5720 *	Computes a new feature set after adding a device with feature set
5721 *	@one to the master device with current feature set @all.  Will not
5722 *	enable anything that is off in @mask. Returns the new feature set.
5723 */
5724unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5725					unsigned long mask)
5726{
5727	/* If device needs checksumming, downgrade to it. */
5728	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5729		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5730	else if (mask & NETIF_F_ALL_CSUM) {
5731		/* If one device supports v4/v6 checksumming, set for all. */
5732		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5733		    !(all & NETIF_F_GEN_CSUM)) {
5734			all &= ~NETIF_F_ALL_CSUM;
5735			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5736		}
5737
5738		/* If one device supports hw checksumming, set for all. */
5739		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5740			all &= ~NETIF_F_ALL_CSUM;
5741			all |= NETIF_F_HW_CSUM;
5742		}
5743	}
5744
5745	one |= NETIF_F_ALL_CSUM;
5746
5747	one |= all & NETIF_F_ONE_FOR_ALL;
5748	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5749	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5750
5751	return all;
5752}
5753EXPORT_SYMBOL(netdev_increment_features);
5754
5755static struct hlist_head *netdev_create_hash(void)
5756{
5757	int i;
5758	struct hlist_head *hash;
5759
5760	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5761	if (hash != NULL)
5762		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5763			INIT_HLIST_HEAD(&hash[i]);
5764
5765	return hash;
5766}
5767
5768/* Initialize per network namespace state */
5769static int __net_init netdev_init(struct net *net)
5770{
5771	INIT_LIST_HEAD(&net->dev_base_head);
5772
5773	net->dev_name_head = netdev_create_hash();
5774	if (net->dev_name_head == NULL)
5775		goto err_name;
5776
5777	net->dev_index_head = netdev_create_hash();
5778	if (net->dev_index_head == NULL)
5779		goto err_idx;
5780
5781	return 0;
5782
5783err_idx:
5784	kfree(net->dev_name_head);
5785err_name:
5786	return -ENOMEM;
5787}
5788
5789/**
5790 *	netdev_drivername - network driver for the device
5791 *	@dev: network device
5792 *	@buffer: buffer for resulting name
5793 *	@len: size of buffer
5794 *
5795 *	Determine network driver for device.
5796 */
5797char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5798{
5799	const struct device_driver *driver;
5800	const struct device *parent;
5801
5802	if (len <= 0 || !buffer)
5803		return buffer;
5804	buffer[0] = 0;
5805
5806	parent = dev->dev.parent;
5807
5808	if (!parent)
5809		return buffer;
5810
5811	driver = parent->driver;
5812	if (driver && driver->name)
5813		strlcpy(buffer, driver->name, len);
5814	return buffer;
5815}
5816
5817static void __net_exit netdev_exit(struct net *net)
5818{
5819	kfree(net->dev_name_head);
5820	kfree(net->dev_index_head);
5821}
5822
5823static struct pernet_operations __net_initdata netdev_net_ops = {
5824	.init = netdev_init,
5825	.exit = netdev_exit,
5826};
5827
5828static void __net_exit default_device_exit(struct net *net)
5829{
5830	struct net_device *dev, *aux;
5831	/*
5832	 * Push all migratable network devices back to the
5833	 * initial network namespace
5834	 */
5835	rtnl_lock();
5836	for_each_netdev_safe(net, dev, aux) {
5837		int err;
5838		char fb_name[IFNAMSIZ];
5839
5840		/* Ignore unmoveable devices (i.e. loopback) */
5841		if (dev->features & NETIF_F_NETNS_LOCAL)
5842			continue;
5843
5844		/* Leave virtual devices for the generic cleanup */
5845		if (dev->rtnl_link_ops)
5846			continue;
5847
5848		/* Push remaing network devices to init_net */
5849		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5850		err = dev_change_net_namespace(dev, &init_net, fb_name);
5851		if (err) {
5852			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5853				__func__, dev->name, err);
5854			BUG();
5855		}
5856	}
5857	rtnl_unlock();
5858}
5859
5860static void __net_exit default_device_exit_batch(struct list_head *net_list)
5861{
5862	/* At exit all network devices most be removed from a network
5863	 * namespace.  Do this in the reverse order of registeration.
5864	 * Do this across as many network namespaces as possible to
5865	 * improve batching efficiency.
5866	 */
5867	struct net_device *dev;
5868	struct net *net;
5869	LIST_HEAD(dev_kill_list);
5870
5871	rtnl_lock();
5872	list_for_each_entry(net, net_list, exit_list) {
5873		for_each_netdev_reverse(net, dev) {
5874			if (dev->rtnl_link_ops)
5875				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5876			else
5877				unregister_netdevice_queue(dev, &dev_kill_list);
5878		}
5879	}
5880	unregister_netdevice_many(&dev_kill_list);
5881	rtnl_unlock();
5882}
5883
5884static struct pernet_operations __net_initdata default_device_ops = {
5885	.exit = default_device_exit,
5886	.exit_batch = default_device_exit_batch,
5887};
5888
5889/*
5890 *	Initialize the DEV module. At boot time this walks the device list and
5891 *	unhooks any devices that fail to initialise (normally hardware not
5892 *	present) and leaves us with a valid list of present and active devices.
5893 *
5894 */
5895
5896/*
5897 *       This is called single threaded during boot, so no need
5898 *       to take the rtnl semaphore.
5899 */
5900static int __init net_dev_init(void)
5901{
5902	int i, rc = -ENOMEM;
5903
5904	BUG_ON(!dev_boot_phase);
5905
5906	if (dev_proc_init())
5907		goto out;
5908
5909	if (netdev_kobject_init())
5910		goto out;
5911
5912	INIT_LIST_HEAD(&ptype_all);
5913	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5914		INIT_LIST_HEAD(&ptype_base[i]);
5915
5916	if (register_pernet_subsys(&netdev_net_ops))
5917		goto out;
5918
5919	/*
5920	 *	Initialise the packet receive queues.
5921	 */
5922
5923	for_each_possible_cpu(i) {
5924		struct softnet_data *sd = &per_cpu(softnet_data, i);
5925
5926		memset(sd, 0, sizeof(*sd));
5927		skb_queue_head_init(&sd->input_pkt_queue);
5928		skb_queue_head_init(&sd->process_queue);
5929		sd->completion_queue = NULL;
5930		INIT_LIST_HEAD(&sd->poll_list);
5931		sd->output_queue = NULL;
5932		sd->output_queue_tailp = &sd->output_queue;
5933#ifdef CONFIG_RPS
5934		sd->csd.func = rps_trigger_softirq;
5935		sd->csd.info = sd;
5936		sd->csd.flags = 0;
5937		sd->cpu = i;
5938#endif
5939
5940		sd->backlog.poll = process_backlog;
5941		sd->backlog.weight = weight_p;
5942		sd->backlog.gro_list = NULL;
5943		sd->backlog.gro_count = 0;
5944	}
5945
5946	dev_boot_phase = 0;
5947
5948	/* The loopback device is special if any other network devices
5949	 * is present in a network namespace the loopback device must
5950	 * be present. Since we now dynamically allocate and free the
5951	 * loopback device ensure this invariant is maintained by
5952	 * keeping the loopback device as the first device on the
5953	 * list of network devices.  Ensuring the loopback devices
5954	 * is the first device that appears and the last network device
5955	 * that disappears.
5956	 */
5957	if (register_pernet_device(&loopback_net_ops))
5958		goto out;
5959
5960	if (register_pernet_device(&default_device_ops))
5961		goto out;
5962
5963	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5964	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5965
5966	hotcpu_notifier(dev_cpu_callback, 0);
5967	dst_init();
5968	dev_mcast_init();
5969	rc = 0;
5970out:
5971	return rc;
5972}
5973
5974subsys_initcall(net_dev_init);
5975
5976static int __init initialize_hashrnd(void)
5977{
5978	get_random_bytes(&hashrnd, sizeof(hashrnd));
5979	return 0;
5980}
5981
5982late_initcall_sync(initialize_hashrnd);
5983