net/core/dev.c at v2.6.35 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.35 148 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <linux/if_bridge.h>
 105#include <linux/if_macvlan.h>
 106#include <net/dst.h>
 107#include <net/pkt_sched.h>
 108#include <net/checksum.h>
 109#include <net/xfrm.h>
 110#include <linux/highmem.h>
 111#include <linux/init.h>
 112#include <linux/kmod.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/wext.h>
 118#include <net/iw_handler.h>
 119#include <asm/current.h>
 120#include <linux/audit.h>
 121#include <linux/dmaengine.h>
 122#include <linux/err.h>
 123#include <linux/ctype.h>
 124#include <linux/if_arp.h>
 125#include <linux/if_vlan.h>
 126#include <linux/ip.h>
 127#include <net/ip.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133#include <linux/pci.h>
 134
 135#include "net-sysfs.h"
 136
 137/* Instead of increasing this, you should create a hash table. */
 138#define MAX_GRO_SKBS 8
 139
 140/* This should be increased if a protocol with a bigger head is added. */
 141#define GRO_MAX_HEAD (MAX_HEADER + 128)
 142
 143/*
 144 *	The list of packet types we will receive (as opposed to discard)
 145 *	and the routines to invoke.
 146 *
 147 *	Why 16. Because with 16 the only overlap we get on a hash of the
 148 *	low nibble of the protocol value is RARP/SNAP/X.25.
 149 *
 150 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 151 *             sure which should go first, but I bet it won't make much
 152 *             difference if we are running VLANs.  The good news is that
 153 *             this protocol won't be in the list unless compiled in, so
 154 *             the average user (w/out VLANs) will not be adversely affected.
 155 *             --BLG
 156 *
 157 *		0800	IP
 158 *		8100    802.1Q VLAN
 159 *		0001	802.3
 160 *		0002	AX.25
 161 *		0004	802.2
 162 *		8035	RARP
 163 *		0005	SNAP
 164 *		0805	X.25
 165 *		0806	ARP
 166 *		8137	IPX
 167 *		0009	Localtalk
 168 *		86DD	IPv6
 169 */
 170
 171#define PTYPE_HASH_SIZE	(16)
 172#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 173
 174static DEFINE_SPINLOCK(ptype_lock);
 175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 176static struct list_head ptype_all __read_mostly;	/* Taps */
 177
 178/*
 179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 180 * semaphore.
 181 *
 182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 183 *
 184 * Writers must hold the rtnl semaphore while they loop through the
 185 * dev_base_head list, and hold dev_base_lock for writing when they do the
 186 * actual updates.  This allows pure readers to access the list even
 187 * while a writer is preparing to update it.
 188 *
 189 * To put it another way, dev_base_lock is held for writing only to
 190 * protect against pure readers; the rtnl semaphore provides the
 191 * protection against other writers.
 192 *
 193 * See, for example usages, register_netdevice() and
 194 * unregister_netdevice(), which must be called with the rtnl
 195 * semaphore held.
 196 */
 197DEFINE_RWLOCK(dev_base_lock);
 198EXPORT_SYMBOL(dev_base_lock);
 199
 200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 201{
 202	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 203	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 204}
 205
 206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 207{
 208	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 209}
 210
 211static inline void rps_lock(struct softnet_data *sd)
 212{
 213#ifdef CONFIG_RPS
 214	spin_lock(&sd->input_pkt_queue.lock);
 215#endif
 216}
 217
 218static inline void rps_unlock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221	spin_unlock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225/* Device list insertion */
 226static int list_netdevice(struct net_device *dev)
 227{
 228	struct net *net = dev_net(dev);
 229
 230	ASSERT_RTNL();
 231
 232	write_lock_bh(&dev_base_lock);
 233	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 234	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 235	hlist_add_head_rcu(&dev->index_hlist,
 236			   dev_index_hash(net, dev->ifindex));
 237	write_unlock_bh(&dev_base_lock);
 238	return 0;
 239}
 240
 241/* Device list removal
 242 * caller must respect a RCU grace period before freeing/reusing dev
 243 */
 244static void unlist_netdevice(struct net_device *dev)
 245{
 246	ASSERT_RTNL();
 247
 248	/* Unlink dev from the device chain */
 249	write_lock_bh(&dev_base_lock);
 250	list_del_rcu(&dev->dev_list);
 251	hlist_del_rcu(&dev->name_hlist);
 252	hlist_del_rcu(&dev->index_hlist);
 253	write_unlock_bh(&dev_base_lock);
 254}
 255
 256/*
 257 *	Our notifier list
 258 */
 259
 260static RAW_NOTIFIER_HEAD(netdev_chain);
 261
 262/*
 263 *	Device drivers call our routines to queue packets here. We empty the
 264 *	queue in the local softnet handler.
 265 */
 266
 267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 268EXPORT_PER_CPU_SYMBOL(softnet_data);
 269
 270#ifdef CONFIG_LOCKDEP
 271/*
 272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 273 * according to dev->type
 274 */
 275static const unsigned short netdev_lock_type[] =
 276	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 277	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 278	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 279	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 280	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 281	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 282	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 283	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 284	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 285	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 286	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 287	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 288	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 289	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 290	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 291	 ARPHRD_VOID, ARPHRD_NONE};
 292
 293static const char *const netdev_lock_name[] =
 294	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 295	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 296	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 297	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 298	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 299	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 300	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 301	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 302	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 303	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 304	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 305	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 306	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 307	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 308	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 309	 "_xmit_VOID", "_xmit_NONE"};
 310
 311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315{
 316	int i;
 317
 318	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319		if (netdev_lock_type[i] == dev_type)
 320			return i;
 321	/* the last key is used by default */
 322	return ARRAY_SIZE(netdev_lock_type) - 1;
 323}
 324
 325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326						 unsigned short dev_type)
 327{
 328	int i;
 329
 330	i = netdev_lock_pos(dev_type);
 331	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332				   netdev_lock_name[i]);
 333}
 334
 335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336{
 337	int i;
 338
 339	i = netdev_lock_pos(dev->type);
 340	lockdep_set_class_and_name(&dev->addr_list_lock,
 341				   &netdev_addr_lock_key[i],
 342				   netdev_lock_name[i]);
 343}
 344#else
 345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346						 unsigned short dev_type)
 347{
 348}
 349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350{
 351}
 352#endif
 353
 354/*******************************************************************************
 355
 356		Protocol management and registration routines
 357
 358*******************************************************************************/
 359
 360/*
 361 *	Add a protocol ID to the list. Now that the input handler is
 362 *	smarter we can dispense with all the messy stuff that used to be
 363 *	here.
 364 *
 365 *	BEWARE!!! Protocol handlers, mangling input packets,
 366 *	MUST BE last in hash buckets and checking protocol handlers
 367 *	MUST start from promiscuous ptype_all chain in net_bh.
 368 *	It is true now, do not change it.
 369 *	Explanation follows: if protocol handler, mangling packet, will
 370 *	be the first on list, it is not able to sense, that packet
 371 *	is cloned and should be copied-on-write, so that it will
 372 *	change it and subsequent readers will get broken packet.
 373 *							--ANK (980803)
 374 */
 375
 376/**
 377 *	dev_add_pack - add packet handler
 378 *	@pt: packet type declaration
 379 *
 380 *	Add a protocol handler to the networking stack. The passed &packet_type
 381 *	is linked into kernel lists and may not be freed until it has been
 382 *	removed from the kernel lists.
 383 *
 384 *	This call does not sleep therefore it can not
 385 *	guarantee all CPU's that are in middle of receiving packets
 386 *	will see the new packet type (until the next received packet).
 387 */
 388
 389void dev_add_pack(struct packet_type *pt)
 390{
 391	int hash;
 392
 393	spin_lock_bh(&ptype_lock);
 394	if (pt->type == htons(ETH_P_ALL))
 395		list_add_rcu(&pt->list, &ptype_all);
 396	else {
 397		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 398		list_add_rcu(&pt->list, &ptype_base[hash]);
 399	}
 400	spin_unlock_bh(&ptype_lock);
 401}
 402EXPORT_SYMBOL(dev_add_pack);
 403
 404/**
 405 *	__dev_remove_pack	 - remove packet handler
 406 *	@pt: packet type declaration
 407 *
 408 *	Remove a protocol handler that was previously added to the kernel
 409 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 410 *	from the kernel lists and can be freed or reused once this function
 411 *	returns.
 412 *
 413 *      The packet type might still be in use by receivers
 414 *	and must not be freed until after all the CPU's have gone
 415 *	through a quiescent state.
 416 */
 417void __dev_remove_pack(struct packet_type *pt)
 418{
 419	struct list_head *head;
 420	struct packet_type *pt1;
 421
 422	spin_lock_bh(&ptype_lock);
 423
 424	if (pt->type == htons(ETH_P_ALL))
 425		head = &ptype_all;
 426	else
 427		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 428
 429	list_for_each_entry(pt1, head, list) {
 430		if (pt == pt1) {
 431			list_del_rcu(&pt->list);
 432			goto out;
 433		}
 434	}
 435
 436	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437out:
 438	spin_unlock_bh(&ptype_lock);
 439}
 440EXPORT_SYMBOL(__dev_remove_pack);
 441
 442/**
 443 *	dev_remove_pack	 - remove packet handler
 444 *	@pt: packet type declaration
 445 *
 446 *	Remove a protocol handler that was previously added to the kernel
 447 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448 *	from the kernel lists and can be freed or reused once this function
 449 *	returns.
 450 *
 451 *	This call sleeps to guarantee that no CPU is looking at the packet
 452 *	type after return.
 453 */
 454void dev_remove_pack(struct packet_type *pt)
 455{
 456	__dev_remove_pack(pt);
 457
 458	synchronize_net();
 459}
 460EXPORT_SYMBOL(dev_remove_pack);
 461
 462/******************************************************************************
 463
 464		      Device Boot-time Settings Routines
 465
 466*******************************************************************************/
 467
 468/* Boot time configuration table */
 469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471/**
 472 *	netdev_boot_setup_add	- add new setup entry
 473 *	@name: name of the device
 474 *	@map: configured settings for the device
 475 *
 476 *	Adds new setup entry to the dev_boot_setup list.  The function
 477 *	returns 0 on error and 1 on success.  This is a generic routine to
 478 *	all netdevices.
 479 */
 480static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481{
 482	struct netdev_boot_setup *s;
 483	int i;
 484
 485	s = dev_boot_setup;
 486	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488			memset(s[i].name, 0, sizeof(s[i].name));
 489			strlcpy(s[i].name, name, IFNAMSIZ);
 490			memcpy(&s[i].map, map, sizeof(s[i].map));
 491			break;
 492		}
 493	}
 494
 495	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496}
 497
 498/**
 499 *	netdev_boot_setup_check	- check boot time settings
 500 *	@dev: the netdevice
 501 *
 502 * 	Check boot time settings for the device.
 503 *	The found settings are set for the device to be used
 504 *	later in the device probing.
 505 *	Returns 0 if no settings found, 1 if they are.
 506 */
 507int netdev_boot_setup_check(struct net_device *dev)
 508{
 509	struct netdev_boot_setup *s = dev_boot_setup;
 510	int i;
 511
 512	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514		    !strcmp(dev->name, s[i].name)) {
 515			dev->irq 	= s[i].map.irq;
 516			dev->base_addr 	= s[i].map.base_addr;
 517			dev->mem_start 	= s[i].map.mem_start;
 518			dev->mem_end 	= s[i].map.mem_end;
 519			return 1;
 520		}
 521	}
 522	return 0;
 523}
 524EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527/**
 528 *	netdev_boot_base	- get address from boot time settings
 529 *	@prefix: prefix for network device
 530 *	@unit: id for network device
 531 *
 532 * 	Check boot time settings for the base address of device.
 533 *	The found settings are set for the device to be used
 534 *	later in the device probing.
 535 *	Returns 0 if no settings found.
 536 */
 537unsigned long netdev_boot_base(const char *prefix, int unit)
 538{
 539	const struct netdev_boot_setup *s = dev_boot_setup;
 540	char name[IFNAMSIZ];
 541	int i;
 542
 543	sprintf(name, "%s%d", prefix, unit);
 544
 545	/*
 546	 * If device already registered then return base of 1
 547	 * to indicate not to probe for this interface
 548	 */
 549	if (__dev_get_by_name(&init_net, name))
 550		return 1;
 551
 552	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553		if (!strcmp(name, s[i].name))
 554			return s[i].map.base_addr;
 555	return 0;
 556}
 557
 558/*
 559 * Saves at boot time configured settings for any netdevice.
 560 */
 561int __init netdev_boot_setup(char *str)
 562{
 563	int ints[5];
 564	struct ifmap map;
 565
 566	str = get_options(str, ARRAY_SIZE(ints), ints);
 567	if (!str || !*str)
 568		return 0;
 569
 570	/* Save settings */
 571	memset(&map, 0, sizeof(map));
 572	if (ints[0] > 0)
 573		map.irq = ints[1];
 574	if (ints[0] > 1)
 575		map.base_addr = ints[2];
 576	if (ints[0] > 2)
 577		map.mem_start = ints[3];
 578	if (ints[0] > 3)
 579		map.mem_end = ints[4];
 580
 581	/* Add new entry to the list */
 582	return netdev_boot_setup_add(str, &map);
 583}
 584
 585__setup("netdev=", netdev_boot_setup);
 586
 587/*******************************************************************************
 588
 589			    Device Interface Subroutines
 590
 591*******************************************************************************/
 592
 593/**
 594 *	__dev_get_by_name	- find a device by its name
 595 *	@net: the applicable net namespace
 596 *	@name: name to find
 597 *
 598 *	Find an interface by name. Must be called under RTNL semaphore
 599 *	or @dev_base_lock. If the name is found a pointer to the device
 600 *	is returned. If the name is not found then %NULL is returned. The
 601 *	reference counters are not incremented so the caller must be
 602 *	careful with locks.
 603 */
 604
 605struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606{
 607	struct hlist_node *p;
 608	struct net_device *dev;
 609	struct hlist_head *head = dev_name_hash(net, name);
 610
 611	hlist_for_each_entry(dev, p, head, name_hlist)
 612		if (!strncmp(dev->name, name, IFNAMSIZ))
 613			return dev;
 614
 615	return NULL;
 616}
 617EXPORT_SYMBOL(__dev_get_by_name);
 618
 619/**
 620 *	dev_get_by_name_rcu	- find a device by its name
 621 *	@net: the applicable net namespace
 622 *	@name: name to find
 623 *
 624 *	Find an interface by name.
 625 *	If the name is found a pointer to the device is returned.
 626 * 	If the name is not found then %NULL is returned.
 627 *	The reference counters are not incremented so the caller must be
 628 *	careful with locks. The caller must hold RCU lock.
 629 */
 630
 631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632{
 633	struct hlist_node *p;
 634	struct net_device *dev;
 635	struct hlist_head *head = dev_name_hash(net, name);
 636
 637	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638		if (!strncmp(dev->name, name, IFNAMSIZ))
 639			return dev;
 640
 641	return NULL;
 642}
 643EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645/**
 646 *	dev_get_by_name		- find a device by its name
 647 *	@net: the applicable net namespace
 648 *	@name: name to find
 649 *
 650 *	Find an interface by name. This can be called from any
 651 *	context and does its own locking. The returned handle has
 652 *	the usage count incremented and the caller must use dev_put() to
 653 *	release it when it is no longer needed. %NULL is returned if no
 654 *	matching device is found.
 655 */
 656
 657struct net_device *dev_get_by_name(struct net *net, const char *name)
 658{
 659	struct net_device *dev;
 660
 661	rcu_read_lock();
 662	dev = dev_get_by_name_rcu(net, name);
 663	if (dev)
 664		dev_hold(dev);
 665	rcu_read_unlock();
 666	return dev;
 667}
 668EXPORT_SYMBOL(dev_get_by_name);
 669
 670/**
 671 *	__dev_get_by_index - find a device by its ifindex
 672 *	@net: the applicable net namespace
 673 *	@ifindex: index of device
 674 *
 675 *	Search for an interface by index. Returns %NULL if the device
 676 *	is not found or a pointer to the device. The device has not
 677 *	had its reference counter increased so the caller must be careful
 678 *	about locking. The caller must hold either the RTNL semaphore
 679 *	or @dev_base_lock.
 680 */
 681
 682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683{
 684	struct hlist_node *p;
 685	struct net_device *dev;
 686	struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688	hlist_for_each_entry(dev, p, head, index_hlist)
 689		if (dev->ifindex == ifindex)
 690			return dev;
 691
 692	return NULL;
 693}
 694EXPORT_SYMBOL(__dev_get_by_index);
 695
 696/**
 697 *	dev_get_by_index_rcu - find a device by its ifindex
 698 *	@net: the applicable net namespace
 699 *	@ifindex: index of device
 700 *
 701 *	Search for an interface by index. Returns %NULL if the device
 702 *	is not found or a pointer to the device. The device has not
 703 *	had its reference counter increased so the caller must be careful
 704 *	about locking. The caller must hold RCU lock.
 705 */
 706
 707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708{
 709	struct hlist_node *p;
 710	struct net_device *dev;
 711	struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714		if (dev->ifindex == ifindex)
 715			return dev;
 716
 717	return NULL;
 718}
 719EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722/**
 723 *	dev_get_by_index - find a device by its ifindex
 724 *	@net: the applicable net namespace
 725 *	@ifindex: index of device
 726 *
 727 *	Search for an interface by index. Returns NULL if the device
 728 *	is not found or a pointer to the device. The device returned has
 729 *	had a reference added and the pointer is safe until the user calls
 730 *	dev_put to indicate they have finished with it.
 731 */
 732
 733struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734{
 735	struct net_device *dev;
 736
 737	rcu_read_lock();
 738	dev = dev_get_by_index_rcu(net, ifindex);
 739	if (dev)
 740		dev_hold(dev);
 741	rcu_read_unlock();
 742	return dev;
 743}
 744EXPORT_SYMBOL(dev_get_by_index);
 745
 746/**
 747 *	dev_getbyhwaddr - find a device by its hardware address
 748 *	@net: the applicable net namespace
 749 *	@type: media type of device
 750 *	@ha: hardware address
 751 *
 752 *	Search for an interface by MAC address. Returns NULL if the device
 753 *	is not found or a pointer to the device. The caller must hold the
 754 *	rtnl semaphore. The returned device has not had its ref count increased
 755 *	and the caller must therefore be careful about locking
 756 *
 757 *	BUGS:
 758 *	If the API was consistent this would be __dev_get_by_hwaddr
 759 */
 760
 761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 762{
 763	struct net_device *dev;
 764
 765	ASSERT_RTNL();
 766
 767	for_each_netdev(net, dev)
 768		if (dev->type == type &&
 769		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 770			return dev;
 771
 772	return NULL;
 773}
 774EXPORT_SYMBOL(dev_getbyhwaddr);
 775
 776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 777{
 778	struct net_device *dev;
 779
 780	ASSERT_RTNL();
 781	for_each_netdev(net, dev)
 782		if (dev->type == type)
 783			return dev;
 784
 785	return NULL;
 786}
 787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 788
 789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790{
 791	struct net_device *dev, *ret = NULL;
 792
 793	rcu_read_lock();
 794	for_each_netdev_rcu(net, dev)
 795		if (dev->type == type) {
 796			dev_hold(dev);
 797			ret = dev;
 798			break;
 799		}
 800	rcu_read_unlock();
 801	return ret;
 802}
 803EXPORT_SYMBOL(dev_getfirstbyhwtype);
 804
 805/**
 806 *	dev_get_by_flags - find any device with given flags
 807 *	@net: the applicable net namespace
 808 *	@if_flags: IFF_* values
 809 *	@mask: bitmask of bits in if_flags to check
 810 *
 811 *	Search for any interface with the given flags. Returns NULL if a device
 812 *	is not found or a pointer to the device. The device returned has
 813 *	had a reference added and the pointer is safe until the user calls
 814 *	dev_put to indicate they have finished with it.
 815 */
 816
 817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 818				    unsigned short mask)
 819{
 820	struct net_device *dev, *ret;
 821
 822	ret = NULL;
 823	rcu_read_lock();
 824	for_each_netdev_rcu(net, dev) {
 825		if (((dev->flags ^ if_flags) & mask) == 0) {
 826			dev_hold(dev);
 827			ret = dev;
 828			break;
 829		}
 830	}
 831	rcu_read_unlock();
 832	return ret;
 833}
 834EXPORT_SYMBOL(dev_get_by_flags);
 835
 836/**
 837 *	dev_valid_name - check if name is okay for network device
 838 *	@name: name string
 839 *
 840 *	Network device names need to be valid file names to
 841 *	to allow sysfs to work.  We also disallow any kind of
 842 *	whitespace.
 843 */
 844int dev_valid_name(const char *name)
 845{
 846	if (*name == '\0')
 847		return 0;
 848	if (strlen(name) >= IFNAMSIZ)
 849		return 0;
 850	if (!strcmp(name, ".") || !strcmp(name, ".."))
 851		return 0;
 852
 853	while (*name) {
 854		if (*name == '/' || isspace(*name))
 855			return 0;
 856		name++;
 857	}
 858	return 1;
 859}
 860EXPORT_SYMBOL(dev_valid_name);
 861
 862/**
 863 *	__dev_alloc_name - allocate a name for a device
 864 *	@net: network namespace to allocate the device name in
 865 *	@name: name format string
 866 *	@buf:  scratch buffer and result name string
 867 *
 868 *	Passed a format string - eg "lt%d" it will try and find a suitable
 869 *	id. It scans list of devices to build up a free map, then chooses
 870 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 871 *	while allocating the name and adding the device in order to avoid
 872 *	duplicates.
 873 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 874 *	Returns the number of the unit assigned or a negative errno code.
 875 */
 876
 877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 878{
 879	int i = 0;
 880	const char *p;
 881	const int max_netdevices = 8*PAGE_SIZE;
 882	unsigned long *inuse;
 883	struct net_device *d;
 884
 885	p = strnchr(name, IFNAMSIZ-1, '%');
 886	if (p) {
 887		/*
 888		 * Verify the string as this thing may have come from
 889		 * the user.  There must be either one "%d" and no other "%"
 890		 * characters.
 891		 */
 892		if (p[1] != 'd' || strchr(p + 2, '%'))
 893			return -EINVAL;
 894
 895		/* Use one page as a bit array of possible slots */
 896		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 897		if (!inuse)
 898			return -ENOMEM;
 899
 900		for_each_netdev(net, d) {
 901			if (!sscanf(d->name, name, &i))
 902				continue;
 903			if (i < 0 || i >= max_netdevices)
 904				continue;
 905
 906			/*  avoid cases where sscanf is not exact inverse of printf */
 907			snprintf(buf, IFNAMSIZ, name, i);
 908			if (!strncmp(buf, d->name, IFNAMSIZ))
 909				set_bit(i, inuse);
 910		}
 911
 912		i = find_first_zero_bit(inuse, max_netdevices);
 913		free_page((unsigned long) inuse);
 914	}
 915
 916	if (buf != name)
 917		snprintf(buf, IFNAMSIZ, name, i);
 918	if (!__dev_get_by_name(net, buf))
 919		return i;
 920
 921	/* It is possible to run out of possible slots
 922	 * when the name is long and there isn't enough space left
 923	 * for the digits, or if all bits are used.
 924	 */
 925	return -ENFILE;
 926}
 927
 928/**
 929 *	dev_alloc_name - allocate a name for a device
 930 *	@dev: device
 931 *	@name: name format string
 932 *
 933 *	Passed a format string - eg "lt%d" it will try and find a suitable
 934 *	id. It scans list of devices to build up a free map, then chooses
 935 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 936 *	while allocating the name and adding the device in order to avoid
 937 *	duplicates.
 938 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 939 *	Returns the number of the unit assigned or a negative errno code.
 940 */
 941
 942int dev_alloc_name(struct net_device *dev, const char *name)
 943{
 944	char buf[IFNAMSIZ];
 945	struct net *net;
 946	int ret;
 947
 948	BUG_ON(!dev_net(dev));
 949	net = dev_net(dev);
 950	ret = __dev_alloc_name(net, name, buf);
 951	if (ret >= 0)
 952		strlcpy(dev->name, buf, IFNAMSIZ);
 953	return ret;
 954}
 955EXPORT_SYMBOL(dev_alloc_name);
 956
 957static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 958{
 959	struct net *net;
 960
 961	BUG_ON(!dev_net(dev));
 962	net = dev_net(dev);
 963
 964	if (!dev_valid_name(name))
 965		return -EINVAL;
 966
 967	if (fmt && strchr(name, '%'))
 968		return dev_alloc_name(dev, name);
 969	else if (__dev_get_by_name(net, name))
 970		return -EEXIST;
 971	else if (dev->name != name)
 972		strlcpy(dev->name, name, IFNAMSIZ);
 973
 974	return 0;
 975}
 976
 977/**
 978 *	dev_change_name - change name of a device
 979 *	@dev: device
 980 *	@newname: name (or format string) must be at least IFNAMSIZ
 981 *
 982 *	Change name of a device, can pass format strings "eth%d".
 983 *	for wildcarding.
 984 */
 985int dev_change_name(struct net_device *dev, const char *newname)
 986{
 987	char oldname[IFNAMSIZ];
 988	int err = 0;
 989	int ret;
 990	struct net *net;
 991
 992	ASSERT_RTNL();
 993	BUG_ON(!dev_net(dev));
 994
 995	net = dev_net(dev);
 996	if (dev->flags & IFF_UP)
 997		return -EBUSY;
 998
 999	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1000		return 0;
1001
1002	memcpy(oldname, dev->name, IFNAMSIZ);
1003
1004	err = dev_get_valid_name(dev, newname, 1);
1005	if (err < 0)
1006		return err;
1007
1008rollback:
1009	ret = device_rename(&dev->dev, dev->name);
1010	if (ret) {
1011		memcpy(dev->name, oldname, IFNAMSIZ);
1012		return ret;
1013	}
1014
1015	write_lock_bh(&dev_base_lock);
1016	hlist_del(&dev->name_hlist);
1017	write_unlock_bh(&dev_base_lock);
1018
1019	synchronize_rcu();
1020
1021	write_lock_bh(&dev_base_lock);
1022	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1023	write_unlock_bh(&dev_base_lock);
1024
1025	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1026	ret = notifier_to_errno(ret);
1027
1028	if (ret) {
1029		/* err >= 0 after dev_alloc_name() or stores the first errno */
1030		if (err >= 0) {
1031			err = ret;
1032			memcpy(dev->name, oldname, IFNAMSIZ);
1033			goto rollback;
1034		} else {
1035			printk(KERN_ERR
1036			       "%s: name change rollback failed: %d.\n",
1037			       dev->name, ret);
1038		}
1039	}
1040
1041	return err;
1042}
1043
1044/**
1045 *	dev_set_alias - change ifalias of a device
1046 *	@dev: device
1047 *	@alias: name up to IFALIASZ
1048 *	@len: limit of bytes to copy from info
1049 *
1050 *	Set ifalias for a device,
1051 */
1052int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1053{
1054	ASSERT_RTNL();
1055
1056	if (len >= IFALIASZ)
1057		return -EINVAL;
1058
1059	if (!len) {
1060		if (dev->ifalias) {
1061			kfree(dev->ifalias);
1062			dev->ifalias = NULL;
1063		}
1064		return 0;
1065	}
1066
1067	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1068	if (!dev->ifalias)
1069		return -ENOMEM;
1070
1071	strlcpy(dev->ifalias, alias, len+1);
1072	return len;
1073}
1074
1075
1076/**
1077 *	netdev_features_change - device changes features
1078 *	@dev: device to cause notification
1079 *
1080 *	Called to indicate a device has changed features.
1081 */
1082void netdev_features_change(struct net_device *dev)
1083{
1084	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1085}
1086EXPORT_SYMBOL(netdev_features_change);
1087
1088/**
1089 *	netdev_state_change - device changes state
1090 *	@dev: device to cause notification
1091 *
1092 *	Called to indicate a device has changed state. This function calls
1093 *	the notifier chains for netdev_chain and sends a NEWLINK message
1094 *	to the routing socket.
1095 */
1096void netdev_state_change(struct net_device *dev)
1097{
1098	if (dev->flags & IFF_UP) {
1099		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1100		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1101	}
1102}
1103EXPORT_SYMBOL(netdev_state_change);
1104
1105int netdev_bonding_change(struct net_device *dev, unsigned long event)
1106{
1107	return call_netdevice_notifiers(event, dev);
1108}
1109EXPORT_SYMBOL(netdev_bonding_change);
1110
1111/**
1112 *	dev_load 	- load a network module
1113 *	@net: the applicable net namespace
1114 *	@name: name of interface
1115 *
1116 *	If a network interface is not present and the process has suitable
1117 *	privileges this function loads the module. If module loading is not
1118 *	available in this kernel then it becomes a nop.
1119 */
1120
1121void dev_load(struct net *net, const char *name)
1122{
1123	struct net_device *dev;
1124
1125	rcu_read_lock();
1126	dev = dev_get_by_name_rcu(net, name);
1127	rcu_read_unlock();
1128
1129	if (!dev && capable(CAP_NET_ADMIN))
1130		request_module("%s", name);
1131}
1132EXPORT_SYMBOL(dev_load);
1133
1134static int __dev_open(struct net_device *dev)
1135{
1136	const struct net_device_ops *ops = dev->netdev_ops;
1137	int ret;
1138
1139	ASSERT_RTNL();
1140
1141	/*
1142	 *	Is it even present?
1143	 */
1144	if (!netif_device_present(dev))
1145		return -ENODEV;
1146
1147	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1148	ret = notifier_to_errno(ret);
1149	if (ret)
1150		return ret;
1151
1152	/*
1153	 *	Call device private open method
1154	 */
1155	set_bit(__LINK_STATE_START, &dev->state);
1156
1157	if (ops->ndo_validate_addr)
1158		ret = ops->ndo_validate_addr(dev);
1159
1160	if (!ret && ops->ndo_open)
1161		ret = ops->ndo_open(dev);
1162
1163	/*
1164	 *	If it went open OK then:
1165	 */
1166
1167	if (ret)
1168		clear_bit(__LINK_STATE_START, &dev->state);
1169	else {
1170		/*
1171		 *	Set the flags.
1172		 */
1173		dev->flags |= IFF_UP;
1174
1175		/*
1176		 *	Enable NET_DMA
1177		 */
1178		net_dmaengine_get();
1179
1180		/*
1181		 *	Initialize multicasting status
1182		 */
1183		dev_set_rx_mode(dev);
1184
1185		/*
1186		 *	Wakeup transmit queue engine
1187		 */
1188		dev_activate(dev);
1189	}
1190
1191	return ret;
1192}
1193
1194/**
1195 *	dev_open	- prepare an interface for use.
1196 *	@dev:	device to open
1197 *
1198 *	Takes a device from down to up state. The device's private open
1199 *	function is invoked and then the multicast lists are loaded. Finally
1200 *	the device is moved into the up state and a %NETDEV_UP message is
1201 *	sent to the netdev notifier chain.
1202 *
1203 *	Calling this function on an active interface is a nop. On a failure
1204 *	a negative errno code is returned.
1205 */
1206int dev_open(struct net_device *dev)
1207{
1208	int ret;
1209
1210	/*
1211	 *	Is it already up?
1212	 */
1213	if (dev->flags & IFF_UP)
1214		return 0;
1215
1216	/*
1217	 *	Open device
1218	 */
1219	ret = __dev_open(dev);
1220	if (ret < 0)
1221		return ret;
1222
1223	/*
1224	 *	... and announce new interface.
1225	 */
1226	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1227	call_netdevice_notifiers(NETDEV_UP, dev);
1228
1229	return ret;
1230}
1231EXPORT_SYMBOL(dev_open);
1232
1233static int __dev_close(struct net_device *dev)
1234{
1235	const struct net_device_ops *ops = dev->netdev_ops;
1236
1237	ASSERT_RTNL();
1238	might_sleep();
1239
1240	/*
1241	 *	Tell people we are going down, so that they can
1242	 *	prepare to death, when device is still operating.
1243	 */
1244	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1245
1246	clear_bit(__LINK_STATE_START, &dev->state);
1247
1248	/* Synchronize to scheduled poll. We cannot touch poll list,
1249	 * it can be even on different cpu. So just clear netif_running().
1250	 *
1251	 * dev->stop() will invoke napi_disable() on all of it's
1252	 * napi_struct instances on this device.
1253	 */
1254	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1255
1256	dev_deactivate(dev);
1257
1258	/*
1259	 *	Call the device specific close. This cannot fail.
1260	 *	Only if device is UP
1261	 *
1262	 *	We allow it to be called even after a DETACH hot-plug
1263	 *	event.
1264	 */
1265	if (ops->ndo_stop)
1266		ops->ndo_stop(dev);
1267
1268	/*
1269	 *	Device is now down.
1270	 */
1271
1272	dev->flags &= ~IFF_UP;
1273
1274	/*
1275	 *	Shutdown NET_DMA
1276	 */
1277	net_dmaengine_put();
1278
1279	return 0;
1280}
1281
1282/**
1283 *	dev_close - shutdown an interface.
1284 *	@dev: device to shutdown
1285 *
1286 *	This function moves an active device into down state. A
1287 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1288 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1289 *	chain.
1290 */
1291int dev_close(struct net_device *dev)
1292{
1293	if (!(dev->flags & IFF_UP))
1294		return 0;
1295
1296	__dev_close(dev);
1297
1298	/*
1299	 * Tell people we are down
1300	 */
1301	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1302	call_netdevice_notifiers(NETDEV_DOWN, dev);
1303
1304	return 0;
1305}
1306EXPORT_SYMBOL(dev_close);
1307
1308
1309/**
1310 *	dev_disable_lro - disable Large Receive Offload on a device
1311 *	@dev: device
1312 *
1313 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314 *	called under RTNL.  This is needed if received packets may be
1315 *	forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1320	    dev->ethtool_ops->set_flags) {
1321		u32 flags = dev->ethtool_ops->get_flags(dev);
1322		if (flags & ETH_FLAG_LRO) {
1323			flags &= ~ETH_FLAG_LRO;
1324			dev->ethtool_ops->set_flags(dev, flags);
1325		}
1326	}
1327	WARN_ON(dev->features & NETIF_F_LRO);
1328}
1329EXPORT_SYMBOL(dev_disable_lro);
1330
1331
1332static int dev_boot_phase = 1;
1333
1334/*
1335 *	Device change register/unregister. These are not inline or static
1336 *	as we export them to the world.
1337 */
1338
1339/**
1340 *	register_netdevice_notifier - register a network notifier block
1341 *	@nb: notifier
1342 *
1343 *	Register a notifier to be called when network device events occur.
1344 *	The notifier passed is linked into the kernel structures and must
1345 *	not be reused until it has been unregistered. A negative errno code
1346 *	is returned on a failure.
1347 *
1348 * 	When registered all registration and up events are replayed
1349 *	to the new notifier to allow device to have a race free
1350 *	view of the network device list.
1351 */
1352
1353int register_netdevice_notifier(struct notifier_block *nb)
1354{
1355	struct net_device *dev;
1356	struct net_device *last;
1357	struct net *net;
1358	int err;
1359
1360	rtnl_lock();
1361	err = raw_notifier_chain_register(&netdev_chain, nb);
1362	if (err)
1363		goto unlock;
1364	if (dev_boot_phase)
1365		goto unlock;
1366	for_each_net(net) {
1367		for_each_netdev(net, dev) {
1368			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1369			err = notifier_to_errno(err);
1370			if (err)
1371				goto rollback;
1372
1373			if (!(dev->flags & IFF_UP))
1374				continue;
1375
1376			nb->notifier_call(nb, NETDEV_UP, dev);
1377		}
1378	}
1379
1380unlock:
1381	rtnl_unlock();
1382	return err;
1383
1384rollback:
1385	last = dev;
1386	for_each_net(net) {
1387		for_each_netdev(net, dev) {
1388			if (dev == last)
1389				break;
1390
1391			if (dev->flags & IFF_UP) {
1392				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1393				nb->notifier_call(nb, NETDEV_DOWN, dev);
1394			}
1395			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1396			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1397		}
1398	}
1399
1400	raw_notifier_chain_unregister(&netdev_chain, nb);
1401	goto unlock;
1402}
1403EXPORT_SYMBOL(register_netdevice_notifier);
1404
1405/**
1406 *	unregister_netdevice_notifier - unregister a network notifier block
1407 *	@nb: notifier
1408 *
1409 *	Unregister a notifier previously registered by
1410 *	register_netdevice_notifier(). The notifier is unlinked into the
1411 *	kernel structures and may then be reused. A negative errno code
1412 *	is returned on a failure.
1413 */
1414
1415int unregister_netdevice_notifier(struct notifier_block *nb)
1416{
1417	int err;
1418
1419	rtnl_lock();
1420	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1421	rtnl_unlock();
1422	return err;
1423}
1424EXPORT_SYMBOL(unregister_netdevice_notifier);
1425
1426/**
1427 *	call_netdevice_notifiers - call all network notifier blocks
1428 *      @val: value passed unmodified to notifier function
1429 *      @dev: net_device pointer passed unmodified to notifier function
1430 *
1431 *	Call all network notifier blocks.  Parameters and return value
1432 *	are as for raw_notifier_call_chain().
1433 */
1434
1435int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1436{
1437	ASSERT_RTNL();
1438	return raw_notifier_call_chain(&netdev_chain, val, dev);
1439}
1440
1441/* When > 0 there are consumers of rx skb time stamps */
1442static atomic_t netstamp_needed = ATOMIC_INIT(0);
1443
1444void net_enable_timestamp(void)
1445{
1446	atomic_inc(&netstamp_needed);
1447}
1448EXPORT_SYMBOL(net_enable_timestamp);
1449
1450void net_disable_timestamp(void)
1451{
1452	atomic_dec(&netstamp_needed);
1453}
1454EXPORT_SYMBOL(net_disable_timestamp);
1455
1456static inline void net_timestamp_set(struct sk_buff *skb)
1457{
1458	if (atomic_read(&netstamp_needed))
1459		__net_timestamp(skb);
1460	else
1461		skb->tstamp.tv64 = 0;
1462}
1463
1464static inline void net_timestamp_check(struct sk_buff *skb)
1465{
1466	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1467		__net_timestamp(skb);
1468}
1469
1470/**
1471 * dev_forward_skb - loopback an skb to another netif
1472 *
1473 * @dev: destination network device
1474 * @skb: buffer to forward
1475 *
1476 * return values:
1477 *	NET_RX_SUCCESS	(no congestion)
1478 *	NET_RX_DROP     (packet was dropped, but freed)
1479 *
1480 * dev_forward_skb can be used for injecting an skb from the
1481 * start_xmit function of one device into the receive queue
1482 * of another device.
1483 *
1484 * The receiving device may be in another namespace, so
1485 * we have to clear all information in the skb that could
1486 * impact namespace isolation.
1487 */
1488int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1489{
1490	skb_orphan(skb);
1491	nf_reset(skb);
1492
1493	if (!(dev->flags & IFF_UP) ||
1494	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1495		kfree_skb(skb);
1496		return NET_RX_DROP;
1497	}
1498	skb_set_dev(skb, dev);
1499	skb->tstamp.tv64 = 0;
1500	skb->pkt_type = PACKET_HOST;
1501	skb->protocol = eth_type_trans(skb, dev);
1502	return netif_rx(skb);
1503}
1504EXPORT_SYMBOL_GPL(dev_forward_skb);
1505
1506/*
1507 *	Support routine. Sends outgoing frames to any network
1508 *	taps currently in use.
1509 */
1510
1511static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1512{
1513	struct packet_type *ptype;
1514
1515#ifdef CONFIG_NET_CLS_ACT
1516	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1517		net_timestamp_set(skb);
1518#else
1519	net_timestamp_set(skb);
1520#endif
1521
1522	rcu_read_lock();
1523	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1524		/* Never send packets back to the socket
1525		 * they originated from - MvS (miquels@drinkel.ow.org)
1526		 */
1527		if ((ptype->dev == dev || !ptype->dev) &&
1528		    (ptype->af_packet_priv == NULL ||
1529		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1530			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1531			if (!skb2)
1532				break;
1533
1534			/* skb->nh should be correctly
1535			   set by sender, so that the second statement is
1536			   just protection against buggy protocols.
1537			 */
1538			skb_reset_mac_header(skb2);
1539
1540			if (skb_network_header(skb2) < skb2->data ||
1541			    skb2->network_header > skb2->tail) {
1542				if (net_ratelimit())
1543					printk(KERN_CRIT "protocol %04x is "
1544					       "buggy, dev %s\n",
1545					       skb2->protocol, dev->name);
1546				skb_reset_network_header(skb2);
1547			}
1548
1549			skb2->transport_header = skb2->network_header;
1550			skb2->pkt_type = PACKET_OUTGOING;
1551			ptype->func(skb2, skb->dev, ptype, skb->dev);
1552		}
1553	}
1554	rcu_read_unlock();
1555}
1556
1557/*
1558 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1559 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1560 */
1561void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1562{
1563	unsigned int real_num = dev->real_num_tx_queues;
1564
1565	if (unlikely(txq > dev->num_tx_queues))
1566		;
1567	else if (txq > real_num)
1568		dev->real_num_tx_queues = txq;
1569	else if (txq < real_num) {
1570		dev->real_num_tx_queues = txq;
1571		qdisc_reset_all_tx_gt(dev, txq);
1572	}
1573}
1574EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1575
1576static inline void __netif_reschedule(struct Qdisc *q)
1577{
1578	struct softnet_data *sd;
1579	unsigned long flags;
1580
1581	local_irq_save(flags);
1582	sd = &__get_cpu_var(softnet_data);
1583	q->next_sched = NULL;
1584	*sd->output_queue_tailp = q;
1585	sd->output_queue_tailp = &q->next_sched;
1586	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1587	local_irq_restore(flags);
1588}
1589
1590void __netif_schedule(struct Qdisc *q)
1591{
1592	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1593		__netif_reschedule(q);
1594}
1595EXPORT_SYMBOL(__netif_schedule);
1596
1597void dev_kfree_skb_irq(struct sk_buff *skb)
1598{
1599	if (atomic_dec_and_test(&skb->users)) {
1600		struct softnet_data *sd;
1601		unsigned long flags;
1602
1603		local_irq_save(flags);
1604		sd = &__get_cpu_var(softnet_data);
1605		skb->next = sd->completion_queue;
1606		sd->completion_queue = skb;
1607		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1608		local_irq_restore(flags);
1609	}
1610}
1611EXPORT_SYMBOL(dev_kfree_skb_irq);
1612
1613void dev_kfree_skb_any(struct sk_buff *skb)
1614{
1615	if (in_irq() || irqs_disabled())
1616		dev_kfree_skb_irq(skb);
1617	else
1618		dev_kfree_skb(skb);
1619}
1620EXPORT_SYMBOL(dev_kfree_skb_any);
1621
1622
1623/**
1624 * netif_device_detach - mark device as removed
1625 * @dev: network device
1626 *
1627 * Mark device as removed from system and therefore no longer available.
1628 */
1629void netif_device_detach(struct net_device *dev)
1630{
1631	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1632	    netif_running(dev)) {
1633		netif_tx_stop_all_queues(dev);
1634	}
1635}
1636EXPORT_SYMBOL(netif_device_detach);
1637
1638/**
1639 * netif_device_attach - mark device as attached
1640 * @dev: network device
1641 *
1642 * Mark device as attached from system and restart if needed.
1643 */
1644void netif_device_attach(struct net_device *dev)
1645{
1646	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1647	    netif_running(dev)) {
1648		netif_tx_wake_all_queues(dev);
1649		__netdev_watchdog_up(dev);
1650	}
1651}
1652EXPORT_SYMBOL(netif_device_attach);
1653
1654static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1655{
1656	return ((features & NETIF_F_GEN_CSUM) ||
1657		((features & NETIF_F_IP_CSUM) &&
1658		 protocol == htons(ETH_P_IP)) ||
1659		((features & NETIF_F_IPV6_CSUM) &&
1660		 protocol == htons(ETH_P_IPV6)) ||
1661		((features & NETIF_F_FCOE_CRC) &&
1662		 protocol == htons(ETH_P_FCOE)));
1663}
1664
1665static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1666{
1667	if (can_checksum_protocol(dev->features, skb->protocol))
1668		return true;
1669
1670	if (skb->protocol == htons(ETH_P_8021Q)) {
1671		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1672		if (can_checksum_protocol(dev->features & dev->vlan_features,
1673					  veh->h_vlan_encapsulated_proto))
1674			return true;
1675	}
1676
1677	return false;
1678}
1679
1680/**
1681 * skb_dev_set -- assign a new device to a buffer
1682 * @skb: buffer for the new device
1683 * @dev: network device
1684 *
1685 * If an skb is owned by a device already, we have to reset
1686 * all data private to the namespace a device belongs to
1687 * before assigning it a new device.
1688 */
1689#ifdef CONFIG_NET_NS
1690void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1691{
1692	skb_dst_drop(skb);
1693	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1694		secpath_reset(skb);
1695		nf_reset(skb);
1696		skb_init_secmark(skb);
1697		skb->mark = 0;
1698		skb->priority = 0;
1699		skb->nf_trace = 0;
1700		skb->ipvs_property = 0;
1701#ifdef CONFIG_NET_SCHED
1702		skb->tc_index = 0;
1703#endif
1704	}
1705	skb->dev = dev;
1706}
1707EXPORT_SYMBOL(skb_set_dev);
1708#endif /* CONFIG_NET_NS */
1709
1710/*
1711 * Invalidate hardware checksum when packet is to be mangled, and
1712 * complete checksum manually on outgoing path.
1713 */
1714int skb_checksum_help(struct sk_buff *skb)
1715{
1716	__wsum csum;
1717	int ret = 0, offset;
1718
1719	if (skb->ip_summed == CHECKSUM_COMPLETE)
1720		goto out_set_summed;
1721
1722	if (unlikely(skb_shinfo(skb)->gso_size)) {
1723		/* Let GSO fix up the checksum. */
1724		goto out_set_summed;
1725	}
1726
1727	offset = skb->csum_start - skb_headroom(skb);
1728	BUG_ON(offset >= skb_headlen(skb));
1729	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1730
1731	offset += skb->csum_offset;
1732	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1733
1734	if (skb_cloned(skb) &&
1735	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1736		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1737		if (ret)
1738			goto out;
1739	}
1740
1741	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1742out_set_summed:
1743	skb->ip_summed = CHECKSUM_NONE;
1744out:
1745	return ret;
1746}
1747EXPORT_SYMBOL(skb_checksum_help);
1748
1749/**
1750 *	skb_gso_segment - Perform segmentation on skb.
1751 *	@skb: buffer to segment
1752 *	@features: features for the output path (see dev->features)
1753 *
1754 *	This function segments the given skb and returns a list of segments.
1755 *
1756 *	It may return NULL if the skb requires no segmentation.  This is
1757 *	only possible when GSO is used for verifying header integrity.
1758 */
1759struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1760{
1761	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1762	struct packet_type *ptype;
1763	__be16 type = skb->protocol;
1764	int err;
1765
1766	skb_reset_mac_header(skb);
1767	skb->mac_len = skb->network_header - skb->mac_header;
1768	__skb_pull(skb, skb->mac_len);
1769
1770	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1771		struct net_device *dev = skb->dev;
1772		struct ethtool_drvinfo info = {};
1773
1774		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1775			dev->ethtool_ops->get_drvinfo(dev, &info);
1776
1777		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1778			"ip_summed=%d",
1779		     info.driver, dev ? dev->features : 0L,
1780		     skb->sk ? skb->sk->sk_route_caps : 0L,
1781		     skb->len, skb->data_len, skb->ip_summed);
1782
1783		if (skb_header_cloned(skb) &&
1784		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1785			return ERR_PTR(err);
1786	}
1787
1788	rcu_read_lock();
1789	list_for_each_entry_rcu(ptype,
1790			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1791		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1792			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1793				err = ptype->gso_send_check(skb);
1794				segs = ERR_PTR(err);
1795				if (err || skb_gso_ok(skb, features))
1796					break;
1797				__skb_push(skb, (skb->data -
1798						 skb_network_header(skb)));
1799			}
1800			segs = ptype->gso_segment(skb, features);
1801			break;
1802		}
1803	}
1804	rcu_read_unlock();
1805
1806	__skb_push(skb, skb->data - skb_mac_header(skb));
1807
1808	return segs;
1809}
1810EXPORT_SYMBOL(skb_gso_segment);
1811
1812/* Take action when hardware reception checksum errors are detected. */
1813#ifdef CONFIG_BUG
1814void netdev_rx_csum_fault(struct net_device *dev)
1815{
1816	if (net_ratelimit()) {
1817		printk(KERN_ERR "%s: hw csum failure.\n",
1818			dev ? dev->name : "<unknown>");
1819		dump_stack();
1820	}
1821}
1822EXPORT_SYMBOL(netdev_rx_csum_fault);
1823#endif
1824
1825/* Actually, we should eliminate this check as soon as we know, that:
1826 * 1. IOMMU is present and allows to map all the memory.
1827 * 2. No high memory really exists on this machine.
1828 */
1829
1830static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1831{
1832#ifdef CONFIG_HIGHMEM
1833	int i;
1834	if (!(dev->features & NETIF_F_HIGHDMA)) {
1835		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1836			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1837				return 1;
1838	}
1839
1840	if (PCI_DMA_BUS_IS_PHYS) {
1841		struct device *pdev = dev->dev.parent;
1842
1843		if (!pdev)
1844			return 0;
1845		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1846			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1847			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1848				return 1;
1849		}
1850	}
1851#endif
1852	return 0;
1853}
1854
1855struct dev_gso_cb {
1856	void (*destructor)(struct sk_buff *skb);
1857};
1858
1859#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1860
1861static void dev_gso_skb_destructor(struct sk_buff *skb)
1862{
1863	struct dev_gso_cb *cb;
1864
1865	do {
1866		struct sk_buff *nskb = skb->next;
1867
1868		skb->next = nskb->next;
1869		nskb->next = NULL;
1870		kfree_skb(nskb);
1871	} while (skb->next);
1872
1873	cb = DEV_GSO_CB(skb);
1874	if (cb->destructor)
1875		cb->destructor(skb);
1876}
1877
1878/**
1879 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1880 *	@skb: buffer to segment
1881 *
1882 *	This function segments the given skb and stores the list of segments
1883 *	in skb->next.
1884 */
1885static int dev_gso_segment(struct sk_buff *skb)
1886{
1887	struct net_device *dev = skb->dev;
1888	struct sk_buff *segs;
1889	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1890					 NETIF_F_SG : 0);
1891
1892	segs = skb_gso_segment(skb, features);
1893
1894	/* Verifying header integrity only. */
1895	if (!segs)
1896		return 0;
1897
1898	if (IS_ERR(segs))
1899		return PTR_ERR(segs);
1900
1901	skb->next = segs;
1902	DEV_GSO_CB(skb)->destructor = skb->destructor;
1903	skb->destructor = dev_gso_skb_destructor;
1904
1905	return 0;
1906}
1907
1908/*
1909 * Try to orphan skb early, right before transmission by the device.
1910 * We cannot orphan skb if tx timestamp is requested, since
1911 * drivers need to call skb_tstamp_tx() to send the timestamp.
1912 */
1913static inline void skb_orphan_try(struct sk_buff *skb)
1914{
1915	struct sock *sk = skb->sk;
1916
1917	if (sk && !skb_tx(skb)->flags) {
1918		/* skb_tx_hash() wont be able to get sk.
1919		 * We copy sk_hash into skb->rxhash
1920		 */
1921		if (!skb->rxhash)
1922			skb->rxhash = sk->sk_hash;
1923		skb_orphan(skb);
1924	}
1925}
1926
1927int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1928			struct netdev_queue *txq)
1929{
1930	const struct net_device_ops *ops = dev->netdev_ops;
1931	int rc = NETDEV_TX_OK;
1932
1933	if (likely(!skb->next)) {
1934		if (!list_empty(&ptype_all))
1935			dev_queue_xmit_nit(skb, dev);
1936
1937		/*
1938		 * If device doesnt need skb->dst, release it right now while
1939		 * its hot in this cpu cache
1940		 */
1941		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1942			skb_dst_drop(skb);
1943
1944		skb_orphan_try(skb);
1945
1946		if (netif_needs_gso(dev, skb)) {
1947			if (unlikely(dev_gso_segment(skb)))
1948				goto out_kfree_skb;
1949			if (skb->next)
1950				goto gso;
1951		}
1952
1953		rc = ops->ndo_start_xmit(skb, dev);
1954		if (rc == NETDEV_TX_OK)
1955			txq_trans_update(txq);
1956		return rc;
1957	}
1958
1959gso:
1960	do {
1961		struct sk_buff *nskb = skb->next;
1962
1963		skb->next = nskb->next;
1964		nskb->next = NULL;
1965
1966		/*
1967		 * If device doesnt need nskb->dst, release it right now while
1968		 * its hot in this cpu cache
1969		 */
1970		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1971			skb_dst_drop(nskb);
1972
1973		rc = ops->ndo_start_xmit(nskb, dev);
1974		if (unlikely(rc != NETDEV_TX_OK)) {
1975			if (rc & ~NETDEV_TX_MASK)
1976				goto out_kfree_gso_skb;
1977			nskb->next = skb->next;
1978			skb->next = nskb;
1979			return rc;
1980		}
1981		txq_trans_update(txq);
1982		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1983			return NETDEV_TX_BUSY;
1984	} while (skb->next);
1985
1986out_kfree_gso_skb:
1987	if (likely(skb->next == NULL))
1988		skb->destructor = DEV_GSO_CB(skb)->destructor;
1989out_kfree_skb:
1990	kfree_skb(skb);
1991	return rc;
1992}
1993
1994static u32 hashrnd __read_mostly;
1995
1996u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1997{
1998	u32 hash;
1999
2000	if (skb_rx_queue_recorded(skb)) {
2001		hash = skb_get_rx_queue(skb);
2002		while (unlikely(hash >= dev->real_num_tx_queues))
2003			hash -= dev->real_num_tx_queues;
2004		return hash;
2005	}
2006
2007	if (skb->sk && skb->sk->sk_hash)
2008		hash = skb->sk->sk_hash;
2009	else
2010		hash = (__force u16) skb->protocol ^ skb->rxhash;
2011	hash = jhash_1word(hash, hashrnd);
2012
2013	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2014}
2015EXPORT_SYMBOL(skb_tx_hash);
2016
2017static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2018{
2019	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2020		if (net_ratelimit()) {
2021			pr_warning("%s selects TX queue %d, but "
2022				"real number of TX queues is %d\n",
2023				dev->name, queue_index, dev->real_num_tx_queues);
2024		}
2025		return 0;
2026	}
2027	return queue_index;
2028}
2029
2030static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2031					struct sk_buff *skb)
2032{
2033	int queue_index;
2034	struct sock *sk = skb->sk;
2035
2036	queue_index = sk_tx_queue_get(sk);
2037	if (queue_index < 0) {
2038		const struct net_device_ops *ops = dev->netdev_ops;
2039
2040		if (ops->ndo_select_queue) {
2041			queue_index = ops->ndo_select_queue(dev, skb);
2042			queue_index = dev_cap_txqueue(dev, queue_index);
2043		} else {
2044			queue_index = 0;
2045			if (dev->real_num_tx_queues > 1)
2046				queue_index = skb_tx_hash(dev, skb);
2047
2048			if (sk) {
2049				struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2050
2051				if (dst && skb_dst(skb) == dst)
2052					sk_tx_queue_set(sk, queue_index);
2053			}
2054		}
2055	}
2056
2057	skb_set_queue_mapping(skb, queue_index);
2058	return netdev_get_tx_queue(dev, queue_index);
2059}
2060
2061static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2062				 struct net_device *dev,
2063				 struct netdev_queue *txq)
2064{
2065	spinlock_t *root_lock = qdisc_lock(q);
2066	int rc;
2067
2068	spin_lock(root_lock);
2069	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2070		kfree_skb(skb);
2071		rc = NET_XMIT_DROP;
2072	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2073		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2074		/*
2075		 * This is a work-conserving queue; there are no old skbs
2076		 * waiting to be sent out; and the qdisc is not running -
2077		 * xmit the skb directly.
2078		 */
2079		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2080			skb_dst_force(skb);
2081		__qdisc_update_bstats(q, skb->len);
2082		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2083			__qdisc_run(q);
2084		else
2085			clear_bit(__QDISC_STATE_RUNNING, &q->state);
2086
2087		rc = NET_XMIT_SUCCESS;
2088	} else {
2089		skb_dst_force(skb);
2090		rc = qdisc_enqueue_root(skb, q);
2091		qdisc_run(q);
2092	}
2093	spin_unlock(root_lock);
2094
2095	return rc;
2096}
2097
2098/*
2099 * Returns true if either:
2100 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2101 *	2. skb is fragmented and the device does not support SG, or if
2102 *	   at least one of fragments is in highmem and device does not
2103 *	   support DMA from it.
2104 */
2105static inline int skb_needs_linearize(struct sk_buff *skb,
2106				      struct net_device *dev)
2107{
2108	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2109	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2110					      illegal_highdma(dev, skb)));
2111}
2112
2113/**
2114 *	dev_queue_xmit - transmit a buffer
2115 *	@skb: buffer to transmit
2116 *
2117 *	Queue a buffer for transmission to a network device. The caller must
2118 *	have set the device and priority and built the buffer before calling
2119 *	this function. The function can be called from an interrupt.
2120 *
2121 *	A negative errno code is returned on a failure. A success does not
2122 *	guarantee the frame will be transmitted as it may be dropped due
2123 *	to congestion or traffic shaping.
2124 *
2125 * -----------------------------------------------------------------------------------
2126 *      I notice this method can also return errors from the queue disciplines,
2127 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2128 *      be positive.
2129 *
2130 *      Regardless of the return value, the skb is consumed, so it is currently
2131 *      difficult to retry a send to this method.  (You can bump the ref count
2132 *      before sending to hold a reference for retry if you are careful.)
2133 *
2134 *      When calling this method, interrupts MUST be enabled.  This is because
2135 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2136 *          --BLG
2137 */
2138int dev_queue_xmit(struct sk_buff *skb)
2139{
2140	struct net_device *dev = skb->dev;
2141	struct netdev_queue *txq;
2142	struct Qdisc *q;
2143	int rc = -ENOMEM;
2144
2145	/* GSO will handle the following emulations directly. */
2146	if (netif_needs_gso(dev, skb))
2147		goto gso;
2148
2149	/* Convert a paged skb to linear, if required */
2150	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2151		goto out_kfree_skb;
2152
2153	/* If packet is not checksummed and device does not support
2154	 * checksumming for this protocol, complete checksumming here.
2155	 */
2156	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2157		skb_set_transport_header(skb, skb->csum_start -
2158					      skb_headroom(skb));
2159		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2160			goto out_kfree_skb;
2161	}
2162
2163gso:
2164	/* Disable soft irqs for various locks below. Also
2165	 * stops preemption for RCU.
2166	 */
2167	rcu_read_lock_bh();
2168
2169	txq = dev_pick_tx(dev, skb);
2170	q = rcu_dereference_bh(txq->qdisc);
2171
2172#ifdef CONFIG_NET_CLS_ACT
2173	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2174#endif
2175	if (q->enqueue) {
2176		rc = __dev_xmit_skb(skb, q, dev, txq);
2177		goto out;
2178	}
2179
2180	/* The device has no queue. Common case for software devices:
2181	   loopback, all the sorts of tunnels...
2182
2183	   Really, it is unlikely that netif_tx_lock protection is necessary
2184	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2185	   counters.)
2186	   However, it is possible, that they rely on protection
2187	   made by us here.
2188
2189	   Check this and shot the lock. It is not prone from deadlocks.
2190	   Either shot noqueue qdisc, it is even simpler 8)
2191	 */
2192	if (dev->flags & IFF_UP) {
2193		int cpu = smp_processor_id(); /* ok because BHs are off */
2194
2195		if (txq->xmit_lock_owner != cpu) {
2196
2197			HARD_TX_LOCK(dev, txq, cpu);
2198
2199			if (!netif_tx_queue_stopped(txq)) {
2200				rc = dev_hard_start_xmit(skb, dev, txq);
2201				if (dev_xmit_complete(rc)) {
2202					HARD_TX_UNLOCK(dev, txq);
2203					goto out;
2204				}
2205			}
2206			HARD_TX_UNLOCK(dev, txq);
2207			if (net_ratelimit())
2208				printk(KERN_CRIT "Virtual device %s asks to "
2209				       "queue packet!\n", dev->name);
2210		} else {
2211			/* Recursion is detected! It is possible,
2212			 * unfortunately */
2213			if (net_ratelimit())
2214				printk(KERN_CRIT "Dead loop on virtual device "
2215				       "%s, fix it urgently!\n", dev->name);
2216		}
2217	}
2218
2219	rc = -ENETDOWN;
2220	rcu_read_unlock_bh();
2221
2222out_kfree_skb:
2223	kfree_skb(skb);
2224	return rc;
2225out:
2226	rcu_read_unlock_bh();
2227	return rc;
2228}
2229EXPORT_SYMBOL(dev_queue_xmit);
2230
2231
2232/*=======================================================================
2233			Receiver routines
2234  =======================================================================*/
2235
2236int netdev_max_backlog __read_mostly = 1000;
2237int netdev_tstamp_prequeue __read_mostly = 1;
2238int netdev_budget __read_mostly = 300;
2239int weight_p __read_mostly = 64;            /* old backlog weight */
2240
2241/* Called with irq disabled */
2242static inline void ____napi_schedule(struct softnet_data *sd,
2243				     struct napi_struct *napi)
2244{
2245	list_add_tail(&napi->poll_list, &sd->poll_list);
2246	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2247}
2248
2249#ifdef CONFIG_RPS
2250
2251/* One global table that all flow-based protocols share. */
2252struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2253EXPORT_SYMBOL(rps_sock_flow_table);
2254
2255/*
2256 * get_rps_cpu is called from netif_receive_skb and returns the target
2257 * CPU from the RPS map of the receiving queue for a given skb.
2258 * rcu_read_lock must be held on entry.
2259 */
2260static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2261		       struct rps_dev_flow **rflowp)
2262{
2263	struct ipv6hdr *ip6;
2264	struct iphdr *ip;
2265	struct netdev_rx_queue *rxqueue;
2266	struct rps_map *map;
2267	struct rps_dev_flow_table *flow_table;
2268	struct rps_sock_flow_table *sock_flow_table;
2269	int cpu = -1;
2270	u8 ip_proto;
2271	u16 tcpu;
2272	u32 addr1, addr2, ihl;
2273	union {
2274		u32 v32;
2275		u16 v16[2];
2276	} ports;
2277
2278	if (skb_rx_queue_recorded(skb)) {
2279		u16 index = skb_get_rx_queue(skb);
2280		if (unlikely(index >= dev->num_rx_queues)) {
2281			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2282				"on queue %u, but number of RX queues is %u\n",
2283				dev->name, index, dev->num_rx_queues);
2284			goto done;
2285		}
2286		rxqueue = dev->_rx + index;
2287	} else
2288		rxqueue = dev->_rx;
2289
2290	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2291		goto done;
2292
2293	if (skb->rxhash)
2294		goto got_hash; /* Skip hash computation on packet header */
2295
2296	switch (skb->protocol) {
2297	case __constant_htons(ETH_P_IP):
2298		if (!pskb_may_pull(skb, sizeof(*ip)))
2299			goto done;
2300
2301		ip = (struct iphdr *) skb->data;
2302		ip_proto = ip->protocol;
2303		addr1 = (__force u32) ip->saddr;
2304		addr2 = (__force u32) ip->daddr;
2305		ihl = ip->ihl;
2306		break;
2307	case __constant_htons(ETH_P_IPV6):
2308		if (!pskb_may_pull(skb, sizeof(*ip6)))
2309			goto done;
2310
2311		ip6 = (struct ipv6hdr *) skb->data;
2312		ip_proto = ip6->nexthdr;
2313		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2314		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2315		ihl = (40 >> 2);
2316		break;
2317	default:
2318		goto done;
2319	}
2320	switch (ip_proto) {
2321	case IPPROTO_TCP:
2322	case IPPROTO_UDP:
2323	case IPPROTO_DCCP:
2324	case IPPROTO_ESP:
2325	case IPPROTO_AH:
2326	case IPPROTO_SCTP:
2327	case IPPROTO_UDPLITE:
2328		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2329			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2330			if (ports.v16[1] < ports.v16[0])
2331				swap(ports.v16[0], ports.v16[1]);
2332			break;
2333		}
2334	default:
2335		ports.v32 = 0;
2336		break;
2337	}
2338
2339	/* get a consistent hash (same value on both flow directions) */
2340	if (addr2 < addr1)
2341		swap(addr1, addr2);
2342	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2343	if (!skb->rxhash)
2344		skb->rxhash = 1;
2345
2346got_hash:
2347	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2348	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2349	if (flow_table && sock_flow_table) {
2350		u16 next_cpu;
2351		struct rps_dev_flow *rflow;
2352
2353		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2354		tcpu = rflow->cpu;
2355
2356		next_cpu = sock_flow_table->ents[skb->rxhash &
2357		    sock_flow_table->mask];
2358
2359		/*
2360		 * If the desired CPU (where last recvmsg was done) is
2361		 * different from current CPU (one in the rx-queue flow
2362		 * table entry), switch if one of the following holds:
2363		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2364		 *   - Current CPU is offline.
2365		 *   - The current CPU's queue tail has advanced beyond the
2366		 *     last packet that was enqueued using this table entry.
2367		 *     This guarantees that all previous packets for the flow
2368		 *     have been dequeued, thus preserving in order delivery.
2369		 */
2370		if (unlikely(tcpu != next_cpu) &&
2371		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2372		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2373		      rflow->last_qtail)) >= 0)) {
2374			tcpu = rflow->cpu = next_cpu;
2375			if (tcpu != RPS_NO_CPU)
2376				rflow->last_qtail = per_cpu(softnet_data,
2377				    tcpu).input_queue_head;
2378		}
2379		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2380			*rflowp = rflow;
2381			cpu = tcpu;
2382			goto done;
2383		}
2384	}
2385
2386	map = rcu_dereference(rxqueue->rps_map);
2387	if (map) {
2388		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2389
2390		if (cpu_online(tcpu)) {
2391			cpu = tcpu;
2392			goto done;
2393		}
2394	}
2395
2396done:
2397	return cpu;
2398}
2399
2400/* Called from hardirq (IPI) context */
2401static void rps_trigger_softirq(void *data)
2402{
2403	struct softnet_data *sd = data;
2404
2405	____napi_schedule(sd, &sd->backlog);
2406	sd->received_rps++;
2407}
2408
2409#endif /* CONFIG_RPS */
2410
2411/*
2412 * Check if this softnet_data structure is another cpu one
2413 * If yes, queue it to our IPI list and return 1
2414 * If no, return 0
2415 */
2416static int rps_ipi_queued(struct softnet_data *sd)
2417{
2418#ifdef CONFIG_RPS
2419	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2420
2421	if (sd != mysd) {
2422		sd->rps_ipi_next = mysd->rps_ipi_list;
2423		mysd->rps_ipi_list = sd;
2424
2425		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2426		return 1;
2427	}
2428#endif /* CONFIG_RPS */
2429	return 0;
2430}
2431
2432/*
2433 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2434 * queue (may be a remote CPU queue).
2435 */
2436static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2437			      unsigned int *qtail)
2438{
2439	struct softnet_data *sd;
2440	unsigned long flags;
2441
2442	sd = &per_cpu(softnet_data, cpu);
2443
2444	local_irq_save(flags);
2445
2446	rps_lock(sd);
2447	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2448		if (skb_queue_len(&sd->input_pkt_queue)) {
2449enqueue:
2450			__skb_queue_tail(&sd->input_pkt_queue, skb);
2451			input_queue_tail_incr_save(sd, qtail);
2452			rps_unlock(sd);
2453			local_irq_restore(flags);
2454			return NET_RX_SUCCESS;
2455		}
2456
2457		/* Schedule NAPI for backlog device
2458		 * We can use non atomic operation since we own the queue lock
2459		 */
2460		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2461			if (!rps_ipi_queued(sd))
2462				____napi_schedule(sd, &sd->backlog);
2463		}
2464		goto enqueue;
2465	}
2466
2467	sd->dropped++;
2468	rps_unlock(sd);
2469
2470	local_irq_restore(flags);
2471
2472	kfree_skb(skb);
2473	return NET_RX_DROP;
2474}
2475
2476/**
2477 *	netif_rx	-	post buffer to the network code
2478 *	@skb: buffer to post
2479 *
2480 *	This function receives a packet from a device driver and queues it for
2481 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2482 *	may be dropped during processing for congestion control or by the
2483 *	protocol layers.
2484 *
2485 *	return values:
2486 *	NET_RX_SUCCESS	(no congestion)
2487 *	NET_RX_DROP     (packet was dropped)
2488 *
2489 */
2490
2491int netif_rx(struct sk_buff *skb)
2492{
2493	int ret;
2494
2495	/* if netpoll wants it, pretend we never saw it */
2496	if (netpoll_rx(skb))
2497		return NET_RX_DROP;
2498
2499	if (netdev_tstamp_prequeue)
2500		net_timestamp_check(skb);
2501
2502#ifdef CONFIG_RPS
2503	{
2504		struct rps_dev_flow voidflow, *rflow = &voidflow;
2505		int cpu;
2506
2507		rcu_read_lock();
2508
2509		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2510		if (cpu < 0)
2511			cpu = smp_processor_id();
2512
2513		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2514
2515		rcu_read_unlock();
2516	}
2517#else
2518	{
2519		unsigned int qtail;
2520		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2521		put_cpu();
2522	}
2523#endif
2524	return ret;
2525}
2526EXPORT_SYMBOL(netif_rx);
2527
2528int netif_rx_ni(struct sk_buff *skb)
2529{
2530	int err;
2531
2532	preempt_disable();
2533	err = netif_rx(skb);
2534	if (local_softirq_pending())
2535		do_softirq();
2536	preempt_enable();
2537
2538	return err;
2539}
2540EXPORT_SYMBOL(netif_rx_ni);
2541
2542static void net_tx_action(struct softirq_action *h)
2543{
2544	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2545
2546	if (sd->completion_queue) {
2547		struct sk_buff *clist;
2548
2549		local_irq_disable();
2550		clist = sd->completion_queue;
2551		sd->completion_queue = NULL;
2552		local_irq_enable();
2553
2554		while (clist) {
2555			struct sk_buff *skb = clist;
2556			clist = clist->next;
2557
2558			WARN_ON(atomic_read(&skb->users));
2559			__kfree_skb(skb);
2560		}
2561	}
2562
2563	if (sd->output_queue) {
2564		struct Qdisc *head;
2565
2566		local_irq_disable();
2567		head = sd->output_queue;
2568		sd->output_queue = NULL;
2569		sd->output_queue_tailp = &sd->output_queue;
2570		local_irq_enable();
2571
2572		while (head) {
2573			struct Qdisc *q = head;
2574			spinlock_t *root_lock;
2575
2576			head = head->next_sched;
2577
2578			root_lock = qdisc_lock(q);
2579			if (spin_trylock(root_lock)) {
2580				smp_mb__before_clear_bit();
2581				clear_bit(__QDISC_STATE_SCHED,
2582					  &q->state);
2583				qdisc_run(q);
2584				spin_unlock(root_lock);
2585			} else {
2586				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2587					      &q->state)) {
2588					__netif_reschedule(q);
2589				} else {
2590					smp_mb__before_clear_bit();
2591					clear_bit(__QDISC_STATE_SCHED,
2592						  &q->state);
2593				}
2594			}
2595		}
2596	}
2597}
2598
2599static inline int deliver_skb(struct sk_buff *skb,
2600			      struct packet_type *pt_prev,
2601			      struct net_device *orig_dev)
2602{
2603	atomic_inc(&skb->users);
2604	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2605}
2606
2607#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2608
2609#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2610/* This hook is defined here for ATM LANE */
2611int (*br_fdb_test_addr_hook)(struct net_device *dev,
2612			     unsigned char *addr) __read_mostly;
2613EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2614#endif
2615
2616/*
2617 * If bridge module is loaded call bridging hook.
2618 *  returns NULL if packet was consumed.
2619 */
2620struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2621					struct sk_buff *skb) __read_mostly;
2622EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2623
2624static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2625					    struct packet_type **pt_prev, int *ret,
2626					    struct net_device *orig_dev)
2627{
2628	struct net_bridge_port *port;
2629
2630	if (skb->pkt_type == PACKET_LOOPBACK ||
2631	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2632		return skb;
2633
2634	if (*pt_prev) {
2635		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2636		*pt_prev = NULL;
2637	}
2638
2639	return br_handle_frame_hook(port, skb);
2640}
2641#else
2642#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2643#endif
2644
2645#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2646struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2647					     struct sk_buff *skb) __read_mostly;
2648EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2649
2650static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2651					     struct packet_type **pt_prev,
2652					     int *ret,
2653					     struct net_device *orig_dev)
2654{
2655	struct macvlan_port *port;
2656
2657	port = rcu_dereference(skb->dev->macvlan_port);
2658	if (!port)
2659		return skb;
2660
2661	if (*pt_prev) {
2662		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2663		*pt_prev = NULL;
2664	}
2665	return macvlan_handle_frame_hook(port, skb);
2666}
2667#else
2668#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2669#endif
2670
2671#ifdef CONFIG_NET_CLS_ACT
2672/* TODO: Maybe we should just force sch_ingress to be compiled in
2673 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2674 * a compare and 2 stores extra right now if we dont have it on
2675 * but have CONFIG_NET_CLS_ACT
2676 * NOTE: This doesnt stop any functionality; if you dont have
2677 * the ingress scheduler, you just cant add policies on ingress.
2678 *
2679 */
2680static int ing_filter(struct sk_buff *skb)
2681{
2682	struct net_device *dev = skb->dev;
2683	u32 ttl = G_TC_RTTL(skb->tc_verd);
2684	struct netdev_queue *rxq;
2685	int result = TC_ACT_OK;
2686	struct Qdisc *q;
2687
2688	if (MAX_RED_LOOP < ttl++) {
2689		printk(KERN_WARNING
2690		       "Redir loop detected Dropping packet (%d->%d)\n",
2691		       skb->skb_iif, dev->ifindex);
2692		return TC_ACT_SHOT;
2693	}
2694
2695	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2696	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2697
2698	rxq = &dev->rx_queue;
2699
2700	q = rxq->qdisc;
2701	if (q != &noop_qdisc) {
2702		spin_lock(qdisc_lock(q));
2703		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2704			result = qdisc_enqueue_root(skb, q);
2705		spin_unlock(qdisc_lock(q));
2706	}
2707
2708	return result;
2709}
2710
2711static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2712					 struct packet_type **pt_prev,
2713					 int *ret, struct net_device *orig_dev)
2714{
2715	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2716		goto out;
2717
2718	if (*pt_prev) {
2719		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2720		*pt_prev = NULL;
2721	} else {
2722		/* Huh? Why does turning on AF_PACKET affect this? */
2723		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2724	}
2725
2726	switch (ing_filter(skb)) {
2727	case TC_ACT_SHOT:
2728	case TC_ACT_STOLEN:
2729		kfree_skb(skb);
2730		return NULL;
2731	}
2732
2733out:
2734	skb->tc_verd = 0;
2735	return skb;
2736}
2737#endif
2738
2739/*
2740 * 	netif_nit_deliver - deliver received packets to network taps
2741 * 	@skb: buffer
2742 *
2743 * 	This function is used to deliver incoming packets to network
2744 * 	taps. It should be used when the normal netif_receive_skb path
2745 * 	is bypassed, for example because of VLAN acceleration.
2746 */
2747void netif_nit_deliver(struct sk_buff *skb)
2748{
2749	struct packet_type *ptype;
2750
2751	if (list_empty(&ptype_all))
2752		return;
2753
2754	skb_reset_network_header(skb);
2755	skb_reset_transport_header(skb);
2756	skb->mac_len = skb->network_header - skb->mac_header;
2757
2758	rcu_read_lock();
2759	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2760		if (!ptype->dev || ptype->dev == skb->dev)
2761			deliver_skb(skb, ptype, skb->dev);
2762	}
2763	rcu_read_unlock();
2764}
2765
2766static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2767					      struct net_device *master)
2768{
2769	if (skb->pkt_type == PACKET_HOST) {
2770		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2771
2772		memcpy(dest, master->dev_addr, ETH_ALEN);
2773	}
2774}
2775
2776/* On bonding slaves other than the currently active slave, suppress
2777 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2778 * ARP on active-backup slaves with arp_validate enabled.
2779 */
2780int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2781{
2782	struct net_device *dev = skb->dev;
2783
2784	if (master->priv_flags & IFF_MASTER_ARPMON)
2785		dev->last_rx = jiffies;
2786
2787	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2788		/* Do address unmangle. The local destination address
2789		 * will be always the one master has. Provides the right
2790		 * functionality in a bridge.
2791		 */
2792		skb_bond_set_mac_by_master(skb, master);
2793	}
2794
2795	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2796		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2797		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2798			return 0;
2799
2800		if (master->priv_flags & IFF_MASTER_ALB) {
2801			if (skb->pkt_type != PACKET_BROADCAST &&
2802			    skb->pkt_type != PACKET_MULTICAST)
2803				return 0;
2804		}
2805		if (master->priv_flags & IFF_MASTER_8023AD &&
2806		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2807			return 0;
2808
2809		return 1;
2810	}
2811	return 0;
2812}
2813EXPORT_SYMBOL(__skb_bond_should_drop);
2814
2815static int __netif_receive_skb(struct sk_buff *skb)
2816{
2817	struct packet_type *ptype, *pt_prev;
2818	struct net_device *orig_dev;
2819	struct net_device *master;
2820	struct net_device *null_or_orig;
2821	struct net_device *orig_or_bond;
2822	int ret = NET_RX_DROP;
2823	__be16 type;
2824
2825	if (!netdev_tstamp_prequeue)
2826		net_timestamp_check(skb);
2827
2828	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2829		return NET_RX_SUCCESS;
2830
2831	/* if we've gotten here through NAPI, check netpoll */
2832	if (netpoll_receive_skb(skb))
2833		return NET_RX_DROP;
2834
2835	if (!skb->skb_iif)
2836		skb->skb_iif = skb->dev->ifindex;
2837
2838	/*
2839	 * bonding note: skbs received on inactive slaves should only
2840	 * be delivered to pkt handlers that are exact matches.  Also
2841	 * the deliver_no_wcard flag will be set.  If packet handlers
2842	 * are sensitive to duplicate packets these skbs will need to
2843	 * be dropped at the handler.  The vlan accel path may have
2844	 * already set the deliver_no_wcard flag.
2845	 */
2846	null_or_orig = NULL;
2847	orig_dev = skb->dev;
2848	master = ACCESS_ONCE(orig_dev->master);
2849	if (skb->deliver_no_wcard)
2850		null_or_orig = orig_dev;
2851	else if (master) {
2852		if (skb_bond_should_drop(skb, master)) {
2853			skb->deliver_no_wcard = 1;
2854			null_or_orig = orig_dev; /* deliver only exact match */
2855		} else
2856			skb->dev = master;
2857	}
2858
2859	__get_cpu_var(softnet_data).processed++;
2860
2861	skb_reset_network_header(skb);
2862	skb_reset_transport_header(skb);
2863	skb->mac_len = skb->network_header - skb->mac_header;
2864
2865	pt_prev = NULL;
2866
2867	rcu_read_lock();
2868
2869#ifdef CONFIG_NET_CLS_ACT
2870	if (skb->tc_verd & TC_NCLS) {
2871		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2872		goto ncls;
2873	}
2874#endif
2875
2876	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2877		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2878		    ptype->dev == orig_dev) {
2879			if (pt_prev)
2880				ret = deliver_skb(skb, pt_prev, orig_dev);
2881			pt_prev = ptype;
2882		}
2883	}
2884
2885#ifdef CONFIG_NET_CLS_ACT
2886	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2887	if (!skb)
2888		goto out;
2889ncls:
2890#endif
2891
2892	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2893	if (!skb)
2894		goto out;
2895	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2896	if (!skb)
2897		goto out;
2898
2899	/*
2900	 * Make sure frames received on VLAN interfaces stacked on
2901	 * bonding interfaces still make their way to any base bonding
2902	 * device that may have registered for a specific ptype.  The
2903	 * handler may have to adjust skb->dev and orig_dev.
2904	 */
2905	orig_or_bond = orig_dev;
2906	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2907	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2908		orig_or_bond = vlan_dev_real_dev(skb->dev);
2909	}
2910
2911	type = skb->protocol;
2912	list_for_each_entry_rcu(ptype,
2913			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2914		if (ptype->type == type && (ptype->dev == null_or_orig ||
2915		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2916		     ptype->dev == orig_or_bond)) {
2917			if (pt_prev)
2918				ret = deliver_skb(skb, pt_prev, orig_dev);
2919			pt_prev = ptype;
2920		}
2921	}
2922
2923	if (pt_prev) {
2924		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2925	} else {
2926		kfree_skb(skb);
2927		/* Jamal, now you will not able to escape explaining
2928		 * me how you were going to use this. :-)
2929		 */
2930		ret = NET_RX_DROP;
2931	}
2932
2933out:
2934	rcu_read_unlock();
2935	return ret;
2936}
2937
2938/**
2939 *	netif_receive_skb - process receive buffer from network
2940 *	@skb: buffer to process
2941 *
2942 *	netif_receive_skb() is the main receive data processing function.
2943 *	It always succeeds. The buffer may be dropped during processing
2944 *	for congestion control or by the protocol layers.
2945 *
2946 *	This function may only be called from softirq context and interrupts
2947 *	should be enabled.
2948 *
2949 *	Return values (usually ignored):
2950 *	NET_RX_SUCCESS: no congestion
2951 *	NET_RX_DROP: packet was dropped
2952 */
2953int netif_receive_skb(struct sk_buff *skb)
2954{
2955	if (netdev_tstamp_prequeue)
2956		net_timestamp_check(skb);
2957
2958#ifdef CONFIG_RPS
2959	{
2960		struct rps_dev_flow voidflow, *rflow = &voidflow;
2961		int cpu, ret;
2962
2963		rcu_read_lock();
2964
2965		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2966
2967		if (cpu >= 0) {
2968			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2969			rcu_read_unlock();
2970		} else {
2971			rcu_read_unlock();
2972			ret = __netif_receive_skb(skb);
2973		}
2974
2975		return ret;
2976	}
2977#else
2978	return __netif_receive_skb(skb);
2979#endif
2980}
2981EXPORT_SYMBOL(netif_receive_skb);
2982
2983/* Network device is going away, flush any packets still pending
2984 * Called with irqs disabled.
2985 */
2986static void flush_backlog(void *arg)
2987{
2988	struct net_device *dev = arg;
2989	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2990	struct sk_buff *skb, *tmp;
2991
2992	rps_lock(sd);
2993	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2994		if (skb->dev == dev) {
2995			__skb_unlink(skb, &sd->input_pkt_queue);
2996			kfree_skb(skb);
2997			input_queue_head_incr(sd);
2998		}
2999	}
3000	rps_unlock(sd);
3001
3002	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3003		if (skb->dev == dev) {
3004			__skb_unlink(skb, &sd->process_queue);
3005			kfree_skb(skb);
3006			input_queue_head_incr(sd);
3007		}
3008	}
3009}
3010
3011static int napi_gro_complete(struct sk_buff *skb)
3012{
3013	struct packet_type *ptype;
3014	__be16 type = skb->protocol;
3015	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3016	int err = -ENOENT;
3017
3018	if (NAPI_GRO_CB(skb)->count == 1) {
3019		skb_shinfo(skb)->gso_size = 0;
3020		goto out;
3021	}
3022
3023	rcu_read_lock();
3024	list_for_each_entry_rcu(ptype, head, list) {
3025		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3026			continue;
3027
3028		err = ptype->gro_complete(skb);
3029		break;
3030	}
3031	rcu_read_unlock();
3032
3033	if (err) {
3034		WARN_ON(&ptype->list == head);
3035		kfree_skb(skb);
3036		return NET_RX_SUCCESS;
3037	}
3038
3039out:
3040	return netif_receive_skb(skb);
3041}
3042
3043static void napi_gro_flush(struct napi_struct *napi)
3044{
3045	struct sk_buff *skb, *next;
3046
3047	for (skb = napi->gro_list; skb; skb = next) {
3048		next = skb->next;
3049		skb->next = NULL;
3050		napi_gro_complete(skb);
3051	}
3052
3053	napi->gro_count = 0;
3054	napi->gro_list = NULL;
3055}
3056
3057enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3058{
3059	struct sk_buff **pp = NULL;
3060	struct packet_type *ptype;
3061	__be16 type = skb->protocol;
3062	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3063	int same_flow;
3064	int mac_len;
3065	enum gro_result ret;
3066
3067	if (!(skb->dev->features & NETIF_F_GRO))
3068		goto normal;
3069
3070	if (skb_is_gso(skb) || skb_has_frags(skb))
3071		goto normal;
3072
3073	rcu_read_lock();
3074	list_for_each_entry_rcu(ptype, head, list) {
3075		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3076			continue;
3077
3078		skb_set_network_header(skb, skb_gro_offset(skb));
3079		mac_len = skb->network_header - skb->mac_header;
3080		skb->mac_len = mac_len;
3081		NAPI_GRO_CB(skb)->same_flow = 0;
3082		NAPI_GRO_CB(skb)->flush = 0;
3083		NAPI_GRO_CB(skb)->free = 0;
3084
3085		pp = ptype->gro_receive(&napi->gro_list, skb);
3086		break;
3087	}
3088	rcu_read_unlock();
3089
3090	if (&ptype->list == head)
3091		goto normal;
3092
3093	same_flow = NAPI_GRO_CB(skb)->same_flow;
3094	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3095
3096	if (pp) {
3097		struct sk_buff *nskb = *pp;
3098
3099		*pp = nskb->next;
3100		nskb->next = NULL;
3101		napi_gro_complete(nskb);
3102		napi->gro_count--;
3103	}
3104
3105	if (same_flow)
3106		goto ok;
3107
3108	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3109		goto normal;
3110
3111	napi->gro_count++;
3112	NAPI_GRO_CB(skb)->count = 1;
3113	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3114	skb->next = napi->gro_list;
3115	napi->gro_list = skb;
3116	ret = GRO_HELD;
3117
3118pull:
3119	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3120		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3121
3122		BUG_ON(skb->end - skb->tail < grow);
3123
3124		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3125
3126		skb->tail += grow;
3127		skb->data_len -= grow;
3128
3129		skb_shinfo(skb)->frags[0].page_offset += grow;
3130		skb_shinfo(skb)->frags[0].size -= grow;
3131
3132		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3133			put_page(skb_shinfo(skb)->frags[0].page);
3134			memmove(skb_shinfo(skb)->frags,
3135				skb_shinfo(skb)->frags + 1,
3136				--skb_shinfo(skb)->nr_frags);
3137		}
3138	}
3139
3140ok:
3141	return ret;
3142
3143normal:
3144	ret = GRO_NORMAL;
3145	goto pull;
3146}
3147EXPORT_SYMBOL(dev_gro_receive);
3148
3149static gro_result_t
3150__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3151{
3152	struct sk_buff *p;
3153
3154	if (netpoll_rx_on(skb))
3155		return GRO_NORMAL;
3156
3157	for (p = napi->gro_list; p; p = p->next) {
3158		NAPI_GRO_CB(p)->same_flow =
3159			(p->dev == skb->dev) &&
3160			!compare_ether_header(skb_mac_header(p),
3161					      skb_gro_mac_header(skb));
3162		NAPI_GRO_CB(p)->flush = 0;
3163	}
3164
3165	return dev_gro_receive(napi, skb);
3166}
3167
3168gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3169{
3170	switch (ret) {
3171	case GRO_NORMAL:
3172		if (netif_receive_skb(skb))
3173			ret = GRO_DROP;
3174		break;
3175
3176	case GRO_DROP:
3177	case GRO_MERGED_FREE:
3178		kfree_skb(skb);
3179		break;
3180
3181	case GRO_HELD:
3182	case GRO_MERGED:
3183		break;
3184	}
3185
3186	return ret;
3187}
3188EXPORT_SYMBOL(napi_skb_finish);
3189
3190void skb_gro_reset_offset(struct sk_buff *skb)
3191{
3192	NAPI_GRO_CB(skb)->data_offset = 0;
3193	NAPI_GRO_CB(skb)->frag0 = NULL;
3194	NAPI_GRO_CB(skb)->frag0_len = 0;
3195
3196	if (skb->mac_header == skb->tail &&
3197	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3198		NAPI_GRO_CB(skb)->frag0 =
3199			page_address(skb_shinfo(skb)->frags[0].page) +
3200			skb_shinfo(skb)->frags[0].page_offset;
3201		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3202	}
3203}
3204EXPORT_SYMBOL(skb_gro_reset_offset);
3205
3206gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3207{
3208	skb_gro_reset_offset(skb);
3209
3210	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3211}
3212EXPORT_SYMBOL(napi_gro_receive);
3213
3214void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3215{
3216	__skb_pull(skb, skb_headlen(skb));
3217	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3218
3219	napi->skb = skb;
3220}
3221EXPORT_SYMBOL(napi_reuse_skb);
3222
3223struct sk_buff *napi_get_frags(struct napi_struct *napi)
3224{
3225	struct sk_buff *skb = napi->skb;
3226
3227	if (!skb) {
3228		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3229		if (skb)
3230			napi->skb = skb;
3231	}
3232	return skb;
3233}
3234EXPORT_SYMBOL(napi_get_frags);
3235
3236gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3237			       gro_result_t ret)
3238{
3239	switch (ret) {
3240	case GRO_NORMAL:
3241	case GRO_HELD:
3242		skb->protocol = eth_type_trans(skb, skb->dev);
3243
3244		if (ret == GRO_HELD)
3245			skb_gro_pull(skb, -ETH_HLEN);
3246		else if (netif_receive_skb(skb))
3247			ret = GRO_DROP;
3248		break;
3249
3250	case GRO_DROP:
3251	case GRO_MERGED_FREE:
3252		napi_reuse_skb(napi, skb);
3253		break;
3254
3255	case GRO_MERGED:
3256		break;
3257	}
3258
3259	return ret;
3260}
3261EXPORT_SYMBOL(napi_frags_finish);
3262
3263struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3264{
3265	struct sk_buff *skb = napi->skb;
3266	struct ethhdr *eth;
3267	unsigned int hlen;
3268	unsigned int off;
3269
3270	napi->skb = NULL;
3271
3272	skb_reset_mac_header(skb);
3273	skb_gro_reset_offset(skb);
3274
3275	off = skb_gro_offset(skb);
3276	hlen = off + sizeof(*eth);
3277	eth = skb_gro_header_fast(skb, off);
3278	if (skb_gro_header_hard(skb, hlen)) {
3279		eth = skb_gro_header_slow(skb, hlen, off);
3280		if (unlikely(!eth)) {
3281			napi_reuse_skb(napi, skb);
3282			skb = NULL;
3283			goto out;
3284		}
3285	}
3286
3287	skb_gro_pull(skb, sizeof(*eth));
3288
3289	/*
3290	 * This works because the only protocols we care about don't require
3291	 * special handling.  We'll fix it up properly at the end.
3292	 */
3293	skb->protocol = eth->h_proto;
3294
3295out:
3296	return skb;
3297}
3298EXPORT_SYMBOL(napi_frags_skb);
3299
3300gro_result_t napi_gro_frags(struct napi_struct *napi)
3301{
3302	struct sk_buff *skb = napi_frags_skb(napi);
3303
3304	if (!skb)
3305		return GRO_DROP;
3306
3307	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3308}
3309EXPORT_SYMBOL(napi_gro_frags);
3310
3311/*
3312 * net_rps_action sends any pending IPI's for rps.
3313 * Note: called with local irq disabled, but exits with local irq enabled.
3314 */
3315static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3316{
3317#ifdef CONFIG_RPS
3318	struct softnet_data *remsd = sd->rps_ipi_list;
3319
3320	if (remsd) {
3321		sd->rps_ipi_list = NULL;
3322
3323		local_irq_enable();
3324
3325		/* Send pending IPI's to kick RPS processing on remote cpus. */
3326		while (remsd) {
3327			struct softnet_data *next = remsd->rps_ipi_next;
3328
3329			if (cpu_online(remsd->cpu))
3330				__smp_call_function_single(remsd->cpu,
3331							   &remsd->csd, 0);
3332			remsd = next;
3333		}
3334	} else
3335#endif
3336		local_irq_enable();
3337}
3338
3339static int process_backlog(struct napi_struct *napi, int quota)
3340{
3341	int work = 0;
3342	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3343
3344#ifdef CONFIG_RPS
3345	/* Check if we have pending ipi, its better to send them now,
3346	 * not waiting net_rx_action() end.
3347	 */
3348	if (sd->rps_ipi_list) {
3349		local_irq_disable();
3350		net_rps_action_and_irq_enable(sd);
3351	}
3352#endif
3353	napi->weight = weight_p;
3354	local_irq_disable();
3355	while (work < quota) {
3356		struct sk_buff *skb;
3357		unsigned int qlen;
3358
3359		while ((skb = __skb_dequeue(&sd->process_queue))) {
3360			local_irq_enable();
3361			__netif_receive_skb(skb);
3362			local_irq_disable();
3363			input_queue_head_incr(sd);
3364			if (++work >= quota) {
3365				local_irq_enable();
3366				return work;
3367			}
3368		}
3369
3370		rps_lock(sd);
3371		qlen = skb_queue_len(&sd->input_pkt_queue);
3372		if (qlen)
3373			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3374						   &sd->process_queue);
3375
3376		if (qlen < quota - work) {
3377			/*
3378			 * Inline a custom version of __napi_complete().
3379			 * only current cpu owns and manipulates this napi,
3380			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3381			 * we can use a plain write instead of clear_bit(),
3382			 * and we dont need an smp_mb() memory barrier.
3383			 */
3384			list_del(&napi->poll_list);
3385			napi->state = 0;
3386
3387			quota = work + qlen;
3388		}
3389		rps_unlock(sd);
3390	}
3391	local_irq_enable();
3392
3393	return work;
3394}
3395
3396/**
3397 * __napi_schedule - schedule for receive
3398 * @n: entry to schedule
3399 *
3400 * The entry's receive function will be scheduled to run
3401 */
3402void __napi_schedule(struct napi_struct *n)
3403{
3404	unsigned long flags;
3405
3406	local_irq_save(flags);
3407	____napi_schedule(&__get_cpu_var(softnet_data), n);
3408	local_irq_restore(flags);
3409}
3410EXPORT_SYMBOL(__napi_schedule);
3411
3412void __napi_complete(struct napi_struct *n)
3413{
3414	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3415	BUG_ON(n->gro_list);
3416
3417	list_del(&n->poll_list);
3418	smp_mb__before_clear_bit();
3419	clear_bit(NAPI_STATE_SCHED, &n->state);
3420}
3421EXPORT_SYMBOL(__napi_complete);
3422
3423void napi_complete(struct napi_struct *n)
3424{
3425	unsigned long flags;
3426
3427	/*
3428	 * don't let napi dequeue from the cpu poll list
3429	 * just in case its running on a different cpu
3430	 */
3431	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3432		return;
3433
3434	napi_gro_flush(n);
3435	local_irq_save(flags);
3436	__napi_complete(n);
3437	local_irq_restore(flags);
3438}
3439EXPORT_SYMBOL(napi_complete);
3440
3441void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3442		    int (*poll)(struct napi_struct *, int), int weight)
3443{
3444	INIT_LIST_HEAD(&napi->poll_list);
3445	napi->gro_count = 0;
3446	napi->gro_list = NULL;
3447	napi->skb = NULL;
3448	napi->poll = poll;
3449	napi->weight = weight;
3450	list_add(&napi->dev_list, &dev->napi_list);
3451	napi->dev = dev;
3452#ifdef CONFIG_NETPOLL
3453	spin_lock_init(&napi->poll_lock);
3454	napi->poll_owner = -1;
3455#endif
3456	set_bit(NAPI_STATE_SCHED, &napi->state);
3457}
3458EXPORT_SYMBOL(netif_napi_add);
3459
3460void netif_napi_del(struct napi_struct *napi)
3461{
3462	struct sk_buff *skb, *next;
3463
3464	list_del_init(&napi->dev_list);
3465	napi_free_frags(napi);
3466
3467	for (skb = napi->gro_list; skb; skb = next) {
3468		next = skb->next;
3469		skb->next = NULL;
3470		kfree_skb(skb);
3471	}
3472
3473	napi->gro_list = NULL;
3474	napi->gro_count = 0;
3475}
3476EXPORT_SYMBOL(netif_napi_del);
3477
3478static void net_rx_action(struct softirq_action *h)
3479{
3480	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3481	unsigned long time_limit = jiffies + 2;
3482	int budget = netdev_budget;
3483	void *have;
3484
3485	local_irq_disable();
3486
3487	while (!list_empty(&sd->poll_list)) {
3488		struct napi_struct *n;
3489		int work, weight;
3490
3491		/* If softirq window is exhuasted then punt.
3492		 * Allow this to run for 2 jiffies since which will allow
3493		 * an average latency of 1.5/HZ.
3494		 */
3495		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3496			goto softnet_break;
3497
3498		local_irq_enable();
3499
3500		/* Even though interrupts have been re-enabled, this
3501		 * access is safe because interrupts can only add new
3502		 * entries to the tail of this list, and only ->poll()
3503		 * calls can remove this head entry from the list.
3504		 */
3505		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3506
3507		have = netpoll_poll_lock(n);
3508
3509		weight = n->weight;
3510
3511		/* This NAPI_STATE_SCHED test is for avoiding a race
3512		 * with netpoll's poll_napi().  Only the entity which
3513		 * obtains the lock and sees NAPI_STATE_SCHED set will
3514		 * actually make the ->poll() call.  Therefore we avoid
3515		 * accidently calling ->poll() when NAPI is not scheduled.
3516		 */
3517		work = 0;
3518		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3519			work = n->poll(n, weight);
3520			trace_napi_poll(n);
3521		}
3522
3523		WARN_ON_ONCE(work > weight);
3524
3525		budget -= work;
3526
3527		local_irq_disable();
3528
3529		/* Drivers must not modify the NAPI state if they
3530		 * consume the entire weight.  In such cases this code
3531		 * still "owns" the NAPI instance and therefore can
3532		 * move the instance around on the list at-will.
3533		 */
3534		if (unlikely(work == weight)) {
3535			if (unlikely(napi_disable_pending(n))) {
3536				local_irq_enable();
3537				napi_complete(n);
3538				local_irq_disable();
3539			} else
3540				list_move_tail(&n->poll_list, &sd->poll_list);
3541		}
3542
3543		netpoll_poll_unlock(have);
3544	}
3545out:
3546	net_rps_action_and_irq_enable(sd);
3547
3548#ifdef CONFIG_NET_DMA
3549	/*
3550	 * There may not be any more sk_buffs coming right now, so push
3551	 * any pending DMA copies to hardware
3552	 */
3553	dma_issue_pending_all();
3554#endif
3555
3556	return;
3557
3558softnet_break:
3559	sd->time_squeeze++;
3560	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3561	goto out;
3562}
3563
3564static gifconf_func_t *gifconf_list[NPROTO];
3565
3566/**
3567 *	register_gifconf	-	register a SIOCGIF handler
3568 *	@family: Address family
3569 *	@gifconf: Function handler
3570 *
3571 *	Register protocol dependent address dumping routines. The handler
3572 *	that is passed must not be freed or reused until it has been replaced
3573 *	by another handler.
3574 */
3575int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3576{
3577	if (family >= NPROTO)
3578		return -EINVAL;
3579	gifconf_list[family] = gifconf;
3580	return 0;
3581}
3582EXPORT_SYMBOL(register_gifconf);
3583
3584
3585/*
3586 *	Map an interface index to its name (SIOCGIFNAME)
3587 */
3588
3589/*
3590 *	We need this ioctl for efficient implementation of the
3591 *	if_indextoname() function required by the IPv6 API.  Without
3592 *	it, we would have to search all the interfaces to find a
3593 *	match.  --pb
3594 */
3595
3596static int dev_ifname(struct net *net, struct ifreq __user *arg)
3597{
3598	struct net_device *dev;
3599	struct ifreq ifr;
3600
3601	/*
3602	 *	Fetch the caller's info block.
3603	 */
3604
3605	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3606		return -EFAULT;
3607
3608	rcu_read_lock();
3609	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3610	if (!dev) {
3611		rcu_read_unlock();
3612		return -ENODEV;
3613	}
3614
3615	strcpy(ifr.ifr_name, dev->name);
3616	rcu_read_unlock();
3617
3618	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3619		return -EFAULT;
3620	return 0;
3621}
3622
3623/*
3624 *	Perform a SIOCGIFCONF call. This structure will change
3625 *	size eventually, and there is nothing I can do about it.
3626 *	Thus we will need a 'compatibility mode'.
3627 */
3628
3629static int dev_ifconf(struct net *net, char __user *arg)
3630{
3631	struct ifconf ifc;
3632	struct net_device *dev;
3633	char __user *pos;
3634	int len;
3635	int total;
3636	int i;
3637
3638	/*
3639	 *	Fetch the caller's info block.
3640	 */
3641
3642	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3643		return -EFAULT;
3644
3645	pos = ifc.ifc_buf;
3646	len = ifc.ifc_len;
3647
3648	/*
3649	 *	Loop over the interfaces, and write an info block for each.
3650	 */
3651
3652	total = 0;
3653	for_each_netdev(net, dev) {
3654		for (i = 0; i < NPROTO; i++) {
3655			if (gifconf_list[i]) {
3656				int done;
3657				if (!pos)
3658					done = gifconf_list[i](dev, NULL, 0);
3659				else
3660					done = gifconf_list[i](dev, pos + total,
3661							       len - total);
3662				if (done < 0)
3663					return -EFAULT;
3664				total += done;
3665			}
3666		}
3667	}
3668
3669	/*
3670	 *	All done.  Write the updated control block back to the caller.
3671	 */
3672	ifc.ifc_len = total;
3673
3674	/*
3675	 * 	Both BSD and Solaris return 0 here, so we do too.
3676	 */
3677	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3678}
3679
3680#ifdef CONFIG_PROC_FS
3681/*
3682 *	This is invoked by the /proc filesystem handler to display a device
3683 *	in detail.
3684 */
3685void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3686	__acquires(RCU)
3687{
3688	struct net *net = seq_file_net(seq);
3689	loff_t off;
3690	struct net_device *dev;
3691
3692	rcu_read_lock();
3693	if (!*pos)
3694		return SEQ_START_TOKEN;
3695
3696	off = 1;
3697	for_each_netdev_rcu(net, dev)
3698		if (off++ == *pos)
3699			return dev;
3700
3701	return NULL;
3702}
3703
3704void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3705{
3706	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3707				  first_net_device(seq_file_net(seq)) :
3708				  next_net_device((struct net_device *)v);
3709
3710	++*pos;
3711	return rcu_dereference(dev);
3712}
3713
3714void dev_seq_stop(struct seq_file *seq, void *v)
3715	__releases(RCU)
3716{
3717	rcu_read_unlock();
3718}
3719
3720static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3721{
3722	const struct net_device_stats *stats = dev_get_stats(dev);
3723
3724	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3725		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3726		   dev->name, stats->rx_bytes, stats->rx_packets,
3727		   stats->rx_errors,
3728		   stats->rx_dropped + stats->rx_missed_errors,
3729		   stats->rx_fifo_errors,
3730		   stats->rx_length_errors + stats->rx_over_errors +
3731		    stats->rx_crc_errors + stats->rx_frame_errors,
3732		   stats->rx_compressed, stats->multicast,
3733		   stats->tx_bytes, stats->tx_packets,
3734		   stats->tx_errors, stats->tx_dropped,
3735		   stats->tx_fifo_errors, stats->collisions,
3736		   stats->tx_carrier_errors +
3737		    stats->tx_aborted_errors +
3738		    stats->tx_window_errors +
3739		    stats->tx_heartbeat_errors,
3740		   stats->tx_compressed);
3741}
3742
3743/*
3744 *	Called from the PROCfs module. This now uses the new arbitrary sized
3745 *	/proc/net interface to create /proc/net/dev
3746 */
3747static int dev_seq_show(struct seq_file *seq, void *v)
3748{
3749	if (v == SEQ_START_TOKEN)
3750		seq_puts(seq, "Inter-|   Receive                            "
3751			      "                    |  Transmit\n"
3752			      " face |bytes    packets errs drop fifo frame "
3753			      "compressed multicast|bytes    packets errs "
3754			      "drop fifo colls carrier compressed\n");
3755	else
3756		dev_seq_printf_stats(seq, v);
3757	return 0;
3758}
3759
3760static struct softnet_data *softnet_get_online(loff_t *pos)
3761{
3762	struct softnet_data *sd = NULL;
3763
3764	while (*pos < nr_cpu_ids)
3765		if (cpu_online(*pos)) {
3766			sd = &per_cpu(softnet_data, *pos);
3767			break;
3768		} else
3769			++*pos;
3770	return sd;
3771}
3772
3773static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3774{
3775	return softnet_get_online(pos);
3776}
3777
3778static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3779{
3780	++*pos;
3781	return softnet_get_online(pos);
3782}
3783
3784static void softnet_seq_stop(struct seq_file *seq, void *v)
3785{
3786}
3787
3788static int softnet_seq_show(struct seq_file *seq, void *v)
3789{
3790	struct softnet_data *sd = v;
3791
3792	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3793		   sd->processed, sd->dropped, sd->time_squeeze, 0,
3794		   0, 0, 0, 0, /* was fastroute */
3795		   sd->cpu_collision, sd->received_rps);
3796	return 0;
3797}
3798
3799static const struct seq_operations dev_seq_ops = {
3800	.start = dev_seq_start,
3801	.next  = dev_seq_next,
3802	.stop  = dev_seq_stop,
3803	.show  = dev_seq_show,
3804};
3805
3806static int dev_seq_open(struct inode *inode, struct file *file)
3807{
3808	return seq_open_net(inode, file, &dev_seq_ops,
3809			    sizeof(struct seq_net_private));
3810}
3811
3812static const struct file_operations dev_seq_fops = {
3813	.owner	 = THIS_MODULE,
3814	.open    = dev_seq_open,
3815	.read    = seq_read,
3816	.llseek  = seq_lseek,
3817	.release = seq_release_net,
3818};
3819
3820static const struct seq_operations softnet_seq_ops = {
3821	.start = softnet_seq_start,
3822	.next  = softnet_seq_next,
3823	.stop  = softnet_seq_stop,
3824	.show  = softnet_seq_show,
3825};
3826
3827static int softnet_seq_open(struct inode *inode, struct file *file)
3828{
3829	return seq_open(file, &softnet_seq_ops);
3830}
3831
3832static const struct file_operations softnet_seq_fops = {
3833	.owner	 = THIS_MODULE,
3834	.open    = softnet_seq_open,
3835	.read    = seq_read,
3836	.llseek  = seq_lseek,
3837	.release = seq_release,
3838};
3839
3840static void *ptype_get_idx(loff_t pos)
3841{
3842	struct packet_type *pt = NULL;
3843	loff_t i = 0;
3844	int t;
3845
3846	list_for_each_entry_rcu(pt, &ptype_all, list) {
3847		if (i == pos)
3848			return pt;
3849		++i;
3850	}
3851
3852	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3853		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3854			if (i == pos)
3855				return pt;
3856			++i;
3857		}
3858	}
3859	return NULL;
3860}
3861
3862static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3863	__acquires(RCU)
3864{
3865	rcu_read_lock();
3866	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3867}
3868
3869static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3870{
3871	struct packet_type *pt;
3872	struct list_head *nxt;
3873	int hash;
3874
3875	++*pos;
3876	if (v == SEQ_START_TOKEN)
3877		return ptype_get_idx(0);
3878
3879	pt = v;
3880	nxt = pt->list.next;
3881	if (pt->type == htons(ETH_P_ALL)) {
3882		if (nxt != &ptype_all)
3883			goto found;
3884		hash = 0;
3885		nxt = ptype_base[0].next;
3886	} else
3887		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3888
3889	while (nxt == &ptype_base[hash]) {
3890		if (++hash >= PTYPE_HASH_SIZE)
3891			return NULL;
3892		nxt = ptype_base[hash].next;
3893	}
3894found:
3895	return list_entry(nxt, struct packet_type, list);
3896}
3897
3898static void ptype_seq_stop(struct seq_file *seq, void *v)
3899	__releases(RCU)
3900{
3901	rcu_read_unlock();
3902}
3903
3904static int ptype_seq_show(struct seq_file *seq, void *v)
3905{
3906	struct packet_type *pt = v;
3907
3908	if (v == SEQ_START_TOKEN)
3909		seq_puts(seq, "Type Device      Function\n");
3910	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3911		if (pt->type == htons(ETH_P_ALL))
3912			seq_puts(seq, "ALL ");
3913		else
3914			seq_printf(seq, "%04x", ntohs(pt->type));
3915
3916		seq_printf(seq, " %-8s %pF\n",
3917			   pt->dev ? pt->dev->name : "", pt->func);
3918	}
3919
3920	return 0;
3921}
3922
3923static const struct seq_operations ptype_seq_ops = {
3924	.start = ptype_seq_start,
3925	.next  = ptype_seq_next,
3926	.stop  = ptype_seq_stop,
3927	.show  = ptype_seq_show,
3928};
3929
3930static int ptype_seq_open(struct inode *inode, struct file *file)
3931{
3932	return seq_open_net(inode, file, &ptype_seq_ops,
3933			sizeof(struct seq_net_private));
3934}
3935
3936static const struct file_operations ptype_seq_fops = {
3937	.owner	 = THIS_MODULE,
3938	.open    = ptype_seq_open,
3939	.read    = seq_read,
3940	.llseek  = seq_lseek,
3941	.release = seq_release_net,
3942};
3943
3944
3945static int __net_init dev_proc_net_init(struct net *net)
3946{
3947	int rc = -ENOMEM;
3948
3949	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3950		goto out;
3951	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3952		goto out_dev;
3953	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3954		goto out_softnet;
3955
3956	if (wext_proc_init(net))
3957		goto out_ptype;
3958	rc = 0;
3959out:
3960	return rc;
3961out_ptype:
3962	proc_net_remove(net, "ptype");
3963out_softnet:
3964	proc_net_remove(net, "softnet_stat");
3965out_dev:
3966	proc_net_remove(net, "dev");
3967	goto out;
3968}
3969
3970static void __net_exit dev_proc_net_exit(struct net *net)
3971{
3972	wext_proc_exit(net);
3973
3974	proc_net_remove(net, "ptype");
3975	proc_net_remove(net, "softnet_stat");
3976	proc_net_remove(net, "dev");
3977}
3978
3979static struct pernet_operations __net_initdata dev_proc_ops = {
3980	.init = dev_proc_net_init,
3981	.exit = dev_proc_net_exit,
3982};
3983
3984static int __init dev_proc_init(void)
3985{
3986	return register_pernet_subsys(&dev_proc_ops);
3987}
3988#else
3989#define dev_proc_init() 0
3990#endif	/* CONFIG_PROC_FS */
3991
3992
3993/**
3994 *	netdev_set_master	-	set up master/slave pair
3995 *	@slave: slave device
3996 *	@master: new master device
3997 *
3998 *	Changes the master device of the slave. Pass %NULL to break the
3999 *	bonding. The caller must hold the RTNL semaphore. On a failure
4000 *	a negative errno code is returned. On success the reference counts
4001 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4002 *	function returns zero.
4003 */
4004int netdev_set_master(struct net_device *slave, struct net_device *master)
4005{
4006	struct net_device *old = slave->master;
4007
4008	ASSERT_RTNL();
4009
4010	if (master) {
4011		if (old)
4012			return -EBUSY;
4013		dev_hold(master);
4014	}
4015
4016	slave->master = master;
4017
4018	if (old) {
4019		synchronize_net();
4020		dev_put(old);
4021	}
4022	if (master)
4023		slave->flags |= IFF_SLAVE;
4024	else
4025		slave->flags &= ~IFF_SLAVE;
4026
4027	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4028	return 0;
4029}
4030EXPORT_SYMBOL(netdev_set_master);
4031
4032static void dev_change_rx_flags(struct net_device *dev, int flags)
4033{
4034	const struct net_device_ops *ops = dev->netdev_ops;
4035
4036	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4037		ops->ndo_change_rx_flags(dev, flags);
4038}
4039
4040static int __dev_set_promiscuity(struct net_device *dev, int inc)
4041{
4042	unsigned short old_flags = dev->flags;
4043	uid_t uid;
4044	gid_t gid;
4045
4046	ASSERT_RTNL();
4047
4048	dev->flags |= IFF_PROMISC;
4049	dev->promiscuity += inc;
4050	if (dev->promiscuity == 0) {
4051		/*
4052		 * Avoid overflow.
4053		 * If inc causes overflow, untouch promisc and return error.
4054		 */
4055		if (inc < 0)
4056			dev->flags &= ~IFF_PROMISC;
4057		else {
4058			dev->promiscuity -= inc;
4059			printk(KERN_WARNING "%s: promiscuity touches roof, "
4060				"set promiscuity failed, promiscuity feature "
4061				"of device might be broken.\n", dev->name);
4062			return -EOVERFLOW;
4063		}
4064	}
4065	if (dev->flags != old_flags) {
4066		printk(KERN_INFO "device %s %s promiscuous mode\n",
4067		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4068							       "left");
4069		if (audit_enabled) {
4070			current_uid_gid(&uid, &gid);
4071			audit_log(current->audit_context, GFP_ATOMIC,
4072				AUDIT_ANOM_PROMISCUOUS,
4073				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4074				dev->name, (dev->flags & IFF_PROMISC),
4075				(old_flags & IFF_PROMISC),
4076				audit_get_loginuid(current),
4077				uid, gid,
4078				audit_get_sessionid(current));
4079		}
4080
4081		dev_change_rx_flags(dev, IFF_PROMISC);
4082	}
4083	return 0;
4084}
4085
4086/**
4087 *	dev_set_promiscuity	- update promiscuity count on a device
4088 *	@dev: device
4089 *	@inc: modifier
4090 *
4091 *	Add or remove promiscuity from a device. While the count in the device
4092 *	remains above zero the interface remains promiscuous. Once it hits zero
4093 *	the device reverts back to normal filtering operation. A negative inc
4094 *	value is used to drop promiscuity on the device.
4095 *	Return 0 if successful or a negative errno code on error.
4096 */
4097int dev_set_promiscuity(struct net_device *dev, int inc)
4098{
4099	unsigned short old_flags = dev->flags;
4100	int err;
4101
4102	err = __dev_set_promiscuity(dev, inc);
4103	if (err < 0)
4104		return err;
4105	if (dev->flags != old_flags)
4106		dev_set_rx_mode(dev);
4107	return err;
4108}
4109EXPORT_SYMBOL(dev_set_promiscuity);
4110
4111/**
4112 *	dev_set_allmulti	- update allmulti count on a device
4113 *	@dev: device
4114 *	@inc: modifier
4115 *
4116 *	Add or remove reception of all multicast frames to a device. While the
4117 *	count in the device remains above zero the interface remains listening
4118 *	to all interfaces. Once it hits zero the device reverts back to normal
4119 *	filtering operation. A negative @inc value is used to drop the counter
4120 *	when releasing a resource needing all multicasts.
4121 *	Return 0 if successful or a negative errno code on error.
4122 */
4123
4124int dev_set_allmulti(struct net_device *dev, int inc)
4125{
4126	unsigned short old_flags = dev->flags;
4127
4128	ASSERT_RTNL();
4129
4130	dev->flags |= IFF_ALLMULTI;
4131	dev->allmulti += inc;
4132	if (dev->allmulti == 0) {
4133		/*
4134		 * Avoid overflow.
4135		 * If inc causes overflow, untouch allmulti and return error.
4136		 */
4137		if (inc < 0)
4138			dev->flags &= ~IFF_ALLMULTI;
4139		else {
4140			dev->allmulti -= inc;
4141			printk(KERN_WARNING "%s: allmulti touches roof, "
4142				"set allmulti failed, allmulti feature of "
4143				"device might be broken.\n", dev->name);
4144			return -EOVERFLOW;
4145		}
4146	}
4147	if (dev->flags ^ old_flags) {
4148		dev_change_rx_flags(dev, IFF_ALLMULTI);
4149		dev_set_rx_mode(dev);
4150	}
4151	return 0;
4152}
4153EXPORT_SYMBOL(dev_set_allmulti);
4154
4155/*
4156 *	Upload unicast and multicast address lists to device and
4157 *	configure RX filtering. When the device doesn't support unicast
4158 *	filtering it is put in promiscuous mode while unicast addresses
4159 *	are present.
4160 */
4161void __dev_set_rx_mode(struct net_device *dev)
4162{
4163	const struct net_device_ops *ops = dev->netdev_ops;
4164
4165	/* dev_open will call this function so the list will stay sane. */
4166	if (!(dev->flags&IFF_UP))
4167		return;
4168
4169	if (!netif_device_present(dev))
4170		return;
4171
4172	if (ops->ndo_set_rx_mode)
4173		ops->ndo_set_rx_mode(dev);
4174	else {
4175		/* Unicast addresses changes may only happen under the rtnl,
4176		 * therefore calling __dev_set_promiscuity here is safe.
4177		 */
4178		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4179			__dev_set_promiscuity(dev, 1);
4180			dev->uc_promisc = 1;
4181		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4182			__dev_set_promiscuity(dev, -1);
4183			dev->uc_promisc = 0;
4184		}
4185
4186		if (ops->ndo_set_multicast_list)
4187			ops->ndo_set_multicast_list(dev);
4188	}
4189}
4190
4191void dev_set_rx_mode(struct net_device *dev)
4192{
4193	netif_addr_lock_bh(dev);
4194	__dev_set_rx_mode(dev);
4195	netif_addr_unlock_bh(dev);
4196}
4197
4198/**
4199 *	dev_get_flags - get flags reported to userspace
4200 *	@dev: device
4201 *
4202 *	Get the combination of flag bits exported through APIs to userspace.
4203 */
4204unsigned dev_get_flags(const struct net_device *dev)
4205{
4206	unsigned flags;
4207
4208	flags = (dev->flags & ~(IFF_PROMISC |
4209				IFF_ALLMULTI |
4210				IFF_RUNNING |
4211				IFF_LOWER_UP |
4212				IFF_DORMANT)) |
4213		(dev->gflags & (IFF_PROMISC |
4214				IFF_ALLMULTI));
4215
4216	if (netif_running(dev)) {
4217		if (netif_oper_up(dev))
4218			flags |= IFF_RUNNING;
4219		if (netif_carrier_ok(dev))
4220			flags |= IFF_LOWER_UP;
4221		if (netif_dormant(dev))
4222			flags |= IFF_DORMANT;
4223	}
4224
4225	return flags;
4226}
4227EXPORT_SYMBOL(dev_get_flags);
4228
4229int __dev_change_flags(struct net_device *dev, unsigned int flags)
4230{
4231	int old_flags = dev->flags;
4232	int ret;
4233
4234	ASSERT_RTNL();
4235
4236	/*
4237	 *	Set the flags on our device.
4238	 */
4239
4240	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4241			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4242			       IFF_AUTOMEDIA)) |
4243		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4244				    IFF_ALLMULTI));
4245
4246	/*
4247	 *	Load in the correct multicast list now the flags have changed.
4248	 */
4249
4250	if ((old_flags ^ flags) & IFF_MULTICAST)
4251		dev_change_rx_flags(dev, IFF_MULTICAST);
4252
4253	dev_set_rx_mode(dev);
4254
4255	/*
4256	 *	Have we downed the interface. We handle IFF_UP ourselves
4257	 *	according to user attempts to set it, rather than blindly
4258	 *	setting it.
4259	 */
4260
4261	ret = 0;
4262	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4263		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4264
4265		if (!ret)
4266			dev_set_rx_mode(dev);
4267	}
4268
4269	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4270		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4271
4272		dev->gflags ^= IFF_PROMISC;
4273		dev_set_promiscuity(dev, inc);
4274	}
4275
4276	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4277	   is important. Some (broken) drivers set IFF_PROMISC, when
4278	   IFF_ALLMULTI is requested not asking us and not reporting.
4279	 */
4280	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4281		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4282
4283		dev->gflags ^= IFF_ALLMULTI;
4284		dev_set_allmulti(dev, inc);
4285	}
4286
4287	return ret;
4288}
4289
4290void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4291{
4292	unsigned int changes = dev->flags ^ old_flags;
4293
4294	if (changes & IFF_UP) {
4295		if (dev->flags & IFF_UP)
4296			call_netdevice_notifiers(NETDEV_UP, dev);
4297		else
4298			call_netdevice_notifiers(NETDEV_DOWN, dev);
4299	}
4300
4301	if (dev->flags & IFF_UP &&
4302	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4303		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4304}
4305
4306/**
4307 *	dev_change_flags - change device settings
4308 *	@dev: device
4309 *	@flags: device state flags
4310 *
4311 *	Change settings on device based state flags. The flags are
4312 *	in the userspace exported format.
4313 */
4314int dev_change_flags(struct net_device *dev, unsigned flags)
4315{
4316	int ret, changes;
4317	int old_flags = dev->flags;
4318
4319	ret = __dev_change_flags(dev, flags);
4320	if (ret < 0)
4321		return ret;
4322
4323	changes = old_flags ^ dev->flags;
4324	if (changes)
4325		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4326
4327	__dev_notify_flags(dev, old_flags);
4328	return ret;
4329}
4330EXPORT_SYMBOL(dev_change_flags);
4331
4332/**
4333 *	dev_set_mtu - Change maximum transfer unit
4334 *	@dev: device
4335 *	@new_mtu: new transfer unit
4336 *
4337 *	Change the maximum transfer size of the network device.
4338 */
4339int dev_set_mtu(struct net_device *dev, int new_mtu)
4340{
4341	const struct net_device_ops *ops = dev->netdev_ops;
4342	int err;
4343
4344	if (new_mtu == dev->mtu)
4345		return 0;
4346
4347	/*	MTU must be positive.	 */
4348	if (new_mtu < 0)
4349		return -EINVAL;
4350
4351	if (!netif_device_present(dev))
4352		return -ENODEV;
4353
4354	err = 0;
4355	if (ops->ndo_change_mtu)
4356		err = ops->ndo_change_mtu(dev, new_mtu);
4357	else
4358		dev->mtu = new_mtu;
4359
4360	if (!err && dev->flags & IFF_UP)
4361		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4362	return err;
4363}
4364EXPORT_SYMBOL(dev_set_mtu);
4365
4366/**
4367 *	dev_set_mac_address - Change Media Access Control Address
4368 *	@dev: device
4369 *	@sa: new address
4370 *
4371 *	Change the hardware (MAC) address of the device
4372 */
4373int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4374{
4375	const struct net_device_ops *ops = dev->netdev_ops;
4376	int err;
4377
4378	if (!ops->ndo_set_mac_address)
4379		return -EOPNOTSUPP;
4380	if (sa->sa_family != dev->type)
4381		return -EINVAL;
4382	if (!netif_device_present(dev))
4383		return -ENODEV;
4384	err = ops->ndo_set_mac_address(dev, sa);
4385	if (!err)
4386		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4387	return err;
4388}
4389EXPORT_SYMBOL(dev_set_mac_address);
4390
4391/*
4392 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4393 */
4394static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4395{
4396	int err;
4397	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4398
4399	if (!dev)
4400		return -ENODEV;
4401
4402	switch (cmd) {
4403	case SIOCGIFFLAGS:	/* Get interface flags */
4404		ifr->ifr_flags = (short) dev_get_flags(dev);
4405		return 0;
4406
4407	case SIOCGIFMETRIC:	/* Get the metric on the interface
4408				   (currently unused) */
4409		ifr->ifr_metric = 0;
4410		return 0;
4411
4412	case SIOCGIFMTU:	/* Get the MTU of a device */
4413		ifr->ifr_mtu = dev->mtu;
4414		return 0;
4415
4416	case SIOCGIFHWADDR:
4417		if (!dev->addr_len)
4418			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4419		else
4420			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4421			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4422		ifr->ifr_hwaddr.sa_family = dev->type;
4423		return 0;
4424
4425	case SIOCGIFSLAVE:
4426		err = -EINVAL;
4427		break;
4428
4429	case SIOCGIFMAP:
4430		ifr->ifr_map.mem_start = dev->mem_start;
4431		ifr->ifr_map.mem_end   = dev->mem_end;
4432		ifr->ifr_map.base_addr = dev->base_addr;
4433		ifr->ifr_map.irq       = dev->irq;
4434		ifr->ifr_map.dma       = dev->dma;
4435		ifr->ifr_map.port      = dev->if_port;
4436		return 0;
4437
4438	case SIOCGIFINDEX:
4439		ifr->ifr_ifindex = dev->ifindex;
4440		return 0;
4441
4442	case SIOCGIFTXQLEN:
4443		ifr->ifr_qlen = dev->tx_queue_len;
4444		return 0;
4445
4446	default:
4447		/* dev_ioctl() should ensure this case
4448		 * is never reached
4449		 */
4450		WARN_ON(1);
4451		err = -EINVAL;
4452		break;
4453
4454	}
4455	return err;
4456}
4457
4458/*
4459 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4460 */
4461static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4462{
4463	int err;
4464	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4465	const struct net_device_ops *ops;
4466
4467	if (!dev)
4468		return -ENODEV;
4469
4470	ops = dev->netdev_ops;
4471
4472	switch (cmd) {
4473	case SIOCSIFFLAGS:	/* Set interface flags */
4474		return dev_change_flags(dev, ifr->ifr_flags);
4475
4476	case SIOCSIFMETRIC:	/* Set the metric on the interface
4477				   (currently unused) */
4478		return -EOPNOTSUPP;
4479
4480	case SIOCSIFMTU:	/* Set the MTU of a device */
4481		return dev_set_mtu(dev, ifr->ifr_mtu);
4482
4483	case SIOCSIFHWADDR:
4484		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4485
4486	case SIOCSIFHWBROADCAST:
4487		if (ifr->ifr_hwaddr.sa_family != dev->type)
4488			return -EINVAL;
4489		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4490		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4491		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4492		return 0;
4493
4494	case SIOCSIFMAP:
4495		if (ops->ndo_set_config) {
4496			if (!netif_device_present(dev))
4497				return -ENODEV;
4498			return ops->ndo_set_config(dev, &ifr->ifr_map);
4499		}
4500		return -EOPNOTSUPP;
4501
4502	case SIOCADDMULTI:
4503		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4504		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4505			return -EINVAL;
4506		if (!netif_device_present(dev))
4507			return -ENODEV;
4508		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4509
4510	case SIOCDELMULTI:
4511		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4512		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4513			return -EINVAL;
4514		if (!netif_device_present(dev))
4515			return -ENODEV;
4516		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4517
4518	case SIOCSIFTXQLEN:
4519		if (ifr->ifr_qlen < 0)
4520			return -EINVAL;
4521		dev->tx_queue_len = ifr->ifr_qlen;
4522		return 0;
4523
4524	case SIOCSIFNAME:
4525		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4526		return dev_change_name(dev, ifr->ifr_newname);
4527
4528	/*
4529	 *	Unknown or private ioctl
4530	 */
4531	default:
4532		if ((cmd >= SIOCDEVPRIVATE &&
4533		    cmd <= SIOCDEVPRIVATE + 15) ||
4534		    cmd == SIOCBONDENSLAVE ||
4535		    cmd == SIOCBONDRELEASE ||
4536		    cmd == SIOCBONDSETHWADDR ||
4537		    cmd == SIOCBONDSLAVEINFOQUERY ||
4538		    cmd == SIOCBONDINFOQUERY ||
4539		    cmd == SIOCBONDCHANGEACTIVE ||
4540		    cmd == SIOCGMIIPHY ||
4541		    cmd == SIOCGMIIREG ||
4542		    cmd == SIOCSMIIREG ||
4543		    cmd == SIOCBRADDIF ||
4544		    cmd == SIOCBRDELIF ||
4545		    cmd == SIOCSHWTSTAMP ||
4546		    cmd == SIOCWANDEV) {
4547			err = -EOPNOTSUPP;
4548			if (ops->ndo_do_ioctl) {
4549				if (netif_device_present(dev))
4550					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4551				else
4552					err = -ENODEV;
4553			}
4554		} else
4555			err = -EINVAL;
4556
4557	}
4558	return err;
4559}
4560
4561/*
4562 *	This function handles all "interface"-type I/O control requests. The actual
4563 *	'doing' part of this is dev_ifsioc above.
4564 */
4565
4566/**
4567 *	dev_ioctl	-	network device ioctl
4568 *	@net: the applicable net namespace
4569 *	@cmd: command to issue
4570 *	@arg: pointer to a struct ifreq in user space
4571 *
4572 *	Issue ioctl functions to devices. This is normally called by the
4573 *	user space syscall interfaces but can sometimes be useful for
4574 *	other purposes. The return value is the return from the syscall if
4575 *	positive or a negative errno code on error.
4576 */
4577
4578int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4579{
4580	struct ifreq ifr;
4581	int ret;
4582	char *colon;
4583
4584	/* One special case: SIOCGIFCONF takes ifconf argument
4585	   and requires shared lock, because it sleeps writing
4586	   to user space.
4587	 */
4588
4589	if (cmd == SIOCGIFCONF) {
4590		rtnl_lock();
4591		ret = dev_ifconf(net, (char __user *) arg);
4592		rtnl_unlock();
4593		return ret;
4594	}
4595	if (cmd == SIOCGIFNAME)
4596		return dev_ifname(net, (struct ifreq __user *)arg);
4597
4598	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4599		return -EFAULT;
4600
4601	ifr.ifr_name[IFNAMSIZ-1] = 0;
4602
4603	colon = strchr(ifr.ifr_name, ':');
4604	if (colon)
4605		*colon = 0;
4606
4607	/*
4608	 *	See which interface the caller is talking about.
4609	 */
4610
4611	switch (cmd) {
4612	/*
4613	 *	These ioctl calls:
4614	 *	- can be done by all.
4615	 *	- atomic and do not require locking.
4616	 *	- return a value
4617	 */
4618	case SIOCGIFFLAGS:
4619	case SIOCGIFMETRIC:
4620	case SIOCGIFMTU:
4621	case SIOCGIFHWADDR:
4622	case SIOCGIFSLAVE:
4623	case SIOCGIFMAP:
4624	case SIOCGIFINDEX:
4625	case SIOCGIFTXQLEN:
4626		dev_load(net, ifr.ifr_name);
4627		rcu_read_lock();
4628		ret = dev_ifsioc_locked(net, &ifr, cmd);
4629		rcu_read_unlock();
4630		if (!ret) {
4631			if (colon)
4632				*colon = ':';
4633			if (copy_to_user(arg, &ifr,
4634					 sizeof(struct ifreq)))
4635				ret = -EFAULT;
4636		}
4637		return ret;
4638
4639	case SIOCETHTOOL:
4640		dev_load(net, ifr.ifr_name);
4641		rtnl_lock();
4642		ret = dev_ethtool(net, &ifr);
4643		rtnl_unlock();
4644		if (!ret) {
4645			if (colon)
4646				*colon = ':';
4647			if (copy_to_user(arg, &ifr,
4648					 sizeof(struct ifreq)))
4649				ret = -EFAULT;
4650		}
4651		return ret;
4652
4653	/*
4654	 *	These ioctl calls:
4655	 *	- require superuser power.
4656	 *	- require strict serialization.
4657	 *	- return a value
4658	 */
4659	case SIOCGMIIPHY:
4660	case SIOCGMIIREG:
4661	case SIOCSIFNAME:
4662		if (!capable(CAP_NET_ADMIN))
4663			return -EPERM;
4664		dev_load(net, ifr.ifr_name);
4665		rtnl_lock();
4666		ret = dev_ifsioc(net, &ifr, cmd);
4667		rtnl_unlock();
4668		if (!ret) {
4669			if (colon)
4670				*colon = ':';
4671			if (copy_to_user(arg, &ifr,
4672					 sizeof(struct ifreq)))
4673				ret = -EFAULT;
4674		}
4675		return ret;
4676
4677	/*
4678	 *	These ioctl calls:
4679	 *	- require superuser power.
4680	 *	- require strict serialization.
4681	 *	- do not return a value
4682	 */
4683	case SIOCSIFFLAGS:
4684	case SIOCSIFMETRIC:
4685	case SIOCSIFMTU:
4686	case SIOCSIFMAP:
4687	case SIOCSIFHWADDR:
4688	case SIOCSIFSLAVE:
4689	case SIOCADDMULTI:
4690	case SIOCDELMULTI:
4691	case SIOCSIFHWBROADCAST:
4692	case SIOCSIFTXQLEN:
4693	case SIOCSMIIREG:
4694	case SIOCBONDENSLAVE:
4695	case SIOCBONDRELEASE:
4696	case SIOCBONDSETHWADDR:
4697	case SIOCBONDCHANGEACTIVE:
4698	case SIOCBRADDIF:
4699	case SIOCBRDELIF:
4700	case SIOCSHWTSTAMP:
4701		if (!capable(CAP_NET_ADMIN))
4702			return -EPERM;
4703		/* fall through */
4704	case SIOCBONDSLAVEINFOQUERY:
4705	case SIOCBONDINFOQUERY:
4706		dev_load(net, ifr.ifr_name);
4707		rtnl_lock();
4708		ret = dev_ifsioc(net, &ifr, cmd);
4709		rtnl_unlock();
4710		return ret;
4711
4712	case SIOCGIFMEM:
4713		/* Get the per device memory space. We can add this but
4714		 * currently do not support it */
4715	case SIOCSIFMEM:
4716		/* Set the per device memory buffer space.
4717		 * Not applicable in our case */
4718	case SIOCSIFLINK:
4719		return -EINVAL;
4720
4721	/*
4722	 *	Unknown or private ioctl.
4723	 */
4724	default:
4725		if (cmd == SIOCWANDEV ||
4726		    (cmd >= SIOCDEVPRIVATE &&
4727		     cmd <= SIOCDEVPRIVATE + 15)) {
4728			dev_load(net, ifr.ifr_name);
4729			rtnl_lock();
4730			ret = dev_ifsioc(net, &ifr, cmd);
4731			rtnl_unlock();
4732			if (!ret && copy_to_user(arg, &ifr,
4733						 sizeof(struct ifreq)))
4734				ret = -EFAULT;
4735			return ret;
4736		}
4737		/* Take care of Wireless Extensions */
4738		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4739			return wext_handle_ioctl(net, &ifr, cmd, arg);
4740		return -EINVAL;
4741	}
4742}
4743
4744
4745/**
4746 *	dev_new_index	-	allocate an ifindex
4747 *	@net: the applicable net namespace
4748 *
4749 *	Returns a suitable unique value for a new device interface
4750 *	number.  The caller must hold the rtnl semaphore or the
4751 *	dev_base_lock to be sure it remains unique.
4752 */
4753static int dev_new_index(struct net *net)
4754{
4755	static int ifindex;
4756	for (;;) {
4757		if (++ifindex <= 0)
4758			ifindex = 1;
4759		if (!__dev_get_by_index(net, ifindex))
4760			return ifindex;
4761	}
4762}
4763
4764/* Delayed registration/unregisteration */
4765static LIST_HEAD(net_todo_list);
4766
4767static void net_set_todo(struct net_device *dev)
4768{
4769	list_add_tail(&dev->todo_list, &net_todo_list);
4770}
4771
4772static void rollback_registered_many(struct list_head *head)
4773{
4774	struct net_device *dev, *tmp;
4775
4776	BUG_ON(dev_boot_phase);
4777	ASSERT_RTNL();
4778
4779	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4780		/* Some devices call without registering
4781		 * for initialization unwind. Remove those
4782		 * devices and proceed with the remaining.
4783		 */
4784		if (dev->reg_state == NETREG_UNINITIALIZED) {
4785			pr_debug("unregister_netdevice: device %s/%p never "
4786				 "was registered\n", dev->name, dev);
4787
4788			WARN_ON(1);
4789			list_del(&dev->unreg_list);
4790			continue;
4791		}
4792
4793		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4794
4795		/* If device is running, close it first. */
4796		dev_close(dev);
4797
4798		/* And unlink it from device chain. */
4799		unlist_netdevice(dev);
4800
4801		dev->reg_state = NETREG_UNREGISTERING;
4802	}
4803
4804	synchronize_net();
4805
4806	list_for_each_entry(dev, head, unreg_list) {
4807		/* Shutdown queueing discipline. */
4808		dev_shutdown(dev);
4809
4810
4811		/* Notify protocols, that we are about to destroy
4812		   this device. They should clean all the things.
4813		*/
4814		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4815
4816		if (!dev->rtnl_link_ops ||
4817		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4818			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4819
4820		/*
4821		 *	Flush the unicast and multicast chains
4822		 */
4823		dev_uc_flush(dev);
4824		dev_mc_flush(dev);
4825
4826		if (dev->netdev_ops->ndo_uninit)
4827			dev->netdev_ops->ndo_uninit(dev);
4828
4829		/* Notifier chain MUST detach us from master device. */
4830		WARN_ON(dev->master);
4831
4832		/* Remove entries from kobject tree */
4833		netdev_unregister_kobject(dev);
4834	}
4835
4836	/* Process any work delayed until the end of the batch */
4837	dev = list_first_entry(head, struct net_device, unreg_list);
4838	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4839
4840	synchronize_net();
4841
4842	list_for_each_entry(dev, head, unreg_list)
4843		dev_put(dev);
4844}
4845
4846static void rollback_registered(struct net_device *dev)
4847{
4848	LIST_HEAD(single);
4849
4850	list_add(&dev->unreg_list, &single);
4851	rollback_registered_many(&single);
4852}
4853
4854static void __netdev_init_queue_locks_one(struct net_device *dev,
4855					  struct netdev_queue *dev_queue,
4856					  void *_unused)
4857{
4858	spin_lock_init(&dev_queue->_xmit_lock);
4859	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4860	dev_queue->xmit_lock_owner = -1;
4861}
4862
4863static void netdev_init_queue_locks(struct net_device *dev)
4864{
4865	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4866	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4867}
4868
4869unsigned long netdev_fix_features(unsigned long features, const char *name)
4870{
4871	/* Fix illegal SG+CSUM combinations. */
4872	if ((features & NETIF_F_SG) &&
4873	    !(features & NETIF_F_ALL_CSUM)) {
4874		if (name)
4875			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4876			       "checksum feature.\n", name);
4877		features &= ~NETIF_F_SG;
4878	}
4879
4880	/* TSO requires that SG is present as well. */
4881	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4882		if (name)
4883			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4884			       "SG feature.\n", name);
4885		features &= ~NETIF_F_TSO;
4886	}
4887
4888	if (features & NETIF_F_UFO) {
4889		if (!(features & NETIF_F_GEN_CSUM)) {
4890			if (name)
4891				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4892				       "since no NETIF_F_HW_CSUM feature.\n",
4893				       name);
4894			features &= ~NETIF_F_UFO;
4895		}
4896
4897		if (!(features & NETIF_F_SG)) {
4898			if (name)
4899				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4900				       "since no NETIF_F_SG feature.\n", name);
4901			features &= ~NETIF_F_UFO;
4902		}
4903	}
4904
4905	return features;
4906}
4907EXPORT_SYMBOL(netdev_fix_features);
4908
4909/**
4910 *	netif_stacked_transfer_operstate -	transfer operstate
4911 *	@rootdev: the root or lower level device to transfer state from
4912 *	@dev: the device to transfer operstate to
4913 *
4914 *	Transfer operational state from root to device. This is normally
4915 *	called when a stacking relationship exists between the root
4916 *	device and the device(a leaf device).
4917 */
4918void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4919					struct net_device *dev)
4920{
4921	if (rootdev->operstate == IF_OPER_DORMANT)
4922		netif_dormant_on(dev);
4923	else
4924		netif_dormant_off(dev);
4925
4926	if (netif_carrier_ok(rootdev)) {
4927		if (!netif_carrier_ok(dev))
4928			netif_carrier_on(dev);
4929	} else {
4930		if (netif_carrier_ok(dev))
4931			netif_carrier_off(dev);
4932	}
4933}
4934EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4935
4936/**
4937 *	register_netdevice	- register a network device
4938 *	@dev: device to register
4939 *
4940 *	Take a completed network device structure and add it to the kernel
4941 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4942 *	chain. 0 is returned on success. A negative errno code is returned
4943 *	on a failure to set up the device, or if the name is a duplicate.
4944 *
4945 *	Callers must hold the rtnl semaphore. You may want
4946 *	register_netdev() instead of this.
4947 *
4948 *	BUGS:
4949 *	The locking appears insufficient to guarantee two parallel registers
4950 *	will not get the same name.
4951 */
4952
4953int register_netdevice(struct net_device *dev)
4954{
4955	int ret;
4956	struct net *net = dev_net(dev);
4957
4958	BUG_ON(dev_boot_phase);
4959	ASSERT_RTNL();
4960
4961	might_sleep();
4962
4963	/* When net_device's are persistent, this will be fatal. */
4964	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4965	BUG_ON(!net);
4966
4967	spin_lock_init(&dev->addr_list_lock);
4968	netdev_set_addr_lockdep_class(dev);
4969	netdev_init_queue_locks(dev);
4970
4971	dev->iflink = -1;
4972
4973#ifdef CONFIG_RPS
4974	if (!dev->num_rx_queues) {
4975		/*
4976		 * Allocate a single RX queue if driver never called
4977		 * alloc_netdev_mq
4978		 */
4979
4980		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4981		if (!dev->_rx) {
4982			ret = -ENOMEM;
4983			goto out;
4984		}
4985
4986		dev->_rx->first = dev->_rx;
4987		atomic_set(&dev->_rx->count, 1);
4988		dev->num_rx_queues = 1;
4989	}
4990#endif
4991	/* Init, if this function is available */
4992	if (dev->netdev_ops->ndo_init) {
4993		ret = dev->netdev_ops->ndo_init(dev);
4994		if (ret) {
4995			if (ret > 0)
4996				ret = -EIO;
4997			goto out;
4998		}
4999	}
5000
5001	ret = dev_get_valid_name(dev, dev->name, 0);
5002	if (ret)
5003		goto err_uninit;
5004
5005	dev->ifindex = dev_new_index(net);
5006	if (dev->iflink == -1)
5007		dev->iflink = dev->ifindex;
5008
5009	/* Fix illegal checksum combinations */
5010	if ((dev->features & NETIF_F_HW_CSUM) &&
5011	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5012		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5013		       dev->name);
5014		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5015	}
5016
5017	if ((dev->features & NETIF_F_NO_CSUM) &&
5018	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5019		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5020		       dev->name);
5021		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5022	}
5023
5024	dev->features = netdev_fix_features(dev->features, dev->name);
5025
5026	/* Enable software GSO if SG is supported. */
5027	if (dev->features & NETIF_F_SG)
5028		dev->features |= NETIF_F_GSO;
5029
5030	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5031	ret = notifier_to_errno(ret);
5032	if (ret)
5033		goto err_uninit;
5034
5035	ret = netdev_register_kobject(dev);
5036	if (ret)
5037		goto err_uninit;
5038	dev->reg_state = NETREG_REGISTERED;
5039
5040	/*
5041	 *	Default initial state at registry is that the
5042	 *	device is present.
5043	 */
5044
5045	set_bit(__LINK_STATE_PRESENT, &dev->state);
5046
5047	dev_init_scheduler(dev);
5048	dev_hold(dev);
5049	list_netdevice(dev);
5050
5051	/* Notify protocols, that a new device appeared. */
5052	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5053	ret = notifier_to_errno(ret);
5054	if (ret) {
5055		rollback_registered(dev);
5056		dev->reg_state = NETREG_UNREGISTERED;
5057	}
5058	/*
5059	 *	Prevent userspace races by waiting until the network
5060	 *	device is fully setup before sending notifications.
5061	 */
5062	if (!dev->rtnl_link_ops ||
5063	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5064		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5065
5066out:
5067	return ret;
5068
5069err_uninit:
5070	if (dev->netdev_ops->ndo_uninit)
5071		dev->netdev_ops->ndo_uninit(dev);
5072	goto out;
5073}
5074EXPORT_SYMBOL(register_netdevice);
5075
5076/**
5077 *	init_dummy_netdev	- init a dummy network device for NAPI
5078 *	@dev: device to init
5079 *
5080 *	This takes a network device structure and initialize the minimum
5081 *	amount of fields so it can be used to schedule NAPI polls without
5082 *	registering a full blown interface. This is to be used by drivers
5083 *	that need to tie several hardware interfaces to a single NAPI
5084 *	poll scheduler due to HW limitations.
5085 */
5086int init_dummy_netdev(struct net_device *dev)
5087{
5088	/* Clear everything. Note we don't initialize spinlocks
5089	 * are they aren't supposed to be taken by any of the
5090	 * NAPI code and this dummy netdev is supposed to be
5091	 * only ever used for NAPI polls
5092	 */
5093	memset(dev, 0, sizeof(struct net_device));
5094
5095	/* make sure we BUG if trying to hit standard
5096	 * register/unregister code path
5097	 */
5098	dev->reg_state = NETREG_DUMMY;
5099
5100	/* initialize the ref count */
5101	atomic_set(&dev->refcnt, 1);
5102
5103	/* NAPI wants this */
5104	INIT_LIST_HEAD(&dev->napi_list);
5105
5106	/* a dummy interface is started by default */
5107	set_bit(__LINK_STATE_PRESENT, &dev->state);
5108	set_bit(__LINK_STATE_START, &dev->state);
5109
5110	return 0;
5111}
5112EXPORT_SYMBOL_GPL(init_dummy_netdev);
5113
5114
5115/**
5116 *	register_netdev	- register a network device
5117 *	@dev: device to register
5118 *
5119 *	Take a completed network device structure and add it to the kernel
5120 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5121 *	chain. 0 is returned on success. A negative errno code is returned
5122 *	on a failure to set up the device, or if the name is a duplicate.
5123 *
5124 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5125 *	and expands the device name if you passed a format string to
5126 *	alloc_netdev.
5127 */
5128int register_netdev(struct net_device *dev)
5129{
5130	int err;
5131
5132	rtnl_lock();
5133
5134	/*
5135	 * If the name is a format string the caller wants us to do a
5136	 * name allocation.
5137	 */
5138	if (strchr(dev->name, '%')) {
5139		err = dev_alloc_name(dev, dev->name);
5140		if (err < 0)
5141			goto out;
5142	}
5143
5144	err = register_netdevice(dev);
5145out:
5146	rtnl_unlock();
5147	return err;
5148}
5149EXPORT_SYMBOL(register_netdev);
5150
5151/*
5152 * netdev_wait_allrefs - wait until all references are gone.
5153 *
5154 * This is called when unregistering network devices.
5155 *
5156 * Any protocol or device that holds a reference should register
5157 * for netdevice notification, and cleanup and put back the
5158 * reference if they receive an UNREGISTER event.
5159 * We can get stuck here if buggy protocols don't correctly
5160 * call dev_put.
5161 */
5162static void netdev_wait_allrefs(struct net_device *dev)
5163{
5164	unsigned long rebroadcast_time, warning_time;
5165
5166	linkwatch_forget_dev(dev);
5167
5168	rebroadcast_time = warning_time = jiffies;
5169	while (atomic_read(&dev->refcnt) != 0) {
5170		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5171			rtnl_lock();
5172
5173			/* Rebroadcast unregister notification */
5174			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5175			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5176			 * should have already handle it the first time */
5177
5178			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5179				     &dev->state)) {
5180				/* We must not have linkwatch events
5181				 * pending on unregister. If this
5182				 * happens, we simply run the queue
5183				 * unscheduled, resulting in a noop
5184				 * for this device.
5185				 */
5186				linkwatch_run_queue();
5187			}
5188
5189			__rtnl_unlock();
5190
5191			rebroadcast_time = jiffies;
5192		}
5193
5194		msleep(250);
5195
5196		if (time_after(jiffies, warning_time + 10 * HZ)) {
5197			printk(KERN_EMERG "unregister_netdevice: "
5198			       "waiting for %s to become free. Usage "
5199			       "count = %d\n",
5200			       dev->name, atomic_read(&dev->refcnt));
5201			warning_time = jiffies;
5202		}
5203	}
5204}
5205
5206/* The sequence is:
5207 *
5208 *	rtnl_lock();
5209 *	...
5210 *	register_netdevice(x1);
5211 *	register_netdevice(x2);
5212 *	...
5213 *	unregister_netdevice(y1);
5214 *	unregister_netdevice(y2);
5215 *      ...
5216 *	rtnl_unlock();
5217 *	free_netdev(y1);
5218 *	free_netdev(y2);
5219 *
5220 * We are invoked by rtnl_unlock().
5221 * This allows us to deal with problems:
5222 * 1) We can delete sysfs objects which invoke hotplug
5223 *    without deadlocking with linkwatch via keventd.
5224 * 2) Since we run with the RTNL semaphore not held, we can sleep
5225 *    safely in order to wait for the netdev refcnt to drop to zero.
5226 *
5227 * We must not return until all unregister events added during
5228 * the interval the lock was held have been completed.
5229 */
5230void netdev_run_todo(void)
5231{
5232	struct list_head list;
5233
5234	/* Snapshot list, allow later requests */
5235	list_replace_init(&net_todo_list, &list);
5236
5237	__rtnl_unlock();
5238
5239	while (!list_empty(&list)) {
5240		struct net_device *dev
5241			= list_first_entry(&list, struct net_device, todo_list);
5242		list_del(&dev->todo_list);
5243
5244		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5245			printk(KERN_ERR "network todo '%s' but state %d\n",
5246			       dev->name, dev->reg_state);
5247			dump_stack();
5248			continue;
5249		}
5250
5251		dev->reg_state = NETREG_UNREGISTERED;
5252
5253		on_each_cpu(flush_backlog, dev, 1);
5254
5255		netdev_wait_allrefs(dev);
5256
5257		/* paranoia */
5258		BUG_ON(atomic_read(&dev->refcnt));
5259		WARN_ON(dev->ip_ptr);
5260		WARN_ON(dev->ip6_ptr);
5261		WARN_ON(dev->dn_ptr);
5262
5263		if (dev->destructor)
5264			dev->destructor(dev);
5265
5266		/* Free network device */
5267		kobject_put(&dev->dev.kobj);
5268	}
5269}
5270
5271/**
5272 *	dev_txq_stats_fold - fold tx_queues stats
5273 *	@dev: device to get statistics from
5274 *	@stats: struct net_device_stats to hold results
5275 */
5276void dev_txq_stats_fold(const struct net_device *dev,
5277			struct net_device_stats *stats)
5278{
5279	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5280	unsigned int i;
5281	struct netdev_queue *txq;
5282
5283	for (i = 0; i < dev->num_tx_queues; i++) {
5284		txq = netdev_get_tx_queue(dev, i);
5285		tx_bytes   += txq->tx_bytes;
5286		tx_packets += txq->tx_packets;
5287		tx_dropped += txq->tx_dropped;
5288	}
5289	if (tx_bytes || tx_packets || tx_dropped) {
5290		stats->tx_bytes   = tx_bytes;
5291		stats->tx_packets = tx_packets;
5292		stats->tx_dropped = tx_dropped;
5293	}
5294}
5295EXPORT_SYMBOL(dev_txq_stats_fold);
5296
5297/**
5298 *	dev_get_stats	- get network device statistics
5299 *	@dev: device to get statistics from
5300 *
5301 *	Get network statistics from device. The device driver may provide
5302 *	its own method by setting dev->netdev_ops->get_stats; otherwise
5303 *	the internal statistics structure is used.
5304 */
5305const struct net_device_stats *dev_get_stats(struct net_device *dev)
5306{
5307	const struct net_device_ops *ops = dev->netdev_ops;
5308
5309	if (ops->ndo_get_stats)
5310		return ops->ndo_get_stats(dev);
5311
5312	dev_txq_stats_fold(dev, &dev->stats);
5313	return &dev->stats;
5314}
5315EXPORT_SYMBOL(dev_get_stats);
5316
5317static void netdev_init_one_queue(struct net_device *dev,
5318				  struct netdev_queue *queue,
5319				  void *_unused)
5320{
5321	queue->dev = dev;
5322}
5323
5324static void netdev_init_queues(struct net_device *dev)
5325{
5326	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5327	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5328	spin_lock_init(&dev->tx_global_lock);
5329}
5330
5331/**
5332 *	alloc_netdev_mq - allocate network device
5333 *	@sizeof_priv:	size of private data to allocate space for
5334 *	@name:		device name format string
5335 *	@setup:		callback to initialize device
5336 *	@queue_count:	the number of subqueues to allocate
5337 *
5338 *	Allocates a struct net_device with private data area for driver use
5339 *	and performs basic initialization.  Also allocates subquue structs
5340 *	for each queue on the device at the end of the netdevice.
5341 */
5342struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5343		void (*setup)(struct net_device *), unsigned int queue_count)
5344{
5345	struct netdev_queue *tx;
5346	struct net_device *dev;
5347	size_t alloc_size;
5348	struct net_device *p;
5349#ifdef CONFIG_RPS
5350	struct netdev_rx_queue *rx;
5351	int i;
5352#endif
5353
5354	BUG_ON(strlen(name) >= sizeof(dev->name));
5355
5356	alloc_size = sizeof(struct net_device);
5357	if (sizeof_priv) {
5358		/* ensure 32-byte alignment of private area */
5359		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5360		alloc_size += sizeof_priv;
5361	}
5362	/* ensure 32-byte alignment of whole construct */
5363	alloc_size += NETDEV_ALIGN - 1;
5364
5365	p = kzalloc(alloc_size, GFP_KERNEL);
5366	if (!p) {
5367		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5368		return NULL;
5369	}
5370
5371	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5372	if (!tx) {
5373		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5374		       "tx qdiscs.\n");
5375		goto free_p;
5376	}
5377
5378#ifdef CONFIG_RPS
5379	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5380	if (!rx) {
5381		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5382		       "rx queues.\n");
5383		goto free_tx;
5384	}
5385
5386	atomic_set(&rx->count, queue_count);
5387
5388	/*
5389	 * Set a pointer to first element in the array which holds the
5390	 * reference count.
5391	 */
5392	for (i = 0; i < queue_count; i++)
5393		rx[i].first = rx;
5394#endif
5395
5396	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5397	dev->padded = (char *)dev - (char *)p;
5398
5399	if (dev_addr_init(dev))
5400		goto free_rx;
5401
5402	dev_mc_init(dev);
5403	dev_uc_init(dev);
5404
5405	dev_net_set(dev, &init_net);
5406
5407	dev->_tx = tx;
5408	dev->num_tx_queues = queue_count;
5409	dev->real_num_tx_queues = queue_count;
5410
5411#ifdef CONFIG_RPS
5412	dev->_rx = rx;
5413	dev->num_rx_queues = queue_count;
5414#endif
5415
5416	dev->gso_max_size = GSO_MAX_SIZE;
5417
5418	netdev_init_queues(dev);
5419
5420	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5421	dev->ethtool_ntuple_list.count = 0;
5422	INIT_LIST_HEAD(&dev->napi_list);
5423	INIT_LIST_HEAD(&dev->unreg_list);
5424	INIT_LIST_HEAD(&dev->link_watch_list);
5425	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5426	setup(dev);
5427	strcpy(dev->name, name);
5428	return dev;
5429
5430free_rx:
5431#ifdef CONFIG_RPS
5432	kfree(rx);
5433free_tx:
5434#endif
5435	kfree(tx);
5436free_p:
5437	kfree(p);
5438	return NULL;
5439}
5440EXPORT_SYMBOL(alloc_netdev_mq);
5441
5442/**
5443 *	free_netdev - free network device
5444 *	@dev: device
5445 *
5446 *	This function does the last stage of destroying an allocated device
5447 * 	interface. The reference to the device object is released.
5448 *	If this is the last reference then it will be freed.
5449 */
5450void free_netdev(struct net_device *dev)
5451{
5452	struct napi_struct *p, *n;
5453
5454	release_net(dev_net(dev));
5455
5456	kfree(dev->_tx);
5457
5458	/* Flush device addresses */
5459	dev_addr_flush(dev);
5460
5461	/* Clear ethtool n-tuple list */
5462	ethtool_ntuple_flush(dev);
5463
5464	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5465		netif_napi_del(p);
5466
5467	/*  Compatibility with error handling in drivers */
5468	if (dev->reg_state == NETREG_UNINITIALIZED) {
5469		kfree((char *)dev - dev->padded);
5470		return;
5471	}
5472
5473	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5474	dev->reg_state = NETREG_RELEASED;
5475
5476	/* will free via device release */
5477	put_device(&dev->dev);
5478}
5479EXPORT_SYMBOL(free_netdev);
5480
5481/**
5482 *	synchronize_net -  Synchronize with packet receive processing
5483 *
5484 *	Wait for packets currently being received to be done.
5485 *	Does not block later packets from starting.
5486 */
5487void synchronize_net(void)
5488{
5489	might_sleep();
5490	synchronize_rcu();
5491}
5492EXPORT_SYMBOL(synchronize_net);
5493
5494/**
5495 *	unregister_netdevice_queue - remove device from the kernel
5496 *	@dev: device
5497 *	@head: list
5498 *
5499 *	This function shuts down a device interface and removes it
5500 *	from the kernel tables.
5501 *	If head not NULL, device is queued to be unregistered later.
5502 *
5503 *	Callers must hold the rtnl semaphore.  You may want
5504 *	unregister_netdev() instead of this.
5505 */
5506
5507void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5508{
5509	ASSERT_RTNL();
5510
5511	if (head) {
5512		list_move_tail(&dev->unreg_list, head);
5513	} else {
5514		rollback_registered(dev);
5515		/* Finish processing unregister after unlock */
5516		net_set_todo(dev);
5517	}
5518}
5519EXPORT_SYMBOL(unregister_netdevice_queue);
5520
5521/**
5522 *	unregister_netdevice_many - unregister many devices
5523 *	@head: list of devices
5524 */
5525void unregister_netdevice_many(struct list_head *head)
5526{
5527	struct net_device *dev;
5528
5529	if (!list_empty(head)) {
5530		rollback_registered_many(head);
5531		list_for_each_entry(dev, head, unreg_list)
5532			net_set_todo(dev);
5533	}
5534}
5535EXPORT_SYMBOL(unregister_netdevice_many);
5536
5537/**
5538 *	unregister_netdev - remove device from the kernel
5539 *	@dev: device
5540 *
5541 *	This function shuts down a device interface and removes it
5542 *	from the kernel tables.
5543 *
5544 *	This is just a wrapper for unregister_netdevice that takes
5545 *	the rtnl semaphore.  In general you want to use this and not
5546 *	unregister_netdevice.
5547 */
5548void unregister_netdev(struct net_device *dev)
5549{
5550	rtnl_lock();
5551	unregister_netdevice(dev);
5552	rtnl_unlock();
5553}
5554EXPORT_SYMBOL(unregister_netdev);
5555
5556/**
5557 *	dev_change_net_namespace - move device to different nethost namespace
5558 *	@dev: device
5559 *	@net: network namespace
5560 *	@pat: If not NULL name pattern to try if the current device name
5561 *	      is already taken in the destination network namespace.
5562 *
5563 *	This function shuts down a device interface and moves it
5564 *	to a new network namespace. On success 0 is returned, on
5565 *	a failure a netagive errno code is returned.
5566 *
5567 *	Callers must hold the rtnl semaphore.
5568 */
5569
5570int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5571{
5572	int err;
5573
5574	ASSERT_RTNL();
5575
5576	/* Don't allow namespace local devices to be moved. */
5577	err = -EINVAL;
5578	if (dev->features & NETIF_F_NETNS_LOCAL)
5579		goto out;
5580
5581	/* Ensure the device has been registrered */
5582	err = -EINVAL;
5583	if (dev->reg_state != NETREG_REGISTERED)
5584		goto out;
5585
5586	/* Get out if there is nothing todo */
5587	err = 0;
5588	if (net_eq(dev_net(dev), net))
5589		goto out;
5590
5591	/* Pick the destination device name, and ensure
5592	 * we can use it in the destination network namespace.
5593	 */
5594	err = -EEXIST;
5595	if (__dev_get_by_name(net, dev->name)) {
5596		/* We get here if we can't use the current device name */
5597		if (!pat)
5598			goto out;
5599		if (dev_get_valid_name(dev, pat, 1))
5600			goto out;
5601	}
5602
5603	/*
5604	 * And now a mini version of register_netdevice unregister_netdevice.
5605	 */
5606
5607	/* If device is running close it first. */
5608	dev_close(dev);
5609
5610	/* And unlink it from device chain */
5611	err = -ENODEV;
5612	unlist_netdevice(dev);
5613
5614	synchronize_net();
5615
5616	/* Shutdown queueing discipline. */
5617	dev_shutdown(dev);
5618
5619	/* Notify protocols, that we are about to destroy
5620	   this device. They should clean all the things.
5621	*/
5622	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5623	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5624
5625	/*
5626	 *	Flush the unicast and multicast chains
5627	 */
5628	dev_uc_flush(dev);
5629	dev_mc_flush(dev);
5630
5631	/* Actually switch the network namespace */
5632	dev_net_set(dev, net);
5633
5634	/* If there is an ifindex conflict assign a new one */
5635	if (__dev_get_by_index(net, dev->ifindex)) {
5636		int iflink = (dev->iflink == dev->ifindex);
5637		dev->ifindex = dev_new_index(net);
5638		if (iflink)
5639			dev->iflink = dev->ifindex;
5640	}
5641
5642	/* Fixup kobjects */
5643	err = device_rename(&dev->dev, dev->name);
5644	WARN_ON(err);
5645
5646	/* Add the device back in the hashes */
5647	list_netdevice(dev);
5648
5649	/* Notify protocols, that a new device appeared. */
5650	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5651
5652	/*
5653	 *	Prevent userspace races by waiting until the network
5654	 *	device is fully setup before sending notifications.
5655	 */
5656	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5657
5658	synchronize_net();
5659	err = 0;
5660out:
5661	return err;
5662}
5663EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5664
5665static int dev_cpu_callback(struct notifier_block *nfb,
5666			    unsigned long action,
5667			    void *ocpu)
5668{
5669	struct sk_buff **list_skb;
5670	struct sk_buff *skb;
5671	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5672	struct softnet_data *sd, *oldsd;
5673
5674	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5675		return NOTIFY_OK;
5676
5677	local_irq_disable();
5678	cpu = smp_processor_id();
5679	sd = &per_cpu(softnet_data, cpu);
5680	oldsd = &per_cpu(softnet_data, oldcpu);
5681
5682	/* Find end of our completion_queue. */
5683	list_skb = &sd->completion_queue;
5684	while (*list_skb)
5685		list_skb = &(*list_skb)->next;
5686	/* Append completion queue from offline CPU. */
5687	*list_skb = oldsd->completion_queue;
5688	oldsd->completion_queue = NULL;
5689
5690	/* Append output queue from offline CPU. */
5691	if (oldsd->output_queue) {
5692		*sd->output_queue_tailp = oldsd->output_queue;
5693		sd->output_queue_tailp = oldsd->output_queue_tailp;
5694		oldsd->output_queue = NULL;
5695		oldsd->output_queue_tailp = &oldsd->output_queue;
5696	}
5697
5698	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5699	local_irq_enable();
5700
5701	/* Process offline CPU's input_pkt_queue */
5702	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5703		netif_rx(skb);
5704		input_queue_head_incr(oldsd);
5705	}
5706	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5707		netif_rx(skb);
5708		input_queue_head_incr(oldsd);
5709	}
5710
5711	return NOTIFY_OK;
5712}
5713
5714
5715/**
5716 *	netdev_increment_features - increment feature set by one
5717 *	@all: current feature set
5718 *	@one: new feature set
5719 *	@mask: mask feature set
5720 *
5721 *	Computes a new feature set after adding a device with feature set
5722 *	@one to the master device with current feature set @all.  Will not
5723 *	enable anything that is off in @mask. Returns the new feature set.
5724 */
5725unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5726					unsigned long mask)
5727{
5728	/* If device needs checksumming, downgrade to it. */
5729	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5730		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5731	else if (mask & NETIF_F_ALL_CSUM) {
5732		/* If one device supports v4/v6 checksumming, set for all. */
5733		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5734		    !(all & NETIF_F_GEN_CSUM)) {
5735			all &= ~NETIF_F_ALL_CSUM;
5736			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5737		}
5738
5739		/* If one device supports hw checksumming, set for all. */
5740		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5741			all &= ~NETIF_F_ALL_CSUM;
5742			all |= NETIF_F_HW_CSUM;
5743		}
5744	}
5745
5746	one |= NETIF_F_ALL_CSUM;
5747
5748	one |= all & NETIF_F_ONE_FOR_ALL;
5749	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5750	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5751
5752	return all;
5753}
5754EXPORT_SYMBOL(netdev_increment_features);
5755
5756static struct hlist_head *netdev_create_hash(void)
5757{
5758	int i;
5759	struct hlist_head *hash;
5760
5761	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5762	if (hash != NULL)
5763		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5764			INIT_HLIST_HEAD(&hash[i]);
5765
5766	return hash;
5767}
5768
5769/* Initialize per network namespace state */
5770static int __net_init netdev_init(struct net *net)
5771{
5772	INIT_LIST_HEAD(&net->dev_base_head);
5773
5774	net->dev_name_head = netdev_create_hash();
5775	if (net->dev_name_head == NULL)
5776		goto err_name;
5777
5778	net->dev_index_head = netdev_create_hash();
5779	if (net->dev_index_head == NULL)
5780		goto err_idx;
5781
5782	return 0;
5783
5784err_idx:
5785	kfree(net->dev_name_head);
5786err_name:
5787	return -ENOMEM;
5788}
5789
5790/**
5791 *	netdev_drivername - network driver for the device
5792 *	@dev: network device
5793 *	@buffer: buffer for resulting name
5794 *	@len: size of buffer
5795 *
5796 *	Determine network driver for device.
5797 */
5798char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5799{
5800	const struct device_driver *driver;
5801	const struct device *parent;
5802
5803	if (len <= 0 || !buffer)
5804		return buffer;
5805	buffer[0] = 0;
5806
5807	parent = dev->dev.parent;
5808
5809	if (!parent)
5810		return buffer;
5811
5812	driver = parent->driver;
5813	if (driver && driver->name)
5814		strlcpy(buffer, driver->name, len);
5815	return buffer;
5816}
5817
5818static void __net_exit netdev_exit(struct net *net)
5819{
5820	kfree(net->dev_name_head);
5821	kfree(net->dev_index_head);
5822}
5823
5824static struct pernet_operations __net_initdata netdev_net_ops = {
5825	.init = netdev_init,
5826	.exit = netdev_exit,
5827};
5828
5829static void __net_exit default_device_exit(struct net *net)
5830{
5831	struct net_device *dev, *aux;
5832	/*
5833	 * Push all migratable network devices back to the
5834	 * initial network namespace
5835	 */
5836	rtnl_lock();
5837	for_each_netdev_safe(net, dev, aux) {
5838		int err;
5839		char fb_name[IFNAMSIZ];
5840
5841		/* Ignore unmoveable devices (i.e. loopback) */
5842		if (dev->features & NETIF_F_NETNS_LOCAL)
5843			continue;
5844
5845		/* Leave virtual devices for the generic cleanup */
5846		if (dev->rtnl_link_ops)
5847			continue;
5848
5849		/* Push remaing network devices to init_net */
5850		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5851		err = dev_change_net_namespace(dev, &init_net, fb_name);
5852		if (err) {
5853			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5854				__func__, dev->name, err);
5855			BUG();
5856		}
5857	}
5858	rtnl_unlock();
5859}
5860
5861static void __net_exit default_device_exit_batch(struct list_head *net_list)
5862{
5863	/* At exit all network devices most be removed from a network
5864	 * namespace.  Do this in the reverse order of registeration.
5865	 * Do this across as many network namespaces as possible to
5866	 * improve batching efficiency.
5867	 */
5868	struct net_device *dev;
5869	struct net *net;
5870	LIST_HEAD(dev_kill_list);
5871
5872	rtnl_lock();
5873	list_for_each_entry(net, net_list, exit_list) {
5874		for_each_netdev_reverse(net, dev) {
5875			if (dev->rtnl_link_ops)
5876				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5877			else
5878				unregister_netdevice_queue(dev, &dev_kill_list);
5879		}
5880	}
5881	unregister_netdevice_many(&dev_kill_list);
5882	rtnl_unlock();
5883}
5884
5885static struct pernet_operations __net_initdata default_device_ops = {
5886	.exit = default_device_exit,
5887	.exit_batch = default_device_exit_batch,
5888};
5889
5890/*
5891 *	Initialize the DEV module. At boot time this walks the device list and
5892 *	unhooks any devices that fail to initialise (normally hardware not
5893 *	present) and leaves us with a valid list of present and active devices.
5894 *
5895 */
5896
5897/*
5898 *       This is called single threaded during boot, so no need
5899 *       to take the rtnl semaphore.
5900 */
5901static int __init net_dev_init(void)
5902{
5903	int i, rc = -ENOMEM;
5904
5905	BUG_ON(!dev_boot_phase);
5906
5907	if (dev_proc_init())
5908		goto out;
5909
5910	if (netdev_kobject_init())
5911		goto out;
5912
5913	INIT_LIST_HEAD(&ptype_all);
5914	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5915		INIT_LIST_HEAD(&ptype_base[i]);
5916
5917	if (register_pernet_subsys(&netdev_net_ops))
5918		goto out;
5919
5920	/*
5921	 *	Initialise the packet receive queues.
5922	 */
5923
5924	for_each_possible_cpu(i) {
5925		struct softnet_data *sd = &per_cpu(softnet_data, i);
5926
5927		memset(sd, 0, sizeof(*sd));
5928		skb_queue_head_init(&sd->input_pkt_queue);
5929		skb_queue_head_init(&sd->process_queue);
5930		sd->completion_queue = NULL;
5931		INIT_LIST_HEAD(&sd->poll_list);
5932		sd->output_queue = NULL;
5933		sd->output_queue_tailp = &sd->output_queue;
5934#ifdef CONFIG_RPS
5935		sd->csd.func = rps_trigger_softirq;
5936		sd->csd.info = sd;
5937		sd->csd.flags = 0;
5938		sd->cpu = i;
5939#endif
5940
5941		sd->backlog.poll = process_backlog;
5942		sd->backlog.weight = weight_p;
5943		sd->backlog.gro_list = NULL;
5944		sd->backlog.gro_count = 0;
5945	}
5946
5947	dev_boot_phase = 0;
5948
5949	/* The loopback device is special if any other network devices
5950	 * is present in a network namespace the loopback device must
5951	 * be present. Since we now dynamically allocate and free the
5952	 * loopback device ensure this invariant is maintained by
5953	 * keeping the loopback device as the first device on the
5954	 * list of network devices.  Ensuring the loopback devices
5955	 * is the first device that appears and the last network device
5956	 * that disappears.
5957	 */
5958	if (register_pernet_device(&loopback_net_ops))
5959		goto out;
5960
5961	if (register_pernet_device(&default_device_ops))
5962		goto out;
5963
5964	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5965	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5966
5967	hotcpu_notifier(dev_cpu_callback, 0);
5968	dst_init();
5969	dev_mcast_init();
5970	rc = 0;
5971out:
5972	return rc;
5973}
5974
5975subsys_initcall(net_dev_init);
5976
5977static int __init initialize_hashrnd(void)
5978{
5979	get_random_bytes(&hashrnd, sizeof(hashrnd));
5980	return 0;
5981}
5982
5983late_initcall_sync(initialize_hashrnd);
5984