net/core/dev.c at v2.6.38-rc3

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.38-rc3 6313 lines 156 kB view raw
wrap content
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135
 136#include "net-sysfs.h"
 137
 138/* Instead of increasing this, you should create a hash table. */
 139#define MAX_GRO_SKBS 8
 140
 141/* This should be increased if a protocol with a bigger head is added. */
 142#define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144/*
 145 *	The list of packet types we will receive (as opposed to discard)
 146 *	and the routines to invoke.
 147 *
 148 *	Why 16. Because with 16 the only overlap we get on a hash of the
 149 *	low nibble of the protocol value is RARP/SNAP/X.25.
 150 *
 151 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152 *             sure which should go first, but I bet it won't make much
 153 *             difference if we are running VLANs.  The good news is that
 154 *             this protocol won't be in the list unless compiled in, so
 155 *             the average user (w/out VLANs) will not be adversely affected.
 156 *             --BLG
 157 *
 158 *		0800	IP
 159 *		8100    802.1Q VLAN
 160 *		0001	802.3
 161 *		0002	AX.25
 162 *		0004	802.2
 163 *		8035	RARP
 164 *		0005	SNAP
 165 *		0805	X.25
 166 *		0806	ARP
 167 *		8137	IPX
 168 *		0009	Localtalk
 169 *		86DD	IPv6
 170 */
 171
 172#define PTYPE_HASH_SIZE	(16)
 173#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 174
 175static DEFINE_SPINLOCK(ptype_lock);
 176static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177static struct list_head ptype_all __read_mostly;	/* Taps */
 178
 179/*
 180 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181 * semaphore.
 182 *
 183 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184 *
 185 * Writers must hold the rtnl semaphore while they loop through the
 186 * dev_base_head list, and hold dev_base_lock for writing when they do the
 187 * actual updates.  This allows pure readers to access the list even
 188 * while a writer is preparing to update it.
 189 *
 190 * To put it another way, dev_base_lock is held for writing only to
 191 * protect against pure readers; the rtnl semaphore provides the
 192 * protection against other writers.
 193 *
 194 * See, for example usages, register_netdevice() and
 195 * unregister_netdevice(), which must be called with the rtnl
 196 * semaphore held.
 197 */
 198DEFINE_RWLOCK(dev_base_lock);
 199EXPORT_SYMBOL(dev_base_lock);
 200
 201static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202{
 203	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205}
 206
 207static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208{
 209	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210}
 211
 212static inline void rps_lock(struct softnet_data *sd)
 213{
 214#ifdef CONFIG_RPS
 215	spin_lock(&sd->input_pkt_queue.lock);
 216#endif
 217}
 218
 219static inline void rps_unlock(struct softnet_data *sd)
 220{
 221#ifdef CONFIG_RPS
 222	spin_unlock(&sd->input_pkt_queue.lock);
 223#endif
 224}
 225
 226/* Device list insertion */
 227static int list_netdevice(struct net_device *dev)
 228{
 229	struct net *net = dev_net(dev);
 230
 231	ASSERT_RTNL();
 232
 233	write_lock_bh(&dev_base_lock);
 234	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236	hlist_add_head_rcu(&dev->index_hlist,
 237			   dev_index_hash(net, dev->ifindex));
 238	write_unlock_bh(&dev_base_lock);
 239	return 0;
 240}
 241
 242/* Device list removal
 243 * caller must respect a RCU grace period before freeing/reusing dev
 244 */
 245static void unlist_netdevice(struct net_device *dev)
 246{
 247	ASSERT_RTNL();
 248
 249	/* Unlink dev from the device chain */
 250	write_lock_bh(&dev_base_lock);
 251	list_del_rcu(&dev->dev_list);
 252	hlist_del_rcu(&dev->name_hlist);
 253	hlist_del_rcu(&dev->index_hlist);
 254	write_unlock_bh(&dev_base_lock);
 255}
 256
 257/*
 258 *	Our notifier list
 259 */
 260
 261static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263/*
 264 *	Device drivers call our routines to queue packets here. We empty the
 265 *	queue in the local softnet handler.
 266 */
 267
 268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271#ifdef CONFIG_LOCKDEP
 272/*
 273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274 * according to dev->type
 275 */
 276static const unsigned short netdev_lock_type[] =
 277	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292	 ARPHRD_VOID, ARPHRD_NONE};
 293
 294static const char *const netdev_lock_name[] =
 295	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310	 "_xmit_VOID", "_xmit_NONE"};
 311
 312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316{
 317	int i;
 318
 319	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320		if (netdev_lock_type[i] == dev_type)
 321			return i;
 322	/* the last key is used by default */
 323	return ARRAY_SIZE(netdev_lock_type) - 1;
 324}
 325
 326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327						 unsigned short dev_type)
 328{
 329	int i;
 330
 331	i = netdev_lock_pos(dev_type);
 332	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333				   netdev_lock_name[i]);
 334}
 335
 336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337{
 338	int i;
 339
 340	i = netdev_lock_pos(dev->type);
 341	lockdep_set_class_and_name(&dev->addr_list_lock,
 342				   &netdev_addr_lock_key[i],
 343				   netdev_lock_name[i]);
 344}
 345#else
 346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347						 unsigned short dev_type)
 348{
 349}
 350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351{
 352}
 353#endif
 354
 355/*******************************************************************************
 356
 357		Protocol management and registration routines
 358
 359*******************************************************************************/
 360
 361/*
 362 *	Add a protocol ID to the list. Now that the input handler is
 363 *	smarter we can dispense with all the messy stuff that used to be
 364 *	here.
 365 *
 366 *	BEWARE!!! Protocol handlers, mangling input packets,
 367 *	MUST BE last in hash buckets and checking protocol handlers
 368 *	MUST start from promiscuous ptype_all chain in net_bh.
 369 *	It is true now, do not change it.
 370 *	Explanation follows: if protocol handler, mangling packet, will
 371 *	be the first on list, it is not able to sense, that packet
 372 *	is cloned and should be copied-on-write, so that it will
 373 *	change it and subsequent readers will get broken packet.
 374 *							--ANK (980803)
 375 */
 376
 377static inline struct list_head *ptype_head(const struct packet_type *pt)
 378{
 379	if (pt->type == htons(ETH_P_ALL))
 380		return &ptype_all;
 381	else
 382		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383}
 384
 385/**
 386 *	dev_add_pack - add packet handler
 387 *	@pt: packet type declaration
 388 *
 389 *	Add a protocol handler to the networking stack. The passed &packet_type
 390 *	is linked into kernel lists and may not be freed until it has been
 391 *	removed from the kernel lists.
 392 *
 393 *	This call does not sleep therefore it can not
 394 *	guarantee all CPU's that are in middle of receiving packets
 395 *	will see the new packet type (until the next received packet).
 396 */
 397
 398void dev_add_pack(struct packet_type *pt)
 399{
 400	struct list_head *head = ptype_head(pt);
 401
 402	spin_lock(&ptype_lock);
 403	list_add_rcu(&pt->list, head);
 404	spin_unlock(&ptype_lock);
 405}
 406EXPORT_SYMBOL(dev_add_pack);
 407
 408/**
 409 *	__dev_remove_pack	 - remove packet handler
 410 *	@pt: packet type declaration
 411 *
 412 *	Remove a protocol handler that was previously added to the kernel
 413 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414 *	from the kernel lists and can be freed or reused once this function
 415 *	returns.
 416 *
 417 *      The packet type might still be in use by receivers
 418 *	and must not be freed until after all the CPU's have gone
 419 *	through a quiescent state.
 420 */
 421void __dev_remove_pack(struct packet_type *pt)
 422{
 423	struct list_head *head = ptype_head(pt);
 424	struct packet_type *pt1;
 425
 426	spin_lock(&ptype_lock);
 427
 428	list_for_each_entry(pt1, head, list) {
 429		if (pt == pt1) {
 430			list_del_rcu(&pt->list);
 431			goto out;
 432		}
 433	}
 434
 435	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436out:
 437	spin_unlock(&ptype_lock);
 438}
 439EXPORT_SYMBOL(__dev_remove_pack);
 440
 441/**
 442 *	dev_remove_pack	 - remove packet handler
 443 *	@pt: packet type declaration
 444 *
 445 *	Remove a protocol handler that was previously added to the kernel
 446 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447 *	from the kernel lists and can be freed or reused once this function
 448 *	returns.
 449 *
 450 *	This call sleeps to guarantee that no CPU is looking at the packet
 451 *	type after return.
 452 */
 453void dev_remove_pack(struct packet_type *pt)
 454{
 455	__dev_remove_pack(pt);
 456
 457	synchronize_net();
 458}
 459EXPORT_SYMBOL(dev_remove_pack);
 460
 461/******************************************************************************
 462
 463		      Device Boot-time Settings Routines
 464
 465*******************************************************************************/
 466
 467/* Boot time configuration table */
 468static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470/**
 471 *	netdev_boot_setup_add	- add new setup entry
 472 *	@name: name of the device
 473 *	@map: configured settings for the device
 474 *
 475 *	Adds new setup entry to the dev_boot_setup list.  The function
 476 *	returns 0 on error and 1 on success.  This is a generic routine to
 477 *	all netdevices.
 478 */
 479static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480{
 481	struct netdev_boot_setup *s;
 482	int i;
 483
 484	s = dev_boot_setup;
 485	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487			memset(s[i].name, 0, sizeof(s[i].name));
 488			strlcpy(s[i].name, name, IFNAMSIZ);
 489			memcpy(&s[i].map, map, sizeof(s[i].map));
 490			break;
 491		}
 492	}
 493
 494	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495}
 496
 497/**
 498 *	netdev_boot_setup_check	- check boot time settings
 499 *	@dev: the netdevice
 500 *
 501 * 	Check boot time settings for the device.
 502 *	The found settings are set for the device to be used
 503 *	later in the device probing.
 504 *	Returns 0 if no settings found, 1 if they are.
 505 */
 506int netdev_boot_setup_check(struct net_device *dev)
 507{
 508	struct netdev_boot_setup *s = dev_boot_setup;
 509	int i;
 510
 511	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513		    !strcmp(dev->name, s[i].name)) {
 514			dev->irq 	= s[i].map.irq;
 515			dev->base_addr 	= s[i].map.base_addr;
 516			dev->mem_start 	= s[i].map.mem_start;
 517			dev->mem_end 	= s[i].map.mem_end;
 518			return 1;
 519		}
 520	}
 521	return 0;
 522}
 523EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526/**
 527 *	netdev_boot_base	- get address from boot time settings
 528 *	@prefix: prefix for network device
 529 *	@unit: id for network device
 530 *
 531 * 	Check boot time settings for the base address of device.
 532 *	The found settings are set for the device to be used
 533 *	later in the device probing.
 534 *	Returns 0 if no settings found.
 535 */
 536unsigned long netdev_boot_base(const char *prefix, int unit)
 537{
 538	const struct netdev_boot_setup *s = dev_boot_setup;
 539	char name[IFNAMSIZ];
 540	int i;
 541
 542	sprintf(name, "%s%d", prefix, unit);
 543
 544	/*
 545	 * If device already registered then return base of 1
 546	 * to indicate not to probe for this interface
 547	 */
 548	if (__dev_get_by_name(&init_net, name))
 549		return 1;
 550
 551	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552		if (!strcmp(name, s[i].name))
 553			return s[i].map.base_addr;
 554	return 0;
 555}
 556
 557/*
 558 * Saves at boot time configured settings for any netdevice.
 559 */
 560int __init netdev_boot_setup(char *str)
 561{
 562	int ints[5];
 563	struct ifmap map;
 564
 565	str = get_options(str, ARRAY_SIZE(ints), ints);
 566	if (!str || !*str)
 567		return 0;
 568
 569	/* Save settings */
 570	memset(&map, 0, sizeof(map));
 571	if (ints[0] > 0)
 572		map.irq = ints[1];
 573	if (ints[0] > 1)
 574		map.base_addr = ints[2];
 575	if (ints[0] > 2)
 576		map.mem_start = ints[3];
 577	if (ints[0] > 3)
 578		map.mem_end = ints[4];
 579
 580	/* Add new entry to the list */
 581	return netdev_boot_setup_add(str, &map);
 582}
 583
 584__setup("netdev=", netdev_boot_setup);
 585
 586/*******************************************************************************
 587
 588			    Device Interface Subroutines
 589
 590*******************************************************************************/
 591
 592/**
 593 *	__dev_get_by_name	- find a device by its name
 594 *	@net: the applicable net namespace
 595 *	@name: name to find
 596 *
 597 *	Find an interface by name. Must be called under RTNL semaphore
 598 *	or @dev_base_lock. If the name is found a pointer to the device
 599 *	is returned. If the name is not found then %NULL is returned. The
 600 *	reference counters are not incremented so the caller must be
 601 *	careful with locks.
 602 */
 603
 604struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605{
 606	struct hlist_node *p;
 607	struct net_device *dev;
 608	struct hlist_head *head = dev_name_hash(net, name);
 609
 610	hlist_for_each_entry(dev, p, head, name_hlist)
 611		if (!strncmp(dev->name, name, IFNAMSIZ))
 612			return dev;
 613
 614	return NULL;
 615}
 616EXPORT_SYMBOL(__dev_get_by_name);
 617
 618/**
 619 *	dev_get_by_name_rcu	- find a device by its name
 620 *	@net: the applicable net namespace
 621 *	@name: name to find
 622 *
 623 *	Find an interface by name.
 624 *	If the name is found a pointer to the device is returned.
 625 * 	If the name is not found then %NULL is returned.
 626 *	The reference counters are not incremented so the caller must be
 627 *	careful with locks. The caller must hold RCU lock.
 628 */
 629
 630struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631{
 632	struct hlist_node *p;
 633	struct net_device *dev;
 634	struct hlist_head *head = dev_name_hash(net, name);
 635
 636	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637		if (!strncmp(dev->name, name, IFNAMSIZ))
 638			return dev;
 639
 640	return NULL;
 641}
 642EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644/**
 645 *	dev_get_by_name		- find a device by its name
 646 *	@net: the applicable net namespace
 647 *	@name: name to find
 648 *
 649 *	Find an interface by name. This can be called from any
 650 *	context and does its own locking. The returned handle has
 651 *	the usage count incremented and the caller must use dev_put() to
 652 *	release it when it is no longer needed. %NULL is returned if no
 653 *	matching device is found.
 654 */
 655
 656struct net_device *dev_get_by_name(struct net *net, const char *name)
 657{
 658	struct net_device *dev;
 659
 660	rcu_read_lock();
 661	dev = dev_get_by_name_rcu(net, name);
 662	if (dev)
 663		dev_hold(dev);
 664	rcu_read_unlock();
 665	return dev;
 666}
 667EXPORT_SYMBOL(dev_get_by_name);
 668
 669/**
 670 *	__dev_get_by_index - find a device by its ifindex
 671 *	@net: the applicable net namespace
 672 *	@ifindex: index of device
 673 *
 674 *	Search for an interface by index. Returns %NULL if the device
 675 *	is not found or a pointer to the device. The device has not
 676 *	had its reference counter increased so the caller must be careful
 677 *	about locking. The caller must hold either the RTNL semaphore
 678 *	or @dev_base_lock.
 679 */
 680
 681struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682{
 683	struct hlist_node *p;
 684	struct net_device *dev;
 685	struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687	hlist_for_each_entry(dev, p, head, index_hlist)
 688		if (dev->ifindex == ifindex)
 689			return dev;
 690
 691	return NULL;
 692}
 693EXPORT_SYMBOL(__dev_get_by_index);
 694
 695/**
 696 *	dev_get_by_index_rcu - find a device by its ifindex
 697 *	@net: the applicable net namespace
 698 *	@ifindex: index of device
 699 *
 700 *	Search for an interface by index. Returns %NULL if the device
 701 *	is not found or a pointer to the device. The device has not
 702 *	had its reference counter increased so the caller must be careful
 703 *	about locking. The caller must hold RCU lock.
 704 */
 705
 706struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707{
 708	struct hlist_node *p;
 709	struct net_device *dev;
 710	struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713		if (dev->ifindex == ifindex)
 714			return dev;
 715
 716	return NULL;
 717}
 718EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721/**
 722 *	dev_get_by_index - find a device by its ifindex
 723 *	@net: the applicable net namespace
 724 *	@ifindex: index of device
 725 *
 726 *	Search for an interface by index. Returns NULL if the device
 727 *	is not found or a pointer to the device. The device returned has
 728 *	had a reference added and the pointer is safe until the user calls
 729 *	dev_put to indicate they have finished with it.
 730 */
 731
 732struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733{
 734	struct net_device *dev;
 735
 736	rcu_read_lock();
 737	dev = dev_get_by_index_rcu(net, ifindex);
 738	if (dev)
 739		dev_hold(dev);
 740	rcu_read_unlock();
 741	return dev;
 742}
 743EXPORT_SYMBOL(dev_get_by_index);
 744
 745/**
 746 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 747 *	@net: the applicable net namespace
 748 *	@type: media type of device
 749 *	@ha: hardware address
 750 *
 751 *	Search for an interface by MAC address. Returns NULL if the device
 752 *	is not found or a pointer to the device.
 753 *	The caller must hold RCU or RTNL.
 754 *	The returned device has not had its ref count increased
 755 *	and the caller must therefore be careful about locking
 756 *
 757 */
 758
 759struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 760				       const char *ha)
 761{
 762	struct net_device *dev;
 763
 764	for_each_netdev_rcu(net, dev)
 765		if (dev->type == type &&
 766		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 767			return dev;
 768
 769	return NULL;
 770}
 771EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 772
 773struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 774{
 775	struct net_device *dev;
 776
 777	ASSERT_RTNL();
 778	for_each_netdev(net, dev)
 779		if (dev->type == type)
 780			return dev;
 781
 782	return NULL;
 783}
 784EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 785
 786struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 787{
 788	struct net_device *dev, *ret = NULL;
 789
 790	rcu_read_lock();
 791	for_each_netdev_rcu(net, dev)
 792		if (dev->type == type) {
 793			dev_hold(dev);
 794			ret = dev;
 795			break;
 796		}
 797	rcu_read_unlock();
 798	return ret;
 799}
 800EXPORT_SYMBOL(dev_getfirstbyhwtype);
 801
 802/**
 803 *	dev_get_by_flags_rcu - find any device with given flags
 804 *	@net: the applicable net namespace
 805 *	@if_flags: IFF_* values
 806 *	@mask: bitmask of bits in if_flags to check
 807 *
 808 *	Search for any interface with the given flags. Returns NULL if a device
 809 *	is not found or a pointer to the device. Must be called inside
 810 *	rcu_read_lock(), and result refcount is unchanged.
 811 */
 812
 813struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 814				    unsigned short mask)
 815{
 816	struct net_device *dev, *ret;
 817
 818	ret = NULL;
 819	for_each_netdev_rcu(net, dev) {
 820		if (((dev->flags ^ if_flags) & mask) == 0) {
 821			ret = dev;
 822			break;
 823		}
 824	}
 825	return ret;
 826}
 827EXPORT_SYMBOL(dev_get_by_flags_rcu);
 828
 829/**
 830 *	dev_valid_name - check if name is okay for network device
 831 *	@name: name string
 832 *
 833 *	Network device names need to be valid file names to
 834 *	to allow sysfs to work.  We also disallow any kind of
 835 *	whitespace.
 836 */
 837int dev_valid_name(const char *name)
 838{
 839	if (*name == '\0')
 840		return 0;
 841	if (strlen(name) >= IFNAMSIZ)
 842		return 0;
 843	if (!strcmp(name, ".") || !strcmp(name, ".."))
 844		return 0;
 845
 846	while (*name) {
 847		if (*name == '/' || isspace(*name))
 848			return 0;
 849		name++;
 850	}
 851	return 1;
 852}
 853EXPORT_SYMBOL(dev_valid_name);
 854
 855/**
 856 *	__dev_alloc_name - allocate a name for a device
 857 *	@net: network namespace to allocate the device name in
 858 *	@name: name format string
 859 *	@buf:  scratch buffer and result name string
 860 *
 861 *	Passed a format string - eg "lt%d" it will try and find a suitable
 862 *	id. It scans list of devices to build up a free map, then chooses
 863 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 864 *	while allocating the name and adding the device in order to avoid
 865 *	duplicates.
 866 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 867 *	Returns the number of the unit assigned or a negative errno code.
 868 */
 869
 870static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 871{
 872	int i = 0;
 873	const char *p;
 874	const int max_netdevices = 8*PAGE_SIZE;
 875	unsigned long *inuse;
 876	struct net_device *d;
 877
 878	p = strnchr(name, IFNAMSIZ-1, '%');
 879	if (p) {
 880		/*
 881		 * Verify the string as this thing may have come from
 882		 * the user.  There must be either one "%d" and no other "%"
 883		 * characters.
 884		 */
 885		if (p[1] != 'd' || strchr(p + 2, '%'))
 886			return -EINVAL;
 887
 888		/* Use one page as a bit array of possible slots */
 889		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 890		if (!inuse)
 891			return -ENOMEM;
 892
 893		for_each_netdev(net, d) {
 894			if (!sscanf(d->name, name, &i))
 895				continue;
 896			if (i < 0 || i >= max_netdevices)
 897				continue;
 898
 899			/*  avoid cases where sscanf is not exact inverse of printf */
 900			snprintf(buf, IFNAMSIZ, name, i);
 901			if (!strncmp(buf, d->name, IFNAMSIZ))
 902				set_bit(i, inuse);
 903		}
 904
 905		i = find_first_zero_bit(inuse, max_netdevices);
 906		free_page((unsigned long) inuse);
 907	}
 908
 909	if (buf != name)
 910		snprintf(buf, IFNAMSIZ, name, i);
 911	if (!__dev_get_by_name(net, buf))
 912		return i;
 913
 914	/* It is possible to run out of possible slots
 915	 * when the name is long and there isn't enough space left
 916	 * for the digits, or if all bits are used.
 917	 */
 918	return -ENFILE;
 919}
 920
 921/**
 922 *	dev_alloc_name - allocate a name for a device
 923 *	@dev: device
 924 *	@name: name format string
 925 *
 926 *	Passed a format string - eg "lt%d" it will try and find a suitable
 927 *	id. It scans list of devices to build up a free map, then chooses
 928 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 929 *	while allocating the name and adding the device in order to avoid
 930 *	duplicates.
 931 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 932 *	Returns the number of the unit assigned or a negative errno code.
 933 */
 934
 935int dev_alloc_name(struct net_device *dev, const char *name)
 936{
 937	char buf[IFNAMSIZ];
 938	struct net *net;
 939	int ret;
 940
 941	BUG_ON(!dev_net(dev));
 942	net = dev_net(dev);
 943	ret = __dev_alloc_name(net, name, buf);
 944	if (ret >= 0)
 945		strlcpy(dev->name, buf, IFNAMSIZ);
 946	return ret;
 947}
 948EXPORT_SYMBOL(dev_alloc_name);
 949
 950static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 951{
 952	struct net *net;
 953
 954	BUG_ON(!dev_net(dev));
 955	net = dev_net(dev);
 956
 957	if (!dev_valid_name(name))
 958		return -EINVAL;
 959
 960	if (fmt && strchr(name, '%'))
 961		return dev_alloc_name(dev, name);
 962	else if (__dev_get_by_name(net, name))
 963		return -EEXIST;
 964	else if (dev->name != name)
 965		strlcpy(dev->name, name, IFNAMSIZ);
 966
 967	return 0;
 968}
 969
 970/**
 971 *	dev_change_name - change name of a device
 972 *	@dev: device
 973 *	@newname: name (or format string) must be at least IFNAMSIZ
 974 *
 975 *	Change name of a device, can pass format strings "eth%d".
 976 *	for wildcarding.
 977 */
 978int dev_change_name(struct net_device *dev, const char *newname)
 979{
 980	char oldname[IFNAMSIZ];
 981	int err = 0;
 982	int ret;
 983	struct net *net;
 984
 985	ASSERT_RTNL();
 986	BUG_ON(!dev_net(dev));
 987
 988	net = dev_net(dev);
 989	if (dev->flags & IFF_UP)
 990		return -EBUSY;
 991
 992	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 993		return 0;
 994
 995	memcpy(oldname, dev->name, IFNAMSIZ);
 996
 997	err = dev_get_valid_name(dev, newname, 1);
 998	if (err < 0)
 999		return err;
1000
1001rollback:
1002	ret = device_rename(&dev->dev, dev->name);
1003	if (ret) {
1004		memcpy(dev->name, oldname, IFNAMSIZ);
1005		return ret;
1006	}
1007
1008	write_lock_bh(&dev_base_lock);
1009	hlist_del(&dev->name_hlist);
1010	write_unlock_bh(&dev_base_lock);
1011
1012	synchronize_rcu();
1013
1014	write_lock_bh(&dev_base_lock);
1015	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1016	write_unlock_bh(&dev_base_lock);
1017
1018	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1019	ret = notifier_to_errno(ret);
1020
1021	if (ret) {
1022		/* err >= 0 after dev_alloc_name() or stores the first errno */
1023		if (err >= 0) {
1024			err = ret;
1025			memcpy(dev->name, oldname, IFNAMSIZ);
1026			goto rollback;
1027		} else {
1028			printk(KERN_ERR
1029			       "%s: name change rollback failed: %d.\n",
1030			       dev->name, ret);
1031		}
1032	}
1033
1034	return err;
1035}
1036
1037/**
1038 *	dev_set_alias - change ifalias of a device
1039 *	@dev: device
1040 *	@alias: name up to IFALIASZ
1041 *	@len: limit of bytes to copy from info
1042 *
1043 *	Set ifalias for a device,
1044 */
1045int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1046{
1047	ASSERT_RTNL();
1048
1049	if (len >= IFALIASZ)
1050		return -EINVAL;
1051
1052	if (!len) {
1053		if (dev->ifalias) {
1054			kfree(dev->ifalias);
1055			dev->ifalias = NULL;
1056		}
1057		return 0;
1058	}
1059
1060	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1061	if (!dev->ifalias)
1062		return -ENOMEM;
1063
1064	strlcpy(dev->ifalias, alias, len+1);
1065	return len;
1066}
1067
1068
1069/**
1070 *	netdev_features_change - device changes features
1071 *	@dev: device to cause notification
1072 *
1073 *	Called to indicate a device has changed features.
1074 */
1075void netdev_features_change(struct net_device *dev)
1076{
1077	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1078}
1079EXPORT_SYMBOL(netdev_features_change);
1080
1081/**
1082 *	netdev_state_change - device changes state
1083 *	@dev: device to cause notification
1084 *
1085 *	Called to indicate a device has changed state. This function calls
1086 *	the notifier chains for netdev_chain and sends a NEWLINK message
1087 *	to the routing socket.
1088 */
1089void netdev_state_change(struct net_device *dev)
1090{
1091	if (dev->flags & IFF_UP) {
1092		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1093		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1094	}
1095}
1096EXPORT_SYMBOL(netdev_state_change);
1097
1098int netdev_bonding_change(struct net_device *dev, unsigned long event)
1099{
1100	return call_netdevice_notifiers(event, dev);
1101}
1102EXPORT_SYMBOL(netdev_bonding_change);
1103
1104/**
1105 *	dev_load 	- load a network module
1106 *	@net: the applicable net namespace
1107 *	@name: name of interface
1108 *
1109 *	If a network interface is not present and the process has suitable
1110 *	privileges this function loads the module. If module loading is not
1111 *	available in this kernel then it becomes a nop.
1112 */
1113
1114void dev_load(struct net *net, const char *name)
1115{
1116	struct net_device *dev;
1117
1118	rcu_read_lock();
1119	dev = dev_get_by_name_rcu(net, name);
1120	rcu_read_unlock();
1121
1122	if (!dev && capable(CAP_NET_ADMIN))
1123		request_module("%s", name);
1124}
1125EXPORT_SYMBOL(dev_load);
1126
1127static int __dev_open(struct net_device *dev)
1128{
1129	const struct net_device_ops *ops = dev->netdev_ops;
1130	int ret;
1131
1132	ASSERT_RTNL();
1133
1134	/*
1135	 *	Is it even present?
1136	 */
1137	if (!netif_device_present(dev))
1138		return -ENODEV;
1139
1140	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1141	ret = notifier_to_errno(ret);
1142	if (ret)
1143		return ret;
1144
1145	/*
1146	 *	Call device private open method
1147	 */
1148	set_bit(__LINK_STATE_START, &dev->state);
1149
1150	if (ops->ndo_validate_addr)
1151		ret = ops->ndo_validate_addr(dev);
1152
1153	if (!ret && ops->ndo_open)
1154		ret = ops->ndo_open(dev);
1155
1156	/*
1157	 *	If it went open OK then:
1158	 */
1159
1160	if (ret)
1161		clear_bit(__LINK_STATE_START, &dev->state);
1162	else {
1163		/*
1164		 *	Set the flags.
1165		 */
1166		dev->flags |= IFF_UP;
1167
1168		/*
1169		 *	Enable NET_DMA
1170		 */
1171		net_dmaengine_get();
1172
1173		/*
1174		 *	Initialize multicasting status
1175		 */
1176		dev_set_rx_mode(dev);
1177
1178		/*
1179		 *	Wakeup transmit queue engine
1180		 */
1181		dev_activate(dev);
1182	}
1183
1184	return ret;
1185}
1186
1187/**
1188 *	dev_open	- prepare an interface for use.
1189 *	@dev:	device to open
1190 *
1191 *	Takes a device from down to up state. The device's private open
1192 *	function is invoked and then the multicast lists are loaded. Finally
1193 *	the device is moved into the up state and a %NETDEV_UP message is
1194 *	sent to the netdev notifier chain.
1195 *
1196 *	Calling this function on an active interface is a nop. On a failure
1197 *	a negative errno code is returned.
1198 */
1199int dev_open(struct net_device *dev)
1200{
1201	int ret;
1202
1203	/*
1204	 *	Is it already up?
1205	 */
1206	if (dev->flags & IFF_UP)
1207		return 0;
1208
1209	/*
1210	 *	Open device
1211	 */
1212	ret = __dev_open(dev);
1213	if (ret < 0)
1214		return ret;
1215
1216	/*
1217	 *	... and announce new interface.
1218	 */
1219	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220	call_netdevice_notifiers(NETDEV_UP, dev);
1221
1222	return ret;
1223}
1224EXPORT_SYMBOL(dev_open);
1225
1226static int __dev_close_many(struct list_head *head)
1227{
1228	struct net_device *dev;
1229
1230	ASSERT_RTNL();
1231	might_sleep();
1232
1233	list_for_each_entry(dev, head, unreg_list) {
1234		/*
1235		 *	Tell people we are going down, so that they can
1236		 *	prepare to death, when device is still operating.
1237		 */
1238		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1239
1240		clear_bit(__LINK_STATE_START, &dev->state);
1241
1242		/* Synchronize to scheduled poll. We cannot touch poll list, it
1243		 * can be even on different cpu. So just clear netif_running().
1244		 *
1245		 * dev->stop() will invoke napi_disable() on all of it's
1246		 * napi_struct instances on this device.
1247		 */
1248		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1249	}
1250
1251	dev_deactivate_many(head);
1252
1253	list_for_each_entry(dev, head, unreg_list) {
1254		const struct net_device_ops *ops = dev->netdev_ops;
1255
1256		/*
1257		 *	Call the device specific close. This cannot fail.
1258		 *	Only if device is UP
1259		 *
1260		 *	We allow it to be called even after a DETACH hot-plug
1261		 *	event.
1262		 */
1263		if (ops->ndo_stop)
1264			ops->ndo_stop(dev);
1265
1266		/*
1267		 *	Device is now down.
1268		 */
1269
1270		dev->flags &= ~IFF_UP;
1271
1272		/*
1273		 *	Shutdown NET_DMA
1274		 */
1275		net_dmaengine_put();
1276	}
1277
1278	return 0;
1279}
1280
1281static int __dev_close(struct net_device *dev)
1282{
1283	LIST_HEAD(single);
1284
1285	list_add(&dev->unreg_list, &single);
1286	return __dev_close_many(&single);
1287}
1288
1289int dev_close_many(struct list_head *head)
1290{
1291	struct net_device *dev, *tmp;
1292	LIST_HEAD(tmp_list);
1293
1294	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1295		if (!(dev->flags & IFF_UP))
1296			list_move(&dev->unreg_list, &tmp_list);
1297
1298	__dev_close_many(head);
1299
1300	/*
1301	 * Tell people we are down
1302	 */
1303	list_for_each_entry(dev, head, unreg_list) {
1304		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1305		call_netdevice_notifiers(NETDEV_DOWN, dev);
1306	}
1307
1308	/* rollback_registered_many needs the complete original list */
1309	list_splice(&tmp_list, head);
1310	return 0;
1311}
1312
1313/**
1314 *	dev_close - shutdown an interface.
1315 *	@dev: device to shutdown
1316 *
1317 *	This function moves an active device into down state. A
1318 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1319 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1320 *	chain.
1321 */
1322int dev_close(struct net_device *dev)
1323{
1324	LIST_HEAD(single);
1325
1326	list_add(&dev->unreg_list, &single);
1327	dev_close_many(&single);
1328
1329	return 0;
1330}
1331EXPORT_SYMBOL(dev_close);
1332
1333
1334/**
1335 *	dev_disable_lro - disable Large Receive Offload on a device
1336 *	@dev: device
1337 *
1338 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1339 *	called under RTNL.  This is needed if received packets may be
1340 *	forwarded to another interface.
1341 */
1342void dev_disable_lro(struct net_device *dev)
1343{
1344	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1345	    dev->ethtool_ops->set_flags) {
1346		u32 flags = dev->ethtool_ops->get_flags(dev);
1347		if (flags & ETH_FLAG_LRO) {
1348			flags &= ~ETH_FLAG_LRO;
1349			dev->ethtool_ops->set_flags(dev, flags);
1350		}
1351	}
1352	WARN_ON(dev->features & NETIF_F_LRO);
1353}
1354EXPORT_SYMBOL(dev_disable_lro);
1355
1356
1357static int dev_boot_phase = 1;
1358
1359/*
1360 *	Device change register/unregister. These are not inline or static
1361 *	as we export them to the world.
1362 */
1363
1364/**
1365 *	register_netdevice_notifier - register a network notifier block
1366 *	@nb: notifier
1367 *
1368 *	Register a notifier to be called when network device events occur.
1369 *	The notifier passed is linked into the kernel structures and must
1370 *	not be reused until it has been unregistered. A negative errno code
1371 *	is returned on a failure.
1372 *
1373 * 	When registered all registration and up events are replayed
1374 *	to the new notifier to allow device to have a race free
1375 *	view of the network device list.
1376 */
1377
1378int register_netdevice_notifier(struct notifier_block *nb)
1379{
1380	struct net_device *dev;
1381	struct net_device *last;
1382	struct net *net;
1383	int err;
1384
1385	rtnl_lock();
1386	err = raw_notifier_chain_register(&netdev_chain, nb);
1387	if (err)
1388		goto unlock;
1389	if (dev_boot_phase)
1390		goto unlock;
1391	for_each_net(net) {
1392		for_each_netdev(net, dev) {
1393			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1394			err = notifier_to_errno(err);
1395			if (err)
1396				goto rollback;
1397
1398			if (!(dev->flags & IFF_UP))
1399				continue;
1400
1401			nb->notifier_call(nb, NETDEV_UP, dev);
1402		}
1403	}
1404
1405unlock:
1406	rtnl_unlock();
1407	return err;
1408
1409rollback:
1410	last = dev;
1411	for_each_net(net) {
1412		for_each_netdev(net, dev) {
1413			if (dev == last)
1414				break;
1415
1416			if (dev->flags & IFF_UP) {
1417				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1418				nb->notifier_call(nb, NETDEV_DOWN, dev);
1419			}
1420			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1421			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1422		}
1423	}
1424
1425	raw_notifier_chain_unregister(&netdev_chain, nb);
1426	goto unlock;
1427}
1428EXPORT_SYMBOL(register_netdevice_notifier);
1429
1430/**
1431 *	unregister_netdevice_notifier - unregister a network notifier block
1432 *	@nb: notifier
1433 *
1434 *	Unregister a notifier previously registered by
1435 *	register_netdevice_notifier(). The notifier is unlinked into the
1436 *	kernel structures and may then be reused. A negative errno code
1437 *	is returned on a failure.
1438 */
1439
1440int unregister_netdevice_notifier(struct notifier_block *nb)
1441{
1442	int err;
1443
1444	rtnl_lock();
1445	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1446	rtnl_unlock();
1447	return err;
1448}
1449EXPORT_SYMBOL(unregister_netdevice_notifier);
1450
1451/**
1452 *	call_netdevice_notifiers - call all network notifier blocks
1453 *      @val: value passed unmodified to notifier function
1454 *      @dev: net_device pointer passed unmodified to notifier function
1455 *
1456 *	Call all network notifier blocks.  Parameters and return value
1457 *	are as for raw_notifier_call_chain().
1458 */
1459
1460int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1461{
1462	ASSERT_RTNL();
1463	return raw_notifier_call_chain(&netdev_chain, val, dev);
1464}
1465
1466/* When > 0 there are consumers of rx skb time stamps */
1467static atomic_t netstamp_needed = ATOMIC_INIT(0);
1468
1469void net_enable_timestamp(void)
1470{
1471	atomic_inc(&netstamp_needed);
1472}
1473EXPORT_SYMBOL(net_enable_timestamp);
1474
1475void net_disable_timestamp(void)
1476{
1477	atomic_dec(&netstamp_needed);
1478}
1479EXPORT_SYMBOL(net_disable_timestamp);
1480
1481static inline void net_timestamp_set(struct sk_buff *skb)
1482{
1483	if (atomic_read(&netstamp_needed))
1484		__net_timestamp(skb);
1485	else
1486		skb->tstamp.tv64 = 0;
1487}
1488
1489static inline void net_timestamp_check(struct sk_buff *skb)
1490{
1491	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1492		__net_timestamp(skb);
1493}
1494
1495/**
1496 * dev_forward_skb - loopback an skb to another netif
1497 *
1498 * @dev: destination network device
1499 * @skb: buffer to forward
1500 *
1501 * return values:
1502 *	NET_RX_SUCCESS	(no congestion)
1503 *	NET_RX_DROP     (packet was dropped, but freed)
1504 *
1505 * dev_forward_skb can be used for injecting an skb from the
1506 * start_xmit function of one device into the receive queue
1507 * of another device.
1508 *
1509 * The receiving device may be in another namespace, so
1510 * we have to clear all information in the skb that could
1511 * impact namespace isolation.
1512 */
1513int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1514{
1515	skb_orphan(skb);
1516	nf_reset(skb);
1517
1518	if (unlikely(!(dev->flags & IFF_UP) ||
1519		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1520		atomic_long_inc(&dev->rx_dropped);
1521		kfree_skb(skb);
1522		return NET_RX_DROP;
1523	}
1524	skb_set_dev(skb, dev);
1525	skb->tstamp.tv64 = 0;
1526	skb->pkt_type = PACKET_HOST;
1527	skb->protocol = eth_type_trans(skb, dev);
1528	return netif_rx(skb);
1529}
1530EXPORT_SYMBOL_GPL(dev_forward_skb);
1531
1532static inline int deliver_skb(struct sk_buff *skb,
1533			      struct packet_type *pt_prev,
1534			      struct net_device *orig_dev)
1535{
1536	atomic_inc(&skb->users);
1537	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1538}
1539
1540/*
1541 *	Support routine. Sends outgoing frames to any network
1542 *	taps currently in use.
1543 */
1544
1545static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1546{
1547	struct packet_type *ptype;
1548	struct sk_buff *skb2 = NULL;
1549	struct packet_type *pt_prev = NULL;
1550
1551	rcu_read_lock();
1552	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1553		/* Never send packets back to the socket
1554		 * they originated from - MvS (miquels@drinkel.ow.org)
1555		 */
1556		if ((ptype->dev == dev || !ptype->dev) &&
1557		    (ptype->af_packet_priv == NULL ||
1558		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1559			if (pt_prev) {
1560				deliver_skb(skb2, pt_prev, skb->dev);
1561				pt_prev = ptype;
1562				continue;
1563			}
1564
1565			skb2 = skb_clone(skb, GFP_ATOMIC);
1566			if (!skb2)
1567				break;
1568
1569			net_timestamp_set(skb2);
1570
1571			/* skb->nh should be correctly
1572			   set by sender, so that the second statement is
1573			   just protection against buggy protocols.
1574			 */
1575			skb_reset_mac_header(skb2);
1576
1577			if (skb_network_header(skb2) < skb2->data ||
1578			    skb2->network_header > skb2->tail) {
1579				if (net_ratelimit())
1580					printk(KERN_CRIT "protocol %04x is "
1581					       "buggy, dev %s\n",
1582					       ntohs(skb2->protocol),
1583					       dev->name);
1584				skb_reset_network_header(skb2);
1585			}
1586
1587			skb2->transport_header = skb2->network_header;
1588			skb2->pkt_type = PACKET_OUTGOING;
1589			pt_prev = ptype;
1590		}
1591	}
1592	if (pt_prev)
1593		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1594	rcu_read_unlock();
1595}
1596
1597/*
1598 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1599 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1600 */
1601int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1602{
1603	int rc;
1604
1605	if (txq < 1 || txq > dev->num_tx_queues)
1606		return -EINVAL;
1607
1608	if (dev->reg_state == NETREG_REGISTERED) {
1609		ASSERT_RTNL();
1610
1611		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1612						  txq);
1613		if (rc)
1614			return rc;
1615
1616		if (txq < dev->real_num_tx_queues)
1617			qdisc_reset_all_tx_gt(dev, txq);
1618	}
1619
1620	dev->real_num_tx_queues = txq;
1621	return 0;
1622}
1623EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1624
1625#ifdef CONFIG_RPS
1626/**
1627 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1628 *	@dev: Network device
1629 *	@rxq: Actual number of RX queues
1630 *
1631 *	This must be called either with the rtnl_lock held or before
1632 *	registration of the net device.  Returns 0 on success, or a
1633 *	negative error code.  If called before registration, it always
1634 *	succeeds.
1635 */
1636int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1637{
1638	int rc;
1639
1640	if (rxq < 1 || rxq > dev->num_rx_queues)
1641		return -EINVAL;
1642
1643	if (dev->reg_state == NETREG_REGISTERED) {
1644		ASSERT_RTNL();
1645
1646		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1647						  rxq);
1648		if (rc)
1649			return rc;
1650	}
1651
1652	dev->real_num_rx_queues = rxq;
1653	return 0;
1654}
1655EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1656#endif
1657
1658static inline void __netif_reschedule(struct Qdisc *q)
1659{
1660	struct softnet_data *sd;
1661	unsigned long flags;
1662
1663	local_irq_save(flags);
1664	sd = &__get_cpu_var(softnet_data);
1665	q->next_sched = NULL;
1666	*sd->output_queue_tailp = q;
1667	sd->output_queue_tailp = &q->next_sched;
1668	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1669	local_irq_restore(flags);
1670}
1671
1672void __netif_schedule(struct Qdisc *q)
1673{
1674	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1675		__netif_reschedule(q);
1676}
1677EXPORT_SYMBOL(__netif_schedule);
1678
1679void dev_kfree_skb_irq(struct sk_buff *skb)
1680{
1681	if (atomic_dec_and_test(&skb->users)) {
1682		struct softnet_data *sd;
1683		unsigned long flags;
1684
1685		local_irq_save(flags);
1686		sd = &__get_cpu_var(softnet_data);
1687		skb->next = sd->completion_queue;
1688		sd->completion_queue = skb;
1689		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1690		local_irq_restore(flags);
1691	}
1692}
1693EXPORT_SYMBOL(dev_kfree_skb_irq);
1694
1695void dev_kfree_skb_any(struct sk_buff *skb)
1696{
1697	if (in_irq() || irqs_disabled())
1698		dev_kfree_skb_irq(skb);
1699	else
1700		dev_kfree_skb(skb);
1701}
1702EXPORT_SYMBOL(dev_kfree_skb_any);
1703
1704
1705/**
1706 * netif_device_detach - mark device as removed
1707 * @dev: network device
1708 *
1709 * Mark device as removed from system and therefore no longer available.
1710 */
1711void netif_device_detach(struct net_device *dev)
1712{
1713	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1714	    netif_running(dev)) {
1715		netif_tx_stop_all_queues(dev);
1716	}
1717}
1718EXPORT_SYMBOL(netif_device_detach);
1719
1720/**
1721 * netif_device_attach - mark device as attached
1722 * @dev: network device
1723 *
1724 * Mark device as attached from system and restart if needed.
1725 */
1726void netif_device_attach(struct net_device *dev)
1727{
1728	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1729	    netif_running(dev)) {
1730		netif_tx_wake_all_queues(dev);
1731		__netdev_watchdog_up(dev);
1732	}
1733}
1734EXPORT_SYMBOL(netif_device_attach);
1735
1736/**
1737 * skb_dev_set -- assign a new device to a buffer
1738 * @skb: buffer for the new device
1739 * @dev: network device
1740 *
1741 * If an skb is owned by a device already, we have to reset
1742 * all data private to the namespace a device belongs to
1743 * before assigning it a new device.
1744 */
1745#ifdef CONFIG_NET_NS
1746void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1747{
1748	skb_dst_drop(skb);
1749	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1750		secpath_reset(skb);
1751		nf_reset(skb);
1752		skb_init_secmark(skb);
1753		skb->mark = 0;
1754		skb->priority = 0;
1755		skb->nf_trace = 0;
1756		skb->ipvs_property = 0;
1757#ifdef CONFIG_NET_SCHED
1758		skb->tc_index = 0;
1759#endif
1760	}
1761	skb->dev = dev;
1762}
1763EXPORT_SYMBOL(skb_set_dev);
1764#endif /* CONFIG_NET_NS */
1765
1766/*
1767 * Invalidate hardware checksum when packet is to be mangled, and
1768 * complete checksum manually on outgoing path.
1769 */
1770int skb_checksum_help(struct sk_buff *skb)
1771{
1772	__wsum csum;
1773	int ret = 0, offset;
1774
1775	if (skb->ip_summed == CHECKSUM_COMPLETE)
1776		goto out_set_summed;
1777
1778	if (unlikely(skb_shinfo(skb)->gso_size)) {
1779		/* Let GSO fix up the checksum. */
1780		goto out_set_summed;
1781	}
1782
1783	offset = skb_checksum_start_offset(skb);
1784	BUG_ON(offset >= skb_headlen(skb));
1785	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1786
1787	offset += skb->csum_offset;
1788	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1789
1790	if (skb_cloned(skb) &&
1791	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1792		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1793		if (ret)
1794			goto out;
1795	}
1796
1797	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1798out_set_summed:
1799	skb->ip_summed = CHECKSUM_NONE;
1800out:
1801	return ret;
1802}
1803EXPORT_SYMBOL(skb_checksum_help);
1804
1805/**
1806 *	skb_gso_segment - Perform segmentation on skb.
1807 *	@skb: buffer to segment
1808 *	@features: features for the output path (see dev->features)
1809 *
1810 *	This function segments the given skb and returns a list of segments.
1811 *
1812 *	It may return NULL if the skb requires no segmentation.  This is
1813 *	only possible when GSO is used for verifying header integrity.
1814 */
1815struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1816{
1817	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1818	struct packet_type *ptype;
1819	__be16 type = skb->protocol;
1820	int vlan_depth = ETH_HLEN;
1821	int err;
1822
1823	while (type == htons(ETH_P_8021Q)) {
1824		struct vlan_hdr *vh;
1825
1826		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1827			return ERR_PTR(-EINVAL);
1828
1829		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1830		type = vh->h_vlan_encapsulated_proto;
1831		vlan_depth += VLAN_HLEN;
1832	}
1833
1834	skb_reset_mac_header(skb);
1835	skb->mac_len = skb->network_header - skb->mac_header;
1836	__skb_pull(skb, skb->mac_len);
1837
1838	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1839		struct net_device *dev = skb->dev;
1840		struct ethtool_drvinfo info = {};
1841
1842		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1843			dev->ethtool_ops->get_drvinfo(dev, &info);
1844
1845		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1846		     info.driver, dev ? dev->features : 0L,
1847		     skb->sk ? skb->sk->sk_route_caps : 0L,
1848		     skb->len, skb->data_len, skb->ip_summed);
1849
1850		if (skb_header_cloned(skb) &&
1851		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1852			return ERR_PTR(err);
1853	}
1854
1855	rcu_read_lock();
1856	list_for_each_entry_rcu(ptype,
1857			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1858		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1859			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1860				err = ptype->gso_send_check(skb);
1861				segs = ERR_PTR(err);
1862				if (err || skb_gso_ok(skb, features))
1863					break;
1864				__skb_push(skb, (skb->data -
1865						 skb_network_header(skb)));
1866			}
1867			segs = ptype->gso_segment(skb, features);
1868			break;
1869		}
1870	}
1871	rcu_read_unlock();
1872
1873	__skb_push(skb, skb->data - skb_mac_header(skb));
1874
1875	return segs;
1876}
1877EXPORT_SYMBOL(skb_gso_segment);
1878
1879/* Take action when hardware reception checksum errors are detected. */
1880#ifdef CONFIG_BUG
1881void netdev_rx_csum_fault(struct net_device *dev)
1882{
1883	if (net_ratelimit()) {
1884		printk(KERN_ERR "%s: hw csum failure.\n",
1885			dev ? dev->name : "<unknown>");
1886		dump_stack();
1887	}
1888}
1889EXPORT_SYMBOL(netdev_rx_csum_fault);
1890#endif
1891
1892/* Actually, we should eliminate this check as soon as we know, that:
1893 * 1. IOMMU is present and allows to map all the memory.
1894 * 2. No high memory really exists on this machine.
1895 */
1896
1897static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1898{
1899#ifdef CONFIG_HIGHMEM
1900	int i;
1901	if (!(dev->features & NETIF_F_HIGHDMA)) {
1902		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1903			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1904				return 1;
1905	}
1906
1907	if (PCI_DMA_BUS_IS_PHYS) {
1908		struct device *pdev = dev->dev.parent;
1909
1910		if (!pdev)
1911			return 0;
1912		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1913			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1914			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1915				return 1;
1916		}
1917	}
1918#endif
1919	return 0;
1920}
1921
1922struct dev_gso_cb {
1923	void (*destructor)(struct sk_buff *skb);
1924};
1925
1926#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1927
1928static void dev_gso_skb_destructor(struct sk_buff *skb)
1929{
1930	struct dev_gso_cb *cb;
1931
1932	do {
1933		struct sk_buff *nskb = skb->next;
1934
1935		skb->next = nskb->next;
1936		nskb->next = NULL;
1937		kfree_skb(nskb);
1938	} while (skb->next);
1939
1940	cb = DEV_GSO_CB(skb);
1941	if (cb->destructor)
1942		cb->destructor(skb);
1943}
1944
1945/**
1946 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1947 *	@skb: buffer to segment
1948 *	@features: device features as applicable to this skb
1949 *
1950 *	This function segments the given skb and stores the list of segments
1951 *	in skb->next.
1952 */
1953static int dev_gso_segment(struct sk_buff *skb, int features)
1954{
1955	struct sk_buff *segs;
1956
1957	segs = skb_gso_segment(skb, features);
1958
1959	/* Verifying header integrity only. */
1960	if (!segs)
1961		return 0;
1962
1963	if (IS_ERR(segs))
1964		return PTR_ERR(segs);
1965
1966	skb->next = segs;
1967	DEV_GSO_CB(skb)->destructor = skb->destructor;
1968	skb->destructor = dev_gso_skb_destructor;
1969
1970	return 0;
1971}
1972
1973/*
1974 * Try to orphan skb early, right before transmission by the device.
1975 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1976 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1977 */
1978static inline void skb_orphan_try(struct sk_buff *skb)
1979{
1980	struct sock *sk = skb->sk;
1981
1982	if (sk && !skb_shinfo(skb)->tx_flags) {
1983		/* skb_tx_hash() wont be able to get sk.
1984		 * We copy sk_hash into skb->rxhash
1985		 */
1986		if (!skb->rxhash)
1987			skb->rxhash = sk->sk_hash;
1988		skb_orphan(skb);
1989	}
1990}
1991
1992static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1993{
1994	return ((features & NETIF_F_GEN_CSUM) ||
1995		((features & NETIF_F_V4_CSUM) &&
1996		 protocol == htons(ETH_P_IP)) ||
1997		((features & NETIF_F_V6_CSUM) &&
1998		 protocol == htons(ETH_P_IPV6)) ||
1999		((features & NETIF_F_FCOE_CRC) &&
2000		 protocol == htons(ETH_P_FCOE)));
2001}
2002
2003static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2004{
2005	if (!can_checksum_protocol(features, protocol)) {
2006		features &= ~NETIF_F_ALL_CSUM;
2007		features &= ~NETIF_F_SG;
2008	} else if (illegal_highdma(skb->dev, skb)) {
2009		features &= ~NETIF_F_SG;
2010	}
2011
2012	return features;
2013}
2014
2015int netif_skb_features(struct sk_buff *skb)
2016{
2017	__be16 protocol = skb->protocol;
2018	int features = skb->dev->features;
2019
2020	if (protocol == htons(ETH_P_8021Q)) {
2021		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2022		protocol = veh->h_vlan_encapsulated_proto;
2023	} else if (!vlan_tx_tag_present(skb)) {
2024		return harmonize_features(skb, protocol, features);
2025	}
2026
2027	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2028
2029	if (protocol != htons(ETH_P_8021Q)) {
2030		return harmonize_features(skb, protocol, features);
2031	} else {
2032		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2033				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2034		return harmonize_features(skb, protocol, features);
2035	}
2036}
2037EXPORT_SYMBOL(netif_skb_features);
2038
2039/*
2040 * Returns true if either:
2041 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2042 *	2. skb is fragmented and the device does not support SG, or if
2043 *	   at least one of fragments is in highmem and device does not
2044 *	   support DMA from it.
2045 */
2046static inline int skb_needs_linearize(struct sk_buff *skb,
2047				      int features)
2048{
2049	return skb_is_nonlinear(skb) &&
2050			((skb_has_frag_list(skb) &&
2051				!(features & NETIF_F_FRAGLIST)) ||
2052			(skb_shinfo(skb)->nr_frags &&
2053				!(features & NETIF_F_SG)));
2054}
2055
2056int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2057			struct netdev_queue *txq)
2058{
2059	const struct net_device_ops *ops = dev->netdev_ops;
2060	int rc = NETDEV_TX_OK;
2061
2062	if (likely(!skb->next)) {
2063		int features;
2064
2065		/*
2066		 * If device doesnt need skb->dst, release it right now while
2067		 * its hot in this cpu cache
2068		 */
2069		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2070			skb_dst_drop(skb);
2071
2072		if (!list_empty(&ptype_all))
2073			dev_queue_xmit_nit(skb, dev);
2074
2075		skb_orphan_try(skb);
2076
2077		features = netif_skb_features(skb);
2078
2079		if (vlan_tx_tag_present(skb) &&
2080		    !(features & NETIF_F_HW_VLAN_TX)) {
2081			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2082			if (unlikely(!skb))
2083				goto out;
2084
2085			skb->vlan_tci = 0;
2086		}
2087
2088		if (netif_needs_gso(skb, features)) {
2089			if (unlikely(dev_gso_segment(skb, features)))
2090				goto out_kfree_skb;
2091			if (skb->next)
2092				goto gso;
2093		} else {
2094			if (skb_needs_linearize(skb, features) &&
2095			    __skb_linearize(skb))
2096				goto out_kfree_skb;
2097
2098			/* If packet is not checksummed and device does not
2099			 * support checksumming for this protocol, complete
2100			 * checksumming here.
2101			 */
2102			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2103				skb_set_transport_header(skb,
2104					skb_checksum_start_offset(skb));
2105				if (!(features & NETIF_F_ALL_CSUM) &&
2106				     skb_checksum_help(skb))
2107					goto out_kfree_skb;
2108			}
2109		}
2110
2111		rc = ops->ndo_start_xmit(skb, dev);
2112		trace_net_dev_xmit(skb, rc);
2113		if (rc == NETDEV_TX_OK)
2114			txq_trans_update(txq);
2115		return rc;
2116	}
2117
2118gso:
2119	do {
2120		struct sk_buff *nskb = skb->next;
2121
2122		skb->next = nskb->next;
2123		nskb->next = NULL;
2124
2125		/*
2126		 * If device doesnt need nskb->dst, release it right now while
2127		 * its hot in this cpu cache
2128		 */
2129		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2130			skb_dst_drop(nskb);
2131
2132		rc = ops->ndo_start_xmit(nskb, dev);
2133		trace_net_dev_xmit(nskb, rc);
2134		if (unlikely(rc != NETDEV_TX_OK)) {
2135			if (rc & ~NETDEV_TX_MASK)
2136				goto out_kfree_gso_skb;
2137			nskb->next = skb->next;
2138			skb->next = nskb;
2139			return rc;
2140		}
2141		txq_trans_update(txq);
2142		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2143			return NETDEV_TX_BUSY;
2144	} while (skb->next);
2145
2146out_kfree_gso_skb:
2147	if (likely(skb->next == NULL))
2148		skb->destructor = DEV_GSO_CB(skb)->destructor;
2149out_kfree_skb:
2150	kfree_skb(skb);
2151out:
2152	return rc;
2153}
2154
2155static u32 hashrnd __read_mostly;
2156
2157/*
2158 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2159 * to be used as a distribution range.
2160 */
2161u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2162		  unsigned int num_tx_queues)
2163{
2164	u32 hash;
2165
2166	if (skb_rx_queue_recorded(skb)) {
2167		hash = skb_get_rx_queue(skb);
2168		while (unlikely(hash >= num_tx_queues))
2169			hash -= num_tx_queues;
2170		return hash;
2171	}
2172
2173	if (skb->sk && skb->sk->sk_hash)
2174		hash = skb->sk->sk_hash;
2175	else
2176		hash = (__force u16) skb->protocol ^ skb->rxhash;
2177	hash = jhash_1word(hash, hashrnd);
2178
2179	return (u16) (((u64) hash * num_tx_queues) >> 32);
2180}
2181EXPORT_SYMBOL(__skb_tx_hash);
2182
2183static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2184{
2185	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2186		if (net_ratelimit()) {
2187			pr_warning("%s selects TX queue %d, but "
2188				"real number of TX queues is %d\n",
2189				dev->name, queue_index, dev->real_num_tx_queues);
2190		}
2191		return 0;
2192	}
2193	return queue_index;
2194}
2195
2196static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2197{
2198#ifdef CONFIG_XPS
2199	struct xps_dev_maps *dev_maps;
2200	struct xps_map *map;
2201	int queue_index = -1;
2202
2203	rcu_read_lock();
2204	dev_maps = rcu_dereference(dev->xps_maps);
2205	if (dev_maps) {
2206		map = rcu_dereference(
2207		    dev_maps->cpu_map[raw_smp_processor_id()]);
2208		if (map) {
2209			if (map->len == 1)
2210				queue_index = map->queues[0];
2211			else {
2212				u32 hash;
2213				if (skb->sk && skb->sk->sk_hash)
2214					hash = skb->sk->sk_hash;
2215				else
2216					hash = (__force u16) skb->protocol ^
2217					    skb->rxhash;
2218				hash = jhash_1word(hash, hashrnd);
2219				queue_index = map->queues[
2220				    ((u64)hash * map->len) >> 32];
2221			}
2222			if (unlikely(queue_index >= dev->real_num_tx_queues))
2223				queue_index = -1;
2224		}
2225	}
2226	rcu_read_unlock();
2227
2228	return queue_index;
2229#else
2230	return -1;
2231#endif
2232}
2233
2234static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2235					struct sk_buff *skb)
2236{
2237	int queue_index;
2238	const struct net_device_ops *ops = dev->netdev_ops;
2239
2240	if (dev->real_num_tx_queues == 1)
2241		queue_index = 0;
2242	else if (ops->ndo_select_queue) {
2243		queue_index = ops->ndo_select_queue(dev, skb);
2244		queue_index = dev_cap_txqueue(dev, queue_index);
2245	} else {
2246		struct sock *sk = skb->sk;
2247		queue_index = sk_tx_queue_get(sk);
2248
2249		if (queue_index < 0 || skb->ooo_okay ||
2250		    queue_index >= dev->real_num_tx_queues) {
2251			int old_index = queue_index;
2252
2253			queue_index = get_xps_queue(dev, skb);
2254			if (queue_index < 0)
2255				queue_index = skb_tx_hash(dev, skb);
2256
2257			if (queue_index != old_index && sk) {
2258				struct dst_entry *dst =
2259				    rcu_dereference_check(sk->sk_dst_cache, 1);
2260
2261				if (dst && skb_dst(skb) == dst)
2262					sk_tx_queue_set(sk, queue_index);
2263			}
2264		}
2265	}
2266
2267	skb_set_queue_mapping(skb, queue_index);
2268	return netdev_get_tx_queue(dev, queue_index);
2269}
2270
2271static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2272				 struct net_device *dev,
2273				 struct netdev_queue *txq)
2274{
2275	spinlock_t *root_lock = qdisc_lock(q);
2276	bool contended = qdisc_is_running(q);
2277	int rc;
2278
2279	/*
2280	 * Heuristic to force contended enqueues to serialize on a
2281	 * separate lock before trying to get qdisc main lock.
2282	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2283	 * and dequeue packets faster.
2284	 */
2285	if (unlikely(contended))
2286		spin_lock(&q->busylock);
2287
2288	spin_lock(root_lock);
2289	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2290		kfree_skb(skb);
2291		rc = NET_XMIT_DROP;
2292	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2293		   qdisc_run_begin(q)) {
2294		/*
2295		 * This is a work-conserving queue; there are no old skbs
2296		 * waiting to be sent out; and the qdisc is not running -
2297		 * xmit the skb directly.
2298		 */
2299		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2300			skb_dst_force(skb);
2301
2302		qdisc_skb_cb(skb)->pkt_len = skb->len;
2303		qdisc_bstats_update(q, skb);
2304
2305		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2306			if (unlikely(contended)) {
2307				spin_unlock(&q->busylock);
2308				contended = false;
2309			}
2310			__qdisc_run(q);
2311		} else
2312			qdisc_run_end(q);
2313
2314		rc = NET_XMIT_SUCCESS;
2315	} else {
2316		skb_dst_force(skb);
2317		rc = qdisc_enqueue_root(skb, q);
2318		if (qdisc_run_begin(q)) {
2319			if (unlikely(contended)) {
2320				spin_unlock(&q->busylock);
2321				contended = false;
2322			}
2323			__qdisc_run(q);
2324		}
2325	}
2326	spin_unlock(root_lock);
2327	if (unlikely(contended))
2328		spin_unlock(&q->busylock);
2329	return rc;
2330}
2331
2332static DEFINE_PER_CPU(int, xmit_recursion);
2333#define RECURSION_LIMIT 10
2334
2335/**
2336 *	dev_queue_xmit - transmit a buffer
2337 *	@skb: buffer to transmit
2338 *
2339 *	Queue a buffer for transmission to a network device. The caller must
2340 *	have set the device and priority and built the buffer before calling
2341 *	this function. The function can be called from an interrupt.
2342 *
2343 *	A negative errno code is returned on a failure. A success does not
2344 *	guarantee the frame will be transmitted as it may be dropped due
2345 *	to congestion or traffic shaping.
2346 *
2347 * -----------------------------------------------------------------------------------
2348 *      I notice this method can also return errors from the queue disciplines,
2349 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2350 *      be positive.
2351 *
2352 *      Regardless of the return value, the skb is consumed, so it is currently
2353 *      difficult to retry a send to this method.  (You can bump the ref count
2354 *      before sending to hold a reference for retry if you are careful.)
2355 *
2356 *      When calling this method, interrupts MUST be enabled.  This is because
2357 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2358 *          --BLG
2359 */
2360int dev_queue_xmit(struct sk_buff *skb)
2361{
2362	struct net_device *dev = skb->dev;
2363	struct netdev_queue *txq;
2364	struct Qdisc *q;
2365	int rc = -ENOMEM;
2366
2367	/* Disable soft irqs for various locks below. Also
2368	 * stops preemption for RCU.
2369	 */
2370	rcu_read_lock_bh();
2371
2372	txq = dev_pick_tx(dev, skb);
2373	q = rcu_dereference_bh(txq->qdisc);
2374
2375#ifdef CONFIG_NET_CLS_ACT
2376	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2377#endif
2378	trace_net_dev_queue(skb);
2379	if (q->enqueue) {
2380		rc = __dev_xmit_skb(skb, q, dev, txq);
2381		goto out;
2382	}
2383
2384	/* The device has no queue. Common case for software devices:
2385	   loopback, all the sorts of tunnels...
2386
2387	   Really, it is unlikely that netif_tx_lock protection is necessary
2388	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2389	   counters.)
2390	   However, it is possible, that they rely on protection
2391	   made by us here.
2392
2393	   Check this and shot the lock. It is not prone from deadlocks.
2394	   Either shot noqueue qdisc, it is even simpler 8)
2395	 */
2396	if (dev->flags & IFF_UP) {
2397		int cpu = smp_processor_id(); /* ok because BHs are off */
2398
2399		if (txq->xmit_lock_owner != cpu) {
2400
2401			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2402				goto recursion_alert;
2403
2404			HARD_TX_LOCK(dev, txq, cpu);
2405
2406			if (!netif_tx_queue_stopped(txq)) {
2407				__this_cpu_inc(xmit_recursion);
2408				rc = dev_hard_start_xmit(skb, dev, txq);
2409				__this_cpu_dec(xmit_recursion);
2410				if (dev_xmit_complete(rc)) {
2411					HARD_TX_UNLOCK(dev, txq);
2412					goto out;
2413				}
2414			}
2415			HARD_TX_UNLOCK(dev, txq);
2416			if (net_ratelimit())
2417				printk(KERN_CRIT "Virtual device %s asks to "
2418				       "queue packet!\n", dev->name);
2419		} else {
2420			/* Recursion is detected! It is possible,
2421			 * unfortunately
2422			 */
2423recursion_alert:
2424			if (net_ratelimit())
2425				printk(KERN_CRIT "Dead loop on virtual device "
2426				       "%s, fix it urgently!\n", dev->name);
2427		}
2428	}
2429
2430	rc = -ENETDOWN;
2431	rcu_read_unlock_bh();
2432
2433	kfree_skb(skb);
2434	return rc;
2435out:
2436	rcu_read_unlock_bh();
2437	return rc;
2438}
2439EXPORT_SYMBOL(dev_queue_xmit);
2440
2441
2442/*=======================================================================
2443			Receiver routines
2444  =======================================================================*/
2445
2446int netdev_max_backlog __read_mostly = 1000;
2447int netdev_tstamp_prequeue __read_mostly = 1;
2448int netdev_budget __read_mostly = 300;
2449int weight_p __read_mostly = 64;            /* old backlog weight */
2450
2451/* Called with irq disabled */
2452static inline void ____napi_schedule(struct softnet_data *sd,
2453				     struct napi_struct *napi)
2454{
2455	list_add_tail(&napi->poll_list, &sd->poll_list);
2456	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2457}
2458
2459/*
2460 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2461 * and src/dst port numbers. Returns a non-zero hash number on success
2462 * and 0 on failure.
2463 */
2464__u32 __skb_get_rxhash(struct sk_buff *skb)
2465{
2466	int nhoff, hash = 0, poff;
2467	struct ipv6hdr *ip6;
2468	struct iphdr *ip;
2469	u8 ip_proto;
2470	u32 addr1, addr2, ihl;
2471	union {
2472		u32 v32;
2473		u16 v16[2];
2474	} ports;
2475
2476	nhoff = skb_network_offset(skb);
2477
2478	switch (skb->protocol) {
2479	case __constant_htons(ETH_P_IP):
2480		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2481			goto done;
2482
2483		ip = (struct iphdr *) (skb->data + nhoff);
2484		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2485			ip_proto = 0;
2486		else
2487			ip_proto = ip->protocol;
2488		addr1 = (__force u32) ip->saddr;
2489		addr2 = (__force u32) ip->daddr;
2490		ihl = ip->ihl;
2491		break;
2492	case __constant_htons(ETH_P_IPV6):
2493		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2494			goto done;
2495
2496		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2497		ip_proto = ip6->nexthdr;
2498		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2499		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2500		ihl = (40 >> 2);
2501		break;
2502	default:
2503		goto done;
2504	}
2505
2506	ports.v32 = 0;
2507	poff = proto_ports_offset(ip_proto);
2508	if (poff >= 0) {
2509		nhoff += ihl * 4 + poff;
2510		if (pskb_may_pull(skb, nhoff + 4)) {
2511			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2512			if (ports.v16[1] < ports.v16[0])
2513				swap(ports.v16[0], ports.v16[1]);
2514		}
2515	}
2516
2517	/* get a consistent hash (same value on both flow directions) */
2518	if (addr2 < addr1)
2519		swap(addr1, addr2);
2520
2521	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2522	if (!hash)
2523		hash = 1;
2524
2525done:
2526	return hash;
2527}
2528EXPORT_SYMBOL(__skb_get_rxhash);
2529
2530#ifdef CONFIG_RPS
2531
2532/* One global table that all flow-based protocols share. */
2533struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2534EXPORT_SYMBOL(rps_sock_flow_table);
2535
2536/*
2537 * get_rps_cpu is called from netif_receive_skb and returns the target
2538 * CPU from the RPS map of the receiving queue for a given skb.
2539 * rcu_read_lock must be held on entry.
2540 */
2541static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2542		       struct rps_dev_flow **rflowp)
2543{
2544	struct netdev_rx_queue *rxqueue;
2545	struct rps_map *map;
2546	struct rps_dev_flow_table *flow_table;
2547	struct rps_sock_flow_table *sock_flow_table;
2548	int cpu = -1;
2549	u16 tcpu;
2550
2551	if (skb_rx_queue_recorded(skb)) {
2552		u16 index = skb_get_rx_queue(skb);
2553		if (unlikely(index >= dev->real_num_rx_queues)) {
2554			WARN_ONCE(dev->real_num_rx_queues > 1,
2555				  "%s received packet on queue %u, but number "
2556				  "of RX queues is %u\n",
2557				  dev->name, index, dev->real_num_rx_queues);
2558			goto done;
2559		}
2560		rxqueue = dev->_rx + index;
2561	} else
2562		rxqueue = dev->_rx;
2563
2564	map = rcu_dereference(rxqueue->rps_map);
2565	if (map) {
2566		if (map->len == 1) {
2567			tcpu = map->cpus[0];
2568			if (cpu_online(tcpu))
2569				cpu = tcpu;
2570			goto done;
2571		}
2572	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2573		goto done;
2574	}
2575
2576	skb_reset_network_header(skb);
2577	if (!skb_get_rxhash(skb))
2578		goto done;
2579
2580	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2581	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2582	if (flow_table && sock_flow_table) {
2583		u16 next_cpu;
2584		struct rps_dev_flow *rflow;
2585
2586		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2587		tcpu = rflow->cpu;
2588
2589		next_cpu = sock_flow_table->ents[skb->rxhash &
2590		    sock_flow_table->mask];
2591
2592		/*
2593		 * If the desired CPU (where last recvmsg was done) is
2594		 * different from current CPU (one in the rx-queue flow
2595		 * table entry), switch if one of the following holds:
2596		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2597		 *   - Current CPU is offline.
2598		 *   - The current CPU's queue tail has advanced beyond the
2599		 *     last packet that was enqueued using this table entry.
2600		 *     This guarantees that all previous packets for the flow
2601		 *     have been dequeued, thus preserving in order delivery.
2602		 */
2603		if (unlikely(tcpu != next_cpu) &&
2604		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2605		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2606		      rflow->last_qtail)) >= 0)) {
2607			tcpu = rflow->cpu = next_cpu;
2608			if (tcpu != RPS_NO_CPU)
2609				rflow->last_qtail = per_cpu(softnet_data,
2610				    tcpu).input_queue_head;
2611		}
2612		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2613			*rflowp = rflow;
2614			cpu = tcpu;
2615			goto done;
2616		}
2617	}
2618
2619	if (map) {
2620		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2621
2622		if (cpu_online(tcpu)) {
2623			cpu = tcpu;
2624			goto done;
2625		}
2626	}
2627
2628done:
2629	return cpu;
2630}
2631
2632/* Called from hardirq (IPI) context */
2633static void rps_trigger_softirq(void *data)
2634{
2635	struct softnet_data *sd = data;
2636
2637	____napi_schedule(sd, &sd->backlog);
2638	sd->received_rps++;
2639}
2640
2641#endif /* CONFIG_RPS */
2642
2643/*
2644 * Check if this softnet_data structure is another cpu one
2645 * If yes, queue it to our IPI list and return 1
2646 * If no, return 0
2647 */
2648static int rps_ipi_queued(struct softnet_data *sd)
2649{
2650#ifdef CONFIG_RPS
2651	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2652
2653	if (sd != mysd) {
2654		sd->rps_ipi_next = mysd->rps_ipi_list;
2655		mysd->rps_ipi_list = sd;
2656
2657		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2658		return 1;
2659	}
2660#endif /* CONFIG_RPS */
2661	return 0;
2662}
2663
2664/*
2665 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2666 * queue (may be a remote CPU queue).
2667 */
2668static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2669			      unsigned int *qtail)
2670{
2671	struct softnet_data *sd;
2672	unsigned long flags;
2673
2674	sd = &per_cpu(softnet_data, cpu);
2675
2676	local_irq_save(flags);
2677
2678	rps_lock(sd);
2679	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2680		if (skb_queue_len(&sd->input_pkt_queue)) {
2681enqueue:
2682			__skb_queue_tail(&sd->input_pkt_queue, skb);
2683			input_queue_tail_incr_save(sd, qtail);
2684			rps_unlock(sd);
2685			local_irq_restore(flags);
2686			return NET_RX_SUCCESS;
2687		}
2688
2689		/* Schedule NAPI for backlog device
2690		 * We can use non atomic operation since we own the queue lock
2691		 */
2692		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2693			if (!rps_ipi_queued(sd))
2694				____napi_schedule(sd, &sd->backlog);
2695		}
2696		goto enqueue;
2697	}
2698
2699	sd->dropped++;
2700	rps_unlock(sd);
2701
2702	local_irq_restore(flags);
2703
2704	atomic_long_inc(&skb->dev->rx_dropped);
2705	kfree_skb(skb);
2706	return NET_RX_DROP;
2707}
2708
2709/**
2710 *	netif_rx	-	post buffer to the network code
2711 *	@skb: buffer to post
2712 *
2713 *	This function receives a packet from a device driver and queues it for
2714 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2715 *	may be dropped during processing for congestion control or by the
2716 *	protocol layers.
2717 *
2718 *	return values:
2719 *	NET_RX_SUCCESS	(no congestion)
2720 *	NET_RX_DROP     (packet was dropped)
2721 *
2722 */
2723
2724int netif_rx(struct sk_buff *skb)
2725{
2726	int ret;
2727
2728	/* if netpoll wants it, pretend we never saw it */
2729	if (netpoll_rx(skb))
2730		return NET_RX_DROP;
2731
2732	if (netdev_tstamp_prequeue)
2733		net_timestamp_check(skb);
2734
2735	trace_netif_rx(skb);
2736#ifdef CONFIG_RPS
2737	{
2738		struct rps_dev_flow voidflow, *rflow = &voidflow;
2739		int cpu;
2740
2741		preempt_disable();
2742		rcu_read_lock();
2743
2744		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2745		if (cpu < 0)
2746			cpu = smp_processor_id();
2747
2748		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2749
2750		rcu_read_unlock();
2751		preempt_enable();
2752	}
2753#else
2754	{
2755		unsigned int qtail;
2756		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2757		put_cpu();
2758	}
2759#endif
2760	return ret;
2761}
2762EXPORT_SYMBOL(netif_rx);
2763
2764int netif_rx_ni(struct sk_buff *skb)
2765{
2766	int err;
2767
2768	preempt_disable();
2769	err = netif_rx(skb);
2770	if (local_softirq_pending())
2771		do_softirq();
2772	preempt_enable();
2773
2774	return err;
2775}
2776EXPORT_SYMBOL(netif_rx_ni);
2777
2778static void net_tx_action(struct softirq_action *h)
2779{
2780	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2781
2782	if (sd->completion_queue) {
2783		struct sk_buff *clist;
2784
2785		local_irq_disable();
2786		clist = sd->completion_queue;
2787		sd->completion_queue = NULL;
2788		local_irq_enable();
2789
2790		while (clist) {
2791			struct sk_buff *skb = clist;
2792			clist = clist->next;
2793
2794			WARN_ON(atomic_read(&skb->users));
2795			trace_kfree_skb(skb, net_tx_action);
2796			__kfree_skb(skb);
2797		}
2798	}
2799
2800	if (sd->output_queue) {
2801		struct Qdisc *head;
2802
2803		local_irq_disable();
2804		head = sd->output_queue;
2805		sd->output_queue = NULL;
2806		sd->output_queue_tailp = &sd->output_queue;
2807		local_irq_enable();
2808
2809		while (head) {
2810			struct Qdisc *q = head;
2811			spinlock_t *root_lock;
2812
2813			head = head->next_sched;
2814
2815			root_lock = qdisc_lock(q);
2816			if (spin_trylock(root_lock)) {
2817				smp_mb__before_clear_bit();
2818				clear_bit(__QDISC_STATE_SCHED,
2819					  &q->state);
2820				qdisc_run(q);
2821				spin_unlock(root_lock);
2822			} else {
2823				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2824					      &q->state)) {
2825					__netif_reschedule(q);
2826				} else {
2827					smp_mb__before_clear_bit();
2828					clear_bit(__QDISC_STATE_SCHED,
2829						  &q->state);
2830				}
2831			}
2832		}
2833	}
2834}
2835
2836#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2837    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2838/* This hook is defined here for ATM LANE */
2839int (*br_fdb_test_addr_hook)(struct net_device *dev,
2840			     unsigned char *addr) __read_mostly;
2841EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2842#endif
2843
2844#ifdef CONFIG_NET_CLS_ACT
2845/* TODO: Maybe we should just force sch_ingress to be compiled in
2846 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2847 * a compare and 2 stores extra right now if we dont have it on
2848 * but have CONFIG_NET_CLS_ACT
2849 * NOTE: This doesnt stop any functionality; if you dont have
2850 * the ingress scheduler, you just cant add policies on ingress.
2851 *
2852 */
2853static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2854{
2855	struct net_device *dev = skb->dev;
2856	u32 ttl = G_TC_RTTL(skb->tc_verd);
2857	int result = TC_ACT_OK;
2858	struct Qdisc *q;
2859
2860	if (unlikely(MAX_RED_LOOP < ttl++)) {
2861		if (net_ratelimit())
2862			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2863			       skb->skb_iif, dev->ifindex);
2864		return TC_ACT_SHOT;
2865	}
2866
2867	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2868	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2869
2870	q = rxq->qdisc;
2871	if (q != &noop_qdisc) {
2872		spin_lock(qdisc_lock(q));
2873		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2874			result = qdisc_enqueue_root(skb, q);
2875		spin_unlock(qdisc_lock(q));
2876	}
2877
2878	return result;
2879}
2880
2881static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2882					 struct packet_type **pt_prev,
2883					 int *ret, struct net_device *orig_dev)
2884{
2885	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2886
2887	if (!rxq || rxq->qdisc == &noop_qdisc)
2888		goto out;
2889
2890	if (*pt_prev) {
2891		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2892		*pt_prev = NULL;
2893	}
2894
2895	switch (ing_filter(skb, rxq)) {
2896	case TC_ACT_SHOT:
2897	case TC_ACT_STOLEN:
2898		kfree_skb(skb);
2899		return NULL;
2900	}
2901
2902out:
2903	skb->tc_verd = 0;
2904	return skb;
2905}
2906#endif
2907
2908/**
2909 *	netdev_rx_handler_register - register receive handler
2910 *	@dev: device to register a handler for
2911 *	@rx_handler: receive handler to register
2912 *	@rx_handler_data: data pointer that is used by rx handler
2913 *
2914 *	Register a receive hander for a device. This handler will then be
2915 *	called from __netif_receive_skb. A negative errno code is returned
2916 *	on a failure.
2917 *
2918 *	The caller must hold the rtnl_mutex.
2919 */
2920int netdev_rx_handler_register(struct net_device *dev,
2921			       rx_handler_func_t *rx_handler,
2922			       void *rx_handler_data)
2923{
2924	ASSERT_RTNL();
2925
2926	if (dev->rx_handler)
2927		return -EBUSY;
2928
2929	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2930	rcu_assign_pointer(dev->rx_handler, rx_handler);
2931
2932	return 0;
2933}
2934EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2935
2936/**
2937 *	netdev_rx_handler_unregister - unregister receive handler
2938 *	@dev: device to unregister a handler from
2939 *
2940 *	Unregister a receive hander from a device.
2941 *
2942 *	The caller must hold the rtnl_mutex.
2943 */
2944void netdev_rx_handler_unregister(struct net_device *dev)
2945{
2946
2947	ASSERT_RTNL();
2948	rcu_assign_pointer(dev->rx_handler, NULL);
2949	rcu_assign_pointer(dev->rx_handler_data, NULL);
2950}
2951EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2952
2953static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2954					      struct net_device *master)
2955{
2956	if (skb->pkt_type == PACKET_HOST) {
2957		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2958
2959		memcpy(dest, master->dev_addr, ETH_ALEN);
2960	}
2961}
2962
2963/* On bonding slaves other than the currently active slave, suppress
2964 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2965 * ARP on active-backup slaves with arp_validate enabled.
2966 */
2967int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2968{
2969	struct net_device *dev = skb->dev;
2970
2971	if (master->priv_flags & IFF_MASTER_ARPMON)
2972		dev->last_rx = jiffies;
2973
2974	if ((master->priv_flags & IFF_MASTER_ALB) &&
2975	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2976		/* Do address unmangle. The local destination address
2977		 * will be always the one master has. Provides the right
2978		 * functionality in a bridge.
2979		 */
2980		skb_bond_set_mac_by_master(skb, master);
2981	}
2982
2983	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2984		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2985		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2986			return 0;
2987
2988		if (master->priv_flags & IFF_MASTER_ALB) {
2989			if (skb->pkt_type != PACKET_BROADCAST &&
2990			    skb->pkt_type != PACKET_MULTICAST)
2991				return 0;
2992		}
2993		if (master->priv_flags & IFF_MASTER_8023AD &&
2994		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2995			return 0;
2996
2997		return 1;
2998	}
2999	return 0;
3000}
3001EXPORT_SYMBOL(__skb_bond_should_drop);
3002
3003static int __netif_receive_skb(struct sk_buff *skb)
3004{
3005	struct packet_type *ptype, *pt_prev;
3006	rx_handler_func_t *rx_handler;
3007	struct net_device *orig_dev;
3008	struct net_device *master;
3009	struct net_device *null_or_orig;
3010	struct net_device *orig_or_bond;
3011	int ret = NET_RX_DROP;
3012	__be16 type;
3013
3014	if (!netdev_tstamp_prequeue)
3015		net_timestamp_check(skb);
3016
3017	trace_netif_receive_skb(skb);
3018
3019	/* if we've gotten here through NAPI, check netpoll */
3020	if (netpoll_receive_skb(skb))
3021		return NET_RX_DROP;
3022
3023	if (!skb->skb_iif)
3024		skb->skb_iif = skb->dev->ifindex;
3025
3026	/*
3027	 * bonding note: skbs received on inactive slaves should only
3028	 * be delivered to pkt handlers that are exact matches.  Also
3029	 * the deliver_no_wcard flag will be set.  If packet handlers
3030	 * are sensitive to duplicate packets these skbs will need to
3031	 * be dropped at the handler.
3032	 */
3033	null_or_orig = NULL;
3034	orig_dev = skb->dev;
3035	master = ACCESS_ONCE(orig_dev->master);
3036	if (skb->deliver_no_wcard)
3037		null_or_orig = orig_dev;
3038	else if (master) {
3039		if (skb_bond_should_drop(skb, master)) {
3040			skb->deliver_no_wcard = 1;
3041			null_or_orig = orig_dev; /* deliver only exact match */
3042		} else
3043			skb->dev = master;
3044	}
3045
3046	__this_cpu_inc(softnet_data.processed);
3047	skb_reset_network_header(skb);
3048	skb_reset_transport_header(skb);
3049	skb->mac_len = skb->network_header - skb->mac_header;
3050
3051	pt_prev = NULL;
3052
3053	rcu_read_lock();
3054
3055#ifdef CONFIG_NET_CLS_ACT
3056	if (skb->tc_verd & TC_NCLS) {
3057		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3058		goto ncls;
3059	}
3060#endif
3061
3062	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3063		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3064		    ptype->dev == orig_dev) {
3065			if (pt_prev)
3066				ret = deliver_skb(skb, pt_prev, orig_dev);
3067			pt_prev = ptype;
3068		}
3069	}
3070
3071#ifdef CONFIG_NET_CLS_ACT
3072	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3073	if (!skb)
3074		goto out;
3075ncls:
3076#endif
3077
3078	/* Handle special case of bridge or macvlan */
3079	rx_handler = rcu_dereference(skb->dev->rx_handler);
3080	if (rx_handler) {
3081		if (pt_prev) {
3082			ret = deliver_skb(skb, pt_prev, orig_dev);
3083			pt_prev = NULL;
3084		}
3085		skb = rx_handler(skb);
3086		if (!skb)
3087			goto out;
3088	}
3089
3090	if (vlan_tx_tag_present(skb)) {
3091		if (pt_prev) {
3092			ret = deliver_skb(skb, pt_prev, orig_dev);
3093			pt_prev = NULL;
3094		}
3095		if (vlan_hwaccel_do_receive(&skb)) {
3096			ret = __netif_receive_skb(skb);
3097			goto out;
3098		} else if (unlikely(!skb))
3099			goto out;
3100	}
3101
3102	/*
3103	 * Make sure frames received on VLAN interfaces stacked on
3104	 * bonding interfaces still make their way to any base bonding
3105	 * device that may have registered for a specific ptype.  The
3106	 * handler may have to adjust skb->dev and orig_dev.
3107	 */
3108	orig_or_bond = orig_dev;
3109	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3110	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3111		orig_or_bond = vlan_dev_real_dev(skb->dev);
3112	}
3113
3114	type = skb->protocol;
3115	list_for_each_entry_rcu(ptype,
3116			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3117		if (ptype->type == type && (ptype->dev == null_or_orig ||
3118		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
3119		     ptype->dev == orig_or_bond)) {
3120			if (pt_prev)
3121				ret = deliver_skb(skb, pt_prev, orig_dev);
3122			pt_prev = ptype;
3123		}
3124	}
3125
3126	if (pt_prev) {
3127		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3128	} else {
3129		atomic_long_inc(&skb->dev->rx_dropped);
3130		kfree_skb(skb);
3131		/* Jamal, now you will not able to escape explaining
3132		 * me how you were going to use this. :-)
3133		 */
3134		ret = NET_RX_DROP;
3135	}
3136
3137out:
3138	rcu_read_unlock();
3139	return ret;
3140}
3141
3142/**
3143 *	netif_receive_skb - process receive buffer from network
3144 *	@skb: buffer to process
3145 *
3146 *	netif_receive_skb() is the main receive data processing function.
3147 *	It always succeeds. The buffer may be dropped during processing
3148 *	for congestion control or by the protocol layers.
3149 *
3150 *	This function may only be called from softirq context and interrupts
3151 *	should be enabled.
3152 *
3153 *	Return values (usually ignored):
3154 *	NET_RX_SUCCESS: no congestion
3155 *	NET_RX_DROP: packet was dropped
3156 */
3157int netif_receive_skb(struct sk_buff *skb)
3158{
3159	if (netdev_tstamp_prequeue)
3160		net_timestamp_check(skb);
3161
3162	if (skb_defer_rx_timestamp(skb))
3163		return NET_RX_SUCCESS;
3164
3165#ifdef CONFIG_RPS
3166	{
3167		struct rps_dev_flow voidflow, *rflow = &voidflow;
3168		int cpu, ret;
3169
3170		rcu_read_lock();
3171
3172		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3173
3174		if (cpu >= 0) {
3175			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3176			rcu_read_unlock();
3177		} else {
3178			rcu_read_unlock();
3179			ret = __netif_receive_skb(skb);
3180		}
3181
3182		return ret;
3183	}
3184#else
3185	return __netif_receive_skb(skb);
3186#endif
3187}
3188EXPORT_SYMBOL(netif_receive_skb);
3189
3190/* Network device is going away, flush any packets still pending
3191 * Called with irqs disabled.
3192 */
3193static void flush_backlog(void *arg)
3194{
3195	struct net_device *dev = arg;
3196	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3197	struct sk_buff *skb, *tmp;
3198
3199	rps_lock(sd);
3200	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3201		if (skb->dev == dev) {
3202			__skb_unlink(skb, &sd->input_pkt_queue);
3203			kfree_skb(skb);
3204			input_queue_head_incr(sd);
3205		}
3206	}
3207	rps_unlock(sd);
3208
3209	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3210		if (skb->dev == dev) {
3211			__skb_unlink(skb, &sd->process_queue);
3212			kfree_skb(skb);
3213			input_queue_head_incr(sd);
3214		}
3215	}
3216}
3217
3218static int napi_gro_complete(struct sk_buff *skb)
3219{
3220	struct packet_type *ptype;
3221	__be16 type = skb->protocol;
3222	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3223	int err = -ENOENT;
3224
3225	if (NAPI_GRO_CB(skb)->count == 1) {
3226		skb_shinfo(skb)->gso_size = 0;
3227		goto out;
3228	}
3229
3230	rcu_read_lock();
3231	list_for_each_entry_rcu(ptype, head, list) {
3232		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3233			continue;
3234
3235		err = ptype->gro_complete(skb);
3236		break;
3237	}
3238	rcu_read_unlock();
3239
3240	if (err) {
3241		WARN_ON(&ptype->list == head);
3242		kfree_skb(skb);
3243		return NET_RX_SUCCESS;
3244	}
3245
3246out:
3247	return netif_receive_skb(skb);
3248}
3249
3250inline void napi_gro_flush(struct napi_struct *napi)
3251{
3252	struct sk_buff *skb, *next;
3253
3254	for (skb = napi->gro_list; skb; skb = next) {
3255		next = skb->next;
3256		skb->next = NULL;
3257		napi_gro_complete(skb);
3258	}
3259
3260	napi->gro_count = 0;
3261	napi->gro_list = NULL;
3262}
3263EXPORT_SYMBOL(napi_gro_flush);
3264
3265enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3266{
3267	struct sk_buff **pp = NULL;
3268	struct packet_type *ptype;
3269	__be16 type = skb->protocol;
3270	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3271	int same_flow;
3272	int mac_len;
3273	enum gro_result ret;
3274
3275	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3276		goto normal;
3277
3278	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3279		goto normal;
3280
3281	rcu_read_lock();
3282	list_for_each_entry_rcu(ptype, head, list) {
3283		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3284			continue;
3285
3286		skb_set_network_header(skb, skb_gro_offset(skb));
3287		mac_len = skb->network_header - skb->mac_header;
3288		skb->mac_len = mac_len;
3289		NAPI_GRO_CB(skb)->same_flow = 0;
3290		NAPI_GRO_CB(skb)->flush = 0;
3291		NAPI_GRO_CB(skb)->free = 0;
3292
3293		pp = ptype->gro_receive(&napi->gro_list, skb);
3294		break;
3295	}
3296	rcu_read_unlock();
3297
3298	if (&ptype->list == head)
3299		goto normal;
3300
3301	same_flow = NAPI_GRO_CB(skb)->same_flow;
3302	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3303
3304	if (pp) {
3305		struct sk_buff *nskb = *pp;
3306
3307		*pp = nskb->next;
3308		nskb->next = NULL;
3309		napi_gro_complete(nskb);
3310		napi->gro_count--;
3311	}
3312
3313	if (same_flow)
3314		goto ok;
3315
3316	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3317		goto normal;
3318
3319	napi->gro_count++;
3320	NAPI_GRO_CB(skb)->count = 1;
3321	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3322	skb->next = napi->gro_list;
3323	napi->gro_list = skb;
3324	ret = GRO_HELD;
3325
3326pull:
3327	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3328		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3329
3330		BUG_ON(skb->end - skb->tail < grow);
3331
3332		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3333
3334		skb->tail += grow;
3335		skb->data_len -= grow;
3336
3337		skb_shinfo(skb)->frags[0].page_offset += grow;
3338		skb_shinfo(skb)->frags[0].size -= grow;
3339
3340		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3341			put_page(skb_shinfo(skb)->frags[0].page);
3342			memmove(skb_shinfo(skb)->frags,
3343				skb_shinfo(skb)->frags + 1,
3344				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3345		}
3346	}
3347
3348ok:
3349	return ret;
3350
3351normal:
3352	ret = GRO_NORMAL;
3353	goto pull;
3354}
3355EXPORT_SYMBOL(dev_gro_receive);
3356
3357static inline gro_result_t
3358__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3359{
3360	struct sk_buff *p;
3361
3362	for (p = napi->gro_list; p; p = p->next) {
3363		unsigned long diffs;
3364
3365		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3366		diffs |= p->vlan_tci ^ skb->vlan_tci;
3367		diffs |= compare_ether_header(skb_mac_header(p),
3368					      skb_gro_mac_header(skb));
3369		NAPI_GRO_CB(p)->same_flow = !diffs;
3370		NAPI_GRO_CB(p)->flush = 0;
3371	}
3372
3373	return dev_gro_receive(napi, skb);
3374}
3375
3376gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3377{
3378	switch (ret) {
3379	case GRO_NORMAL:
3380		if (netif_receive_skb(skb))
3381			ret = GRO_DROP;
3382		break;
3383
3384	case GRO_DROP:
3385	case GRO_MERGED_FREE:
3386		kfree_skb(skb);
3387		break;
3388
3389	case GRO_HELD:
3390	case GRO_MERGED:
3391		break;
3392	}
3393
3394	return ret;
3395}
3396EXPORT_SYMBOL(napi_skb_finish);
3397
3398void skb_gro_reset_offset(struct sk_buff *skb)
3399{
3400	NAPI_GRO_CB(skb)->data_offset = 0;
3401	NAPI_GRO_CB(skb)->frag0 = NULL;
3402	NAPI_GRO_CB(skb)->frag0_len = 0;
3403
3404	if (skb->mac_header == skb->tail &&
3405	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3406		NAPI_GRO_CB(skb)->frag0 =
3407			page_address(skb_shinfo(skb)->frags[0].page) +
3408			skb_shinfo(skb)->frags[0].page_offset;
3409		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3410	}
3411}
3412EXPORT_SYMBOL(skb_gro_reset_offset);
3413
3414gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3415{
3416	skb_gro_reset_offset(skb);
3417
3418	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3419}
3420EXPORT_SYMBOL(napi_gro_receive);
3421
3422static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3423{
3424	__skb_pull(skb, skb_headlen(skb));
3425	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3426	skb->vlan_tci = 0;
3427
3428	napi->skb = skb;
3429}
3430
3431struct sk_buff *napi_get_frags(struct napi_struct *napi)
3432{
3433	struct sk_buff *skb = napi->skb;
3434
3435	if (!skb) {
3436		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3437		if (skb)
3438			napi->skb = skb;
3439	}
3440	return skb;
3441}
3442EXPORT_SYMBOL(napi_get_frags);
3443
3444gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3445			       gro_result_t ret)
3446{
3447	switch (ret) {
3448	case GRO_NORMAL:
3449	case GRO_HELD:
3450		skb->protocol = eth_type_trans(skb, skb->dev);
3451
3452		if (ret == GRO_HELD)
3453			skb_gro_pull(skb, -ETH_HLEN);
3454		else if (netif_receive_skb(skb))
3455			ret = GRO_DROP;
3456		break;
3457
3458	case GRO_DROP:
3459	case GRO_MERGED_FREE:
3460		napi_reuse_skb(napi, skb);
3461		break;
3462
3463	case GRO_MERGED:
3464		break;
3465	}
3466
3467	return ret;
3468}
3469EXPORT_SYMBOL(napi_frags_finish);
3470
3471struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3472{
3473	struct sk_buff *skb = napi->skb;
3474	struct ethhdr *eth;
3475	unsigned int hlen;
3476	unsigned int off;
3477
3478	napi->skb = NULL;
3479
3480	skb_reset_mac_header(skb);
3481	skb_gro_reset_offset(skb);
3482
3483	off = skb_gro_offset(skb);
3484	hlen = off + sizeof(*eth);
3485	eth = skb_gro_header_fast(skb, off);
3486	if (skb_gro_header_hard(skb, hlen)) {
3487		eth = skb_gro_header_slow(skb, hlen, off);
3488		if (unlikely(!eth)) {
3489			napi_reuse_skb(napi, skb);
3490			skb = NULL;
3491			goto out;
3492		}
3493	}
3494
3495	skb_gro_pull(skb, sizeof(*eth));
3496
3497	/*
3498	 * This works because the only protocols we care about don't require
3499	 * special handling.  We'll fix it up properly at the end.
3500	 */
3501	skb->protocol = eth->h_proto;
3502
3503out:
3504	return skb;
3505}
3506EXPORT_SYMBOL(napi_frags_skb);
3507
3508gro_result_t napi_gro_frags(struct napi_struct *napi)
3509{
3510	struct sk_buff *skb = napi_frags_skb(napi);
3511
3512	if (!skb)
3513		return GRO_DROP;
3514
3515	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3516}
3517EXPORT_SYMBOL(napi_gro_frags);
3518
3519/*
3520 * net_rps_action sends any pending IPI's for rps.
3521 * Note: called with local irq disabled, but exits with local irq enabled.
3522 */
3523static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3524{
3525#ifdef CONFIG_RPS
3526	struct softnet_data *remsd = sd->rps_ipi_list;
3527
3528	if (remsd) {
3529		sd->rps_ipi_list = NULL;
3530
3531		local_irq_enable();
3532
3533		/* Send pending IPI's to kick RPS processing on remote cpus. */
3534		while (remsd) {
3535			struct softnet_data *next = remsd->rps_ipi_next;
3536
3537			if (cpu_online(remsd->cpu))
3538				__smp_call_function_single(remsd->cpu,
3539							   &remsd->csd, 0);
3540			remsd = next;
3541		}
3542	} else
3543#endif
3544		local_irq_enable();
3545}
3546
3547static int process_backlog(struct napi_struct *napi, int quota)
3548{
3549	int work = 0;
3550	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3551
3552#ifdef CONFIG_RPS
3553	/* Check if we have pending ipi, its better to send them now,
3554	 * not waiting net_rx_action() end.
3555	 */
3556	if (sd->rps_ipi_list) {
3557		local_irq_disable();
3558		net_rps_action_and_irq_enable(sd);
3559	}
3560#endif
3561	napi->weight = weight_p;
3562	local_irq_disable();
3563	while (work < quota) {
3564		struct sk_buff *skb;
3565		unsigned int qlen;
3566
3567		while ((skb = __skb_dequeue(&sd->process_queue))) {
3568			local_irq_enable();
3569			__netif_receive_skb(skb);
3570			local_irq_disable();
3571			input_queue_head_incr(sd);
3572			if (++work >= quota) {
3573				local_irq_enable();
3574				return work;
3575			}
3576		}
3577
3578		rps_lock(sd);
3579		qlen = skb_queue_len(&sd->input_pkt_queue);
3580		if (qlen)
3581			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3582						   &sd->process_queue);
3583
3584		if (qlen < quota - work) {
3585			/*
3586			 * Inline a custom version of __napi_complete().
3587			 * only current cpu owns and manipulates this napi,
3588			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3589			 * we can use a plain write instead of clear_bit(),
3590			 * and we dont need an smp_mb() memory barrier.
3591			 */
3592			list_del(&napi->poll_list);
3593			napi->state = 0;
3594
3595			quota = work + qlen;
3596		}
3597		rps_unlock(sd);
3598	}
3599	local_irq_enable();
3600
3601	return work;
3602}
3603
3604/**
3605 * __napi_schedule - schedule for receive
3606 * @n: entry to schedule
3607 *
3608 * The entry's receive function will be scheduled to run
3609 */
3610void __napi_schedule(struct napi_struct *n)
3611{
3612	unsigned long flags;
3613
3614	local_irq_save(flags);
3615	____napi_schedule(&__get_cpu_var(softnet_data), n);
3616	local_irq_restore(flags);
3617}
3618EXPORT_SYMBOL(__napi_schedule);
3619
3620void __napi_complete(struct napi_struct *n)
3621{
3622	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3623	BUG_ON(n->gro_list);
3624
3625	list_del(&n->poll_list);
3626	smp_mb__before_clear_bit();
3627	clear_bit(NAPI_STATE_SCHED, &n->state);
3628}
3629EXPORT_SYMBOL(__napi_complete);
3630
3631void napi_complete(struct napi_struct *n)
3632{
3633	unsigned long flags;
3634
3635	/*
3636	 * don't let napi dequeue from the cpu poll list
3637	 * just in case its running on a different cpu
3638	 */
3639	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3640		return;
3641
3642	napi_gro_flush(n);
3643	local_irq_save(flags);
3644	__napi_complete(n);
3645	local_irq_restore(flags);
3646}
3647EXPORT_SYMBOL(napi_complete);
3648
3649void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3650		    int (*poll)(struct napi_struct *, int), int weight)
3651{
3652	INIT_LIST_HEAD(&napi->poll_list);
3653	napi->gro_count = 0;
3654	napi->gro_list = NULL;
3655	napi->skb = NULL;
3656	napi->poll = poll;
3657	napi->weight = weight;
3658	list_add(&napi->dev_list, &dev->napi_list);
3659	napi->dev = dev;
3660#ifdef CONFIG_NETPOLL
3661	spin_lock_init(&napi->poll_lock);
3662	napi->poll_owner = -1;
3663#endif
3664	set_bit(NAPI_STATE_SCHED, &napi->state);
3665}
3666EXPORT_SYMBOL(netif_napi_add);
3667
3668void netif_napi_del(struct napi_struct *napi)
3669{
3670	struct sk_buff *skb, *next;
3671
3672	list_del_init(&napi->dev_list);
3673	napi_free_frags(napi);
3674
3675	for (skb = napi->gro_list; skb; skb = next) {
3676		next = skb->next;
3677		skb->next = NULL;
3678		kfree_skb(skb);
3679	}
3680
3681	napi->gro_list = NULL;
3682	napi->gro_count = 0;
3683}
3684EXPORT_SYMBOL(netif_napi_del);
3685
3686static void net_rx_action(struct softirq_action *h)
3687{
3688	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3689	unsigned long time_limit = jiffies + 2;
3690	int budget = netdev_budget;
3691	void *have;
3692
3693	local_irq_disable();
3694
3695	while (!list_empty(&sd->poll_list)) {
3696		struct napi_struct *n;
3697		int work, weight;
3698
3699		/* If softirq window is exhuasted then punt.
3700		 * Allow this to run for 2 jiffies since which will allow
3701		 * an average latency of 1.5/HZ.
3702		 */
3703		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3704			goto softnet_break;
3705
3706		local_irq_enable();
3707
3708		/* Even though interrupts have been re-enabled, this
3709		 * access is safe because interrupts can only add new
3710		 * entries to the tail of this list, and only ->poll()
3711		 * calls can remove this head entry from the list.
3712		 */
3713		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3714
3715		have = netpoll_poll_lock(n);
3716
3717		weight = n->weight;
3718
3719		/* This NAPI_STATE_SCHED test is for avoiding a race
3720		 * with netpoll's poll_napi().  Only the entity which
3721		 * obtains the lock and sees NAPI_STATE_SCHED set will
3722		 * actually make the ->poll() call.  Therefore we avoid
3723		 * accidently calling ->poll() when NAPI is not scheduled.
3724		 */
3725		work = 0;
3726		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3727			work = n->poll(n, weight);
3728			trace_napi_poll(n);
3729		}
3730
3731		WARN_ON_ONCE(work > weight);
3732
3733		budget -= work;
3734
3735		local_irq_disable();
3736
3737		/* Drivers must not modify the NAPI state if they
3738		 * consume the entire weight.  In such cases this code
3739		 * still "owns" the NAPI instance and therefore can
3740		 * move the instance around on the list at-will.
3741		 */
3742		if (unlikely(work == weight)) {
3743			if (unlikely(napi_disable_pending(n))) {
3744				local_irq_enable();
3745				napi_complete(n);
3746				local_irq_disable();
3747			} else
3748				list_move_tail(&n->poll_list, &sd->poll_list);
3749		}
3750
3751		netpoll_poll_unlock(have);
3752	}
3753out:
3754	net_rps_action_and_irq_enable(sd);
3755
3756#ifdef CONFIG_NET_DMA
3757	/*
3758	 * There may not be any more sk_buffs coming right now, so push
3759	 * any pending DMA copies to hardware
3760	 */
3761	dma_issue_pending_all();
3762#endif
3763
3764	return;
3765
3766softnet_break:
3767	sd->time_squeeze++;
3768	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3769	goto out;
3770}
3771
3772static gifconf_func_t *gifconf_list[NPROTO];
3773
3774/**
3775 *	register_gifconf	-	register a SIOCGIF handler
3776 *	@family: Address family
3777 *	@gifconf: Function handler
3778 *
3779 *	Register protocol dependent address dumping routines. The handler
3780 *	that is passed must not be freed or reused until it has been replaced
3781 *	by another handler.
3782 */
3783int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3784{
3785	if (family >= NPROTO)
3786		return -EINVAL;
3787	gifconf_list[family] = gifconf;
3788	return 0;
3789}
3790EXPORT_SYMBOL(register_gifconf);
3791
3792
3793/*
3794 *	Map an interface index to its name (SIOCGIFNAME)
3795 */
3796
3797/*
3798 *	We need this ioctl for efficient implementation of the
3799 *	if_indextoname() function required by the IPv6 API.  Without
3800 *	it, we would have to search all the interfaces to find a
3801 *	match.  --pb
3802 */
3803
3804static int dev_ifname(struct net *net, struct ifreq __user *arg)
3805{
3806	struct net_device *dev;
3807	struct ifreq ifr;
3808
3809	/*
3810	 *	Fetch the caller's info block.
3811	 */
3812
3813	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3814		return -EFAULT;
3815
3816	rcu_read_lock();
3817	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3818	if (!dev) {
3819		rcu_read_unlock();
3820		return -ENODEV;
3821	}
3822
3823	strcpy(ifr.ifr_name, dev->name);
3824	rcu_read_unlock();
3825
3826	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3827		return -EFAULT;
3828	return 0;
3829}
3830
3831/*
3832 *	Perform a SIOCGIFCONF call. This structure will change
3833 *	size eventually, and there is nothing I can do about it.
3834 *	Thus we will need a 'compatibility mode'.
3835 */
3836
3837static int dev_ifconf(struct net *net, char __user *arg)
3838{
3839	struct ifconf ifc;
3840	struct net_device *dev;
3841	char __user *pos;
3842	int len;
3843	int total;
3844	int i;
3845
3846	/*
3847	 *	Fetch the caller's info block.
3848	 */
3849
3850	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3851		return -EFAULT;
3852
3853	pos = ifc.ifc_buf;
3854	len = ifc.ifc_len;
3855
3856	/*
3857	 *	Loop over the interfaces, and write an info block for each.
3858	 */
3859
3860	total = 0;
3861	for_each_netdev(net, dev) {
3862		for (i = 0; i < NPROTO; i++) {
3863			if (gifconf_list[i]) {
3864				int done;
3865				if (!pos)
3866					done = gifconf_list[i](dev, NULL, 0);
3867				else
3868					done = gifconf_list[i](dev, pos + total,
3869							       len - total);
3870				if (done < 0)
3871					return -EFAULT;
3872				total += done;
3873			}
3874		}
3875	}
3876
3877	/*
3878	 *	All done.  Write the updated control block back to the caller.
3879	 */
3880	ifc.ifc_len = total;
3881
3882	/*
3883	 * 	Both BSD and Solaris return 0 here, so we do too.
3884	 */
3885	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3886}
3887
3888#ifdef CONFIG_PROC_FS
3889/*
3890 *	This is invoked by the /proc filesystem handler to display a device
3891 *	in detail.
3892 */
3893void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3894	__acquires(RCU)
3895{
3896	struct net *net = seq_file_net(seq);
3897	loff_t off;
3898	struct net_device *dev;
3899
3900	rcu_read_lock();
3901	if (!*pos)
3902		return SEQ_START_TOKEN;
3903
3904	off = 1;
3905	for_each_netdev_rcu(net, dev)
3906		if (off++ == *pos)
3907			return dev;
3908
3909	return NULL;
3910}
3911
3912void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3913{
3914	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3915				  first_net_device(seq_file_net(seq)) :
3916				  next_net_device((struct net_device *)v);
3917
3918	++*pos;
3919	return rcu_dereference(dev);
3920}
3921
3922void dev_seq_stop(struct seq_file *seq, void *v)
3923	__releases(RCU)
3924{
3925	rcu_read_unlock();
3926}
3927
3928static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3929{
3930	struct rtnl_link_stats64 temp;
3931	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3932
3933	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3934		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3935		   dev->name, stats->rx_bytes, stats->rx_packets,
3936		   stats->rx_errors,
3937		   stats->rx_dropped + stats->rx_missed_errors,
3938		   stats->rx_fifo_errors,
3939		   stats->rx_length_errors + stats->rx_over_errors +
3940		    stats->rx_crc_errors + stats->rx_frame_errors,
3941		   stats->rx_compressed, stats->multicast,
3942		   stats->tx_bytes, stats->tx_packets,
3943		   stats->tx_errors, stats->tx_dropped,
3944		   stats->tx_fifo_errors, stats->collisions,
3945		   stats->tx_carrier_errors +
3946		    stats->tx_aborted_errors +
3947		    stats->tx_window_errors +
3948		    stats->tx_heartbeat_errors,
3949		   stats->tx_compressed);
3950}
3951
3952/*
3953 *	Called from the PROCfs module. This now uses the new arbitrary sized
3954 *	/proc/net interface to create /proc/net/dev
3955 */
3956static int dev_seq_show(struct seq_file *seq, void *v)
3957{
3958	if (v == SEQ_START_TOKEN)
3959		seq_puts(seq, "Inter-|   Receive                            "
3960			      "                    |  Transmit\n"
3961			      " face |bytes    packets errs drop fifo frame "
3962			      "compressed multicast|bytes    packets errs "
3963			      "drop fifo colls carrier compressed\n");
3964	else
3965		dev_seq_printf_stats(seq, v);
3966	return 0;
3967}
3968
3969static struct softnet_data *softnet_get_online(loff_t *pos)
3970{
3971	struct softnet_data *sd = NULL;
3972
3973	while (*pos < nr_cpu_ids)
3974		if (cpu_online(*pos)) {
3975			sd = &per_cpu(softnet_data, *pos);
3976			break;
3977		} else
3978			++*pos;
3979	return sd;
3980}
3981
3982static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3983{
3984	return softnet_get_online(pos);
3985}
3986
3987static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3988{
3989	++*pos;
3990	return softnet_get_online(pos);
3991}
3992
3993static void softnet_seq_stop(struct seq_file *seq, void *v)
3994{
3995}
3996
3997static int softnet_seq_show(struct seq_file *seq, void *v)
3998{
3999	struct softnet_data *sd = v;
4000
4001	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4002		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4003		   0, 0, 0, 0, /* was fastroute */
4004		   sd->cpu_collision, sd->received_rps);
4005	return 0;
4006}
4007
4008static const struct seq_operations dev_seq_ops = {
4009	.start = dev_seq_start,
4010	.next  = dev_seq_next,
4011	.stop  = dev_seq_stop,
4012	.show  = dev_seq_show,
4013};
4014
4015static int dev_seq_open(struct inode *inode, struct file *file)
4016{
4017	return seq_open_net(inode, file, &dev_seq_ops,
4018			    sizeof(struct seq_net_private));
4019}
4020
4021static const struct file_operations dev_seq_fops = {
4022	.owner	 = THIS_MODULE,
4023	.open    = dev_seq_open,
4024	.read    = seq_read,
4025	.llseek  = seq_lseek,
4026	.release = seq_release_net,
4027};
4028
4029static const struct seq_operations softnet_seq_ops = {
4030	.start = softnet_seq_start,
4031	.next  = softnet_seq_next,
4032	.stop  = softnet_seq_stop,
4033	.show  = softnet_seq_show,
4034};
4035
4036static int softnet_seq_open(struct inode *inode, struct file *file)
4037{
4038	return seq_open(file, &softnet_seq_ops);
4039}
4040
4041static const struct file_operations softnet_seq_fops = {
4042	.owner	 = THIS_MODULE,
4043	.open    = softnet_seq_open,
4044	.read    = seq_read,
4045	.llseek  = seq_lseek,
4046	.release = seq_release,
4047};
4048
4049static void *ptype_get_idx(loff_t pos)
4050{
4051	struct packet_type *pt = NULL;
4052	loff_t i = 0;
4053	int t;
4054
4055	list_for_each_entry_rcu(pt, &ptype_all, list) {
4056		if (i == pos)
4057			return pt;
4058		++i;
4059	}
4060
4061	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4062		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4063			if (i == pos)
4064				return pt;
4065			++i;
4066		}
4067	}
4068	return NULL;
4069}
4070
4071static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4072	__acquires(RCU)
4073{
4074	rcu_read_lock();
4075	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4076}
4077
4078static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4079{
4080	struct packet_type *pt;
4081	struct list_head *nxt;
4082	int hash;
4083
4084	++*pos;
4085	if (v == SEQ_START_TOKEN)
4086		return ptype_get_idx(0);
4087
4088	pt = v;
4089	nxt = pt->list.next;
4090	if (pt->type == htons(ETH_P_ALL)) {
4091		if (nxt != &ptype_all)
4092			goto found;
4093		hash = 0;
4094		nxt = ptype_base[0].next;
4095	} else
4096		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4097
4098	while (nxt == &ptype_base[hash]) {
4099		if (++hash >= PTYPE_HASH_SIZE)
4100			return NULL;
4101		nxt = ptype_base[hash].next;
4102	}
4103found:
4104	return list_entry(nxt, struct packet_type, list);
4105}
4106
4107static void ptype_seq_stop(struct seq_file *seq, void *v)
4108	__releases(RCU)
4109{
4110	rcu_read_unlock();
4111}
4112
4113static int ptype_seq_show(struct seq_file *seq, void *v)
4114{
4115	struct packet_type *pt = v;
4116
4117	if (v == SEQ_START_TOKEN)
4118		seq_puts(seq, "Type Device      Function\n");
4119	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4120		if (pt->type == htons(ETH_P_ALL))
4121			seq_puts(seq, "ALL ");
4122		else
4123			seq_printf(seq, "%04x", ntohs(pt->type));
4124
4125		seq_printf(seq, " %-8s %pF\n",
4126			   pt->dev ? pt->dev->name : "", pt->func);
4127	}
4128
4129	return 0;
4130}
4131
4132static const struct seq_operations ptype_seq_ops = {
4133	.start = ptype_seq_start,
4134	.next  = ptype_seq_next,
4135	.stop  = ptype_seq_stop,
4136	.show  = ptype_seq_show,
4137};
4138
4139static int ptype_seq_open(struct inode *inode, struct file *file)
4140{
4141	return seq_open_net(inode, file, &ptype_seq_ops,
4142			sizeof(struct seq_net_private));
4143}
4144
4145static const struct file_operations ptype_seq_fops = {
4146	.owner	 = THIS_MODULE,
4147	.open    = ptype_seq_open,
4148	.read    = seq_read,
4149	.llseek  = seq_lseek,
4150	.release = seq_release_net,
4151};
4152
4153
4154static int __net_init dev_proc_net_init(struct net *net)
4155{
4156	int rc = -ENOMEM;
4157
4158	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4159		goto out;
4160	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4161		goto out_dev;
4162	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4163		goto out_softnet;
4164
4165	if (wext_proc_init(net))
4166		goto out_ptype;
4167	rc = 0;
4168out:
4169	return rc;
4170out_ptype:
4171	proc_net_remove(net, "ptype");
4172out_softnet:
4173	proc_net_remove(net, "softnet_stat");
4174out_dev:
4175	proc_net_remove(net, "dev");
4176	goto out;
4177}
4178
4179static void __net_exit dev_proc_net_exit(struct net *net)
4180{
4181	wext_proc_exit(net);
4182
4183	proc_net_remove(net, "ptype");
4184	proc_net_remove(net, "softnet_stat");
4185	proc_net_remove(net, "dev");
4186}
4187
4188static struct pernet_operations __net_initdata dev_proc_ops = {
4189	.init = dev_proc_net_init,
4190	.exit = dev_proc_net_exit,
4191};
4192
4193static int __init dev_proc_init(void)
4194{
4195	return register_pernet_subsys(&dev_proc_ops);
4196}
4197#else
4198#define dev_proc_init() 0
4199#endif	/* CONFIG_PROC_FS */
4200
4201
4202/**
4203 *	netdev_set_master	-	set up master/slave pair
4204 *	@slave: slave device
4205 *	@master: new master device
4206 *
4207 *	Changes the master device of the slave. Pass %NULL to break the
4208 *	bonding. The caller must hold the RTNL semaphore. On a failure
4209 *	a negative errno code is returned. On success the reference counts
4210 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4211 *	function returns zero.
4212 */
4213int netdev_set_master(struct net_device *slave, struct net_device *master)
4214{
4215	struct net_device *old = slave->master;
4216
4217	ASSERT_RTNL();
4218
4219	if (master) {
4220		if (old)
4221			return -EBUSY;
4222		dev_hold(master);
4223	}
4224
4225	slave->master = master;
4226
4227	if (old) {
4228		synchronize_net();
4229		dev_put(old);
4230	}
4231	if (master)
4232		slave->flags |= IFF_SLAVE;
4233	else
4234		slave->flags &= ~IFF_SLAVE;
4235
4236	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4237	return 0;
4238}
4239EXPORT_SYMBOL(netdev_set_master);
4240
4241static void dev_change_rx_flags(struct net_device *dev, int flags)
4242{
4243	const struct net_device_ops *ops = dev->netdev_ops;
4244
4245	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4246		ops->ndo_change_rx_flags(dev, flags);
4247}
4248
4249static int __dev_set_promiscuity(struct net_device *dev, int inc)
4250{
4251	unsigned short old_flags = dev->flags;
4252	uid_t uid;
4253	gid_t gid;
4254
4255	ASSERT_RTNL();
4256
4257	dev->flags |= IFF_PROMISC;
4258	dev->promiscuity += inc;
4259	if (dev->promiscuity == 0) {
4260		/*
4261		 * Avoid overflow.
4262		 * If inc causes overflow, untouch promisc and return error.
4263		 */
4264		if (inc < 0)
4265			dev->flags &= ~IFF_PROMISC;
4266		else {
4267			dev->promiscuity -= inc;
4268			printk(KERN_WARNING "%s: promiscuity touches roof, "
4269				"set promiscuity failed, promiscuity feature "
4270				"of device might be broken.\n", dev->name);
4271			return -EOVERFLOW;
4272		}
4273	}
4274	if (dev->flags != old_flags) {
4275		printk(KERN_INFO "device %s %s promiscuous mode\n",
4276		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4277							       "left");
4278		if (audit_enabled) {
4279			current_uid_gid(&uid, &gid);
4280			audit_log(current->audit_context, GFP_ATOMIC,
4281				AUDIT_ANOM_PROMISCUOUS,
4282				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4283				dev->name, (dev->flags & IFF_PROMISC),
4284				(old_flags & IFF_PROMISC),
4285				audit_get_loginuid(current),
4286				uid, gid,
4287				audit_get_sessionid(current));
4288		}
4289
4290		dev_change_rx_flags(dev, IFF_PROMISC);
4291	}
4292	return 0;
4293}
4294
4295/**
4296 *	dev_set_promiscuity	- update promiscuity count on a device
4297 *	@dev: device
4298 *	@inc: modifier
4299 *
4300 *	Add or remove promiscuity from a device. While the count in the device
4301 *	remains above zero the interface remains promiscuous. Once it hits zero
4302 *	the device reverts back to normal filtering operation. A negative inc
4303 *	value is used to drop promiscuity on the device.
4304 *	Return 0 if successful or a negative errno code on error.
4305 */
4306int dev_set_promiscuity(struct net_device *dev, int inc)
4307{
4308	unsigned short old_flags = dev->flags;
4309	int err;
4310
4311	err = __dev_set_promiscuity(dev, inc);
4312	if (err < 0)
4313		return err;
4314	if (dev->flags != old_flags)
4315		dev_set_rx_mode(dev);
4316	return err;
4317}
4318EXPORT_SYMBOL(dev_set_promiscuity);
4319
4320/**
4321 *	dev_set_allmulti	- update allmulti count on a device
4322 *	@dev: device
4323 *	@inc: modifier
4324 *
4325 *	Add or remove reception of all multicast frames to a device. While the
4326 *	count in the device remains above zero the interface remains listening
4327 *	to all interfaces. Once it hits zero the device reverts back to normal
4328 *	filtering operation. A negative @inc value is used to drop the counter
4329 *	when releasing a resource needing all multicasts.
4330 *	Return 0 if successful or a negative errno code on error.
4331 */
4332
4333int dev_set_allmulti(struct net_device *dev, int inc)
4334{
4335	unsigned short old_flags = dev->flags;
4336
4337	ASSERT_RTNL();
4338
4339	dev->flags |= IFF_ALLMULTI;
4340	dev->allmulti += inc;
4341	if (dev->allmulti == 0) {
4342		/*
4343		 * Avoid overflow.
4344		 * If inc causes overflow, untouch allmulti and return error.
4345		 */
4346		if (inc < 0)
4347			dev->flags &= ~IFF_ALLMULTI;
4348		else {
4349			dev->allmulti -= inc;
4350			printk(KERN_WARNING "%s: allmulti touches roof, "
4351				"set allmulti failed, allmulti feature of "
4352				"device might be broken.\n", dev->name);
4353			return -EOVERFLOW;
4354		}
4355	}
4356	if (dev->flags ^ old_flags) {
4357		dev_change_rx_flags(dev, IFF_ALLMULTI);
4358		dev_set_rx_mode(dev);
4359	}
4360	return 0;
4361}
4362EXPORT_SYMBOL(dev_set_allmulti);
4363
4364/*
4365 *	Upload unicast and multicast address lists to device and
4366 *	configure RX filtering. When the device doesn't support unicast
4367 *	filtering it is put in promiscuous mode while unicast addresses
4368 *	are present.
4369 */
4370void __dev_set_rx_mode(struct net_device *dev)
4371{
4372	const struct net_device_ops *ops = dev->netdev_ops;
4373
4374	/* dev_open will call this function so the list will stay sane. */
4375	if (!(dev->flags&IFF_UP))
4376		return;
4377
4378	if (!netif_device_present(dev))
4379		return;
4380
4381	if (ops->ndo_set_rx_mode)
4382		ops->ndo_set_rx_mode(dev);
4383	else {
4384		/* Unicast addresses changes may only happen under the rtnl,
4385		 * therefore calling __dev_set_promiscuity here is safe.
4386		 */
4387		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4388			__dev_set_promiscuity(dev, 1);
4389			dev->uc_promisc = 1;
4390		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4391			__dev_set_promiscuity(dev, -1);
4392			dev->uc_promisc = 0;
4393		}
4394
4395		if (ops->ndo_set_multicast_list)
4396			ops->ndo_set_multicast_list(dev);
4397	}
4398}
4399
4400void dev_set_rx_mode(struct net_device *dev)
4401{
4402	netif_addr_lock_bh(dev);
4403	__dev_set_rx_mode(dev);
4404	netif_addr_unlock_bh(dev);
4405}
4406
4407/**
4408 *	dev_get_flags - get flags reported to userspace
4409 *	@dev: device
4410 *
4411 *	Get the combination of flag bits exported through APIs to userspace.
4412 */
4413unsigned dev_get_flags(const struct net_device *dev)
4414{
4415	unsigned flags;
4416
4417	flags = (dev->flags & ~(IFF_PROMISC |
4418				IFF_ALLMULTI |
4419				IFF_RUNNING |
4420				IFF_LOWER_UP |
4421				IFF_DORMANT)) |
4422		(dev->gflags & (IFF_PROMISC |
4423				IFF_ALLMULTI));
4424
4425	if (netif_running(dev)) {
4426		if (netif_oper_up(dev))
4427			flags |= IFF_RUNNING;
4428		if (netif_carrier_ok(dev))
4429			flags |= IFF_LOWER_UP;
4430		if (netif_dormant(dev))
4431			flags |= IFF_DORMANT;
4432	}
4433
4434	return flags;
4435}
4436EXPORT_SYMBOL(dev_get_flags);
4437
4438int __dev_change_flags(struct net_device *dev, unsigned int flags)
4439{
4440	int old_flags = dev->flags;
4441	int ret;
4442
4443	ASSERT_RTNL();
4444
4445	/*
4446	 *	Set the flags on our device.
4447	 */
4448
4449	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4450			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4451			       IFF_AUTOMEDIA)) |
4452		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4453				    IFF_ALLMULTI));
4454
4455	/*
4456	 *	Load in the correct multicast list now the flags have changed.
4457	 */
4458
4459	if ((old_flags ^ flags) & IFF_MULTICAST)
4460		dev_change_rx_flags(dev, IFF_MULTICAST);
4461
4462	dev_set_rx_mode(dev);
4463
4464	/*
4465	 *	Have we downed the interface. We handle IFF_UP ourselves
4466	 *	according to user attempts to set it, rather than blindly
4467	 *	setting it.
4468	 */
4469
4470	ret = 0;
4471	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4472		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4473
4474		if (!ret)
4475			dev_set_rx_mode(dev);
4476	}
4477
4478	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4479		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4480
4481		dev->gflags ^= IFF_PROMISC;
4482		dev_set_promiscuity(dev, inc);
4483	}
4484
4485	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4486	   is important. Some (broken) drivers set IFF_PROMISC, when
4487	   IFF_ALLMULTI is requested not asking us and not reporting.
4488	 */
4489	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4490		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4491
4492		dev->gflags ^= IFF_ALLMULTI;
4493		dev_set_allmulti(dev, inc);
4494	}
4495
4496	return ret;
4497}
4498
4499void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4500{
4501	unsigned int changes = dev->flags ^ old_flags;
4502
4503	if (changes & IFF_UP) {
4504		if (dev->flags & IFF_UP)
4505			call_netdevice_notifiers(NETDEV_UP, dev);
4506		else
4507			call_netdevice_notifiers(NETDEV_DOWN, dev);
4508	}
4509
4510	if (dev->flags & IFF_UP &&
4511	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4512		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4513}
4514
4515/**
4516 *	dev_change_flags - change device settings
4517 *	@dev: device
4518 *	@flags: device state flags
4519 *
4520 *	Change settings on device based state flags. The flags are
4521 *	in the userspace exported format.
4522 */
4523int dev_change_flags(struct net_device *dev, unsigned flags)
4524{
4525	int ret, changes;
4526	int old_flags = dev->flags;
4527
4528	ret = __dev_change_flags(dev, flags);
4529	if (ret < 0)
4530		return ret;
4531
4532	changes = old_flags ^ dev->flags;
4533	if (changes)
4534		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4535
4536	__dev_notify_flags(dev, old_flags);
4537	return ret;
4538}
4539EXPORT_SYMBOL(dev_change_flags);
4540
4541/**
4542 *	dev_set_mtu - Change maximum transfer unit
4543 *	@dev: device
4544 *	@new_mtu: new transfer unit
4545 *
4546 *	Change the maximum transfer size of the network device.
4547 */
4548int dev_set_mtu(struct net_device *dev, int new_mtu)
4549{
4550	const struct net_device_ops *ops = dev->netdev_ops;
4551	int err;
4552
4553	if (new_mtu == dev->mtu)
4554		return 0;
4555
4556	/*	MTU must be positive.	 */
4557	if (new_mtu < 0)
4558		return -EINVAL;
4559
4560	if (!netif_device_present(dev))
4561		return -ENODEV;
4562
4563	err = 0;
4564	if (ops->ndo_change_mtu)
4565		err = ops->ndo_change_mtu(dev, new_mtu);
4566	else
4567		dev->mtu = new_mtu;
4568
4569	if (!err && dev->flags & IFF_UP)
4570		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4571	return err;
4572}
4573EXPORT_SYMBOL(dev_set_mtu);
4574
4575/**
4576 *	dev_set_mac_address - Change Media Access Control Address
4577 *	@dev: device
4578 *	@sa: new address
4579 *
4580 *	Change the hardware (MAC) address of the device
4581 */
4582int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4583{
4584	const struct net_device_ops *ops = dev->netdev_ops;
4585	int err;
4586
4587	if (!ops->ndo_set_mac_address)
4588		return -EOPNOTSUPP;
4589	if (sa->sa_family != dev->type)
4590		return -EINVAL;
4591	if (!netif_device_present(dev))
4592		return -ENODEV;
4593	err = ops->ndo_set_mac_address(dev, sa);
4594	if (!err)
4595		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4596	return err;
4597}
4598EXPORT_SYMBOL(dev_set_mac_address);
4599
4600/*
4601 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4602 */
4603static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4604{
4605	int err;
4606	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4607
4608	if (!dev)
4609		return -ENODEV;
4610
4611	switch (cmd) {
4612	case SIOCGIFFLAGS:	/* Get interface flags */
4613		ifr->ifr_flags = (short) dev_get_flags(dev);
4614		return 0;
4615
4616	case SIOCGIFMETRIC:	/* Get the metric on the interface
4617				   (currently unused) */
4618		ifr->ifr_metric = 0;
4619		return 0;
4620
4621	case SIOCGIFMTU:	/* Get the MTU of a device */
4622		ifr->ifr_mtu = dev->mtu;
4623		return 0;
4624
4625	case SIOCGIFHWADDR:
4626		if (!dev->addr_len)
4627			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4628		else
4629			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4630			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4631		ifr->ifr_hwaddr.sa_family = dev->type;
4632		return 0;
4633
4634	case SIOCGIFSLAVE:
4635		err = -EINVAL;
4636		break;
4637
4638	case SIOCGIFMAP:
4639		ifr->ifr_map.mem_start = dev->mem_start;
4640		ifr->ifr_map.mem_end   = dev->mem_end;
4641		ifr->ifr_map.base_addr = dev->base_addr;
4642		ifr->ifr_map.irq       = dev->irq;
4643		ifr->ifr_map.dma       = dev->dma;
4644		ifr->ifr_map.port      = dev->if_port;
4645		return 0;
4646
4647	case SIOCGIFINDEX:
4648		ifr->ifr_ifindex = dev->ifindex;
4649		return 0;
4650
4651	case SIOCGIFTXQLEN:
4652		ifr->ifr_qlen = dev->tx_queue_len;
4653		return 0;
4654
4655	default:
4656		/* dev_ioctl() should ensure this case
4657		 * is never reached
4658		 */
4659		WARN_ON(1);
4660		err = -EINVAL;
4661		break;
4662
4663	}
4664	return err;
4665}
4666
4667/*
4668 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4669 */
4670static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4671{
4672	int err;
4673	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4674	const struct net_device_ops *ops;
4675
4676	if (!dev)
4677		return -ENODEV;
4678
4679	ops = dev->netdev_ops;
4680
4681	switch (cmd) {
4682	case SIOCSIFFLAGS:	/* Set interface flags */
4683		return dev_change_flags(dev, ifr->ifr_flags);
4684
4685	case SIOCSIFMETRIC:	/* Set the metric on the interface
4686				   (currently unused) */
4687		return -EOPNOTSUPP;
4688
4689	case SIOCSIFMTU:	/* Set the MTU of a device */
4690		return dev_set_mtu(dev, ifr->ifr_mtu);
4691
4692	case SIOCSIFHWADDR:
4693		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4694
4695	case SIOCSIFHWBROADCAST:
4696		if (ifr->ifr_hwaddr.sa_family != dev->type)
4697			return -EINVAL;
4698		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4699		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4700		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4701		return 0;
4702
4703	case SIOCSIFMAP:
4704		if (ops->ndo_set_config) {
4705			if (!netif_device_present(dev))
4706				return -ENODEV;
4707			return ops->ndo_set_config(dev, &ifr->ifr_map);
4708		}
4709		return -EOPNOTSUPP;
4710
4711	case SIOCADDMULTI:
4712		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4713		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4714			return -EINVAL;
4715		if (!netif_device_present(dev))
4716			return -ENODEV;
4717		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4718
4719	case SIOCDELMULTI:
4720		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4721		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4722			return -EINVAL;
4723		if (!netif_device_present(dev))
4724			return -ENODEV;
4725		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4726
4727	case SIOCSIFTXQLEN:
4728		if (ifr->ifr_qlen < 0)
4729			return -EINVAL;
4730		dev->tx_queue_len = ifr->ifr_qlen;
4731		return 0;
4732
4733	case SIOCSIFNAME:
4734		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4735		return dev_change_name(dev, ifr->ifr_newname);
4736
4737	/*
4738	 *	Unknown or private ioctl
4739	 */
4740	default:
4741		if ((cmd >= SIOCDEVPRIVATE &&
4742		    cmd <= SIOCDEVPRIVATE + 15) ||
4743		    cmd == SIOCBONDENSLAVE ||
4744		    cmd == SIOCBONDRELEASE ||
4745		    cmd == SIOCBONDSETHWADDR ||
4746		    cmd == SIOCBONDSLAVEINFOQUERY ||
4747		    cmd == SIOCBONDINFOQUERY ||
4748		    cmd == SIOCBONDCHANGEACTIVE ||
4749		    cmd == SIOCGMIIPHY ||
4750		    cmd == SIOCGMIIREG ||
4751		    cmd == SIOCSMIIREG ||
4752		    cmd == SIOCBRADDIF ||
4753		    cmd == SIOCBRDELIF ||
4754		    cmd == SIOCSHWTSTAMP ||
4755		    cmd == SIOCWANDEV) {
4756			err = -EOPNOTSUPP;
4757			if (ops->ndo_do_ioctl) {
4758				if (netif_device_present(dev))
4759					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4760				else
4761					err = -ENODEV;
4762			}
4763		} else
4764			err = -EINVAL;
4765
4766	}
4767	return err;
4768}
4769
4770/*
4771 *	This function handles all "interface"-type I/O control requests. The actual
4772 *	'doing' part of this is dev_ifsioc above.
4773 */
4774
4775/**
4776 *	dev_ioctl	-	network device ioctl
4777 *	@net: the applicable net namespace
4778 *	@cmd: command to issue
4779 *	@arg: pointer to a struct ifreq in user space
4780 *
4781 *	Issue ioctl functions to devices. This is normally called by the
4782 *	user space syscall interfaces but can sometimes be useful for
4783 *	other purposes. The return value is the return from the syscall if
4784 *	positive or a negative errno code on error.
4785 */
4786
4787int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4788{
4789	struct ifreq ifr;
4790	int ret;
4791	char *colon;
4792
4793	/* One special case: SIOCGIFCONF takes ifconf argument
4794	   and requires shared lock, because it sleeps writing
4795	   to user space.
4796	 */
4797
4798	if (cmd == SIOCGIFCONF) {
4799		rtnl_lock();
4800		ret = dev_ifconf(net, (char __user *) arg);
4801		rtnl_unlock();
4802		return ret;
4803	}
4804	if (cmd == SIOCGIFNAME)
4805		return dev_ifname(net, (struct ifreq __user *)arg);
4806
4807	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4808		return -EFAULT;
4809
4810	ifr.ifr_name[IFNAMSIZ-1] = 0;
4811
4812	colon = strchr(ifr.ifr_name, ':');
4813	if (colon)
4814		*colon = 0;
4815
4816	/*
4817	 *	See which interface the caller is talking about.
4818	 */
4819
4820	switch (cmd) {
4821	/*
4822	 *	These ioctl calls:
4823	 *	- can be done by all.
4824	 *	- atomic and do not require locking.
4825	 *	- return a value
4826	 */
4827	case SIOCGIFFLAGS:
4828	case SIOCGIFMETRIC:
4829	case SIOCGIFMTU:
4830	case SIOCGIFHWADDR:
4831	case SIOCGIFSLAVE:
4832	case SIOCGIFMAP:
4833	case SIOCGIFINDEX:
4834	case SIOCGIFTXQLEN:
4835		dev_load(net, ifr.ifr_name);
4836		rcu_read_lock();
4837		ret = dev_ifsioc_locked(net, &ifr, cmd);
4838		rcu_read_unlock();
4839		if (!ret) {
4840			if (colon)
4841				*colon = ':';
4842			if (copy_to_user(arg, &ifr,
4843					 sizeof(struct ifreq)))
4844				ret = -EFAULT;
4845		}
4846		return ret;
4847
4848	case SIOCETHTOOL:
4849		dev_load(net, ifr.ifr_name);
4850		rtnl_lock();
4851		ret = dev_ethtool(net, &ifr);
4852		rtnl_unlock();
4853		if (!ret) {
4854			if (colon)
4855				*colon = ':';
4856			if (copy_to_user(arg, &ifr,
4857					 sizeof(struct ifreq)))
4858				ret = -EFAULT;
4859		}
4860		return ret;
4861
4862	/*
4863	 *	These ioctl calls:
4864	 *	- require superuser power.
4865	 *	- require strict serialization.
4866	 *	- return a value
4867	 */
4868	case SIOCGMIIPHY:
4869	case SIOCGMIIREG:
4870	case SIOCSIFNAME:
4871		if (!capable(CAP_NET_ADMIN))
4872			return -EPERM;
4873		dev_load(net, ifr.ifr_name);
4874		rtnl_lock();
4875		ret = dev_ifsioc(net, &ifr, cmd);
4876		rtnl_unlock();
4877		if (!ret) {
4878			if (colon)
4879				*colon = ':';
4880			if (copy_to_user(arg, &ifr,
4881					 sizeof(struct ifreq)))
4882				ret = -EFAULT;
4883		}
4884		return ret;
4885
4886	/*
4887	 *	These ioctl calls:
4888	 *	- require superuser power.
4889	 *	- require strict serialization.
4890	 *	- do not return a value
4891	 */
4892	case SIOCSIFFLAGS:
4893	case SIOCSIFMETRIC:
4894	case SIOCSIFMTU:
4895	case SIOCSIFMAP:
4896	case SIOCSIFHWADDR:
4897	case SIOCSIFSLAVE:
4898	case SIOCADDMULTI:
4899	case SIOCDELMULTI:
4900	case SIOCSIFHWBROADCAST:
4901	case SIOCSIFTXQLEN:
4902	case SIOCSMIIREG:
4903	case SIOCBONDENSLAVE:
4904	case SIOCBONDRELEASE:
4905	case SIOCBONDSETHWADDR:
4906	case SIOCBONDCHANGEACTIVE:
4907	case SIOCBRADDIF:
4908	case SIOCBRDELIF:
4909	case SIOCSHWTSTAMP:
4910		if (!capable(CAP_NET_ADMIN))
4911			return -EPERM;
4912		/* fall through */
4913	case SIOCBONDSLAVEINFOQUERY:
4914	case SIOCBONDINFOQUERY:
4915		dev_load(net, ifr.ifr_name);
4916		rtnl_lock();
4917		ret = dev_ifsioc(net, &ifr, cmd);
4918		rtnl_unlock();
4919		return ret;
4920
4921	case SIOCGIFMEM:
4922		/* Get the per device memory space. We can add this but
4923		 * currently do not support it */
4924	case SIOCSIFMEM:
4925		/* Set the per device memory buffer space.
4926		 * Not applicable in our case */
4927	case SIOCSIFLINK:
4928		return -EINVAL;
4929
4930	/*
4931	 *	Unknown or private ioctl.
4932	 */
4933	default:
4934		if (cmd == SIOCWANDEV ||
4935		    (cmd >= SIOCDEVPRIVATE &&
4936		     cmd <= SIOCDEVPRIVATE + 15)) {
4937			dev_load(net, ifr.ifr_name);
4938			rtnl_lock();
4939			ret = dev_ifsioc(net, &ifr, cmd);
4940			rtnl_unlock();
4941			if (!ret && copy_to_user(arg, &ifr,
4942						 sizeof(struct ifreq)))
4943				ret = -EFAULT;
4944			return ret;
4945		}
4946		/* Take care of Wireless Extensions */
4947		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4948			return wext_handle_ioctl(net, &ifr, cmd, arg);
4949		return -EINVAL;
4950	}
4951}
4952
4953
4954/**
4955 *	dev_new_index	-	allocate an ifindex
4956 *	@net: the applicable net namespace
4957 *
4958 *	Returns a suitable unique value for a new device interface
4959 *	number.  The caller must hold the rtnl semaphore or the
4960 *	dev_base_lock to be sure it remains unique.
4961 */
4962static int dev_new_index(struct net *net)
4963{
4964	static int ifindex;
4965	for (;;) {
4966		if (++ifindex <= 0)
4967			ifindex = 1;
4968		if (!__dev_get_by_index(net, ifindex))
4969			return ifindex;
4970	}
4971}
4972
4973/* Delayed registration/unregisteration */
4974static LIST_HEAD(net_todo_list);
4975
4976static void net_set_todo(struct net_device *dev)
4977{
4978	list_add_tail(&dev->todo_list, &net_todo_list);
4979}
4980
4981static void rollback_registered_many(struct list_head *head)
4982{
4983	struct net_device *dev, *tmp;
4984
4985	BUG_ON(dev_boot_phase);
4986	ASSERT_RTNL();
4987
4988	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4989		/* Some devices call without registering
4990		 * for initialization unwind. Remove those
4991		 * devices and proceed with the remaining.
4992		 */
4993		if (dev->reg_state == NETREG_UNINITIALIZED) {
4994			pr_debug("unregister_netdevice: device %s/%p never "
4995				 "was registered\n", dev->name, dev);
4996
4997			WARN_ON(1);
4998			list_del(&dev->unreg_list);
4999			continue;
5000		}
5001
5002		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5003	}
5004
5005	/* If device is running, close it first. */
5006	dev_close_many(head);
5007
5008	list_for_each_entry(dev, head, unreg_list) {
5009		/* And unlink it from device chain. */
5010		unlist_netdevice(dev);
5011
5012		dev->reg_state = NETREG_UNREGISTERING;
5013	}
5014
5015	synchronize_net();
5016
5017	list_for_each_entry(dev, head, unreg_list) {
5018		/* Shutdown queueing discipline. */
5019		dev_shutdown(dev);
5020
5021
5022		/* Notify protocols, that we are about to destroy
5023		   this device. They should clean all the things.
5024		*/
5025		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5026
5027		if (!dev->rtnl_link_ops ||
5028		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5029			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5030
5031		/*
5032		 *	Flush the unicast and multicast chains
5033		 */
5034		dev_uc_flush(dev);
5035		dev_mc_flush(dev);
5036
5037		if (dev->netdev_ops->ndo_uninit)
5038			dev->netdev_ops->ndo_uninit(dev);
5039
5040		/* Notifier chain MUST detach us from master device. */
5041		WARN_ON(dev->master);
5042
5043		/* Remove entries from kobject tree */
5044		netdev_unregister_kobject(dev);
5045	}
5046
5047	/* Process any work delayed until the end of the batch */
5048	dev = list_first_entry(head, struct net_device, unreg_list);
5049	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5050
5051	rcu_barrier();
5052
5053	list_for_each_entry(dev, head, unreg_list)
5054		dev_put(dev);
5055}
5056
5057static void rollback_registered(struct net_device *dev)
5058{
5059	LIST_HEAD(single);
5060
5061	list_add(&dev->unreg_list, &single);
5062	rollback_registered_many(&single);
5063}
5064
5065unsigned long netdev_fix_features(unsigned long features, const char *name)
5066{
5067	/* Fix illegal SG+CSUM combinations. */
5068	if ((features & NETIF_F_SG) &&
5069	    !(features & NETIF_F_ALL_CSUM)) {
5070		if (name)
5071			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5072			       "checksum feature.\n", name);
5073		features &= ~NETIF_F_SG;
5074	}
5075
5076	/* TSO requires that SG is present as well. */
5077	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5078		if (name)
5079			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5080			       "SG feature.\n", name);
5081		features &= ~NETIF_F_TSO;
5082	}
5083
5084	if (features & NETIF_F_UFO) {
5085		/* maybe split UFO into V4 and V6? */
5086		if (!((features & NETIF_F_GEN_CSUM) ||
5087		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5088			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5089			if (name)
5090				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5091				       "since no checksum offload features.\n",
5092				       name);
5093			features &= ~NETIF_F_UFO;
5094		}
5095
5096		if (!(features & NETIF_F_SG)) {
5097			if (name)
5098				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5099				       "since no NETIF_F_SG feature.\n", name);
5100			features &= ~NETIF_F_UFO;
5101		}
5102	}
5103
5104	return features;
5105}
5106EXPORT_SYMBOL(netdev_fix_features);
5107
5108/**
5109 *	netif_stacked_transfer_operstate -	transfer operstate
5110 *	@rootdev: the root or lower level device to transfer state from
5111 *	@dev: the device to transfer operstate to
5112 *
5113 *	Transfer operational state from root to device. This is normally
5114 *	called when a stacking relationship exists between the root
5115 *	device and the device(a leaf device).
5116 */
5117void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5118					struct net_device *dev)
5119{
5120	if (rootdev->operstate == IF_OPER_DORMANT)
5121		netif_dormant_on(dev);
5122	else
5123		netif_dormant_off(dev);
5124
5125	if (netif_carrier_ok(rootdev)) {
5126		if (!netif_carrier_ok(dev))
5127			netif_carrier_on(dev);
5128	} else {
5129		if (netif_carrier_ok(dev))
5130			netif_carrier_off(dev);
5131	}
5132}
5133EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5134
5135#ifdef CONFIG_RPS
5136static int netif_alloc_rx_queues(struct net_device *dev)
5137{
5138	unsigned int i, count = dev->num_rx_queues;
5139	struct netdev_rx_queue *rx;
5140
5141	BUG_ON(count < 1);
5142
5143	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5144	if (!rx) {
5145		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5146		return -ENOMEM;
5147	}
5148	dev->_rx = rx;
5149
5150	for (i = 0; i < count; i++)
5151		rx[i].dev = dev;
5152	return 0;
5153}
5154#endif
5155
5156static void netdev_init_one_queue(struct net_device *dev,
5157				  struct netdev_queue *queue, void *_unused)
5158{
5159	/* Initialize queue lock */
5160	spin_lock_init(&queue->_xmit_lock);
5161	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5162	queue->xmit_lock_owner = -1;
5163	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5164	queue->dev = dev;
5165}
5166
5167static int netif_alloc_netdev_queues(struct net_device *dev)
5168{
5169	unsigned int count = dev->num_tx_queues;
5170	struct netdev_queue *tx;
5171
5172	BUG_ON(count < 1);
5173
5174	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5175	if (!tx) {
5176		pr_err("netdev: Unable to allocate %u tx queues.\n",
5177		       count);
5178		return -ENOMEM;
5179	}
5180	dev->_tx = tx;
5181
5182	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5183	spin_lock_init(&dev->tx_global_lock);
5184
5185	return 0;
5186}
5187
5188/**
5189 *	register_netdevice	- register a network device
5190 *	@dev: device to register
5191 *
5192 *	Take a completed network device structure and add it to the kernel
5193 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5194 *	chain. 0 is returned on success. A negative errno code is returned
5195 *	on a failure to set up the device, or if the name is a duplicate.
5196 *
5197 *	Callers must hold the rtnl semaphore. You may want
5198 *	register_netdev() instead of this.
5199 *
5200 *	BUGS:
5201 *	The locking appears insufficient to guarantee two parallel registers
5202 *	will not get the same name.
5203 */
5204
5205int register_netdevice(struct net_device *dev)
5206{
5207	int ret;
5208	struct net *net = dev_net(dev);
5209
5210	BUG_ON(dev_boot_phase);
5211	ASSERT_RTNL();
5212
5213	might_sleep();
5214
5215	/* When net_device's are persistent, this will be fatal. */
5216	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5217	BUG_ON(!net);
5218
5219	spin_lock_init(&dev->addr_list_lock);
5220	netdev_set_addr_lockdep_class(dev);
5221
5222	dev->iflink = -1;
5223
5224	/* Init, if this function is available */
5225	if (dev->netdev_ops->ndo_init) {
5226		ret = dev->netdev_ops->ndo_init(dev);
5227		if (ret) {
5228			if (ret > 0)
5229				ret = -EIO;
5230			goto out;
5231		}
5232	}
5233
5234	ret = dev_get_valid_name(dev, dev->name, 0);
5235	if (ret)
5236		goto err_uninit;
5237
5238	dev->ifindex = dev_new_index(net);
5239	if (dev->iflink == -1)
5240		dev->iflink = dev->ifindex;
5241
5242	/* Fix illegal checksum combinations */
5243	if ((dev->features & NETIF_F_HW_CSUM) &&
5244	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5245		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5246		       dev->name);
5247		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5248	}
5249
5250	if ((dev->features & NETIF_F_NO_CSUM) &&
5251	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5252		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5253		       dev->name);
5254		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5255	}
5256
5257	dev->features = netdev_fix_features(dev->features, dev->name);
5258
5259	/* Enable software GSO if SG is supported. */
5260	if (dev->features & NETIF_F_SG)
5261		dev->features |= NETIF_F_GSO;
5262
5263	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5264	 * vlan_dev_init() will do the dev->features check, so these features
5265	 * are enabled only if supported by underlying device.
5266	 */
5267	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5268
5269	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5270	ret = notifier_to_errno(ret);
5271	if (ret)
5272		goto err_uninit;
5273
5274	ret = netdev_register_kobject(dev);
5275	if (ret)
5276		goto err_uninit;
5277	dev->reg_state = NETREG_REGISTERED;
5278
5279	/*
5280	 *	Default initial state at registry is that the
5281	 *	device is present.
5282	 */
5283
5284	set_bit(__LINK_STATE_PRESENT, &dev->state);
5285
5286	dev_init_scheduler(dev);
5287	dev_hold(dev);
5288	list_netdevice(dev);
5289
5290	/* Notify protocols, that a new device appeared. */
5291	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5292	ret = notifier_to_errno(ret);
5293	if (ret) {
5294		rollback_registered(dev);
5295		dev->reg_state = NETREG_UNREGISTERED;
5296	}
5297	/*
5298	 *	Prevent userspace races by waiting until the network
5299	 *	device is fully setup before sending notifications.
5300	 */
5301	if (!dev->rtnl_link_ops ||
5302	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5303		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5304
5305out:
5306	return ret;
5307
5308err_uninit:
5309	if (dev->netdev_ops->ndo_uninit)
5310		dev->netdev_ops->ndo_uninit(dev);
5311	goto out;
5312}
5313EXPORT_SYMBOL(register_netdevice);
5314
5315/**
5316 *	init_dummy_netdev	- init a dummy network device for NAPI
5317 *	@dev: device to init
5318 *
5319 *	This takes a network device structure and initialize the minimum
5320 *	amount of fields so it can be used to schedule NAPI polls without
5321 *	registering a full blown interface. This is to be used by drivers
5322 *	that need to tie several hardware interfaces to a single NAPI
5323 *	poll scheduler due to HW limitations.
5324 */
5325int init_dummy_netdev(struct net_device *dev)
5326{
5327	/* Clear everything. Note we don't initialize spinlocks
5328	 * are they aren't supposed to be taken by any of the
5329	 * NAPI code and this dummy netdev is supposed to be
5330	 * only ever used for NAPI polls
5331	 */
5332	memset(dev, 0, sizeof(struct net_device));
5333
5334	/* make sure we BUG if trying to hit standard
5335	 * register/unregister code path
5336	 */
5337	dev->reg_state = NETREG_DUMMY;
5338
5339	/* NAPI wants this */
5340	INIT_LIST_HEAD(&dev->napi_list);
5341
5342	/* a dummy interface is started by default */
5343	set_bit(__LINK_STATE_PRESENT, &dev->state);
5344	set_bit(__LINK_STATE_START, &dev->state);
5345
5346	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5347	 * because users of this 'device' dont need to change
5348	 * its refcount.
5349	 */
5350
5351	return 0;
5352}
5353EXPORT_SYMBOL_GPL(init_dummy_netdev);
5354
5355
5356/**
5357 *	register_netdev	- register a network device
5358 *	@dev: device to register
5359 *
5360 *	Take a completed network device structure and add it to the kernel
5361 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5362 *	chain. 0 is returned on success. A negative errno code is returned
5363 *	on a failure to set up the device, or if the name is a duplicate.
5364 *
5365 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5366 *	and expands the device name if you passed a format string to
5367 *	alloc_netdev.
5368 */
5369int register_netdev(struct net_device *dev)
5370{
5371	int err;
5372
5373	rtnl_lock();
5374
5375	/*
5376	 * If the name is a format string the caller wants us to do a
5377	 * name allocation.
5378	 */
5379	if (strchr(dev->name, '%')) {
5380		err = dev_alloc_name(dev, dev->name);
5381		if (err < 0)
5382			goto out;
5383	}
5384
5385	err = register_netdevice(dev);
5386out:
5387	rtnl_unlock();
5388	return err;
5389}
5390EXPORT_SYMBOL(register_netdev);
5391
5392int netdev_refcnt_read(const struct net_device *dev)
5393{
5394	int i, refcnt = 0;
5395
5396	for_each_possible_cpu(i)
5397		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5398	return refcnt;
5399}
5400EXPORT_SYMBOL(netdev_refcnt_read);
5401
5402/*
5403 * netdev_wait_allrefs - wait until all references are gone.
5404 *
5405 * This is called when unregistering network devices.
5406 *
5407 * Any protocol or device that holds a reference should register
5408 * for netdevice notification, and cleanup and put back the
5409 * reference if they receive an UNREGISTER event.
5410 * We can get stuck here if buggy protocols don't correctly
5411 * call dev_put.
5412 */
5413static void netdev_wait_allrefs(struct net_device *dev)
5414{
5415	unsigned long rebroadcast_time, warning_time;
5416	int refcnt;
5417
5418	linkwatch_forget_dev(dev);
5419
5420	rebroadcast_time = warning_time = jiffies;
5421	refcnt = netdev_refcnt_read(dev);
5422
5423	while (refcnt != 0) {
5424		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5425			rtnl_lock();
5426
5427			/* Rebroadcast unregister notification */
5428			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5429			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5430			 * should have already handle it the first time */
5431
5432			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5433				     &dev->state)) {
5434				/* We must not have linkwatch events
5435				 * pending on unregister. If this
5436				 * happens, we simply run the queue
5437				 * unscheduled, resulting in a noop
5438				 * for this device.
5439				 */
5440				linkwatch_run_queue();
5441			}
5442
5443			__rtnl_unlock();
5444
5445			rebroadcast_time = jiffies;
5446		}
5447
5448		msleep(250);
5449
5450		refcnt = netdev_refcnt_read(dev);
5451
5452		if (time_after(jiffies, warning_time + 10 * HZ)) {
5453			printk(KERN_EMERG "unregister_netdevice: "
5454			       "waiting for %s to become free. Usage "
5455			       "count = %d\n",
5456			       dev->name, refcnt);
5457			warning_time = jiffies;
5458		}
5459	}
5460}
5461
5462/* The sequence is:
5463 *
5464 *	rtnl_lock();
5465 *	...
5466 *	register_netdevice(x1);
5467 *	register_netdevice(x2);
5468 *	...
5469 *	unregister_netdevice(y1);
5470 *	unregister_netdevice(y2);
5471 *      ...
5472 *	rtnl_unlock();
5473 *	free_netdev(y1);
5474 *	free_netdev(y2);
5475 *
5476 * We are invoked by rtnl_unlock().
5477 * This allows us to deal with problems:
5478 * 1) We can delete sysfs objects which invoke hotplug
5479 *    without deadlocking with linkwatch via keventd.
5480 * 2) Since we run with the RTNL semaphore not held, we can sleep
5481 *    safely in order to wait for the netdev refcnt to drop to zero.
5482 *
5483 * We must not return until all unregister events added during
5484 * the interval the lock was held have been completed.
5485 */
5486void netdev_run_todo(void)
5487{
5488	struct list_head list;
5489
5490	/* Snapshot list, allow later requests */
5491	list_replace_init(&net_todo_list, &list);
5492
5493	__rtnl_unlock();
5494
5495	while (!list_empty(&list)) {
5496		struct net_device *dev
5497			= list_first_entry(&list, struct net_device, todo_list);
5498		list_del(&dev->todo_list);
5499
5500		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5501			printk(KERN_ERR "network todo '%s' but state %d\n",
5502			       dev->name, dev->reg_state);
5503			dump_stack();
5504			continue;
5505		}
5506
5507		dev->reg_state = NETREG_UNREGISTERED;
5508
5509		on_each_cpu(flush_backlog, dev, 1);
5510
5511		netdev_wait_allrefs(dev);
5512
5513		/* paranoia */
5514		BUG_ON(netdev_refcnt_read(dev));
5515		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5516		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5517		WARN_ON(dev->dn_ptr);
5518
5519		if (dev->destructor)
5520			dev->destructor(dev);
5521
5522		/* Free network device */
5523		kobject_put(&dev->dev.kobj);
5524	}
5525}
5526
5527/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5528 * fields in the same order, with only the type differing.
5529 */
5530static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5531				    const struct net_device_stats *netdev_stats)
5532{
5533#if BITS_PER_LONG == 64
5534        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5535        memcpy(stats64, netdev_stats, sizeof(*stats64));
5536#else
5537	size_t i, n = sizeof(*stats64) / sizeof(u64);
5538	const unsigned long *src = (const unsigned long *)netdev_stats;
5539	u64 *dst = (u64 *)stats64;
5540
5541	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5542		     sizeof(*stats64) / sizeof(u64));
5543	for (i = 0; i < n; i++)
5544		dst[i] = src[i];
5545#endif
5546}
5547
5548/**
5549 *	dev_get_stats	- get network device statistics
5550 *	@dev: device to get statistics from
5551 *	@storage: place to store stats
5552 *
5553 *	Get network statistics from device. Return @storage.
5554 *	The device driver may provide its own method by setting
5555 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5556 *	otherwise the internal statistics structure is used.
5557 */
5558struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5559					struct rtnl_link_stats64 *storage)
5560{
5561	const struct net_device_ops *ops = dev->netdev_ops;
5562
5563	if (ops->ndo_get_stats64) {
5564		memset(storage, 0, sizeof(*storage));
5565		ops->ndo_get_stats64(dev, storage);
5566	} else if (ops->ndo_get_stats) {
5567		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5568	} else {
5569		netdev_stats_to_stats64(storage, &dev->stats);
5570	}
5571	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5572	return storage;
5573}
5574EXPORT_SYMBOL(dev_get_stats);
5575
5576struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5577{
5578	struct netdev_queue *queue = dev_ingress_queue(dev);
5579
5580#ifdef CONFIG_NET_CLS_ACT
5581	if (queue)
5582		return queue;
5583	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5584	if (!queue)
5585		return NULL;
5586	netdev_init_one_queue(dev, queue, NULL);
5587	queue->qdisc = &noop_qdisc;
5588	queue->qdisc_sleeping = &noop_qdisc;
5589	rcu_assign_pointer(dev->ingress_queue, queue);
5590#endif
5591	return queue;
5592}
5593
5594/**
5595 *	alloc_netdev_mqs - allocate network device
5596 *	@sizeof_priv:	size of private data to allocate space for
5597 *	@name:		device name format string
5598 *	@setup:		callback to initialize device
5599 *	@txqs:		the number of TX subqueues to allocate
5600 *	@rxqs:		the number of RX subqueues to allocate
5601 *
5602 *	Allocates a struct net_device with private data area for driver use
5603 *	and performs basic initialization.  Also allocates subquue structs
5604 *	for each queue on the device.
5605 */
5606struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5607		void (*setup)(struct net_device *),
5608		unsigned int txqs, unsigned int rxqs)
5609{
5610	struct net_device *dev;
5611	size_t alloc_size;
5612	struct net_device *p;
5613
5614	BUG_ON(strlen(name) >= sizeof(dev->name));
5615
5616	if (txqs < 1) {
5617		pr_err("alloc_netdev: Unable to allocate device "
5618		       "with zero queues.\n");
5619		return NULL;
5620	}
5621
5622#ifdef CONFIG_RPS
5623	if (rxqs < 1) {
5624		pr_err("alloc_netdev: Unable to allocate device "
5625		       "with zero RX queues.\n");
5626		return NULL;
5627	}
5628#endif
5629
5630	alloc_size = sizeof(struct net_device);
5631	if (sizeof_priv) {
5632		/* ensure 32-byte alignment of private area */
5633		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5634		alloc_size += sizeof_priv;
5635	}
5636	/* ensure 32-byte alignment of whole construct */
5637	alloc_size += NETDEV_ALIGN - 1;
5638
5639	p = kzalloc(alloc_size, GFP_KERNEL);
5640	if (!p) {
5641		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5642		return NULL;
5643	}
5644
5645	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5646	dev->padded = (char *)dev - (char *)p;
5647
5648	dev->pcpu_refcnt = alloc_percpu(int);
5649	if (!dev->pcpu_refcnt)
5650		goto free_p;
5651
5652	if (dev_addr_init(dev))
5653		goto free_pcpu;
5654
5655	dev_mc_init(dev);
5656	dev_uc_init(dev);
5657
5658	dev_net_set(dev, &init_net);
5659
5660	dev->num_tx_queues = txqs;
5661	dev->real_num_tx_queues = txqs;
5662	if (netif_alloc_netdev_queues(dev))
5663		goto free_pcpu;
5664
5665#ifdef CONFIG_RPS
5666	dev->num_rx_queues = rxqs;
5667	dev->real_num_rx_queues = rxqs;
5668	if (netif_alloc_rx_queues(dev))
5669		goto free_pcpu;
5670#endif
5671
5672	dev->gso_max_size = GSO_MAX_SIZE;
5673
5674	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5675	dev->ethtool_ntuple_list.count = 0;
5676	INIT_LIST_HEAD(&dev->napi_list);
5677	INIT_LIST_HEAD(&dev->unreg_list);
5678	INIT_LIST_HEAD(&dev->link_watch_list);
5679	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5680	setup(dev);
5681	strcpy(dev->name, name);
5682	return dev;
5683
5684free_pcpu:
5685	free_percpu(dev->pcpu_refcnt);
5686	kfree(dev->_tx);
5687#ifdef CONFIG_RPS
5688	kfree(dev->_rx);
5689#endif
5690
5691free_p:
5692	kfree(p);
5693	return NULL;
5694}
5695EXPORT_SYMBOL(alloc_netdev_mqs);
5696
5697/**
5698 *	free_netdev - free network device
5699 *	@dev: device
5700 *
5701 *	This function does the last stage of destroying an allocated device
5702 * 	interface. The reference to the device object is released.
5703 *	If this is the last reference then it will be freed.
5704 */
5705void free_netdev(struct net_device *dev)
5706{
5707	struct napi_struct *p, *n;
5708
5709	release_net(dev_net(dev));
5710
5711	kfree(dev->_tx);
5712#ifdef CONFIG_RPS
5713	kfree(dev->_rx);
5714#endif
5715
5716	kfree(rcu_dereference_raw(dev->ingress_queue));
5717
5718	/* Flush device addresses */
5719	dev_addr_flush(dev);
5720
5721	/* Clear ethtool n-tuple list */
5722	ethtool_ntuple_flush(dev);
5723
5724	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5725		netif_napi_del(p);
5726
5727	free_percpu(dev->pcpu_refcnt);
5728	dev->pcpu_refcnt = NULL;
5729
5730	/*  Compatibility with error handling in drivers */
5731	if (dev->reg_state == NETREG_UNINITIALIZED) {
5732		kfree((char *)dev - dev->padded);
5733		return;
5734	}
5735
5736	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5737	dev->reg_state = NETREG_RELEASED;
5738
5739	/* will free via device release */
5740	put_device(&dev->dev);
5741}
5742EXPORT_SYMBOL(free_netdev);
5743
5744/**
5745 *	synchronize_net -  Synchronize with packet receive processing
5746 *
5747 *	Wait for packets currently being received to be done.
5748 *	Does not block later packets from starting.
5749 */
5750void synchronize_net(void)
5751{
5752	might_sleep();
5753	synchronize_rcu();
5754}
5755EXPORT_SYMBOL(synchronize_net);
5756
5757/**
5758 *	unregister_netdevice_queue - remove device from the kernel
5759 *	@dev: device
5760 *	@head: list
5761 *
5762 *	This function shuts down a device interface and removes it
5763 *	from the kernel tables.
5764 *	If head not NULL, device is queued to be unregistered later.
5765 *
5766 *	Callers must hold the rtnl semaphore.  You may want
5767 *	unregister_netdev() instead of this.
5768 */
5769
5770void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5771{
5772	ASSERT_RTNL();
5773
5774	if (head) {
5775		list_move_tail(&dev->unreg_list, head);
5776	} else {
5777		rollback_registered(dev);
5778		/* Finish processing unregister after unlock */
5779		net_set_todo(dev);
5780	}
5781}
5782EXPORT_SYMBOL(unregister_netdevice_queue);
5783
5784/**
5785 *	unregister_netdevice_many - unregister many devices
5786 *	@head: list of devices
5787 */
5788void unregister_netdevice_many(struct list_head *head)
5789{
5790	struct net_device *dev;
5791
5792	if (!list_empty(head)) {
5793		rollback_registered_many(head);
5794		list_for_each_entry(dev, head, unreg_list)
5795			net_set_todo(dev);
5796	}
5797}
5798EXPORT_SYMBOL(unregister_netdevice_many);
5799
5800/**
5801 *	unregister_netdev - remove device from the kernel
5802 *	@dev: device
5803 *
5804 *	This function shuts down a device interface and removes it
5805 *	from the kernel tables.
5806 *
5807 *	This is just a wrapper for unregister_netdevice that takes
5808 *	the rtnl semaphore.  In general you want to use this and not
5809 *	unregister_netdevice.
5810 */
5811void unregister_netdev(struct net_device *dev)
5812{
5813	rtnl_lock();
5814	unregister_netdevice(dev);
5815	rtnl_unlock();
5816}
5817EXPORT_SYMBOL(unregister_netdev);
5818
5819/**
5820 *	dev_change_net_namespace - move device to different nethost namespace
5821 *	@dev: device
5822 *	@net: network namespace
5823 *	@pat: If not NULL name pattern to try if the current device name
5824 *	      is already taken in the destination network namespace.
5825 *
5826 *	This function shuts down a device interface and moves it
5827 *	to a new network namespace. On success 0 is returned, on
5828 *	a failure a netagive errno code is returned.
5829 *
5830 *	Callers must hold the rtnl semaphore.
5831 */
5832
5833int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5834{
5835	int err;
5836
5837	ASSERT_RTNL();
5838
5839	/* Don't allow namespace local devices to be moved. */
5840	err = -EINVAL;
5841	if (dev->features & NETIF_F_NETNS_LOCAL)
5842		goto out;
5843
5844	/* Ensure the device has been registrered */
5845	err = -EINVAL;
5846	if (dev->reg_state != NETREG_REGISTERED)
5847		goto out;
5848
5849	/* Get out if there is nothing todo */
5850	err = 0;
5851	if (net_eq(dev_net(dev), net))
5852		goto out;
5853
5854	/* Pick the destination device name, and ensure
5855	 * we can use it in the destination network namespace.
5856	 */
5857	err = -EEXIST;
5858	if (__dev_get_by_name(net, dev->name)) {
5859		/* We get here if we can't use the current device name */
5860		if (!pat)
5861			goto out;
5862		if (dev_get_valid_name(dev, pat, 1))
5863			goto out;
5864	}
5865
5866	/*
5867	 * And now a mini version of register_netdevice unregister_netdevice.
5868	 */
5869
5870	/* If device is running close it first. */
5871	dev_close(dev);
5872
5873	/* And unlink it from device chain */
5874	err = -ENODEV;
5875	unlist_netdevice(dev);
5876
5877	synchronize_net();
5878
5879	/* Shutdown queueing discipline. */
5880	dev_shutdown(dev);
5881
5882	/* Notify protocols, that we are about to destroy
5883	   this device. They should clean all the things.
5884
5885	   Note that dev->reg_state stays at NETREG_REGISTERED.
5886	   This is wanted because this way 8021q and macvlan know
5887	   the device is just moving and can keep their slaves up.
5888	*/
5889	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5890	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5891
5892	/*
5893	 *	Flush the unicast and multicast chains
5894	 */
5895	dev_uc_flush(dev);
5896	dev_mc_flush(dev);
5897
5898	/* Actually switch the network namespace */
5899	dev_net_set(dev, net);
5900
5901	/* If there is an ifindex conflict assign a new one */
5902	if (__dev_get_by_index(net, dev->ifindex)) {
5903		int iflink = (dev->iflink == dev->ifindex);
5904		dev->ifindex = dev_new_index(net);
5905		if (iflink)
5906			dev->iflink = dev->ifindex;
5907	}
5908
5909	/* Fixup kobjects */
5910	err = device_rename(&dev->dev, dev->name);
5911	WARN_ON(err);
5912
5913	/* Add the device back in the hashes */
5914	list_netdevice(dev);
5915
5916	/* Notify protocols, that a new device appeared. */
5917	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5918
5919	/*
5920	 *	Prevent userspace races by waiting until the network
5921	 *	device is fully setup before sending notifications.
5922	 */
5923	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5924
5925	synchronize_net();
5926	err = 0;
5927out:
5928	return err;
5929}
5930EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5931
5932static int dev_cpu_callback(struct notifier_block *nfb,
5933			    unsigned long action,
5934			    void *ocpu)
5935{
5936	struct sk_buff **list_skb;
5937	struct sk_buff *skb;
5938	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5939	struct softnet_data *sd, *oldsd;
5940
5941	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5942		return NOTIFY_OK;
5943
5944	local_irq_disable();
5945	cpu = smp_processor_id();
5946	sd = &per_cpu(softnet_data, cpu);
5947	oldsd = &per_cpu(softnet_data, oldcpu);
5948
5949	/* Find end of our completion_queue. */
5950	list_skb = &sd->completion_queue;
5951	while (*list_skb)
5952		list_skb = &(*list_skb)->next;
5953	/* Append completion queue from offline CPU. */
5954	*list_skb = oldsd->completion_queue;
5955	oldsd->completion_queue = NULL;
5956
5957	/* Append output queue from offline CPU. */
5958	if (oldsd->output_queue) {
5959		*sd->output_queue_tailp = oldsd->output_queue;
5960		sd->output_queue_tailp = oldsd->output_queue_tailp;
5961		oldsd->output_queue = NULL;
5962		oldsd->output_queue_tailp = &oldsd->output_queue;
5963	}
5964
5965	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5966	local_irq_enable();
5967
5968	/* Process offline CPU's input_pkt_queue */
5969	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5970		netif_rx(skb);
5971		input_queue_head_incr(oldsd);
5972	}
5973	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5974		netif_rx(skb);
5975		input_queue_head_incr(oldsd);
5976	}
5977
5978	return NOTIFY_OK;
5979}
5980
5981
5982/**
5983 *	netdev_increment_features - increment feature set by one
5984 *	@all: current feature set
5985 *	@one: new feature set
5986 *	@mask: mask feature set
5987 *
5988 *	Computes a new feature set after adding a device with feature set
5989 *	@one to the master device with current feature set @all.  Will not
5990 *	enable anything that is off in @mask. Returns the new feature set.
5991 */
5992unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5993					unsigned long mask)
5994{
5995	/* If device needs checksumming, downgrade to it. */
5996	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5997		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5998	else if (mask & NETIF_F_ALL_CSUM) {
5999		/* If one device supports v4/v6 checksumming, set for all. */
6000		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6001		    !(all & NETIF_F_GEN_CSUM)) {
6002			all &= ~NETIF_F_ALL_CSUM;
6003			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6004		}
6005
6006		/* If one device supports hw checksumming, set for all. */
6007		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6008			all &= ~NETIF_F_ALL_CSUM;
6009			all |= NETIF_F_HW_CSUM;
6010		}
6011	}
6012
6013	one |= NETIF_F_ALL_CSUM;
6014
6015	one |= all & NETIF_F_ONE_FOR_ALL;
6016	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6017	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6018
6019	return all;
6020}
6021EXPORT_SYMBOL(netdev_increment_features);
6022
6023static struct hlist_head *netdev_create_hash(void)
6024{
6025	int i;
6026	struct hlist_head *hash;
6027
6028	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6029	if (hash != NULL)
6030		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6031			INIT_HLIST_HEAD(&hash[i]);
6032
6033	return hash;
6034}
6035
6036/* Initialize per network namespace state */
6037static int __net_init netdev_init(struct net *net)
6038{
6039	INIT_LIST_HEAD(&net->dev_base_head);
6040
6041	net->dev_name_head = netdev_create_hash();
6042	if (net->dev_name_head == NULL)
6043		goto err_name;
6044
6045	net->dev_index_head = netdev_create_hash();
6046	if (net->dev_index_head == NULL)
6047		goto err_idx;
6048
6049	return 0;
6050
6051err_idx:
6052	kfree(net->dev_name_head);
6053err_name:
6054	return -ENOMEM;
6055}
6056
6057/**
6058 *	netdev_drivername - network driver for the device
6059 *	@dev: network device
6060 *	@buffer: buffer for resulting name
6061 *	@len: size of buffer
6062 *
6063 *	Determine network driver for device.
6064 */
6065char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6066{
6067	const struct device_driver *driver;
6068	const struct device *parent;
6069
6070	if (len <= 0 || !buffer)
6071		return buffer;
6072	buffer[0] = 0;
6073
6074	parent = dev->dev.parent;
6075
6076	if (!parent)
6077		return buffer;
6078
6079	driver = parent->driver;
6080	if (driver && driver->name)
6081		strlcpy(buffer, driver->name, len);
6082	return buffer;
6083}
6084
6085static int __netdev_printk(const char *level, const struct net_device *dev,
6086			   struct va_format *vaf)
6087{
6088	int r;
6089
6090	if (dev && dev->dev.parent)
6091		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6092			       netdev_name(dev), vaf);
6093	else if (dev)
6094		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6095	else
6096		r = printk("%s(NULL net_device): %pV", level, vaf);
6097
6098	return r;
6099}
6100
6101int netdev_printk(const char *level, const struct net_device *dev,
6102		  const char *format, ...)
6103{
6104	struct va_format vaf;
6105	va_list args;
6106	int r;
6107
6108	va_start(args, format);
6109
6110	vaf.fmt = format;
6111	vaf.va = &args;
6112
6113	r = __netdev_printk(level, dev, &vaf);
6114	va_end(args);
6115
6116	return r;
6117}
6118EXPORT_SYMBOL(netdev_printk);
6119
6120#define define_netdev_printk_level(func, level)			\
6121int func(const struct net_device *dev, const char *fmt, ...)	\
6122{								\
6123	int r;							\
6124	struct va_format vaf;					\
6125	va_list args;						\
6126								\
6127	va_start(args, fmt);					\
6128								\
6129	vaf.fmt = fmt;						\
6130	vaf.va = &args;						\
6131								\
6132	r = __netdev_printk(level, dev, &vaf);			\
6133	va_end(args);						\
6134								\
6135	return r;						\
6136}								\
6137EXPORT_SYMBOL(func);
6138
6139define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6140define_netdev_printk_level(netdev_alert, KERN_ALERT);
6141define_netdev_printk_level(netdev_crit, KERN_CRIT);
6142define_netdev_printk_level(netdev_err, KERN_ERR);
6143define_netdev_printk_level(netdev_warn, KERN_WARNING);
6144define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6145define_netdev_printk_level(netdev_info, KERN_INFO);
6146
6147static void __net_exit netdev_exit(struct net *net)
6148{
6149	kfree(net->dev_name_head);
6150	kfree(net->dev_index_head);
6151}
6152
6153static struct pernet_operations __net_initdata netdev_net_ops = {
6154	.init = netdev_init,
6155	.exit = netdev_exit,
6156};
6157
6158static void __net_exit default_device_exit(struct net *net)
6159{
6160	struct net_device *dev, *aux;
6161	/*
6162	 * Push all migratable network devices back to the
6163	 * initial network namespace
6164	 */
6165	rtnl_lock();
6166	for_each_netdev_safe(net, dev, aux) {
6167		int err;
6168		char fb_name[IFNAMSIZ];
6169
6170		/* Ignore unmoveable devices (i.e. loopback) */
6171		if (dev->features & NETIF_F_NETNS_LOCAL)
6172			continue;
6173
6174		/* Leave virtual devices for the generic cleanup */
6175		if (dev->rtnl_link_ops)
6176			continue;
6177
6178		/* Push remaing network devices to init_net */
6179		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6180		err = dev_change_net_namespace(dev, &init_net, fb_name);
6181		if (err) {
6182			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6183				__func__, dev->name, err);
6184			BUG();
6185		}
6186	}
6187	rtnl_unlock();
6188}
6189
6190static void __net_exit default_device_exit_batch(struct list_head *net_list)
6191{
6192	/* At exit all network devices most be removed from a network
6193	 * namespace.  Do this in the reverse order of registration.
6194	 * Do this across as many network namespaces as possible to
6195	 * improve batching efficiency.
6196	 */
6197	struct net_device *dev;
6198	struct net *net;
6199	LIST_HEAD(dev_kill_list);
6200
6201	rtnl_lock();
6202	list_for_each_entry(net, net_list, exit_list) {
6203		for_each_netdev_reverse(net, dev) {
6204			if (dev->rtnl_link_ops)
6205				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6206			else
6207				unregister_netdevice_queue(dev, &dev_kill_list);
6208		}
6209	}
6210	unregister_netdevice_many(&dev_kill_list);
6211	rtnl_unlock();
6212}
6213
6214static struct pernet_operations __net_initdata default_device_ops = {
6215	.exit = default_device_exit,
6216	.exit_batch = default_device_exit_batch,
6217};
6218
6219/*
6220 *	Initialize the DEV module. At boot time this walks the device list and
6221 *	unhooks any devices that fail to initialise (normally hardware not
6222 *	present) and leaves us with a valid list of present and active devices.
6223 *
6224 */
6225
6226/*
6227 *       This is called single threaded during boot, so no need
6228 *       to take the rtnl semaphore.
6229 */
6230static int __init net_dev_init(void)
6231{
6232	int i, rc = -ENOMEM;
6233
6234	BUG_ON(!dev_boot_phase);
6235
6236	if (dev_proc_init())
6237		goto out;
6238
6239	if (netdev_kobject_init())
6240		goto out;
6241
6242	INIT_LIST_HEAD(&ptype_all);
6243	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6244		INIT_LIST_HEAD(&ptype_base[i]);
6245
6246	if (register_pernet_subsys(&netdev_net_ops))
6247		goto out;
6248
6249	/*
6250	 *	Initialise the packet receive queues.
6251	 */
6252
6253	for_each_possible_cpu(i) {
6254		struct softnet_data *sd = &per_cpu(softnet_data, i);
6255
6256		memset(sd, 0, sizeof(*sd));
6257		skb_queue_head_init(&sd->input_pkt_queue);
6258		skb_queue_head_init(&sd->process_queue);
6259		sd->completion_queue = NULL;
6260		INIT_LIST_HEAD(&sd->poll_list);
6261		sd->output_queue = NULL;
6262		sd->output_queue_tailp = &sd->output_queue;
6263#ifdef CONFIG_RPS
6264		sd->csd.func = rps_trigger_softirq;
6265		sd->csd.info = sd;
6266		sd->csd.flags = 0;
6267		sd->cpu = i;
6268#endif
6269
6270		sd->backlog.poll = process_backlog;
6271		sd->backlog.weight = weight_p;
6272		sd->backlog.gro_list = NULL;
6273		sd->backlog.gro_count = 0;
6274	}
6275
6276	dev_boot_phase = 0;
6277
6278	/* The loopback device is special if any other network devices
6279	 * is present in a network namespace the loopback device must
6280	 * be present. Since we now dynamically allocate and free the
6281	 * loopback device ensure this invariant is maintained by
6282	 * keeping the loopback device as the first device on the
6283	 * list of network devices.  Ensuring the loopback devices
6284	 * is the first device that appears and the last network device
6285	 * that disappears.
6286	 */
6287	if (register_pernet_device(&loopback_net_ops))
6288		goto out;
6289
6290	if (register_pernet_device(&default_device_ops))
6291		goto out;
6292
6293	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6294	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6295
6296	hotcpu_notifier(dev_cpu_callback, 0);
6297	dst_init();
6298	dev_mcast_init();
6299	rc = 0;
6300out:
6301	return rc;
6302}
6303
6304subsys_initcall(net_dev_init);
6305
6306static int __init initialize_hashrnd(void)
6307{
6308	get_random_bytes(&hashrnd, sizeof(hashrnd));
6309	return 0;
6310}
6311
6312late_initcall_sync(initialize_hashrnd);
6313