net/core/dev.c at v2.6.29-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.29-rc2 5250 lines 129 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/ethtool.h>
  94#include <linux/notifier.h>
  95#include <linux/skbuff.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/stat.h>
 102#include <linux/if_bridge.h>
 103#include <linux/if_macvlan.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129
 130#include "net-sysfs.h"
 131
 132/* Instead of increasing this, you should create a hash table. */
 133#define MAX_GRO_SKBS 8
 134
 135/* This should be increased if a protocol with a bigger head is added. */
 136#define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138/*
 139 *	The list of packet types we will receive (as opposed to discard)
 140 *	and the routines to invoke.
 141 *
 142 *	Why 16. Because with 16 the only overlap we get on a hash of the
 143 *	low nibble of the protocol value is RARP/SNAP/X.25.
 144 *
 145 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 146 *             sure which should go first, but I bet it won't make much
 147 *             difference if we are running VLANs.  The good news is that
 148 *             this protocol won't be in the list unless compiled in, so
 149 *             the average user (w/out VLANs) will not be adversely affected.
 150 *             --BLG
 151 *
 152 *		0800	IP
 153 *		8100    802.1Q VLAN
 154 *		0001	802.3
 155 *		0002	AX.25
 156 *		0004	802.2
 157 *		8035	RARP
 158 *		0005	SNAP
 159 *		0805	X.25
 160 *		0806	ARP
 161 *		8137	IPX
 162 *		0009	Localtalk
 163 *		86DD	IPv6
 164 */
 165
 166#define PTYPE_HASH_SIZE	(16)
 167#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 168
 169static DEFINE_SPINLOCK(ptype_lock);
 170static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 171static struct list_head ptype_all __read_mostly;	/* Taps */
 172
 173/*
 174 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 175 * semaphore.
 176 *
 177 * Pure readers hold dev_base_lock for reading.
 178 *
 179 * Writers must hold the rtnl semaphore while they loop through the
 180 * dev_base_head list, and hold dev_base_lock for writing when they do the
 181 * actual updates.  This allows pure readers to access the list even
 182 * while a writer is preparing to update it.
 183 *
 184 * To put it another way, dev_base_lock is held for writing only to
 185 * protect against pure readers; the rtnl semaphore provides the
 186 * protection against other writers.
 187 *
 188 * See, for example usages, register_netdevice() and
 189 * unregister_netdevice(), which must be called with the rtnl
 190 * semaphore held.
 191 */
 192DEFINE_RWLOCK(dev_base_lock);
 193
 194EXPORT_SYMBOL(dev_base_lock);
 195
 196#define NETDEV_HASHBITS	8
 197#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200{
 201	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208}
 209
 210/* Device list insertion */
 211static int list_netdevice(struct net_device *dev)
 212{
 213	struct net *net = dev_net(dev);
 214
 215	ASSERT_RTNL();
 216
 217	write_lock_bh(&dev_base_lock);
 218	list_add_tail(&dev->dev_list, &net->dev_base_head);
 219	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221	write_unlock_bh(&dev_base_lock);
 222	return 0;
 223}
 224
 225/* Device list removal */
 226static void unlist_netdevice(struct net_device *dev)
 227{
 228	ASSERT_RTNL();
 229
 230	/* Unlink dev from the device chain */
 231	write_lock_bh(&dev_base_lock);
 232	list_del(&dev->dev_list);
 233	hlist_del(&dev->name_hlist);
 234	hlist_del(&dev->index_hlist);
 235	write_unlock_bh(&dev_base_lock);
 236}
 237
 238/*
 239 *	Our notifier list
 240 */
 241
 242static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244/*
 245 *	Device drivers call our routines to queue packets here. We empty the
 246 *	queue in the local softnet handler.
 247 */
 248
 249DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250
 251#ifdef CONFIG_LOCKDEP
 252/*
 253 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 254 * according to dev->type
 255 */
 256static const unsigned short netdev_lock_type[] =
 257	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 258	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 259	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 260	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 261	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 262	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 263	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 264	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 265	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 266	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 267	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 268	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 269	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 270	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 271	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 272
 273static const char *netdev_lock_name[] =
 274	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 275	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 276	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 277	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 278	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 279	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 280	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 281	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 282	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 283	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 284	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 285	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 286	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 287	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 288	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 289
 290static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 292
 293static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 294{
 295	int i;
 296
 297	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 298		if (netdev_lock_type[i] == dev_type)
 299			return i;
 300	/* the last key is used by default */
 301	return ARRAY_SIZE(netdev_lock_type) - 1;
 302}
 303
 304static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 305						 unsigned short dev_type)
 306{
 307	int i;
 308
 309	i = netdev_lock_pos(dev_type);
 310	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 311				   netdev_lock_name[i]);
 312}
 313
 314static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 315{
 316	int i;
 317
 318	i = netdev_lock_pos(dev->type);
 319	lockdep_set_class_and_name(&dev->addr_list_lock,
 320				   &netdev_addr_lock_key[i],
 321				   netdev_lock_name[i]);
 322}
 323#else
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325						 unsigned short dev_type)
 326{
 327}
 328static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329{
 330}
 331#endif
 332
 333/*******************************************************************************
 334
 335		Protocol management and registration routines
 336
 337*******************************************************************************/
 338
 339/*
 340 *	Add a protocol ID to the list. Now that the input handler is
 341 *	smarter we can dispense with all the messy stuff that used to be
 342 *	here.
 343 *
 344 *	BEWARE!!! Protocol handlers, mangling input packets,
 345 *	MUST BE last in hash buckets and checking protocol handlers
 346 *	MUST start from promiscuous ptype_all chain in net_bh.
 347 *	It is true now, do not change it.
 348 *	Explanation follows: if protocol handler, mangling packet, will
 349 *	be the first on list, it is not able to sense, that packet
 350 *	is cloned and should be copied-on-write, so that it will
 351 *	change it and subsequent readers will get broken packet.
 352 *							--ANK (980803)
 353 */
 354
 355/**
 356 *	dev_add_pack - add packet handler
 357 *	@pt: packet type declaration
 358 *
 359 *	Add a protocol handler to the networking stack. The passed &packet_type
 360 *	is linked into kernel lists and may not be freed until it has been
 361 *	removed from the kernel lists.
 362 *
 363 *	This call does not sleep therefore it can not
 364 *	guarantee all CPU's that are in middle of receiving packets
 365 *	will see the new packet type (until the next received packet).
 366 */
 367
 368void dev_add_pack(struct packet_type *pt)
 369{
 370	int hash;
 371
 372	spin_lock_bh(&ptype_lock);
 373	if (pt->type == htons(ETH_P_ALL))
 374		list_add_rcu(&pt->list, &ptype_all);
 375	else {
 376		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 377		list_add_rcu(&pt->list, &ptype_base[hash]);
 378	}
 379	spin_unlock_bh(&ptype_lock);
 380}
 381
 382/**
 383 *	__dev_remove_pack	 - remove packet handler
 384 *	@pt: packet type declaration
 385 *
 386 *	Remove a protocol handler that was previously added to the kernel
 387 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 388 *	from the kernel lists and can be freed or reused once this function
 389 *	returns.
 390 *
 391 *      The packet type might still be in use by receivers
 392 *	and must not be freed until after all the CPU's have gone
 393 *	through a quiescent state.
 394 */
 395void __dev_remove_pack(struct packet_type *pt)
 396{
 397	struct list_head *head;
 398	struct packet_type *pt1;
 399
 400	spin_lock_bh(&ptype_lock);
 401
 402	if (pt->type == htons(ETH_P_ALL))
 403		head = &ptype_all;
 404	else
 405		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 406
 407	list_for_each_entry(pt1, head, list) {
 408		if (pt == pt1) {
 409			list_del_rcu(&pt->list);
 410			goto out;
 411		}
 412	}
 413
 414	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 415out:
 416	spin_unlock_bh(&ptype_lock);
 417}
 418/**
 419 *	dev_remove_pack	 - remove packet handler
 420 *	@pt: packet type declaration
 421 *
 422 *	Remove a protocol handler that was previously added to the kernel
 423 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424 *	from the kernel lists and can be freed or reused once this function
 425 *	returns.
 426 *
 427 *	This call sleeps to guarantee that no CPU is looking at the packet
 428 *	type after return.
 429 */
 430void dev_remove_pack(struct packet_type *pt)
 431{
 432	__dev_remove_pack(pt);
 433
 434	synchronize_net();
 435}
 436
 437/******************************************************************************
 438
 439		      Device Boot-time Settings Routines
 440
 441*******************************************************************************/
 442
 443/* Boot time configuration table */
 444static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 445
 446/**
 447 *	netdev_boot_setup_add	- add new setup entry
 448 *	@name: name of the device
 449 *	@map: configured settings for the device
 450 *
 451 *	Adds new setup entry to the dev_boot_setup list.  The function
 452 *	returns 0 on error and 1 on success.  This is a generic routine to
 453 *	all netdevices.
 454 */
 455static int netdev_boot_setup_add(char *name, struct ifmap *map)
 456{
 457	struct netdev_boot_setup *s;
 458	int i;
 459
 460	s = dev_boot_setup;
 461	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 462		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 463			memset(s[i].name, 0, sizeof(s[i].name));
 464			strlcpy(s[i].name, name, IFNAMSIZ);
 465			memcpy(&s[i].map, map, sizeof(s[i].map));
 466			break;
 467		}
 468	}
 469
 470	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 471}
 472
 473/**
 474 *	netdev_boot_setup_check	- check boot time settings
 475 *	@dev: the netdevice
 476 *
 477 * 	Check boot time settings for the device.
 478 *	The found settings are set for the device to be used
 479 *	later in the device probing.
 480 *	Returns 0 if no settings found, 1 if they are.
 481 */
 482int netdev_boot_setup_check(struct net_device *dev)
 483{
 484	struct netdev_boot_setup *s = dev_boot_setup;
 485	int i;
 486
 487	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 488		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 489		    !strcmp(dev->name, s[i].name)) {
 490			dev->irq 	= s[i].map.irq;
 491			dev->base_addr 	= s[i].map.base_addr;
 492			dev->mem_start 	= s[i].map.mem_start;
 493			dev->mem_end 	= s[i].map.mem_end;
 494			return 1;
 495		}
 496	}
 497	return 0;
 498}
 499
 500
 501/**
 502 *	netdev_boot_base	- get address from boot time settings
 503 *	@prefix: prefix for network device
 504 *	@unit: id for network device
 505 *
 506 * 	Check boot time settings for the base address of device.
 507 *	The found settings are set for the device to be used
 508 *	later in the device probing.
 509 *	Returns 0 if no settings found.
 510 */
 511unsigned long netdev_boot_base(const char *prefix, int unit)
 512{
 513	const struct netdev_boot_setup *s = dev_boot_setup;
 514	char name[IFNAMSIZ];
 515	int i;
 516
 517	sprintf(name, "%s%d", prefix, unit);
 518
 519	/*
 520	 * If device already registered then return base of 1
 521	 * to indicate not to probe for this interface
 522	 */
 523	if (__dev_get_by_name(&init_net, name))
 524		return 1;
 525
 526	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 527		if (!strcmp(name, s[i].name))
 528			return s[i].map.base_addr;
 529	return 0;
 530}
 531
 532/*
 533 * Saves at boot time configured settings for any netdevice.
 534 */
 535int __init netdev_boot_setup(char *str)
 536{
 537	int ints[5];
 538	struct ifmap map;
 539
 540	str = get_options(str, ARRAY_SIZE(ints), ints);
 541	if (!str || !*str)
 542		return 0;
 543
 544	/* Save settings */
 545	memset(&map, 0, sizeof(map));
 546	if (ints[0] > 0)
 547		map.irq = ints[1];
 548	if (ints[0] > 1)
 549		map.base_addr = ints[2];
 550	if (ints[0] > 2)
 551		map.mem_start = ints[3];
 552	if (ints[0] > 3)
 553		map.mem_end = ints[4];
 554
 555	/* Add new entry to the list */
 556	return netdev_boot_setup_add(str, &map);
 557}
 558
 559__setup("netdev=", netdev_boot_setup);
 560
 561/*******************************************************************************
 562
 563			    Device Interface Subroutines
 564
 565*******************************************************************************/
 566
 567/**
 568 *	__dev_get_by_name	- find a device by its name
 569 *	@net: the applicable net namespace
 570 *	@name: name to find
 571 *
 572 *	Find an interface by name. Must be called under RTNL semaphore
 573 *	or @dev_base_lock. If the name is found a pointer to the device
 574 *	is returned. If the name is not found then %NULL is returned. The
 575 *	reference counters are not incremented so the caller must be
 576 *	careful with locks.
 577 */
 578
 579struct net_device *__dev_get_by_name(struct net *net, const char *name)
 580{
 581	struct hlist_node *p;
 582
 583	hlist_for_each(p, dev_name_hash(net, name)) {
 584		struct net_device *dev
 585			= hlist_entry(p, struct net_device, name_hlist);
 586		if (!strncmp(dev->name, name, IFNAMSIZ))
 587			return dev;
 588	}
 589	return NULL;
 590}
 591
 592/**
 593 *	dev_get_by_name		- find a device by its name
 594 *	@net: the applicable net namespace
 595 *	@name: name to find
 596 *
 597 *	Find an interface by name. This can be called from any
 598 *	context and does its own locking. The returned handle has
 599 *	the usage count incremented and the caller must use dev_put() to
 600 *	release it when it is no longer needed. %NULL is returned if no
 601 *	matching device is found.
 602 */
 603
 604struct net_device *dev_get_by_name(struct net *net, const char *name)
 605{
 606	struct net_device *dev;
 607
 608	read_lock(&dev_base_lock);
 609	dev = __dev_get_by_name(net, name);
 610	if (dev)
 611		dev_hold(dev);
 612	read_unlock(&dev_base_lock);
 613	return dev;
 614}
 615
 616/**
 617 *	__dev_get_by_index - find a device by its ifindex
 618 *	@net: the applicable net namespace
 619 *	@ifindex: index of device
 620 *
 621 *	Search for an interface by index. Returns %NULL if the device
 622 *	is not found or a pointer to the device. The device has not
 623 *	had its reference counter increased so the caller must be careful
 624 *	about locking. The caller must hold either the RTNL semaphore
 625 *	or @dev_base_lock.
 626 */
 627
 628struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 629{
 630	struct hlist_node *p;
 631
 632	hlist_for_each(p, dev_index_hash(net, ifindex)) {
 633		struct net_device *dev
 634			= hlist_entry(p, struct net_device, index_hlist);
 635		if (dev->ifindex == ifindex)
 636			return dev;
 637	}
 638	return NULL;
 639}
 640
 641
 642/**
 643 *	dev_get_by_index - find a device by its ifindex
 644 *	@net: the applicable net namespace
 645 *	@ifindex: index of device
 646 *
 647 *	Search for an interface by index. Returns NULL if the device
 648 *	is not found or a pointer to the device. The device returned has
 649 *	had a reference added and the pointer is safe until the user calls
 650 *	dev_put to indicate they have finished with it.
 651 */
 652
 653struct net_device *dev_get_by_index(struct net *net, int ifindex)
 654{
 655	struct net_device *dev;
 656
 657	read_lock(&dev_base_lock);
 658	dev = __dev_get_by_index(net, ifindex);
 659	if (dev)
 660		dev_hold(dev);
 661	read_unlock(&dev_base_lock);
 662	return dev;
 663}
 664
 665/**
 666 *	dev_getbyhwaddr - find a device by its hardware address
 667 *	@net: the applicable net namespace
 668 *	@type: media type of device
 669 *	@ha: hardware address
 670 *
 671 *	Search for an interface by MAC address. Returns NULL if the device
 672 *	is not found or a pointer to the device. The caller must hold the
 673 *	rtnl semaphore. The returned device has not had its ref count increased
 674 *	and the caller must therefore be careful about locking
 675 *
 676 *	BUGS:
 677 *	If the API was consistent this would be __dev_get_by_hwaddr
 678 */
 679
 680struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 681{
 682	struct net_device *dev;
 683
 684	ASSERT_RTNL();
 685
 686	for_each_netdev(net, dev)
 687		if (dev->type == type &&
 688		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 689			return dev;
 690
 691	return NULL;
 692}
 693
 694EXPORT_SYMBOL(dev_getbyhwaddr);
 695
 696struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 697{
 698	struct net_device *dev;
 699
 700	ASSERT_RTNL();
 701	for_each_netdev(net, dev)
 702		if (dev->type == type)
 703			return dev;
 704
 705	return NULL;
 706}
 707
 708EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 709
 710struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 711{
 712	struct net_device *dev;
 713
 714	rtnl_lock();
 715	dev = __dev_getfirstbyhwtype(net, type);
 716	if (dev)
 717		dev_hold(dev);
 718	rtnl_unlock();
 719	return dev;
 720}
 721
 722EXPORT_SYMBOL(dev_getfirstbyhwtype);
 723
 724/**
 725 *	dev_get_by_flags - find any device with given flags
 726 *	@net: the applicable net namespace
 727 *	@if_flags: IFF_* values
 728 *	@mask: bitmask of bits in if_flags to check
 729 *
 730 *	Search for any interface with the given flags. Returns NULL if a device
 731 *	is not found or a pointer to the device. The device returned has
 732 *	had a reference added and the pointer is safe until the user calls
 733 *	dev_put to indicate they have finished with it.
 734 */
 735
 736struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 737{
 738	struct net_device *dev, *ret;
 739
 740	ret = NULL;
 741	read_lock(&dev_base_lock);
 742	for_each_netdev(net, dev) {
 743		if (((dev->flags ^ if_flags) & mask) == 0) {
 744			dev_hold(dev);
 745			ret = dev;
 746			break;
 747		}
 748	}
 749	read_unlock(&dev_base_lock);
 750	return ret;
 751}
 752
 753/**
 754 *	dev_valid_name - check if name is okay for network device
 755 *	@name: name string
 756 *
 757 *	Network device names need to be valid file names to
 758 *	to allow sysfs to work.  We also disallow any kind of
 759 *	whitespace.
 760 */
 761int dev_valid_name(const char *name)
 762{
 763	if (*name == '\0')
 764		return 0;
 765	if (strlen(name) >= IFNAMSIZ)
 766		return 0;
 767	if (!strcmp(name, ".") || !strcmp(name, ".."))
 768		return 0;
 769
 770	while (*name) {
 771		if (*name == '/' || isspace(*name))
 772			return 0;
 773		name++;
 774	}
 775	return 1;
 776}
 777
 778/**
 779 *	__dev_alloc_name - allocate a name for a device
 780 *	@net: network namespace to allocate the device name in
 781 *	@name: name format string
 782 *	@buf:  scratch buffer and result name string
 783 *
 784 *	Passed a format string - eg "lt%d" it will try and find a suitable
 785 *	id. It scans list of devices to build up a free map, then chooses
 786 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 787 *	while allocating the name and adding the device in order to avoid
 788 *	duplicates.
 789 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 790 *	Returns the number of the unit assigned or a negative errno code.
 791 */
 792
 793static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 794{
 795	int i = 0;
 796	const char *p;
 797	const int max_netdevices = 8*PAGE_SIZE;
 798	unsigned long *inuse;
 799	struct net_device *d;
 800
 801	p = strnchr(name, IFNAMSIZ-1, '%');
 802	if (p) {
 803		/*
 804		 * Verify the string as this thing may have come from
 805		 * the user.  There must be either one "%d" and no other "%"
 806		 * characters.
 807		 */
 808		if (p[1] != 'd' || strchr(p + 2, '%'))
 809			return -EINVAL;
 810
 811		/* Use one page as a bit array of possible slots */
 812		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 813		if (!inuse)
 814			return -ENOMEM;
 815
 816		for_each_netdev(net, d) {
 817			if (!sscanf(d->name, name, &i))
 818				continue;
 819			if (i < 0 || i >= max_netdevices)
 820				continue;
 821
 822			/*  avoid cases where sscanf is not exact inverse of printf */
 823			snprintf(buf, IFNAMSIZ, name, i);
 824			if (!strncmp(buf, d->name, IFNAMSIZ))
 825				set_bit(i, inuse);
 826		}
 827
 828		i = find_first_zero_bit(inuse, max_netdevices);
 829		free_page((unsigned long) inuse);
 830	}
 831
 832	snprintf(buf, IFNAMSIZ, name, i);
 833	if (!__dev_get_by_name(net, buf))
 834		return i;
 835
 836	/* It is possible to run out of possible slots
 837	 * when the name is long and there isn't enough space left
 838	 * for the digits, or if all bits are used.
 839	 */
 840	return -ENFILE;
 841}
 842
 843/**
 844 *	dev_alloc_name - allocate a name for a device
 845 *	@dev: device
 846 *	@name: name format string
 847 *
 848 *	Passed a format string - eg "lt%d" it will try and find a suitable
 849 *	id. It scans list of devices to build up a free map, then chooses
 850 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 851 *	while allocating the name and adding the device in order to avoid
 852 *	duplicates.
 853 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 854 *	Returns the number of the unit assigned or a negative errno code.
 855 */
 856
 857int dev_alloc_name(struct net_device *dev, const char *name)
 858{
 859	char buf[IFNAMSIZ];
 860	struct net *net;
 861	int ret;
 862
 863	BUG_ON(!dev_net(dev));
 864	net = dev_net(dev);
 865	ret = __dev_alloc_name(net, name, buf);
 866	if (ret >= 0)
 867		strlcpy(dev->name, buf, IFNAMSIZ);
 868	return ret;
 869}
 870
 871
 872/**
 873 *	dev_change_name - change name of a device
 874 *	@dev: device
 875 *	@newname: name (or format string) must be at least IFNAMSIZ
 876 *
 877 *	Change name of a device, can pass format strings "eth%d".
 878 *	for wildcarding.
 879 */
 880int dev_change_name(struct net_device *dev, const char *newname)
 881{
 882	char oldname[IFNAMSIZ];
 883	int err = 0;
 884	int ret;
 885	struct net *net;
 886
 887	ASSERT_RTNL();
 888	BUG_ON(!dev_net(dev));
 889
 890	net = dev_net(dev);
 891	if (dev->flags & IFF_UP)
 892		return -EBUSY;
 893
 894	if (!dev_valid_name(newname))
 895		return -EINVAL;
 896
 897	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 898		return 0;
 899
 900	memcpy(oldname, dev->name, IFNAMSIZ);
 901
 902	if (strchr(newname, '%')) {
 903		err = dev_alloc_name(dev, newname);
 904		if (err < 0)
 905			return err;
 906	}
 907	else if (__dev_get_by_name(net, newname))
 908		return -EEXIST;
 909	else
 910		strlcpy(dev->name, newname, IFNAMSIZ);
 911
 912rollback:
 913	/* For now only devices in the initial network namespace
 914	 * are in sysfs.
 915	 */
 916	if (net == &init_net) {
 917		ret = device_rename(&dev->dev, dev->name);
 918		if (ret) {
 919			memcpy(dev->name, oldname, IFNAMSIZ);
 920			return ret;
 921		}
 922	}
 923
 924	write_lock_bh(&dev_base_lock);
 925	hlist_del(&dev->name_hlist);
 926	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 927	write_unlock_bh(&dev_base_lock);
 928
 929	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 930	ret = notifier_to_errno(ret);
 931
 932	if (ret) {
 933		if (err) {
 934			printk(KERN_ERR
 935			       "%s: name change rollback failed: %d.\n",
 936			       dev->name, ret);
 937		} else {
 938			err = ret;
 939			memcpy(dev->name, oldname, IFNAMSIZ);
 940			goto rollback;
 941		}
 942	}
 943
 944	return err;
 945}
 946
 947/**
 948 *	dev_set_alias - change ifalias of a device
 949 *	@dev: device
 950 *	@alias: name up to IFALIASZ
 951 *	@len: limit of bytes to copy from info
 952 *
 953 *	Set ifalias for a device,
 954 */
 955int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 956{
 957	ASSERT_RTNL();
 958
 959	if (len >= IFALIASZ)
 960		return -EINVAL;
 961
 962	if (!len) {
 963		if (dev->ifalias) {
 964			kfree(dev->ifalias);
 965			dev->ifalias = NULL;
 966		}
 967		return 0;
 968	}
 969
 970	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 971	if (!dev->ifalias)
 972		return -ENOMEM;
 973
 974	strlcpy(dev->ifalias, alias, len+1);
 975	return len;
 976}
 977
 978
 979/**
 980 *	netdev_features_change - device changes features
 981 *	@dev: device to cause notification
 982 *
 983 *	Called to indicate a device has changed features.
 984 */
 985void netdev_features_change(struct net_device *dev)
 986{
 987	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 988}
 989EXPORT_SYMBOL(netdev_features_change);
 990
 991/**
 992 *	netdev_state_change - device changes state
 993 *	@dev: device to cause notification
 994 *
 995 *	Called to indicate a device has changed state. This function calls
 996 *	the notifier chains for netdev_chain and sends a NEWLINK message
 997 *	to the routing socket.
 998 */
 999void netdev_state_change(struct net_device *dev)
1000{
1001	if (dev->flags & IFF_UP) {
1002		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004	}
1005}
1006
1007void netdev_bonding_change(struct net_device *dev)
1008{
1009	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010}
1011EXPORT_SYMBOL(netdev_bonding_change);
1012
1013/**
1014 *	dev_load 	- load a network module
1015 *	@net: the applicable net namespace
1016 *	@name: name of interface
1017 *
1018 *	If a network interface is not present and the process has suitable
1019 *	privileges this function loads the module. If module loading is not
1020 *	available in this kernel then it becomes a nop.
1021 */
1022
1023void dev_load(struct net *net, const char *name)
1024{
1025	struct net_device *dev;
1026
1027	read_lock(&dev_base_lock);
1028	dev = __dev_get_by_name(net, name);
1029	read_unlock(&dev_base_lock);
1030
1031	if (!dev && capable(CAP_SYS_MODULE))
1032		request_module("%s", name);
1033}
1034
1035/**
1036 *	dev_open	- prepare an interface for use.
1037 *	@dev:	device to open
1038 *
1039 *	Takes a device from down to up state. The device's private open
1040 *	function is invoked and then the multicast lists are loaded. Finally
1041 *	the device is moved into the up state and a %NETDEV_UP message is
1042 *	sent to the netdev notifier chain.
1043 *
1044 *	Calling this function on an active interface is a nop. On a failure
1045 *	a negative errno code is returned.
1046 */
1047int dev_open(struct net_device *dev)
1048{
1049	const struct net_device_ops *ops = dev->netdev_ops;
1050	int ret = 0;
1051
1052	ASSERT_RTNL();
1053
1054	/*
1055	 *	Is it already up?
1056	 */
1057
1058	if (dev->flags & IFF_UP)
1059		return 0;
1060
1061	/*
1062	 *	Is it even present?
1063	 */
1064	if (!netif_device_present(dev))
1065		return -ENODEV;
1066
1067	/*
1068	 *	Call device private open method
1069	 */
1070	set_bit(__LINK_STATE_START, &dev->state);
1071
1072	if (ops->ndo_validate_addr)
1073		ret = ops->ndo_validate_addr(dev);
1074
1075	if (!ret && ops->ndo_open)
1076		ret = ops->ndo_open(dev);
1077
1078	/*
1079	 *	If it went open OK then:
1080	 */
1081
1082	if (ret)
1083		clear_bit(__LINK_STATE_START, &dev->state);
1084	else {
1085		/*
1086		 *	Set the flags.
1087		 */
1088		dev->flags |= IFF_UP;
1089
1090		/*
1091		 *	Enable NET_DMA
1092		 */
1093		dmaengine_get();
1094
1095		/*
1096		 *	Initialize multicasting status
1097		 */
1098		dev_set_rx_mode(dev);
1099
1100		/*
1101		 *	Wakeup transmit queue engine
1102		 */
1103		dev_activate(dev);
1104
1105		/*
1106		 *	... and announce new interface.
1107		 */
1108		call_netdevice_notifiers(NETDEV_UP, dev);
1109	}
1110
1111	return ret;
1112}
1113
1114/**
1115 *	dev_close - shutdown an interface.
1116 *	@dev: device to shutdown
1117 *
1118 *	This function moves an active device into down state. A
1119 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121 *	chain.
1122 */
1123int dev_close(struct net_device *dev)
1124{
1125	const struct net_device_ops *ops = dev->netdev_ops;
1126	ASSERT_RTNL();
1127
1128	might_sleep();
1129
1130	if (!(dev->flags & IFF_UP))
1131		return 0;
1132
1133	/*
1134	 *	Tell people we are going down, so that they can
1135	 *	prepare to death, when device is still operating.
1136	 */
1137	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138
1139	clear_bit(__LINK_STATE_START, &dev->state);
1140
1141	/* Synchronize to scheduled poll. We cannot touch poll list,
1142	 * it can be even on different cpu. So just clear netif_running().
1143	 *
1144	 * dev->stop() will invoke napi_disable() on all of it's
1145	 * napi_struct instances on this device.
1146	 */
1147	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148
1149	dev_deactivate(dev);
1150
1151	/*
1152	 *	Call the device specific close. This cannot fail.
1153	 *	Only if device is UP
1154	 *
1155	 *	We allow it to be called even after a DETACH hot-plug
1156	 *	event.
1157	 */
1158	if (ops->ndo_stop)
1159		ops->ndo_stop(dev);
1160
1161	/*
1162	 *	Device is now down.
1163	 */
1164
1165	dev->flags &= ~IFF_UP;
1166
1167	/*
1168	 * Tell people we are down
1169	 */
1170	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171
1172	/*
1173	 *	Shutdown NET_DMA
1174	 */
1175	dmaengine_put();
1176
1177	return 0;
1178}
1179
1180
1181/**
1182 *	dev_disable_lro - disable Large Receive Offload on a device
1183 *	@dev: device
1184 *
1185 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186 *	called under RTNL.  This is needed if received packets may be
1187 *	forwarded to another interface.
1188 */
1189void dev_disable_lro(struct net_device *dev)
1190{
1191	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192	    dev->ethtool_ops->set_flags) {
1193		u32 flags = dev->ethtool_ops->get_flags(dev);
1194		if (flags & ETH_FLAG_LRO) {
1195			flags &= ~ETH_FLAG_LRO;
1196			dev->ethtool_ops->set_flags(dev, flags);
1197		}
1198	}
1199	WARN_ON(dev->features & NETIF_F_LRO);
1200}
1201EXPORT_SYMBOL(dev_disable_lro);
1202
1203
1204static int dev_boot_phase = 1;
1205
1206/*
1207 *	Device change register/unregister. These are not inline or static
1208 *	as we export them to the world.
1209 */
1210
1211/**
1212 *	register_netdevice_notifier - register a network notifier block
1213 *	@nb: notifier
1214 *
1215 *	Register a notifier to be called when network device events occur.
1216 *	The notifier passed is linked into the kernel structures and must
1217 *	not be reused until it has been unregistered. A negative errno code
1218 *	is returned on a failure.
1219 *
1220 * 	When registered all registration and up events are replayed
1221 *	to the new notifier to allow device to have a race free
1222 *	view of the network device list.
1223 */
1224
1225int register_netdevice_notifier(struct notifier_block *nb)
1226{
1227	struct net_device *dev;
1228	struct net_device *last;
1229	struct net *net;
1230	int err;
1231
1232	rtnl_lock();
1233	err = raw_notifier_chain_register(&netdev_chain, nb);
1234	if (err)
1235		goto unlock;
1236	if (dev_boot_phase)
1237		goto unlock;
1238	for_each_net(net) {
1239		for_each_netdev(net, dev) {
1240			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241			err = notifier_to_errno(err);
1242			if (err)
1243				goto rollback;
1244
1245			if (!(dev->flags & IFF_UP))
1246				continue;
1247
1248			nb->notifier_call(nb, NETDEV_UP, dev);
1249		}
1250	}
1251
1252unlock:
1253	rtnl_unlock();
1254	return err;
1255
1256rollback:
1257	last = dev;
1258	for_each_net(net) {
1259		for_each_netdev(net, dev) {
1260			if (dev == last)
1261				break;
1262
1263			if (dev->flags & IFF_UP) {
1264				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266			}
1267			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268		}
1269	}
1270
1271	raw_notifier_chain_unregister(&netdev_chain, nb);
1272	goto unlock;
1273}
1274
1275/**
1276 *	unregister_netdevice_notifier - unregister a network notifier block
1277 *	@nb: notifier
1278 *
1279 *	Unregister a notifier previously registered by
1280 *	register_netdevice_notifier(). The notifier is unlinked into the
1281 *	kernel structures and may then be reused. A negative errno code
1282 *	is returned on a failure.
1283 */
1284
1285int unregister_netdevice_notifier(struct notifier_block *nb)
1286{
1287	int err;
1288
1289	rtnl_lock();
1290	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291	rtnl_unlock();
1292	return err;
1293}
1294
1295/**
1296 *	call_netdevice_notifiers - call all network notifier blocks
1297 *      @val: value passed unmodified to notifier function
1298 *      @dev: net_device pointer passed unmodified to notifier function
1299 *
1300 *	Call all network notifier blocks.  Parameters and return value
1301 *	are as for raw_notifier_call_chain().
1302 */
1303
1304int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305{
1306	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307}
1308
1309/* When > 0 there are consumers of rx skb time stamps */
1310static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311
1312void net_enable_timestamp(void)
1313{
1314	atomic_inc(&netstamp_needed);
1315}
1316
1317void net_disable_timestamp(void)
1318{
1319	atomic_dec(&netstamp_needed);
1320}
1321
1322static inline void net_timestamp(struct sk_buff *skb)
1323{
1324	if (atomic_read(&netstamp_needed))
1325		__net_timestamp(skb);
1326	else
1327		skb->tstamp.tv64 = 0;
1328}
1329
1330/*
1331 *	Support routine. Sends outgoing frames to any network
1332 *	taps currently in use.
1333 */
1334
1335static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336{
1337	struct packet_type *ptype;
1338
1339	net_timestamp(skb);
1340
1341	rcu_read_lock();
1342	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343		/* Never send packets back to the socket
1344		 * they originated from - MvS (miquels@drinkel.ow.org)
1345		 */
1346		if ((ptype->dev == dev || !ptype->dev) &&
1347		    (ptype->af_packet_priv == NULL ||
1348		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350			if (!skb2)
1351				break;
1352
1353			/* skb->nh should be correctly
1354			   set by sender, so that the second statement is
1355			   just protection against buggy protocols.
1356			 */
1357			skb_reset_mac_header(skb2);
1358
1359			if (skb_network_header(skb2) < skb2->data ||
1360			    skb2->network_header > skb2->tail) {
1361				if (net_ratelimit())
1362					printk(KERN_CRIT "protocol %04x is "
1363					       "buggy, dev %s\n",
1364					       skb2->protocol, dev->name);
1365				skb_reset_network_header(skb2);
1366			}
1367
1368			skb2->transport_header = skb2->network_header;
1369			skb2->pkt_type = PACKET_OUTGOING;
1370			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371		}
1372	}
1373	rcu_read_unlock();
1374}
1375
1376
1377static inline void __netif_reschedule(struct Qdisc *q)
1378{
1379	struct softnet_data *sd;
1380	unsigned long flags;
1381
1382	local_irq_save(flags);
1383	sd = &__get_cpu_var(softnet_data);
1384	q->next_sched = sd->output_queue;
1385	sd->output_queue = q;
1386	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387	local_irq_restore(flags);
1388}
1389
1390void __netif_schedule(struct Qdisc *q)
1391{
1392	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393		__netif_reschedule(q);
1394}
1395EXPORT_SYMBOL(__netif_schedule);
1396
1397void dev_kfree_skb_irq(struct sk_buff *skb)
1398{
1399	if (atomic_dec_and_test(&skb->users)) {
1400		struct softnet_data *sd;
1401		unsigned long flags;
1402
1403		local_irq_save(flags);
1404		sd = &__get_cpu_var(softnet_data);
1405		skb->next = sd->completion_queue;
1406		sd->completion_queue = skb;
1407		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408		local_irq_restore(flags);
1409	}
1410}
1411EXPORT_SYMBOL(dev_kfree_skb_irq);
1412
1413void dev_kfree_skb_any(struct sk_buff *skb)
1414{
1415	if (in_irq() || irqs_disabled())
1416		dev_kfree_skb_irq(skb);
1417	else
1418		dev_kfree_skb(skb);
1419}
1420EXPORT_SYMBOL(dev_kfree_skb_any);
1421
1422
1423/**
1424 * netif_device_detach - mark device as removed
1425 * @dev: network device
1426 *
1427 * Mark device as removed from system and therefore no longer available.
1428 */
1429void netif_device_detach(struct net_device *dev)
1430{
1431	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432	    netif_running(dev)) {
1433		netif_stop_queue(dev);
1434	}
1435}
1436EXPORT_SYMBOL(netif_device_detach);
1437
1438/**
1439 * netif_device_attach - mark device as attached
1440 * @dev: network device
1441 *
1442 * Mark device as attached from system and restart if needed.
1443 */
1444void netif_device_attach(struct net_device *dev)
1445{
1446	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447	    netif_running(dev)) {
1448		netif_wake_queue(dev);
1449		__netdev_watchdog_up(dev);
1450	}
1451}
1452EXPORT_SYMBOL(netif_device_attach);
1453
1454static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455{
1456	return ((features & NETIF_F_GEN_CSUM) ||
1457		((features & NETIF_F_IP_CSUM) &&
1458		 protocol == htons(ETH_P_IP)) ||
1459		((features & NETIF_F_IPV6_CSUM) &&
1460		 protocol == htons(ETH_P_IPV6)));
1461}
1462
1463static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464{
1465	if (can_checksum_protocol(dev->features, skb->protocol))
1466		return true;
1467
1468	if (skb->protocol == htons(ETH_P_8021Q)) {
1469		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471					  veh->h_vlan_encapsulated_proto))
1472			return true;
1473	}
1474
1475	return false;
1476}
1477
1478/*
1479 * Invalidate hardware checksum when packet is to be mangled, and
1480 * complete checksum manually on outgoing path.
1481 */
1482int skb_checksum_help(struct sk_buff *skb)
1483{
1484	__wsum csum;
1485	int ret = 0, offset;
1486
1487	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488		goto out_set_summed;
1489
1490	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491		/* Let GSO fix up the checksum. */
1492		goto out_set_summed;
1493	}
1494
1495	offset = skb->csum_start - skb_headroom(skb);
1496	BUG_ON(offset >= skb_headlen(skb));
1497	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498
1499	offset += skb->csum_offset;
1500	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501
1502	if (skb_cloned(skb) &&
1503	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505		if (ret)
1506			goto out;
1507	}
1508
1509	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510out_set_summed:
1511	skb->ip_summed = CHECKSUM_NONE;
1512out:
1513	return ret;
1514}
1515
1516/**
1517 *	skb_gso_segment - Perform segmentation on skb.
1518 *	@skb: buffer to segment
1519 *	@features: features for the output path (see dev->features)
1520 *
1521 *	This function segments the given skb and returns a list of segments.
1522 *
1523 *	It may return NULL if the skb requires no segmentation.  This is
1524 *	only possible when GSO is used for verifying header integrity.
1525 */
1526struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527{
1528	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529	struct packet_type *ptype;
1530	__be16 type = skb->protocol;
1531	int err;
1532
1533	skb_reset_mac_header(skb);
1534	skb->mac_len = skb->network_header - skb->mac_header;
1535	__skb_pull(skb, skb->mac_len);
1536
1537	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538		if (skb_header_cloned(skb) &&
1539		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1540			return ERR_PTR(err);
1541	}
1542
1543	rcu_read_lock();
1544	list_for_each_entry_rcu(ptype,
1545			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1546		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1547			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1548				err = ptype->gso_send_check(skb);
1549				segs = ERR_PTR(err);
1550				if (err || skb_gso_ok(skb, features))
1551					break;
1552				__skb_push(skb, (skb->data -
1553						 skb_network_header(skb)));
1554			}
1555			segs = ptype->gso_segment(skb, features);
1556			break;
1557		}
1558	}
1559	rcu_read_unlock();
1560
1561	__skb_push(skb, skb->data - skb_mac_header(skb));
1562
1563	return segs;
1564}
1565
1566EXPORT_SYMBOL(skb_gso_segment);
1567
1568/* Take action when hardware reception checksum errors are detected. */
1569#ifdef CONFIG_BUG
1570void netdev_rx_csum_fault(struct net_device *dev)
1571{
1572	if (net_ratelimit()) {
1573		printk(KERN_ERR "%s: hw csum failure.\n",
1574			dev ? dev->name : "<unknown>");
1575		dump_stack();
1576	}
1577}
1578EXPORT_SYMBOL(netdev_rx_csum_fault);
1579#endif
1580
1581/* Actually, we should eliminate this check as soon as we know, that:
1582 * 1. IOMMU is present and allows to map all the memory.
1583 * 2. No high memory really exists on this machine.
1584 */
1585
1586static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1587{
1588#ifdef CONFIG_HIGHMEM
1589	int i;
1590
1591	if (dev->features & NETIF_F_HIGHDMA)
1592		return 0;
1593
1594	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1595		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1596			return 1;
1597
1598#endif
1599	return 0;
1600}
1601
1602struct dev_gso_cb {
1603	void (*destructor)(struct sk_buff *skb);
1604};
1605
1606#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1607
1608static void dev_gso_skb_destructor(struct sk_buff *skb)
1609{
1610	struct dev_gso_cb *cb;
1611
1612	do {
1613		struct sk_buff *nskb = skb->next;
1614
1615		skb->next = nskb->next;
1616		nskb->next = NULL;
1617		kfree_skb(nskb);
1618	} while (skb->next);
1619
1620	cb = DEV_GSO_CB(skb);
1621	if (cb->destructor)
1622		cb->destructor(skb);
1623}
1624
1625/**
1626 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1627 *	@skb: buffer to segment
1628 *
1629 *	This function segments the given skb and stores the list of segments
1630 *	in skb->next.
1631 */
1632static int dev_gso_segment(struct sk_buff *skb)
1633{
1634	struct net_device *dev = skb->dev;
1635	struct sk_buff *segs;
1636	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1637					 NETIF_F_SG : 0);
1638
1639	segs = skb_gso_segment(skb, features);
1640
1641	/* Verifying header integrity only. */
1642	if (!segs)
1643		return 0;
1644
1645	if (IS_ERR(segs))
1646		return PTR_ERR(segs);
1647
1648	skb->next = segs;
1649	DEV_GSO_CB(skb)->destructor = skb->destructor;
1650	skb->destructor = dev_gso_skb_destructor;
1651
1652	return 0;
1653}
1654
1655int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1656			struct netdev_queue *txq)
1657{
1658	const struct net_device_ops *ops = dev->netdev_ops;
1659
1660	prefetch(&dev->netdev_ops->ndo_start_xmit);
1661	if (likely(!skb->next)) {
1662		if (!list_empty(&ptype_all))
1663			dev_queue_xmit_nit(skb, dev);
1664
1665		if (netif_needs_gso(dev, skb)) {
1666			if (unlikely(dev_gso_segment(skb)))
1667				goto out_kfree_skb;
1668			if (skb->next)
1669				goto gso;
1670		}
1671
1672		return ops->ndo_start_xmit(skb, dev);
1673	}
1674
1675gso:
1676	do {
1677		struct sk_buff *nskb = skb->next;
1678		int rc;
1679
1680		skb->next = nskb->next;
1681		nskb->next = NULL;
1682		rc = ops->ndo_start_xmit(nskb, dev);
1683		if (unlikely(rc)) {
1684			nskb->next = skb->next;
1685			skb->next = nskb;
1686			return rc;
1687		}
1688		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1689			return NETDEV_TX_BUSY;
1690	} while (skb->next);
1691
1692	skb->destructor = DEV_GSO_CB(skb)->destructor;
1693
1694out_kfree_skb:
1695	kfree_skb(skb);
1696	return 0;
1697}
1698
1699static u32 simple_tx_hashrnd;
1700static int simple_tx_hashrnd_initialized = 0;
1701
1702static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1703{
1704	u32 addr1, addr2, ports;
1705	u32 hash, ihl;
1706	u8 ip_proto = 0;
1707
1708	if (unlikely(!simple_tx_hashrnd_initialized)) {
1709		get_random_bytes(&simple_tx_hashrnd, 4);
1710		simple_tx_hashrnd_initialized = 1;
1711	}
1712
1713	switch (skb->protocol) {
1714	case htons(ETH_P_IP):
1715		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1716			ip_proto = ip_hdr(skb)->protocol;
1717		addr1 = ip_hdr(skb)->saddr;
1718		addr2 = ip_hdr(skb)->daddr;
1719		ihl = ip_hdr(skb)->ihl;
1720		break;
1721	case htons(ETH_P_IPV6):
1722		ip_proto = ipv6_hdr(skb)->nexthdr;
1723		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1724		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1725		ihl = (40 >> 2);
1726		break;
1727	default:
1728		return 0;
1729	}
1730
1731
1732	switch (ip_proto) {
1733	case IPPROTO_TCP:
1734	case IPPROTO_UDP:
1735	case IPPROTO_DCCP:
1736	case IPPROTO_ESP:
1737	case IPPROTO_AH:
1738	case IPPROTO_SCTP:
1739	case IPPROTO_UDPLITE:
1740		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1741		break;
1742
1743	default:
1744		ports = 0;
1745		break;
1746	}
1747
1748	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1749
1750	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1751}
1752
1753static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1754					struct sk_buff *skb)
1755{
1756	const struct net_device_ops *ops = dev->netdev_ops;
1757	u16 queue_index = 0;
1758
1759	if (ops->ndo_select_queue)
1760		queue_index = ops->ndo_select_queue(dev, skb);
1761	else if (dev->real_num_tx_queues > 1)
1762		queue_index = simple_tx_hash(dev, skb);
1763
1764	skb_set_queue_mapping(skb, queue_index);
1765	return netdev_get_tx_queue(dev, queue_index);
1766}
1767
1768/**
1769 *	dev_queue_xmit - transmit a buffer
1770 *	@skb: buffer to transmit
1771 *
1772 *	Queue a buffer for transmission to a network device. The caller must
1773 *	have set the device and priority and built the buffer before calling
1774 *	this function. The function can be called from an interrupt.
1775 *
1776 *	A negative errno code is returned on a failure. A success does not
1777 *	guarantee the frame will be transmitted as it may be dropped due
1778 *	to congestion or traffic shaping.
1779 *
1780 * -----------------------------------------------------------------------------------
1781 *      I notice this method can also return errors from the queue disciplines,
1782 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1783 *      be positive.
1784 *
1785 *      Regardless of the return value, the skb is consumed, so it is currently
1786 *      difficult to retry a send to this method.  (You can bump the ref count
1787 *      before sending to hold a reference for retry if you are careful.)
1788 *
1789 *      When calling this method, interrupts MUST be enabled.  This is because
1790 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1791 *          --BLG
1792 */
1793int dev_queue_xmit(struct sk_buff *skb)
1794{
1795	struct net_device *dev = skb->dev;
1796	struct netdev_queue *txq;
1797	struct Qdisc *q;
1798	int rc = -ENOMEM;
1799
1800	/* GSO will handle the following emulations directly. */
1801	if (netif_needs_gso(dev, skb))
1802		goto gso;
1803
1804	if (skb_shinfo(skb)->frag_list &&
1805	    !(dev->features & NETIF_F_FRAGLIST) &&
1806	    __skb_linearize(skb))
1807		goto out_kfree_skb;
1808
1809	/* Fragmented skb is linearized if device does not support SG,
1810	 * or if at least one of fragments is in highmem and device
1811	 * does not support DMA from it.
1812	 */
1813	if (skb_shinfo(skb)->nr_frags &&
1814	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1815	    __skb_linearize(skb))
1816		goto out_kfree_skb;
1817
1818	/* If packet is not checksummed and device does not support
1819	 * checksumming for this protocol, complete checksumming here.
1820	 */
1821	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1822		skb_set_transport_header(skb, skb->csum_start -
1823					      skb_headroom(skb));
1824		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1825			goto out_kfree_skb;
1826	}
1827
1828gso:
1829	/* Disable soft irqs for various locks below. Also
1830	 * stops preemption for RCU.
1831	 */
1832	rcu_read_lock_bh();
1833
1834	txq = dev_pick_tx(dev, skb);
1835	q = rcu_dereference(txq->qdisc);
1836
1837#ifdef CONFIG_NET_CLS_ACT
1838	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1839#endif
1840	if (q->enqueue) {
1841		spinlock_t *root_lock = qdisc_lock(q);
1842
1843		spin_lock(root_lock);
1844
1845		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1846			kfree_skb(skb);
1847			rc = NET_XMIT_DROP;
1848		} else {
1849			rc = qdisc_enqueue_root(skb, q);
1850			qdisc_run(q);
1851		}
1852		spin_unlock(root_lock);
1853
1854		goto out;
1855	}
1856
1857	/* The device has no queue. Common case for software devices:
1858	   loopback, all the sorts of tunnels...
1859
1860	   Really, it is unlikely that netif_tx_lock protection is necessary
1861	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1862	   counters.)
1863	   However, it is possible, that they rely on protection
1864	   made by us here.
1865
1866	   Check this and shot the lock. It is not prone from deadlocks.
1867	   Either shot noqueue qdisc, it is even simpler 8)
1868	 */
1869	if (dev->flags & IFF_UP) {
1870		int cpu = smp_processor_id(); /* ok because BHs are off */
1871
1872		if (txq->xmit_lock_owner != cpu) {
1873
1874			HARD_TX_LOCK(dev, txq, cpu);
1875
1876			if (!netif_tx_queue_stopped(txq)) {
1877				rc = 0;
1878				if (!dev_hard_start_xmit(skb, dev, txq)) {
1879					HARD_TX_UNLOCK(dev, txq);
1880					goto out;
1881				}
1882			}
1883			HARD_TX_UNLOCK(dev, txq);
1884			if (net_ratelimit())
1885				printk(KERN_CRIT "Virtual device %s asks to "
1886				       "queue packet!\n", dev->name);
1887		} else {
1888			/* Recursion is detected! It is possible,
1889			 * unfortunately */
1890			if (net_ratelimit())
1891				printk(KERN_CRIT "Dead loop on virtual device "
1892				       "%s, fix it urgently!\n", dev->name);
1893		}
1894	}
1895
1896	rc = -ENETDOWN;
1897	rcu_read_unlock_bh();
1898
1899out_kfree_skb:
1900	kfree_skb(skb);
1901	return rc;
1902out:
1903	rcu_read_unlock_bh();
1904	return rc;
1905}
1906
1907
1908/*=======================================================================
1909			Receiver routines
1910  =======================================================================*/
1911
1912int netdev_max_backlog __read_mostly = 1000;
1913int netdev_budget __read_mostly = 300;
1914int weight_p __read_mostly = 64;            /* old backlog weight */
1915
1916DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1917
1918
1919/**
1920 *	netif_rx	-	post buffer to the network code
1921 *	@skb: buffer to post
1922 *
1923 *	This function receives a packet from a device driver and queues it for
1924 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1925 *	may be dropped during processing for congestion control or by the
1926 *	protocol layers.
1927 *
1928 *	return values:
1929 *	NET_RX_SUCCESS	(no congestion)
1930 *	NET_RX_DROP     (packet was dropped)
1931 *
1932 */
1933
1934int netif_rx(struct sk_buff *skb)
1935{
1936	struct softnet_data *queue;
1937	unsigned long flags;
1938
1939	/* if netpoll wants it, pretend we never saw it */
1940	if (netpoll_rx(skb))
1941		return NET_RX_DROP;
1942
1943	if (!skb->tstamp.tv64)
1944		net_timestamp(skb);
1945
1946	/*
1947	 * The code is rearranged so that the path is the most
1948	 * short when CPU is congested, but is still operating.
1949	 */
1950	local_irq_save(flags);
1951	queue = &__get_cpu_var(softnet_data);
1952
1953	__get_cpu_var(netdev_rx_stat).total++;
1954	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1955		if (queue->input_pkt_queue.qlen) {
1956enqueue:
1957			__skb_queue_tail(&queue->input_pkt_queue, skb);
1958			local_irq_restore(flags);
1959			return NET_RX_SUCCESS;
1960		}
1961
1962		napi_schedule(&queue->backlog);
1963		goto enqueue;
1964	}
1965
1966	__get_cpu_var(netdev_rx_stat).dropped++;
1967	local_irq_restore(flags);
1968
1969	kfree_skb(skb);
1970	return NET_RX_DROP;
1971}
1972
1973int netif_rx_ni(struct sk_buff *skb)
1974{
1975	int err;
1976
1977	preempt_disable();
1978	err = netif_rx(skb);
1979	if (local_softirq_pending())
1980		do_softirq();
1981	preempt_enable();
1982
1983	return err;
1984}
1985
1986EXPORT_SYMBOL(netif_rx_ni);
1987
1988static void net_tx_action(struct softirq_action *h)
1989{
1990	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1991
1992	if (sd->completion_queue) {
1993		struct sk_buff *clist;
1994
1995		local_irq_disable();
1996		clist = sd->completion_queue;
1997		sd->completion_queue = NULL;
1998		local_irq_enable();
1999
2000		while (clist) {
2001			struct sk_buff *skb = clist;
2002			clist = clist->next;
2003
2004			WARN_ON(atomic_read(&skb->users));
2005			__kfree_skb(skb);
2006		}
2007	}
2008
2009	if (sd->output_queue) {
2010		struct Qdisc *head;
2011
2012		local_irq_disable();
2013		head = sd->output_queue;
2014		sd->output_queue = NULL;
2015		local_irq_enable();
2016
2017		while (head) {
2018			struct Qdisc *q = head;
2019			spinlock_t *root_lock;
2020
2021			head = head->next_sched;
2022
2023			root_lock = qdisc_lock(q);
2024			if (spin_trylock(root_lock)) {
2025				smp_mb__before_clear_bit();
2026				clear_bit(__QDISC_STATE_SCHED,
2027					  &q->state);
2028				qdisc_run(q);
2029				spin_unlock(root_lock);
2030			} else {
2031				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2032					      &q->state)) {
2033					__netif_reschedule(q);
2034				} else {
2035					smp_mb__before_clear_bit();
2036					clear_bit(__QDISC_STATE_SCHED,
2037						  &q->state);
2038				}
2039			}
2040		}
2041	}
2042}
2043
2044static inline int deliver_skb(struct sk_buff *skb,
2045			      struct packet_type *pt_prev,
2046			      struct net_device *orig_dev)
2047{
2048	atomic_inc(&skb->users);
2049	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2050}
2051
2052#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2053/* These hooks defined here for ATM */
2054struct net_bridge;
2055struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2056						unsigned char *addr);
2057void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2058
2059/*
2060 * If bridge module is loaded call bridging hook.
2061 *  returns NULL if packet was consumed.
2062 */
2063struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2064					struct sk_buff *skb) __read_mostly;
2065static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2066					    struct packet_type **pt_prev, int *ret,
2067					    struct net_device *orig_dev)
2068{
2069	struct net_bridge_port *port;
2070
2071	if (skb->pkt_type == PACKET_LOOPBACK ||
2072	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2073		return skb;
2074
2075	if (*pt_prev) {
2076		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2077		*pt_prev = NULL;
2078	}
2079
2080	return br_handle_frame_hook(port, skb);
2081}
2082#else
2083#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2084#endif
2085
2086#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2087struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2088EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2089
2090static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2091					     struct packet_type **pt_prev,
2092					     int *ret,
2093					     struct net_device *orig_dev)
2094{
2095	if (skb->dev->macvlan_port == NULL)
2096		return skb;
2097
2098	if (*pt_prev) {
2099		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2100		*pt_prev = NULL;
2101	}
2102	return macvlan_handle_frame_hook(skb);
2103}
2104#else
2105#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2106#endif
2107
2108#ifdef CONFIG_NET_CLS_ACT
2109/* TODO: Maybe we should just force sch_ingress to be compiled in
2110 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2111 * a compare and 2 stores extra right now if we dont have it on
2112 * but have CONFIG_NET_CLS_ACT
2113 * NOTE: This doesnt stop any functionality; if you dont have
2114 * the ingress scheduler, you just cant add policies on ingress.
2115 *
2116 */
2117static int ing_filter(struct sk_buff *skb)
2118{
2119	struct net_device *dev = skb->dev;
2120	u32 ttl = G_TC_RTTL(skb->tc_verd);
2121	struct netdev_queue *rxq;
2122	int result = TC_ACT_OK;
2123	struct Qdisc *q;
2124
2125	if (MAX_RED_LOOP < ttl++) {
2126		printk(KERN_WARNING
2127		       "Redir loop detected Dropping packet (%d->%d)\n",
2128		       skb->iif, dev->ifindex);
2129		return TC_ACT_SHOT;
2130	}
2131
2132	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2133	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2134
2135	rxq = &dev->rx_queue;
2136
2137	q = rxq->qdisc;
2138	if (q != &noop_qdisc) {
2139		spin_lock(qdisc_lock(q));
2140		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2141			result = qdisc_enqueue_root(skb, q);
2142		spin_unlock(qdisc_lock(q));
2143	}
2144
2145	return result;
2146}
2147
2148static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2149					 struct packet_type **pt_prev,
2150					 int *ret, struct net_device *orig_dev)
2151{
2152	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2153		goto out;
2154
2155	if (*pt_prev) {
2156		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2157		*pt_prev = NULL;
2158	} else {
2159		/* Huh? Why does turning on AF_PACKET affect this? */
2160		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2161	}
2162
2163	switch (ing_filter(skb)) {
2164	case TC_ACT_SHOT:
2165	case TC_ACT_STOLEN:
2166		kfree_skb(skb);
2167		return NULL;
2168	}
2169
2170out:
2171	skb->tc_verd = 0;
2172	return skb;
2173}
2174#endif
2175
2176/*
2177 * 	netif_nit_deliver - deliver received packets to network taps
2178 * 	@skb: buffer
2179 *
2180 * 	This function is used to deliver incoming packets to network
2181 * 	taps. It should be used when the normal netif_receive_skb path
2182 * 	is bypassed, for example because of VLAN acceleration.
2183 */
2184void netif_nit_deliver(struct sk_buff *skb)
2185{
2186	struct packet_type *ptype;
2187
2188	if (list_empty(&ptype_all))
2189		return;
2190
2191	skb_reset_network_header(skb);
2192	skb_reset_transport_header(skb);
2193	skb->mac_len = skb->network_header - skb->mac_header;
2194
2195	rcu_read_lock();
2196	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2197		if (!ptype->dev || ptype->dev == skb->dev)
2198			deliver_skb(skb, ptype, skb->dev);
2199	}
2200	rcu_read_unlock();
2201}
2202
2203/**
2204 *	netif_receive_skb - process receive buffer from network
2205 *	@skb: buffer to process
2206 *
2207 *	netif_receive_skb() is the main receive data processing function.
2208 *	It always succeeds. The buffer may be dropped during processing
2209 *	for congestion control or by the protocol layers.
2210 *
2211 *	This function may only be called from softirq context and interrupts
2212 *	should be enabled.
2213 *
2214 *	Return values (usually ignored):
2215 *	NET_RX_SUCCESS: no congestion
2216 *	NET_RX_DROP: packet was dropped
2217 */
2218int netif_receive_skb(struct sk_buff *skb)
2219{
2220	struct packet_type *ptype, *pt_prev;
2221	struct net_device *orig_dev;
2222	struct net_device *null_or_orig;
2223	int ret = NET_RX_DROP;
2224	__be16 type;
2225
2226	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2227		return NET_RX_SUCCESS;
2228
2229	/* if we've gotten here through NAPI, check netpoll */
2230	if (netpoll_receive_skb(skb))
2231		return NET_RX_DROP;
2232
2233	if (!skb->tstamp.tv64)
2234		net_timestamp(skb);
2235
2236	if (!skb->iif)
2237		skb->iif = skb->dev->ifindex;
2238
2239	null_or_orig = NULL;
2240	orig_dev = skb->dev;
2241	if (orig_dev->master) {
2242		if (skb_bond_should_drop(skb))
2243			null_or_orig = orig_dev; /* deliver only exact match */
2244		else
2245			skb->dev = orig_dev->master;
2246	}
2247
2248	__get_cpu_var(netdev_rx_stat).total++;
2249
2250	skb_reset_network_header(skb);
2251	skb_reset_transport_header(skb);
2252	skb->mac_len = skb->network_header - skb->mac_header;
2253
2254	pt_prev = NULL;
2255
2256	rcu_read_lock();
2257
2258	/* Don't receive packets in an exiting network namespace */
2259	if (!net_alive(dev_net(skb->dev))) {
2260		kfree_skb(skb);
2261		goto out;
2262	}
2263
2264#ifdef CONFIG_NET_CLS_ACT
2265	if (skb->tc_verd & TC_NCLS) {
2266		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2267		goto ncls;
2268	}
2269#endif
2270
2271	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2272		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2273		    ptype->dev == orig_dev) {
2274			if (pt_prev)
2275				ret = deliver_skb(skb, pt_prev, orig_dev);
2276			pt_prev = ptype;
2277		}
2278	}
2279
2280#ifdef CONFIG_NET_CLS_ACT
2281	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2282	if (!skb)
2283		goto out;
2284ncls:
2285#endif
2286
2287	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2288	if (!skb)
2289		goto out;
2290	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2291	if (!skb)
2292		goto out;
2293
2294	type = skb->protocol;
2295	list_for_each_entry_rcu(ptype,
2296			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2297		if (ptype->type == type &&
2298		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2299		     ptype->dev == orig_dev)) {
2300			if (pt_prev)
2301				ret = deliver_skb(skb, pt_prev, orig_dev);
2302			pt_prev = ptype;
2303		}
2304	}
2305
2306	if (pt_prev) {
2307		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2308	} else {
2309		kfree_skb(skb);
2310		/* Jamal, now you will not able to escape explaining
2311		 * me how you were going to use this. :-)
2312		 */
2313		ret = NET_RX_DROP;
2314	}
2315
2316out:
2317	rcu_read_unlock();
2318	return ret;
2319}
2320
2321/* Network device is going away, flush any packets still pending  */
2322static void flush_backlog(void *arg)
2323{
2324	struct net_device *dev = arg;
2325	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2326	struct sk_buff *skb, *tmp;
2327
2328	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2329		if (skb->dev == dev) {
2330			__skb_unlink(skb, &queue->input_pkt_queue);
2331			kfree_skb(skb);
2332		}
2333}
2334
2335static int napi_gro_complete(struct sk_buff *skb)
2336{
2337	struct packet_type *ptype;
2338	__be16 type = skb->protocol;
2339	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2340	int err = -ENOENT;
2341
2342	if (NAPI_GRO_CB(skb)->count == 1)
2343		goto out;
2344
2345	rcu_read_lock();
2346	list_for_each_entry_rcu(ptype, head, list) {
2347		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2348			continue;
2349
2350		err = ptype->gro_complete(skb);
2351		break;
2352	}
2353	rcu_read_unlock();
2354
2355	if (err) {
2356		WARN_ON(&ptype->list == head);
2357		kfree_skb(skb);
2358		return NET_RX_SUCCESS;
2359	}
2360
2361out:
2362	skb_shinfo(skb)->gso_size = 0;
2363	__skb_push(skb, -skb_network_offset(skb));
2364	return netif_receive_skb(skb);
2365}
2366
2367void napi_gro_flush(struct napi_struct *napi)
2368{
2369	struct sk_buff *skb, *next;
2370
2371	for (skb = napi->gro_list; skb; skb = next) {
2372		next = skb->next;
2373		skb->next = NULL;
2374		napi_gro_complete(skb);
2375	}
2376
2377	napi->gro_list = NULL;
2378}
2379EXPORT_SYMBOL(napi_gro_flush);
2380
2381int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2382{
2383	struct sk_buff **pp = NULL;
2384	struct packet_type *ptype;
2385	__be16 type = skb->protocol;
2386	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2387	int count = 0;
2388	int same_flow;
2389	int mac_len;
2390	int free;
2391
2392	if (!(skb->dev->features & NETIF_F_GRO))
2393		goto normal;
2394
2395	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2396		goto normal;
2397
2398	rcu_read_lock();
2399	list_for_each_entry_rcu(ptype, head, list) {
2400		struct sk_buff *p;
2401
2402		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2403			continue;
2404
2405		skb_reset_network_header(skb);
2406		mac_len = skb->network_header - skb->mac_header;
2407		skb->mac_len = mac_len;
2408		NAPI_GRO_CB(skb)->same_flow = 0;
2409		NAPI_GRO_CB(skb)->flush = 0;
2410		NAPI_GRO_CB(skb)->free = 0;
2411
2412		for (p = napi->gro_list; p; p = p->next) {
2413			count++;
2414
2415			if (!NAPI_GRO_CB(p)->same_flow)
2416				continue;
2417
2418			if (p->mac_len != mac_len ||
2419			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2420				   mac_len))
2421				NAPI_GRO_CB(p)->same_flow = 0;
2422		}
2423
2424		pp = ptype->gro_receive(&napi->gro_list, skb);
2425		break;
2426	}
2427	rcu_read_unlock();
2428
2429	if (&ptype->list == head)
2430		goto normal;
2431
2432	same_flow = NAPI_GRO_CB(skb)->same_flow;
2433	free = NAPI_GRO_CB(skb)->free;
2434
2435	if (pp) {
2436		struct sk_buff *nskb = *pp;
2437
2438		*pp = nskb->next;
2439		nskb->next = NULL;
2440		napi_gro_complete(nskb);
2441		count--;
2442	}
2443
2444	if (same_flow)
2445		goto ok;
2446
2447	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2448		__skb_push(skb, -skb_network_offset(skb));
2449		goto normal;
2450	}
2451
2452	NAPI_GRO_CB(skb)->count = 1;
2453	skb_shinfo(skb)->gso_size = skb->len;
2454	skb->next = napi->gro_list;
2455	napi->gro_list = skb;
2456
2457ok:
2458	return free;
2459
2460normal:
2461	return -1;
2462}
2463EXPORT_SYMBOL(dev_gro_receive);
2464
2465static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2466{
2467	struct sk_buff *p;
2468
2469	for (p = napi->gro_list; p; p = p->next) {
2470		NAPI_GRO_CB(p)->same_flow = 1;
2471		NAPI_GRO_CB(p)->flush = 0;
2472	}
2473
2474	return dev_gro_receive(napi, skb);
2475}
2476
2477int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2478{
2479	switch (__napi_gro_receive(napi, skb)) {
2480	case -1:
2481		return netif_receive_skb(skb);
2482
2483	case 1:
2484		kfree_skb(skb);
2485		break;
2486	}
2487
2488	return NET_RX_SUCCESS;
2489}
2490EXPORT_SYMBOL(napi_gro_receive);
2491
2492void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2493{
2494	__skb_pull(skb, skb_headlen(skb));
2495	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2496
2497	napi->skb = skb;
2498}
2499EXPORT_SYMBOL(napi_reuse_skb);
2500
2501struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2502				  struct napi_gro_fraginfo *info)
2503{
2504	struct net_device *dev = napi->dev;
2505	struct sk_buff *skb = napi->skb;
2506
2507	napi->skb = NULL;
2508
2509	if (!skb) {
2510		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2511		if (!skb)
2512			goto out;
2513
2514		skb_reserve(skb, NET_IP_ALIGN);
2515	}
2516
2517	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2518	skb_shinfo(skb)->nr_frags = info->nr_frags;
2519	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2520
2521	skb->data_len = info->len;
2522	skb->len += info->len;
2523	skb->truesize += info->len;
2524
2525	if (!pskb_may_pull(skb, ETH_HLEN)) {
2526		napi_reuse_skb(napi, skb);
2527		goto out;
2528	}
2529
2530	skb->protocol = eth_type_trans(skb, dev);
2531
2532	skb->ip_summed = info->ip_summed;
2533	skb->csum = info->csum;
2534
2535out:
2536	return skb;
2537}
2538EXPORT_SYMBOL(napi_fraginfo_skb);
2539
2540int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2541{
2542	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2543	int err = NET_RX_DROP;
2544
2545	if (!skb)
2546		goto out;
2547
2548	err = NET_RX_SUCCESS;
2549
2550	switch (__napi_gro_receive(napi, skb)) {
2551	case -1:
2552		return netif_receive_skb(skb);
2553
2554	case 0:
2555		goto out;
2556	}
2557
2558	napi_reuse_skb(napi, skb);
2559
2560out:
2561	return err;
2562}
2563EXPORT_SYMBOL(napi_gro_frags);
2564
2565static int process_backlog(struct napi_struct *napi, int quota)
2566{
2567	int work = 0;
2568	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2569	unsigned long start_time = jiffies;
2570
2571	napi->weight = weight_p;
2572	do {
2573		struct sk_buff *skb;
2574
2575		local_irq_disable();
2576		skb = __skb_dequeue(&queue->input_pkt_queue);
2577		if (!skb) {
2578			__napi_complete(napi);
2579			local_irq_enable();
2580			break;
2581		}
2582		local_irq_enable();
2583
2584		napi_gro_receive(napi, skb);
2585	} while (++work < quota && jiffies == start_time);
2586
2587	napi_gro_flush(napi);
2588
2589	return work;
2590}
2591
2592/**
2593 * __napi_schedule - schedule for receive
2594 * @n: entry to schedule
2595 *
2596 * The entry's receive function will be scheduled to run
2597 */
2598void __napi_schedule(struct napi_struct *n)
2599{
2600	unsigned long flags;
2601
2602	local_irq_save(flags);
2603	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2604	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2605	local_irq_restore(flags);
2606}
2607EXPORT_SYMBOL(__napi_schedule);
2608
2609void __napi_complete(struct napi_struct *n)
2610{
2611	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2612	BUG_ON(n->gro_list);
2613
2614	list_del(&n->poll_list);
2615	smp_mb__before_clear_bit();
2616	clear_bit(NAPI_STATE_SCHED, &n->state);
2617}
2618EXPORT_SYMBOL(__napi_complete);
2619
2620void napi_complete(struct napi_struct *n)
2621{
2622	unsigned long flags;
2623
2624	/*
2625	 * don't let napi dequeue from the cpu poll list
2626	 * just in case its running on a different cpu
2627	 */
2628	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2629		return;
2630
2631	napi_gro_flush(n);
2632	local_irq_save(flags);
2633	__napi_complete(n);
2634	local_irq_restore(flags);
2635}
2636EXPORT_SYMBOL(napi_complete);
2637
2638void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2639		    int (*poll)(struct napi_struct *, int), int weight)
2640{
2641	INIT_LIST_HEAD(&napi->poll_list);
2642	napi->gro_list = NULL;
2643	napi->skb = NULL;
2644	napi->poll = poll;
2645	napi->weight = weight;
2646	list_add(&napi->dev_list, &dev->napi_list);
2647	napi->dev = dev;
2648#ifdef CONFIG_NETPOLL
2649	spin_lock_init(&napi->poll_lock);
2650	napi->poll_owner = -1;
2651#endif
2652	set_bit(NAPI_STATE_SCHED, &napi->state);
2653}
2654EXPORT_SYMBOL(netif_napi_add);
2655
2656void netif_napi_del(struct napi_struct *napi)
2657{
2658	struct sk_buff *skb, *next;
2659
2660	list_del_init(&napi->dev_list);
2661	kfree(napi->skb);
2662
2663	for (skb = napi->gro_list; skb; skb = next) {
2664		next = skb->next;
2665		skb->next = NULL;
2666		kfree_skb(skb);
2667	}
2668
2669	napi->gro_list = NULL;
2670}
2671EXPORT_SYMBOL(netif_napi_del);
2672
2673
2674static void net_rx_action(struct softirq_action *h)
2675{
2676	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2677	unsigned long time_limit = jiffies + 2;
2678	int budget = netdev_budget;
2679	void *have;
2680
2681	local_irq_disable();
2682
2683	while (!list_empty(list)) {
2684		struct napi_struct *n;
2685		int work, weight;
2686
2687		/* If softirq window is exhuasted then punt.
2688		 * Allow this to run for 2 jiffies since which will allow
2689		 * an average latency of 1.5/HZ.
2690		 */
2691		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2692			goto softnet_break;
2693
2694		local_irq_enable();
2695
2696		/* Even though interrupts have been re-enabled, this
2697		 * access is safe because interrupts can only add new
2698		 * entries to the tail of this list, and only ->poll()
2699		 * calls can remove this head entry from the list.
2700		 */
2701		n = list_entry(list->next, struct napi_struct, poll_list);
2702
2703		have = netpoll_poll_lock(n);
2704
2705		weight = n->weight;
2706
2707		/* This NAPI_STATE_SCHED test is for avoiding a race
2708		 * with netpoll's poll_napi().  Only the entity which
2709		 * obtains the lock and sees NAPI_STATE_SCHED set will
2710		 * actually make the ->poll() call.  Therefore we avoid
2711		 * accidently calling ->poll() when NAPI is not scheduled.
2712		 */
2713		work = 0;
2714		if (test_bit(NAPI_STATE_SCHED, &n->state))
2715			work = n->poll(n, weight);
2716
2717		WARN_ON_ONCE(work > weight);
2718
2719		budget -= work;
2720
2721		local_irq_disable();
2722
2723		/* Drivers must not modify the NAPI state if they
2724		 * consume the entire weight.  In such cases this code
2725		 * still "owns" the NAPI instance and therefore can
2726		 * move the instance around on the list at-will.
2727		 */
2728		if (unlikely(work == weight)) {
2729			if (unlikely(napi_disable_pending(n)))
2730				__napi_complete(n);
2731			else
2732				list_move_tail(&n->poll_list, list);
2733		}
2734
2735		netpoll_poll_unlock(have);
2736	}
2737out:
2738	local_irq_enable();
2739
2740#ifdef CONFIG_NET_DMA
2741	/*
2742	 * There may not be any more sk_buffs coming right now, so push
2743	 * any pending DMA copies to hardware
2744	 */
2745	dma_issue_pending_all();
2746#endif
2747
2748	return;
2749
2750softnet_break:
2751	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2752	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2753	goto out;
2754}
2755
2756static gifconf_func_t * gifconf_list [NPROTO];
2757
2758/**
2759 *	register_gifconf	-	register a SIOCGIF handler
2760 *	@family: Address family
2761 *	@gifconf: Function handler
2762 *
2763 *	Register protocol dependent address dumping routines. The handler
2764 *	that is passed must not be freed or reused until it has been replaced
2765 *	by another handler.
2766 */
2767int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2768{
2769	if (family >= NPROTO)
2770		return -EINVAL;
2771	gifconf_list[family] = gifconf;
2772	return 0;
2773}
2774
2775
2776/*
2777 *	Map an interface index to its name (SIOCGIFNAME)
2778 */
2779
2780/*
2781 *	We need this ioctl for efficient implementation of the
2782 *	if_indextoname() function required by the IPv6 API.  Without
2783 *	it, we would have to search all the interfaces to find a
2784 *	match.  --pb
2785 */
2786
2787static int dev_ifname(struct net *net, struct ifreq __user *arg)
2788{
2789	struct net_device *dev;
2790	struct ifreq ifr;
2791
2792	/*
2793	 *	Fetch the caller's info block.
2794	 */
2795
2796	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2797		return -EFAULT;
2798
2799	read_lock(&dev_base_lock);
2800	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2801	if (!dev) {
2802		read_unlock(&dev_base_lock);
2803		return -ENODEV;
2804	}
2805
2806	strcpy(ifr.ifr_name, dev->name);
2807	read_unlock(&dev_base_lock);
2808
2809	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2810		return -EFAULT;
2811	return 0;
2812}
2813
2814/*
2815 *	Perform a SIOCGIFCONF call. This structure will change
2816 *	size eventually, and there is nothing I can do about it.
2817 *	Thus we will need a 'compatibility mode'.
2818 */
2819
2820static int dev_ifconf(struct net *net, char __user *arg)
2821{
2822	struct ifconf ifc;
2823	struct net_device *dev;
2824	char __user *pos;
2825	int len;
2826	int total;
2827	int i;
2828
2829	/*
2830	 *	Fetch the caller's info block.
2831	 */
2832
2833	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2834		return -EFAULT;
2835
2836	pos = ifc.ifc_buf;
2837	len = ifc.ifc_len;
2838
2839	/*
2840	 *	Loop over the interfaces, and write an info block for each.
2841	 */
2842
2843	total = 0;
2844	for_each_netdev(net, dev) {
2845		for (i = 0; i < NPROTO; i++) {
2846			if (gifconf_list[i]) {
2847				int done;
2848				if (!pos)
2849					done = gifconf_list[i](dev, NULL, 0);
2850				else
2851					done = gifconf_list[i](dev, pos + total,
2852							       len - total);
2853				if (done < 0)
2854					return -EFAULT;
2855				total += done;
2856			}
2857		}
2858	}
2859
2860	/*
2861	 *	All done.  Write the updated control block back to the caller.
2862	 */
2863	ifc.ifc_len = total;
2864
2865	/*
2866	 * 	Both BSD and Solaris return 0 here, so we do too.
2867	 */
2868	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2869}
2870
2871#ifdef CONFIG_PROC_FS
2872/*
2873 *	This is invoked by the /proc filesystem handler to display a device
2874 *	in detail.
2875 */
2876void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2877	__acquires(dev_base_lock)
2878{
2879	struct net *net = seq_file_net(seq);
2880	loff_t off;
2881	struct net_device *dev;
2882
2883	read_lock(&dev_base_lock);
2884	if (!*pos)
2885		return SEQ_START_TOKEN;
2886
2887	off = 1;
2888	for_each_netdev(net, dev)
2889		if (off++ == *pos)
2890			return dev;
2891
2892	return NULL;
2893}
2894
2895void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2896{
2897	struct net *net = seq_file_net(seq);
2898	++*pos;
2899	return v == SEQ_START_TOKEN ?
2900		first_net_device(net) : next_net_device((struct net_device *)v);
2901}
2902
2903void dev_seq_stop(struct seq_file *seq, void *v)
2904	__releases(dev_base_lock)
2905{
2906	read_unlock(&dev_base_lock);
2907}
2908
2909static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2910{
2911	const struct net_device_stats *stats = dev_get_stats(dev);
2912
2913	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2914		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2915		   dev->name, stats->rx_bytes, stats->rx_packets,
2916		   stats->rx_errors,
2917		   stats->rx_dropped + stats->rx_missed_errors,
2918		   stats->rx_fifo_errors,
2919		   stats->rx_length_errors + stats->rx_over_errors +
2920		    stats->rx_crc_errors + stats->rx_frame_errors,
2921		   stats->rx_compressed, stats->multicast,
2922		   stats->tx_bytes, stats->tx_packets,
2923		   stats->tx_errors, stats->tx_dropped,
2924		   stats->tx_fifo_errors, stats->collisions,
2925		   stats->tx_carrier_errors +
2926		    stats->tx_aborted_errors +
2927		    stats->tx_window_errors +
2928		    stats->tx_heartbeat_errors,
2929		   stats->tx_compressed);
2930}
2931
2932/*
2933 *	Called from the PROCfs module. This now uses the new arbitrary sized
2934 *	/proc/net interface to create /proc/net/dev
2935 */
2936static int dev_seq_show(struct seq_file *seq, void *v)
2937{
2938	if (v == SEQ_START_TOKEN)
2939		seq_puts(seq, "Inter-|   Receive                            "
2940			      "                    |  Transmit\n"
2941			      " face |bytes    packets errs drop fifo frame "
2942			      "compressed multicast|bytes    packets errs "
2943			      "drop fifo colls carrier compressed\n");
2944	else
2945		dev_seq_printf_stats(seq, v);
2946	return 0;
2947}
2948
2949static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2950{
2951	struct netif_rx_stats *rc = NULL;
2952
2953	while (*pos < nr_cpu_ids)
2954		if (cpu_online(*pos)) {
2955			rc = &per_cpu(netdev_rx_stat, *pos);
2956			break;
2957		} else
2958			++*pos;
2959	return rc;
2960}
2961
2962static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2963{
2964	return softnet_get_online(pos);
2965}
2966
2967static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2968{
2969	++*pos;
2970	return softnet_get_online(pos);
2971}
2972
2973static void softnet_seq_stop(struct seq_file *seq, void *v)
2974{
2975}
2976
2977static int softnet_seq_show(struct seq_file *seq, void *v)
2978{
2979	struct netif_rx_stats *s = v;
2980
2981	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2982		   s->total, s->dropped, s->time_squeeze, 0,
2983		   0, 0, 0, 0, /* was fastroute */
2984		   s->cpu_collision );
2985	return 0;
2986}
2987
2988static const struct seq_operations dev_seq_ops = {
2989	.start = dev_seq_start,
2990	.next  = dev_seq_next,
2991	.stop  = dev_seq_stop,
2992	.show  = dev_seq_show,
2993};
2994
2995static int dev_seq_open(struct inode *inode, struct file *file)
2996{
2997	return seq_open_net(inode, file, &dev_seq_ops,
2998			    sizeof(struct seq_net_private));
2999}
3000
3001static const struct file_operations dev_seq_fops = {
3002	.owner	 = THIS_MODULE,
3003	.open    = dev_seq_open,
3004	.read    = seq_read,
3005	.llseek  = seq_lseek,
3006	.release = seq_release_net,
3007};
3008
3009static const struct seq_operations softnet_seq_ops = {
3010	.start = softnet_seq_start,
3011	.next  = softnet_seq_next,
3012	.stop  = softnet_seq_stop,
3013	.show  = softnet_seq_show,
3014};
3015
3016static int softnet_seq_open(struct inode *inode, struct file *file)
3017{
3018	return seq_open(file, &softnet_seq_ops);
3019}
3020
3021static const struct file_operations softnet_seq_fops = {
3022	.owner	 = THIS_MODULE,
3023	.open    = softnet_seq_open,
3024	.read    = seq_read,
3025	.llseek  = seq_lseek,
3026	.release = seq_release,
3027};
3028
3029static void *ptype_get_idx(loff_t pos)
3030{
3031	struct packet_type *pt = NULL;
3032	loff_t i = 0;
3033	int t;
3034
3035	list_for_each_entry_rcu(pt, &ptype_all, list) {
3036		if (i == pos)
3037			return pt;
3038		++i;
3039	}
3040
3041	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3042		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3043			if (i == pos)
3044				return pt;
3045			++i;
3046		}
3047	}
3048	return NULL;
3049}
3050
3051static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3052	__acquires(RCU)
3053{
3054	rcu_read_lock();
3055	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3056}
3057
3058static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3059{
3060	struct packet_type *pt;
3061	struct list_head *nxt;
3062	int hash;
3063
3064	++*pos;
3065	if (v == SEQ_START_TOKEN)
3066		return ptype_get_idx(0);
3067
3068	pt = v;
3069	nxt = pt->list.next;
3070	if (pt->type == htons(ETH_P_ALL)) {
3071		if (nxt != &ptype_all)
3072			goto found;
3073		hash = 0;
3074		nxt = ptype_base[0].next;
3075	} else
3076		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3077
3078	while (nxt == &ptype_base[hash]) {
3079		if (++hash >= PTYPE_HASH_SIZE)
3080			return NULL;
3081		nxt = ptype_base[hash].next;
3082	}
3083found:
3084	return list_entry(nxt, struct packet_type, list);
3085}
3086
3087static void ptype_seq_stop(struct seq_file *seq, void *v)
3088	__releases(RCU)
3089{
3090	rcu_read_unlock();
3091}
3092
3093static int ptype_seq_show(struct seq_file *seq, void *v)
3094{
3095	struct packet_type *pt = v;
3096
3097	if (v == SEQ_START_TOKEN)
3098		seq_puts(seq, "Type Device      Function\n");
3099	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3100		if (pt->type == htons(ETH_P_ALL))
3101			seq_puts(seq, "ALL ");
3102		else
3103			seq_printf(seq, "%04x", ntohs(pt->type));
3104
3105		seq_printf(seq, " %-8s %pF\n",
3106			   pt->dev ? pt->dev->name : "", pt->func);
3107	}
3108
3109	return 0;
3110}
3111
3112static const struct seq_operations ptype_seq_ops = {
3113	.start = ptype_seq_start,
3114	.next  = ptype_seq_next,
3115	.stop  = ptype_seq_stop,
3116	.show  = ptype_seq_show,
3117};
3118
3119static int ptype_seq_open(struct inode *inode, struct file *file)
3120{
3121	return seq_open_net(inode, file, &ptype_seq_ops,
3122			sizeof(struct seq_net_private));
3123}
3124
3125static const struct file_operations ptype_seq_fops = {
3126	.owner	 = THIS_MODULE,
3127	.open    = ptype_seq_open,
3128	.read    = seq_read,
3129	.llseek  = seq_lseek,
3130	.release = seq_release_net,
3131};
3132
3133
3134static int __net_init dev_proc_net_init(struct net *net)
3135{
3136	int rc = -ENOMEM;
3137
3138	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3139		goto out;
3140	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3141		goto out_dev;
3142	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3143		goto out_softnet;
3144
3145	if (wext_proc_init(net))
3146		goto out_ptype;
3147	rc = 0;
3148out:
3149	return rc;
3150out_ptype:
3151	proc_net_remove(net, "ptype");
3152out_softnet:
3153	proc_net_remove(net, "softnet_stat");
3154out_dev:
3155	proc_net_remove(net, "dev");
3156	goto out;
3157}
3158
3159static void __net_exit dev_proc_net_exit(struct net *net)
3160{
3161	wext_proc_exit(net);
3162
3163	proc_net_remove(net, "ptype");
3164	proc_net_remove(net, "softnet_stat");
3165	proc_net_remove(net, "dev");
3166}
3167
3168static struct pernet_operations __net_initdata dev_proc_ops = {
3169	.init = dev_proc_net_init,
3170	.exit = dev_proc_net_exit,
3171};
3172
3173static int __init dev_proc_init(void)
3174{
3175	return register_pernet_subsys(&dev_proc_ops);
3176}
3177#else
3178#define dev_proc_init() 0
3179#endif	/* CONFIG_PROC_FS */
3180
3181
3182/**
3183 *	netdev_set_master	-	set up master/slave pair
3184 *	@slave: slave device
3185 *	@master: new master device
3186 *
3187 *	Changes the master device of the slave. Pass %NULL to break the
3188 *	bonding. The caller must hold the RTNL semaphore. On a failure
3189 *	a negative errno code is returned. On success the reference counts
3190 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3191 *	function returns zero.
3192 */
3193int netdev_set_master(struct net_device *slave, struct net_device *master)
3194{
3195	struct net_device *old = slave->master;
3196
3197	ASSERT_RTNL();
3198
3199	if (master) {
3200		if (old)
3201			return -EBUSY;
3202		dev_hold(master);
3203	}
3204
3205	slave->master = master;
3206
3207	synchronize_net();
3208
3209	if (old)
3210		dev_put(old);
3211
3212	if (master)
3213		slave->flags |= IFF_SLAVE;
3214	else
3215		slave->flags &= ~IFF_SLAVE;
3216
3217	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3218	return 0;
3219}
3220
3221static void dev_change_rx_flags(struct net_device *dev, int flags)
3222{
3223	const struct net_device_ops *ops = dev->netdev_ops;
3224
3225	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3226		ops->ndo_change_rx_flags(dev, flags);
3227}
3228
3229static int __dev_set_promiscuity(struct net_device *dev, int inc)
3230{
3231	unsigned short old_flags = dev->flags;
3232	uid_t uid;
3233	gid_t gid;
3234
3235	ASSERT_RTNL();
3236
3237	dev->flags |= IFF_PROMISC;
3238	dev->promiscuity += inc;
3239	if (dev->promiscuity == 0) {
3240		/*
3241		 * Avoid overflow.
3242		 * If inc causes overflow, untouch promisc and return error.
3243		 */
3244		if (inc < 0)
3245			dev->flags &= ~IFF_PROMISC;
3246		else {
3247			dev->promiscuity -= inc;
3248			printk(KERN_WARNING "%s: promiscuity touches roof, "
3249				"set promiscuity failed, promiscuity feature "
3250				"of device might be broken.\n", dev->name);
3251			return -EOVERFLOW;
3252		}
3253	}
3254	if (dev->flags != old_flags) {
3255		printk(KERN_INFO "device %s %s promiscuous mode\n",
3256		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3257							       "left");
3258		if (audit_enabled) {
3259			current_uid_gid(&uid, &gid);
3260			audit_log(current->audit_context, GFP_ATOMIC,
3261				AUDIT_ANOM_PROMISCUOUS,
3262				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3263				dev->name, (dev->flags & IFF_PROMISC),
3264				(old_flags & IFF_PROMISC),
3265				audit_get_loginuid(current),
3266				uid, gid,
3267				audit_get_sessionid(current));
3268		}
3269
3270		dev_change_rx_flags(dev, IFF_PROMISC);
3271	}
3272	return 0;
3273}
3274
3275/**
3276 *	dev_set_promiscuity	- update promiscuity count on a device
3277 *	@dev: device
3278 *	@inc: modifier
3279 *
3280 *	Add or remove promiscuity from a device. While the count in the device
3281 *	remains above zero the interface remains promiscuous. Once it hits zero
3282 *	the device reverts back to normal filtering operation. A negative inc
3283 *	value is used to drop promiscuity on the device.
3284 *	Return 0 if successful or a negative errno code on error.
3285 */
3286int dev_set_promiscuity(struct net_device *dev, int inc)
3287{
3288	unsigned short old_flags = dev->flags;
3289	int err;
3290
3291	err = __dev_set_promiscuity(dev, inc);
3292	if (err < 0)
3293		return err;
3294	if (dev->flags != old_flags)
3295		dev_set_rx_mode(dev);
3296	return err;
3297}
3298
3299/**
3300 *	dev_set_allmulti	- update allmulti count on a device
3301 *	@dev: device
3302 *	@inc: modifier
3303 *
3304 *	Add or remove reception of all multicast frames to a device. While the
3305 *	count in the device remains above zero the interface remains listening
3306 *	to all interfaces. Once it hits zero the device reverts back to normal
3307 *	filtering operation. A negative @inc value is used to drop the counter
3308 *	when releasing a resource needing all multicasts.
3309 *	Return 0 if successful or a negative errno code on error.
3310 */
3311
3312int dev_set_allmulti(struct net_device *dev, int inc)
3313{
3314	unsigned short old_flags = dev->flags;
3315
3316	ASSERT_RTNL();
3317
3318	dev->flags |= IFF_ALLMULTI;
3319	dev->allmulti += inc;
3320	if (dev->allmulti == 0) {
3321		/*
3322		 * Avoid overflow.
3323		 * If inc causes overflow, untouch allmulti and return error.
3324		 */
3325		if (inc < 0)
3326			dev->flags &= ~IFF_ALLMULTI;
3327		else {
3328			dev->allmulti -= inc;
3329			printk(KERN_WARNING "%s: allmulti touches roof, "
3330				"set allmulti failed, allmulti feature of "
3331				"device might be broken.\n", dev->name);
3332			return -EOVERFLOW;
3333		}
3334	}
3335	if (dev->flags ^ old_flags) {
3336		dev_change_rx_flags(dev, IFF_ALLMULTI);
3337		dev_set_rx_mode(dev);
3338	}
3339	return 0;
3340}
3341
3342/*
3343 *	Upload unicast and multicast address lists to device and
3344 *	configure RX filtering. When the device doesn't support unicast
3345 *	filtering it is put in promiscuous mode while unicast addresses
3346 *	are present.
3347 */
3348void __dev_set_rx_mode(struct net_device *dev)
3349{
3350	const struct net_device_ops *ops = dev->netdev_ops;
3351
3352	/* dev_open will call this function so the list will stay sane. */
3353	if (!(dev->flags&IFF_UP))
3354		return;
3355
3356	if (!netif_device_present(dev))
3357		return;
3358
3359	if (ops->ndo_set_rx_mode)
3360		ops->ndo_set_rx_mode(dev);
3361	else {
3362		/* Unicast addresses changes may only happen under the rtnl,
3363		 * therefore calling __dev_set_promiscuity here is safe.
3364		 */
3365		if (dev->uc_count > 0 && !dev->uc_promisc) {
3366			__dev_set_promiscuity(dev, 1);
3367			dev->uc_promisc = 1;
3368		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3369			__dev_set_promiscuity(dev, -1);
3370			dev->uc_promisc = 0;
3371		}
3372
3373		if (ops->ndo_set_multicast_list)
3374			ops->ndo_set_multicast_list(dev);
3375	}
3376}
3377
3378void dev_set_rx_mode(struct net_device *dev)
3379{
3380	netif_addr_lock_bh(dev);
3381	__dev_set_rx_mode(dev);
3382	netif_addr_unlock_bh(dev);
3383}
3384
3385int __dev_addr_delete(struct dev_addr_list **list, int *count,
3386		      void *addr, int alen, int glbl)
3387{
3388	struct dev_addr_list *da;
3389
3390	for (; (da = *list) != NULL; list = &da->next) {
3391		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3392		    alen == da->da_addrlen) {
3393			if (glbl) {
3394				int old_glbl = da->da_gusers;
3395				da->da_gusers = 0;
3396				if (old_glbl == 0)
3397					break;
3398			}
3399			if (--da->da_users)
3400				return 0;
3401
3402			*list = da->next;
3403			kfree(da);
3404			(*count)--;
3405			return 0;
3406		}
3407	}
3408	return -ENOENT;
3409}
3410
3411int __dev_addr_add(struct dev_addr_list **list, int *count,
3412		   void *addr, int alen, int glbl)
3413{
3414	struct dev_addr_list *da;
3415
3416	for (da = *list; da != NULL; da = da->next) {
3417		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3418		    da->da_addrlen == alen) {
3419			if (glbl) {
3420				int old_glbl = da->da_gusers;
3421				da->da_gusers = 1;
3422				if (old_glbl)
3423					return 0;
3424			}
3425			da->da_users++;
3426			return 0;
3427		}
3428	}
3429
3430	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3431	if (da == NULL)
3432		return -ENOMEM;
3433	memcpy(da->da_addr, addr, alen);
3434	da->da_addrlen = alen;
3435	da->da_users = 1;
3436	da->da_gusers = glbl ? 1 : 0;
3437	da->next = *list;
3438	*list = da;
3439	(*count)++;
3440	return 0;
3441}
3442
3443/**
3444 *	dev_unicast_delete	- Release secondary unicast address.
3445 *	@dev: device
3446 *	@addr: address to delete
3447 *	@alen: length of @addr
3448 *
3449 *	Release reference to a secondary unicast address and remove it
3450 *	from the device if the reference count drops to zero.
3451 *
3452 * 	The caller must hold the rtnl_mutex.
3453 */
3454int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3455{
3456	int err;
3457
3458	ASSERT_RTNL();
3459
3460	netif_addr_lock_bh(dev);
3461	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3462	if (!err)
3463		__dev_set_rx_mode(dev);
3464	netif_addr_unlock_bh(dev);
3465	return err;
3466}
3467EXPORT_SYMBOL(dev_unicast_delete);
3468
3469/**
3470 *	dev_unicast_add		- add a secondary unicast address
3471 *	@dev: device
3472 *	@addr: address to add
3473 *	@alen: length of @addr
3474 *
3475 *	Add a secondary unicast address to the device or increase
3476 *	the reference count if it already exists.
3477 *
3478 *	The caller must hold the rtnl_mutex.
3479 */
3480int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3481{
3482	int err;
3483
3484	ASSERT_RTNL();
3485
3486	netif_addr_lock_bh(dev);
3487	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3488	if (!err)
3489		__dev_set_rx_mode(dev);
3490	netif_addr_unlock_bh(dev);
3491	return err;
3492}
3493EXPORT_SYMBOL(dev_unicast_add);
3494
3495int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3496		    struct dev_addr_list **from, int *from_count)
3497{
3498	struct dev_addr_list *da, *next;
3499	int err = 0;
3500
3501	da = *from;
3502	while (da != NULL) {
3503		next = da->next;
3504		if (!da->da_synced) {
3505			err = __dev_addr_add(to, to_count,
3506					     da->da_addr, da->da_addrlen, 0);
3507			if (err < 0)
3508				break;
3509			da->da_synced = 1;
3510			da->da_users++;
3511		} else if (da->da_users == 1) {
3512			__dev_addr_delete(to, to_count,
3513					  da->da_addr, da->da_addrlen, 0);
3514			__dev_addr_delete(from, from_count,
3515					  da->da_addr, da->da_addrlen, 0);
3516		}
3517		da = next;
3518	}
3519	return err;
3520}
3521
3522void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3523		       struct dev_addr_list **from, int *from_count)
3524{
3525	struct dev_addr_list *da, *next;
3526
3527	da = *from;
3528	while (da != NULL) {
3529		next = da->next;
3530		if (da->da_synced) {
3531			__dev_addr_delete(to, to_count,
3532					  da->da_addr, da->da_addrlen, 0);
3533			da->da_synced = 0;
3534			__dev_addr_delete(from, from_count,
3535					  da->da_addr, da->da_addrlen, 0);
3536		}
3537		da = next;
3538	}
3539}
3540
3541/**
3542 *	dev_unicast_sync - Synchronize device's unicast list to another device
3543 *	@to: destination device
3544 *	@from: source device
3545 *
3546 *	Add newly added addresses to the destination device and release
3547 *	addresses that have no users left. The source device must be
3548 *	locked by netif_tx_lock_bh.
3549 *
3550 *	This function is intended to be called from the dev->set_rx_mode
3551 *	function of layered software devices.
3552 */
3553int dev_unicast_sync(struct net_device *to, struct net_device *from)
3554{
3555	int err = 0;
3556
3557	netif_addr_lock_bh(to);
3558	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3559			      &from->uc_list, &from->uc_count);
3560	if (!err)
3561		__dev_set_rx_mode(to);
3562	netif_addr_unlock_bh(to);
3563	return err;
3564}
3565EXPORT_SYMBOL(dev_unicast_sync);
3566
3567/**
3568 *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3569 *	@to: destination device
3570 *	@from: source device
3571 *
3572 *	Remove all addresses that were added to the destination device by
3573 *	dev_unicast_sync(). This function is intended to be called from the
3574 *	dev->stop function of layered software devices.
3575 */
3576void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3577{
3578	netif_addr_lock_bh(from);
3579	netif_addr_lock(to);
3580
3581	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3582			  &from->uc_list, &from->uc_count);
3583	__dev_set_rx_mode(to);
3584
3585	netif_addr_unlock(to);
3586	netif_addr_unlock_bh(from);
3587}
3588EXPORT_SYMBOL(dev_unicast_unsync);
3589
3590static void __dev_addr_discard(struct dev_addr_list **list)
3591{
3592	struct dev_addr_list *tmp;
3593
3594	while (*list != NULL) {
3595		tmp = *list;
3596		*list = tmp->next;
3597		if (tmp->da_users > tmp->da_gusers)
3598			printk("__dev_addr_discard: address leakage! "
3599			       "da_users=%d\n", tmp->da_users);
3600		kfree(tmp);
3601	}
3602}
3603
3604static void dev_addr_discard(struct net_device *dev)
3605{
3606	netif_addr_lock_bh(dev);
3607
3608	__dev_addr_discard(&dev->uc_list);
3609	dev->uc_count = 0;
3610
3611	__dev_addr_discard(&dev->mc_list);
3612	dev->mc_count = 0;
3613
3614	netif_addr_unlock_bh(dev);
3615}
3616
3617/**
3618 *	dev_get_flags - get flags reported to userspace
3619 *	@dev: device
3620 *
3621 *	Get the combination of flag bits exported through APIs to userspace.
3622 */
3623unsigned dev_get_flags(const struct net_device *dev)
3624{
3625	unsigned flags;
3626
3627	flags = (dev->flags & ~(IFF_PROMISC |
3628				IFF_ALLMULTI |
3629				IFF_RUNNING |
3630				IFF_LOWER_UP |
3631				IFF_DORMANT)) |
3632		(dev->gflags & (IFF_PROMISC |
3633				IFF_ALLMULTI));
3634
3635	if (netif_running(dev)) {
3636		if (netif_oper_up(dev))
3637			flags |= IFF_RUNNING;
3638		if (netif_carrier_ok(dev))
3639			flags |= IFF_LOWER_UP;
3640		if (netif_dormant(dev))
3641			flags |= IFF_DORMANT;
3642	}
3643
3644	return flags;
3645}
3646
3647/**
3648 *	dev_change_flags - change device settings
3649 *	@dev: device
3650 *	@flags: device state flags
3651 *
3652 *	Change settings on device based state flags. The flags are
3653 *	in the userspace exported format.
3654 */
3655int dev_change_flags(struct net_device *dev, unsigned flags)
3656{
3657	int ret, changes;
3658	int old_flags = dev->flags;
3659
3660	ASSERT_RTNL();
3661
3662	/*
3663	 *	Set the flags on our device.
3664	 */
3665
3666	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3667			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3668			       IFF_AUTOMEDIA)) |
3669		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3670				    IFF_ALLMULTI));
3671
3672	/*
3673	 *	Load in the correct multicast list now the flags have changed.
3674	 */
3675
3676	if ((old_flags ^ flags) & IFF_MULTICAST)
3677		dev_change_rx_flags(dev, IFF_MULTICAST);
3678
3679	dev_set_rx_mode(dev);
3680
3681	/*
3682	 *	Have we downed the interface. We handle IFF_UP ourselves
3683	 *	according to user attempts to set it, rather than blindly
3684	 *	setting it.
3685	 */
3686
3687	ret = 0;
3688	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3689		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3690
3691		if (!ret)
3692			dev_set_rx_mode(dev);
3693	}
3694
3695	if (dev->flags & IFF_UP &&
3696	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3697					  IFF_VOLATILE)))
3698		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3699
3700	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3701		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3702		dev->gflags ^= IFF_PROMISC;
3703		dev_set_promiscuity(dev, inc);
3704	}
3705
3706	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3707	   is important. Some (broken) drivers set IFF_PROMISC, when
3708	   IFF_ALLMULTI is requested not asking us and not reporting.
3709	 */
3710	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3711		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3712		dev->gflags ^= IFF_ALLMULTI;
3713		dev_set_allmulti(dev, inc);
3714	}
3715
3716	/* Exclude state transition flags, already notified */
3717	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3718	if (changes)
3719		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3720
3721	return ret;
3722}
3723
3724/**
3725 *	dev_set_mtu - Change maximum transfer unit
3726 *	@dev: device
3727 *	@new_mtu: new transfer unit
3728 *
3729 *	Change the maximum transfer size of the network device.
3730 */
3731int dev_set_mtu(struct net_device *dev, int new_mtu)
3732{
3733	const struct net_device_ops *ops = dev->netdev_ops;
3734	int err;
3735
3736	if (new_mtu == dev->mtu)
3737		return 0;
3738
3739	/*	MTU must be positive.	 */
3740	if (new_mtu < 0)
3741		return -EINVAL;
3742
3743	if (!netif_device_present(dev))
3744		return -ENODEV;
3745
3746	err = 0;
3747	if (ops->ndo_change_mtu)
3748		err = ops->ndo_change_mtu(dev, new_mtu);
3749	else
3750		dev->mtu = new_mtu;
3751
3752	if (!err && dev->flags & IFF_UP)
3753		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3754	return err;
3755}
3756
3757/**
3758 *	dev_set_mac_address - Change Media Access Control Address
3759 *	@dev: device
3760 *	@sa: new address
3761 *
3762 *	Change the hardware (MAC) address of the device
3763 */
3764int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3765{
3766	const struct net_device_ops *ops = dev->netdev_ops;
3767	int err;
3768
3769	if (!ops->ndo_set_mac_address)
3770		return -EOPNOTSUPP;
3771	if (sa->sa_family != dev->type)
3772		return -EINVAL;
3773	if (!netif_device_present(dev))
3774		return -ENODEV;
3775	err = ops->ndo_set_mac_address(dev, sa);
3776	if (!err)
3777		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3778	return err;
3779}
3780
3781/*
3782 *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3783 */
3784static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3785{
3786	int err;
3787	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3788
3789	if (!dev)
3790		return -ENODEV;
3791
3792	switch (cmd) {
3793		case SIOCGIFFLAGS:	/* Get interface flags */
3794			ifr->ifr_flags = dev_get_flags(dev);
3795			return 0;
3796
3797		case SIOCGIFMETRIC:	/* Get the metric on the interface
3798					   (currently unused) */
3799			ifr->ifr_metric = 0;
3800			return 0;
3801
3802		case SIOCGIFMTU:	/* Get the MTU of a device */
3803			ifr->ifr_mtu = dev->mtu;
3804			return 0;
3805
3806		case SIOCGIFHWADDR:
3807			if (!dev->addr_len)
3808				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3809			else
3810				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3811				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3812			ifr->ifr_hwaddr.sa_family = dev->type;
3813			return 0;
3814
3815		case SIOCGIFSLAVE:
3816			err = -EINVAL;
3817			break;
3818
3819		case SIOCGIFMAP:
3820			ifr->ifr_map.mem_start = dev->mem_start;
3821			ifr->ifr_map.mem_end   = dev->mem_end;
3822			ifr->ifr_map.base_addr = dev->base_addr;
3823			ifr->ifr_map.irq       = dev->irq;
3824			ifr->ifr_map.dma       = dev->dma;
3825			ifr->ifr_map.port      = dev->if_port;
3826			return 0;
3827
3828		case SIOCGIFINDEX:
3829			ifr->ifr_ifindex = dev->ifindex;
3830			return 0;
3831
3832		case SIOCGIFTXQLEN:
3833			ifr->ifr_qlen = dev->tx_queue_len;
3834			return 0;
3835
3836		default:
3837			/* dev_ioctl() should ensure this case
3838			 * is never reached
3839			 */
3840			WARN_ON(1);
3841			err = -EINVAL;
3842			break;
3843
3844	}
3845	return err;
3846}
3847
3848/*
3849 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3850 */
3851static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3852{
3853	int err;
3854	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3855	const struct net_device_ops *ops;
3856
3857	if (!dev)
3858		return -ENODEV;
3859
3860	ops = dev->netdev_ops;
3861
3862	switch (cmd) {
3863		case SIOCSIFFLAGS:	/* Set interface flags */
3864			return dev_change_flags(dev, ifr->ifr_flags);
3865
3866		case SIOCSIFMETRIC:	/* Set the metric on the interface
3867					   (currently unused) */
3868			return -EOPNOTSUPP;
3869
3870		case SIOCSIFMTU:	/* Set the MTU of a device */
3871			return dev_set_mtu(dev, ifr->ifr_mtu);
3872
3873		case SIOCSIFHWADDR:
3874			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3875
3876		case SIOCSIFHWBROADCAST:
3877			if (ifr->ifr_hwaddr.sa_family != dev->type)
3878				return -EINVAL;
3879			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3880			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3881			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3882			return 0;
3883
3884		case SIOCSIFMAP:
3885			if (ops->ndo_set_config) {
3886				if (!netif_device_present(dev))
3887					return -ENODEV;
3888				return ops->ndo_set_config(dev, &ifr->ifr_map);
3889			}
3890			return -EOPNOTSUPP;
3891
3892		case SIOCADDMULTI:
3893			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3894			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3895				return -EINVAL;
3896			if (!netif_device_present(dev))
3897				return -ENODEV;
3898			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3899					  dev->addr_len, 1);
3900
3901		case SIOCDELMULTI:
3902			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3903			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3904				return -EINVAL;
3905			if (!netif_device_present(dev))
3906				return -ENODEV;
3907			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3908					     dev->addr_len, 1);
3909
3910		case SIOCSIFTXQLEN:
3911			if (ifr->ifr_qlen < 0)
3912				return -EINVAL;
3913			dev->tx_queue_len = ifr->ifr_qlen;
3914			return 0;
3915
3916		case SIOCSIFNAME:
3917			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3918			return dev_change_name(dev, ifr->ifr_newname);
3919
3920		/*
3921		 *	Unknown or private ioctl
3922		 */
3923
3924		default:
3925			if ((cmd >= SIOCDEVPRIVATE &&
3926			    cmd <= SIOCDEVPRIVATE + 15) ||
3927			    cmd == SIOCBONDENSLAVE ||
3928			    cmd == SIOCBONDRELEASE ||
3929			    cmd == SIOCBONDSETHWADDR ||
3930			    cmd == SIOCBONDSLAVEINFOQUERY ||
3931			    cmd == SIOCBONDINFOQUERY ||
3932			    cmd == SIOCBONDCHANGEACTIVE ||
3933			    cmd == SIOCGMIIPHY ||
3934			    cmd == SIOCGMIIREG ||
3935			    cmd == SIOCSMIIREG ||
3936			    cmd == SIOCBRADDIF ||
3937			    cmd == SIOCBRDELIF ||
3938			    cmd == SIOCWANDEV) {
3939				err = -EOPNOTSUPP;
3940				if (ops->ndo_do_ioctl) {
3941					if (netif_device_present(dev))
3942						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3943					else
3944						err = -ENODEV;
3945				}
3946			} else
3947				err = -EINVAL;
3948
3949	}
3950	return err;
3951}
3952
3953/*
3954 *	This function handles all "interface"-type I/O control requests. The actual
3955 *	'doing' part of this is dev_ifsioc above.
3956 */
3957
3958/**
3959 *	dev_ioctl	-	network device ioctl
3960 *	@net: the applicable net namespace
3961 *	@cmd: command to issue
3962 *	@arg: pointer to a struct ifreq in user space
3963 *
3964 *	Issue ioctl functions to devices. This is normally called by the
3965 *	user space syscall interfaces but can sometimes be useful for
3966 *	other purposes. The return value is the return from the syscall if
3967 *	positive or a negative errno code on error.
3968 */
3969
3970int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3971{
3972	struct ifreq ifr;
3973	int ret;
3974	char *colon;
3975
3976	/* One special case: SIOCGIFCONF takes ifconf argument
3977	   and requires shared lock, because it sleeps writing
3978	   to user space.
3979	 */
3980
3981	if (cmd == SIOCGIFCONF) {
3982		rtnl_lock();
3983		ret = dev_ifconf(net, (char __user *) arg);
3984		rtnl_unlock();
3985		return ret;
3986	}
3987	if (cmd == SIOCGIFNAME)
3988		return dev_ifname(net, (struct ifreq __user *)arg);
3989
3990	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3991		return -EFAULT;
3992
3993	ifr.ifr_name[IFNAMSIZ-1] = 0;
3994
3995	colon = strchr(ifr.ifr_name, ':');
3996	if (colon)
3997		*colon = 0;
3998
3999	/*
4000	 *	See which interface the caller is talking about.
4001	 */
4002
4003	switch (cmd) {
4004		/*
4005		 *	These ioctl calls:
4006		 *	- can be done by all.
4007		 *	- atomic and do not require locking.
4008		 *	- return a value
4009		 */
4010		case SIOCGIFFLAGS:
4011		case SIOCGIFMETRIC:
4012		case SIOCGIFMTU:
4013		case SIOCGIFHWADDR:
4014		case SIOCGIFSLAVE:
4015		case SIOCGIFMAP:
4016		case SIOCGIFINDEX:
4017		case SIOCGIFTXQLEN:
4018			dev_load(net, ifr.ifr_name);
4019			read_lock(&dev_base_lock);
4020			ret = dev_ifsioc_locked(net, &ifr, cmd);
4021			read_unlock(&dev_base_lock);
4022			if (!ret) {
4023				if (colon)
4024					*colon = ':';
4025				if (copy_to_user(arg, &ifr,
4026						 sizeof(struct ifreq)))
4027					ret = -EFAULT;
4028			}
4029			return ret;
4030
4031		case SIOCETHTOOL:
4032			dev_load(net, ifr.ifr_name);
4033			rtnl_lock();
4034			ret = dev_ethtool(net, &ifr);
4035			rtnl_unlock();
4036			if (!ret) {
4037				if (colon)
4038					*colon = ':';
4039				if (copy_to_user(arg, &ifr,
4040						 sizeof(struct ifreq)))
4041					ret = -EFAULT;
4042			}
4043			return ret;
4044
4045		/*
4046		 *	These ioctl calls:
4047		 *	- require superuser power.
4048		 *	- require strict serialization.
4049		 *	- return a value
4050		 */
4051		case SIOCGMIIPHY:
4052		case SIOCGMIIREG:
4053		case SIOCSIFNAME:
4054			if (!capable(CAP_NET_ADMIN))
4055				return -EPERM;
4056			dev_load(net, ifr.ifr_name);
4057			rtnl_lock();
4058			ret = dev_ifsioc(net, &ifr, cmd);
4059			rtnl_unlock();
4060			if (!ret) {
4061				if (colon)
4062					*colon = ':';
4063				if (copy_to_user(arg, &ifr,
4064						 sizeof(struct ifreq)))
4065					ret = -EFAULT;
4066			}
4067			return ret;
4068
4069		/*
4070		 *	These ioctl calls:
4071		 *	- require superuser power.
4072		 *	- require strict serialization.
4073		 *	- do not return a value
4074		 */
4075		case SIOCSIFFLAGS:
4076		case SIOCSIFMETRIC:
4077		case SIOCSIFMTU:
4078		case SIOCSIFMAP:
4079		case SIOCSIFHWADDR:
4080		case SIOCSIFSLAVE:
4081		case SIOCADDMULTI:
4082		case SIOCDELMULTI:
4083		case SIOCSIFHWBROADCAST:
4084		case SIOCSIFTXQLEN:
4085		case SIOCSMIIREG:
4086		case SIOCBONDENSLAVE:
4087		case SIOCBONDRELEASE:
4088		case SIOCBONDSETHWADDR:
4089		case SIOCBONDCHANGEACTIVE:
4090		case SIOCBRADDIF:
4091		case SIOCBRDELIF:
4092			if (!capable(CAP_NET_ADMIN))
4093				return -EPERM;
4094			/* fall through */
4095		case SIOCBONDSLAVEINFOQUERY:
4096		case SIOCBONDINFOQUERY:
4097			dev_load(net, ifr.ifr_name);
4098			rtnl_lock();
4099			ret = dev_ifsioc(net, &ifr, cmd);
4100			rtnl_unlock();
4101			return ret;
4102
4103		case SIOCGIFMEM:
4104			/* Get the per device memory space. We can add this but
4105			 * currently do not support it */
4106		case SIOCSIFMEM:
4107			/* Set the per device memory buffer space.
4108			 * Not applicable in our case */
4109		case SIOCSIFLINK:
4110			return -EINVAL;
4111
4112		/*
4113		 *	Unknown or private ioctl.
4114		 */
4115		default:
4116			if (cmd == SIOCWANDEV ||
4117			    (cmd >= SIOCDEVPRIVATE &&
4118			     cmd <= SIOCDEVPRIVATE + 15)) {
4119				dev_load(net, ifr.ifr_name);
4120				rtnl_lock();
4121				ret = dev_ifsioc(net, &ifr, cmd);
4122				rtnl_unlock();
4123				if (!ret && copy_to_user(arg, &ifr,
4124							 sizeof(struct ifreq)))
4125					ret = -EFAULT;
4126				return ret;
4127			}
4128			/* Take care of Wireless Extensions */
4129			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4130				return wext_handle_ioctl(net, &ifr, cmd, arg);
4131			return -EINVAL;
4132	}
4133}
4134
4135
4136/**
4137 *	dev_new_index	-	allocate an ifindex
4138 *	@net: the applicable net namespace
4139 *
4140 *	Returns a suitable unique value for a new device interface
4141 *	number.  The caller must hold the rtnl semaphore or the
4142 *	dev_base_lock to be sure it remains unique.
4143 */
4144static int dev_new_index(struct net *net)
4145{
4146	static int ifindex;
4147	for (;;) {
4148		if (++ifindex <= 0)
4149			ifindex = 1;
4150		if (!__dev_get_by_index(net, ifindex))
4151			return ifindex;
4152	}
4153}
4154
4155/* Delayed registration/unregisteration */
4156static LIST_HEAD(net_todo_list);
4157
4158static void net_set_todo(struct net_device *dev)
4159{
4160	list_add_tail(&dev->todo_list, &net_todo_list);
4161}
4162
4163static void rollback_registered(struct net_device *dev)
4164{
4165	BUG_ON(dev_boot_phase);
4166	ASSERT_RTNL();
4167
4168	/* Some devices call without registering for initialization unwind. */
4169	if (dev->reg_state == NETREG_UNINITIALIZED) {
4170		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4171				  "was registered\n", dev->name, dev);
4172
4173		WARN_ON(1);
4174		return;
4175	}
4176
4177	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4178
4179	/* If device is running, close it first. */
4180	dev_close(dev);
4181
4182	/* And unlink it from device chain. */
4183	unlist_netdevice(dev);
4184
4185	dev->reg_state = NETREG_UNREGISTERING;
4186
4187	synchronize_net();
4188
4189	/* Shutdown queueing discipline. */
4190	dev_shutdown(dev);
4191
4192
4193	/* Notify protocols, that we are about to destroy
4194	   this device. They should clean all the things.
4195	*/
4196	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4197
4198	/*
4199	 *	Flush the unicast and multicast chains
4200	 */
4201	dev_addr_discard(dev);
4202
4203	if (dev->netdev_ops->ndo_uninit)
4204		dev->netdev_ops->ndo_uninit(dev);
4205
4206	/* Notifier chain MUST detach us from master device. */
4207	WARN_ON(dev->master);
4208
4209	/* Remove entries from kobject tree */
4210	netdev_unregister_kobject(dev);
4211
4212	synchronize_net();
4213
4214	dev_put(dev);
4215}
4216
4217static void __netdev_init_queue_locks_one(struct net_device *dev,
4218					  struct netdev_queue *dev_queue,
4219					  void *_unused)
4220{
4221	spin_lock_init(&dev_queue->_xmit_lock);
4222	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4223	dev_queue->xmit_lock_owner = -1;
4224}
4225
4226static void netdev_init_queue_locks(struct net_device *dev)
4227{
4228	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4229	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4230}
4231
4232unsigned long netdev_fix_features(unsigned long features, const char *name)
4233{
4234	/* Fix illegal SG+CSUM combinations. */
4235	if ((features & NETIF_F_SG) &&
4236	    !(features & NETIF_F_ALL_CSUM)) {
4237		if (name)
4238			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4239			       "checksum feature.\n", name);
4240		features &= ~NETIF_F_SG;
4241	}
4242
4243	/* TSO requires that SG is present as well. */
4244	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4245		if (name)
4246			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4247			       "SG feature.\n", name);
4248		features &= ~NETIF_F_TSO;
4249	}
4250
4251	if (features & NETIF_F_UFO) {
4252		if (!(features & NETIF_F_GEN_CSUM)) {
4253			if (name)
4254				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4255				       "since no NETIF_F_HW_CSUM feature.\n",
4256				       name);
4257			features &= ~NETIF_F_UFO;
4258		}
4259
4260		if (!(features & NETIF_F_SG)) {
4261			if (name)
4262				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4263				       "since no NETIF_F_SG feature.\n", name);
4264			features &= ~NETIF_F_UFO;
4265		}
4266	}
4267
4268	return features;
4269}
4270EXPORT_SYMBOL(netdev_fix_features);
4271
4272/**
4273 *	register_netdevice	- register a network device
4274 *	@dev: device to register
4275 *
4276 *	Take a completed network device structure and add it to the kernel
4277 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4278 *	chain. 0 is returned on success. A negative errno code is returned
4279 *	on a failure to set up the device, or if the name is a duplicate.
4280 *
4281 *	Callers must hold the rtnl semaphore. You may want
4282 *	register_netdev() instead of this.
4283 *
4284 *	BUGS:
4285 *	The locking appears insufficient to guarantee two parallel registers
4286 *	will not get the same name.
4287 */
4288
4289int register_netdevice(struct net_device *dev)
4290{
4291	struct hlist_head *head;
4292	struct hlist_node *p;
4293	int ret;
4294	struct net *net = dev_net(dev);
4295
4296	BUG_ON(dev_boot_phase);
4297	ASSERT_RTNL();
4298
4299	might_sleep();
4300
4301	/* When net_device's are persistent, this will be fatal. */
4302	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4303	BUG_ON(!net);
4304
4305	spin_lock_init(&dev->addr_list_lock);
4306	netdev_set_addr_lockdep_class(dev);
4307	netdev_init_queue_locks(dev);
4308
4309	dev->iflink = -1;
4310
4311#ifdef CONFIG_COMPAT_NET_DEV_OPS
4312	/* Netdevice_ops API compatiability support.
4313	 * This is temporary until all network devices are converted.
4314	 */
4315	if (dev->netdev_ops) {
4316		const struct net_device_ops *ops = dev->netdev_ops;
4317
4318		dev->init = ops->ndo_init;
4319		dev->uninit = ops->ndo_uninit;
4320		dev->open = ops->ndo_open;
4321		dev->change_rx_flags = ops->ndo_change_rx_flags;
4322		dev->set_rx_mode = ops->ndo_set_rx_mode;
4323		dev->set_multicast_list = ops->ndo_set_multicast_list;
4324		dev->set_mac_address = ops->ndo_set_mac_address;
4325		dev->validate_addr = ops->ndo_validate_addr;
4326		dev->do_ioctl = ops->ndo_do_ioctl;
4327		dev->set_config = ops->ndo_set_config;
4328		dev->change_mtu = ops->ndo_change_mtu;
4329		dev->tx_timeout = ops->ndo_tx_timeout;
4330		dev->get_stats = ops->ndo_get_stats;
4331		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4332		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4333		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4334#ifdef CONFIG_NET_POLL_CONTROLLER
4335		dev->poll_controller = ops->ndo_poll_controller;
4336#endif
4337	} else {
4338		char drivername[64];
4339		pr_info("%s (%s): not using net_device_ops yet\n",
4340			dev->name, netdev_drivername(dev, drivername, 64));
4341
4342		/* This works only because net_device_ops and the
4343		   compatiablity structure are the same. */
4344		dev->netdev_ops = (void *) &(dev->init);
4345	}
4346#endif
4347
4348	/* Init, if this function is available */
4349	if (dev->netdev_ops->ndo_init) {
4350		ret = dev->netdev_ops->ndo_init(dev);
4351		if (ret) {
4352			if (ret > 0)
4353				ret = -EIO;
4354			goto out;
4355		}
4356	}
4357
4358	if (!dev_valid_name(dev->name)) {
4359		ret = -EINVAL;
4360		goto err_uninit;
4361	}
4362
4363	dev->ifindex = dev_new_index(net);
4364	if (dev->iflink == -1)
4365		dev->iflink = dev->ifindex;
4366
4367	/* Check for existence of name */
4368	head = dev_name_hash(net, dev->name);
4369	hlist_for_each(p, head) {
4370		struct net_device *d
4371			= hlist_entry(p, struct net_device, name_hlist);
4372		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4373			ret = -EEXIST;
4374			goto err_uninit;
4375		}
4376	}
4377
4378	/* Fix illegal checksum combinations */
4379	if ((dev->features & NETIF_F_HW_CSUM) &&
4380	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4381		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4382		       dev->name);
4383		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4384	}
4385
4386	if ((dev->features & NETIF_F_NO_CSUM) &&
4387	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4388		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4389		       dev->name);
4390		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4391	}
4392
4393	dev->features = netdev_fix_features(dev->features, dev->name);
4394
4395	/* Enable software GSO if SG is supported. */
4396	if (dev->features & NETIF_F_SG)
4397		dev->features |= NETIF_F_GSO;
4398
4399	netdev_initialize_kobject(dev);
4400	ret = netdev_register_kobject(dev);
4401	if (ret)
4402		goto err_uninit;
4403	dev->reg_state = NETREG_REGISTERED;
4404
4405	/*
4406	 *	Default initial state at registry is that the
4407	 *	device is present.
4408	 */
4409
4410	set_bit(__LINK_STATE_PRESENT, &dev->state);
4411
4412	dev_init_scheduler(dev);
4413	dev_hold(dev);
4414	list_netdevice(dev);
4415
4416	/* Notify protocols, that a new device appeared. */
4417	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4418	ret = notifier_to_errno(ret);
4419	if (ret) {
4420		rollback_registered(dev);
4421		dev->reg_state = NETREG_UNREGISTERED;
4422	}
4423
4424out:
4425	return ret;
4426
4427err_uninit:
4428	if (dev->netdev_ops->ndo_uninit)
4429		dev->netdev_ops->ndo_uninit(dev);
4430	goto out;
4431}
4432
4433/**
4434 *	init_dummy_netdev	- init a dummy network device for NAPI
4435 *	@dev: device to init
4436 *
4437 *	This takes a network device structure and initialize the minimum
4438 *	amount of fields so it can be used to schedule NAPI polls without
4439 *	registering a full blown interface. This is to be used by drivers
4440 *	that need to tie several hardware interfaces to a single NAPI
4441 *	poll scheduler due to HW limitations.
4442 */
4443int init_dummy_netdev(struct net_device *dev)
4444{
4445	/* Clear everything. Note we don't initialize spinlocks
4446	 * are they aren't supposed to be taken by any of the
4447	 * NAPI code and this dummy netdev is supposed to be
4448	 * only ever used for NAPI polls
4449	 */
4450	memset(dev, 0, sizeof(struct net_device));
4451
4452	/* make sure we BUG if trying to hit standard
4453	 * register/unregister code path
4454	 */
4455	dev->reg_state = NETREG_DUMMY;
4456
4457	/* initialize the ref count */
4458	atomic_set(&dev->refcnt, 1);
4459
4460	/* NAPI wants this */
4461	INIT_LIST_HEAD(&dev->napi_list);
4462
4463	/* a dummy interface is started by default */
4464	set_bit(__LINK_STATE_PRESENT, &dev->state);
4465	set_bit(__LINK_STATE_START, &dev->state);
4466
4467	return 0;
4468}
4469EXPORT_SYMBOL_GPL(init_dummy_netdev);
4470
4471
4472/**
4473 *	register_netdev	- register a network device
4474 *	@dev: device to register
4475 *
4476 *	Take a completed network device structure and add it to the kernel
4477 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4478 *	chain. 0 is returned on success. A negative errno code is returned
4479 *	on a failure to set up the device, or if the name is a duplicate.
4480 *
4481 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4482 *	and expands the device name if you passed a format string to
4483 *	alloc_netdev.
4484 */
4485int register_netdev(struct net_device *dev)
4486{
4487	int err;
4488
4489	rtnl_lock();
4490
4491	/*
4492	 * If the name is a format string the caller wants us to do a
4493	 * name allocation.
4494	 */
4495	if (strchr(dev->name, '%')) {
4496		err = dev_alloc_name(dev, dev->name);
4497		if (err < 0)
4498			goto out;
4499	}
4500
4501	err = register_netdevice(dev);
4502out:
4503	rtnl_unlock();
4504	return err;
4505}
4506EXPORT_SYMBOL(register_netdev);
4507
4508/*
4509 * netdev_wait_allrefs - wait until all references are gone.
4510 *
4511 * This is called when unregistering network devices.
4512 *
4513 * Any protocol or device that holds a reference should register
4514 * for netdevice notification, and cleanup and put back the
4515 * reference if they receive an UNREGISTER event.
4516 * We can get stuck here if buggy protocols don't correctly
4517 * call dev_put.
4518 */
4519static void netdev_wait_allrefs(struct net_device *dev)
4520{
4521	unsigned long rebroadcast_time, warning_time;
4522
4523	rebroadcast_time = warning_time = jiffies;
4524	while (atomic_read(&dev->refcnt) != 0) {
4525		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4526			rtnl_lock();
4527
4528			/* Rebroadcast unregister notification */
4529			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4530
4531			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4532				     &dev->state)) {
4533				/* We must not have linkwatch events
4534				 * pending on unregister. If this
4535				 * happens, we simply run the queue
4536				 * unscheduled, resulting in a noop
4537				 * for this device.
4538				 */
4539				linkwatch_run_queue();
4540			}
4541
4542			__rtnl_unlock();
4543
4544			rebroadcast_time = jiffies;
4545		}
4546
4547		msleep(250);
4548
4549		if (time_after(jiffies, warning_time + 10 * HZ)) {
4550			printk(KERN_EMERG "unregister_netdevice: "
4551			       "waiting for %s to become free. Usage "
4552			       "count = %d\n",
4553			       dev->name, atomic_read(&dev->refcnt));
4554			warning_time = jiffies;
4555		}
4556	}
4557}
4558
4559/* The sequence is:
4560 *
4561 *	rtnl_lock();
4562 *	...
4563 *	register_netdevice(x1);
4564 *	register_netdevice(x2);
4565 *	...
4566 *	unregister_netdevice(y1);
4567 *	unregister_netdevice(y2);
4568 *      ...
4569 *	rtnl_unlock();
4570 *	free_netdev(y1);
4571 *	free_netdev(y2);
4572 *
4573 * We are invoked by rtnl_unlock().
4574 * This allows us to deal with problems:
4575 * 1) We can delete sysfs objects which invoke hotplug
4576 *    without deadlocking with linkwatch via keventd.
4577 * 2) Since we run with the RTNL semaphore not held, we can sleep
4578 *    safely in order to wait for the netdev refcnt to drop to zero.
4579 *
4580 * We must not return until all unregister events added during
4581 * the interval the lock was held have been completed.
4582 */
4583void netdev_run_todo(void)
4584{
4585	struct list_head list;
4586
4587	/* Snapshot list, allow later requests */
4588	list_replace_init(&net_todo_list, &list);
4589
4590	__rtnl_unlock();
4591
4592	while (!list_empty(&list)) {
4593		struct net_device *dev
4594			= list_entry(list.next, struct net_device, todo_list);
4595		list_del(&dev->todo_list);
4596
4597		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4598			printk(KERN_ERR "network todo '%s' but state %d\n",
4599			       dev->name, dev->reg_state);
4600			dump_stack();
4601			continue;
4602		}
4603
4604		dev->reg_state = NETREG_UNREGISTERED;
4605
4606		on_each_cpu(flush_backlog, dev, 1);
4607
4608		netdev_wait_allrefs(dev);
4609
4610		/* paranoia */
4611		BUG_ON(atomic_read(&dev->refcnt));
4612		WARN_ON(dev->ip_ptr);
4613		WARN_ON(dev->ip6_ptr);
4614		WARN_ON(dev->dn_ptr);
4615
4616		if (dev->destructor)
4617			dev->destructor(dev);
4618
4619		/* Free network device */
4620		kobject_put(&dev->dev.kobj);
4621	}
4622}
4623
4624/**
4625 *	dev_get_stats	- get network device statistics
4626 *	@dev: device to get statistics from
4627 *
4628 *	Get network statistics from device. The device driver may provide
4629 *	its own method by setting dev->netdev_ops->get_stats; otherwise
4630 *	the internal statistics structure is used.
4631 */
4632const struct net_device_stats *dev_get_stats(struct net_device *dev)
4633 {
4634	const struct net_device_ops *ops = dev->netdev_ops;
4635
4636	if (ops->ndo_get_stats)
4637		return ops->ndo_get_stats(dev);
4638	else
4639		return &dev->stats;
4640}
4641EXPORT_SYMBOL(dev_get_stats);
4642
4643static void netdev_init_one_queue(struct net_device *dev,
4644				  struct netdev_queue *queue,
4645				  void *_unused)
4646{
4647	queue->dev = dev;
4648}
4649
4650static void netdev_init_queues(struct net_device *dev)
4651{
4652	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4653	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4654	spin_lock_init(&dev->tx_global_lock);
4655}
4656
4657/**
4658 *	alloc_netdev_mq - allocate network device
4659 *	@sizeof_priv:	size of private data to allocate space for
4660 *	@name:		device name format string
4661 *	@setup:		callback to initialize device
4662 *	@queue_count:	the number of subqueues to allocate
4663 *
4664 *	Allocates a struct net_device with private data area for driver use
4665 *	and performs basic initialization.  Also allocates subquue structs
4666 *	for each queue on the device at the end of the netdevice.
4667 */
4668struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4669		void (*setup)(struct net_device *), unsigned int queue_count)
4670{
4671	struct netdev_queue *tx;
4672	struct net_device *dev;
4673	size_t alloc_size;
4674	void *p;
4675
4676	BUG_ON(strlen(name) >= sizeof(dev->name));
4677
4678	alloc_size = sizeof(struct net_device);
4679	if (sizeof_priv) {
4680		/* ensure 32-byte alignment of private area */
4681		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4682		alloc_size += sizeof_priv;
4683	}
4684	/* ensure 32-byte alignment of whole construct */
4685	alloc_size += NETDEV_ALIGN_CONST;
4686
4687	p = kzalloc(alloc_size, GFP_KERNEL);
4688	if (!p) {
4689		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4690		return NULL;
4691	}
4692
4693	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4694	if (!tx) {
4695		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4696		       "tx qdiscs.\n");
4697		kfree(p);
4698		return NULL;
4699	}
4700
4701	dev = (struct net_device *)
4702		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4703	dev->padded = (char *)dev - (char *)p;
4704	dev_net_set(dev, &init_net);
4705
4706	dev->_tx = tx;
4707	dev->num_tx_queues = queue_count;
4708	dev->real_num_tx_queues = queue_count;
4709
4710	dev->gso_max_size = GSO_MAX_SIZE;
4711
4712	netdev_init_queues(dev);
4713
4714	INIT_LIST_HEAD(&dev->napi_list);
4715	setup(dev);
4716	strcpy(dev->name, name);
4717	return dev;
4718}
4719EXPORT_SYMBOL(alloc_netdev_mq);
4720
4721/**
4722 *	free_netdev - free network device
4723 *	@dev: device
4724 *
4725 *	This function does the last stage of destroying an allocated device
4726 * 	interface. The reference to the device object is released.
4727 *	If this is the last reference then it will be freed.
4728 */
4729void free_netdev(struct net_device *dev)
4730{
4731	struct napi_struct *p, *n;
4732
4733	release_net(dev_net(dev));
4734
4735	kfree(dev->_tx);
4736
4737	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4738		netif_napi_del(p);
4739
4740	/*  Compatibility with error handling in drivers */
4741	if (dev->reg_state == NETREG_UNINITIALIZED) {
4742		kfree((char *)dev - dev->padded);
4743		return;
4744	}
4745
4746	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4747	dev->reg_state = NETREG_RELEASED;
4748
4749	/* will free via device release */
4750	put_device(&dev->dev);
4751}
4752
4753/**
4754 *	synchronize_net -  Synchronize with packet receive processing
4755 *
4756 *	Wait for packets currently being received to be done.
4757 *	Does not block later packets from starting.
4758 */
4759void synchronize_net(void)
4760{
4761	might_sleep();
4762	synchronize_rcu();
4763}
4764
4765/**
4766 *	unregister_netdevice - remove device from the kernel
4767 *	@dev: device
4768 *
4769 *	This function shuts down a device interface and removes it
4770 *	from the kernel tables.
4771 *
4772 *	Callers must hold the rtnl semaphore.  You may want
4773 *	unregister_netdev() instead of this.
4774 */
4775
4776void unregister_netdevice(struct net_device *dev)
4777{
4778	ASSERT_RTNL();
4779
4780	rollback_registered(dev);
4781	/* Finish processing unregister after unlock */
4782	net_set_todo(dev);
4783}
4784
4785/**
4786 *	unregister_netdev - remove device from the kernel
4787 *	@dev: device
4788 *
4789 *	This function shuts down a device interface and removes it
4790 *	from the kernel tables.
4791 *
4792 *	This is just a wrapper for unregister_netdevice that takes
4793 *	the rtnl semaphore.  In general you want to use this and not
4794 *	unregister_netdevice.
4795 */
4796void unregister_netdev(struct net_device *dev)
4797{
4798	rtnl_lock();
4799	unregister_netdevice(dev);
4800	rtnl_unlock();
4801}
4802
4803EXPORT_SYMBOL(unregister_netdev);
4804
4805/**
4806 *	dev_change_net_namespace - move device to different nethost namespace
4807 *	@dev: device
4808 *	@net: network namespace
4809 *	@pat: If not NULL name pattern to try if the current device name
4810 *	      is already taken in the destination network namespace.
4811 *
4812 *	This function shuts down a device interface and moves it
4813 *	to a new network namespace. On success 0 is returned, on
4814 *	a failure a netagive errno code is returned.
4815 *
4816 *	Callers must hold the rtnl semaphore.
4817 */
4818
4819int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4820{
4821	char buf[IFNAMSIZ];
4822	const char *destname;
4823	int err;
4824
4825	ASSERT_RTNL();
4826
4827	/* Don't allow namespace local devices to be moved. */
4828	err = -EINVAL;
4829	if (dev->features & NETIF_F_NETNS_LOCAL)
4830		goto out;
4831
4832#ifdef CONFIG_SYSFS
4833	/* Don't allow real devices to be moved when sysfs
4834	 * is enabled.
4835	 */
4836	err = -EINVAL;
4837	if (dev->dev.parent)
4838		goto out;
4839#endif
4840
4841	/* Ensure the device has been registrered */
4842	err = -EINVAL;
4843	if (dev->reg_state != NETREG_REGISTERED)
4844		goto out;
4845
4846	/* Get out if there is nothing todo */
4847	err = 0;
4848	if (net_eq(dev_net(dev), net))
4849		goto out;
4850
4851	/* Pick the destination device name, and ensure
4852	 * we can use it in the destination network namespace.
4853	 */
4854	err = -EEXIST;
4855	destname = dev->name;
4856	if (__dev_get_by_name(net, destname)) {
4857		/* We get here if we can't use the current device name */
4858		if (!pat)
4859			goto out;
4860		if (!dev_valid_name(pat))
4861			goto out;
4862		if (strchr(pat, '%')) {
4863			if (__dev_alloc_name(net, pat, buf) < 0)
4864				goto out;
4865			destname = buf;
4866		} else
4867			destname = pat;
4868		if (__dev_get_by_name(net, destname))
4869			goto out;
4870	}
4871
4872	/*
4873	 * And now a mini version of register_netdevice unregister_netdevice.
4874	 */
4875
4876	/* If device is running close it first. */
4877	dev_close(dev);
4878
4879	/* And unlink it from device chain */
4880	err = -ENODEV;
4881	unlist_netdevice(dev);
4882
4883	synchronize_net();
4884
4885	/* Shutdown queueing discipline. */
4886	dev_shutdown(dev);
4887
4888	/* Notify protocols, that we are about to destroy
4889	   this device. They should clean all the things.
4890	*/
4891	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4892
4893	/*
4894	 *	Flush the unicast and multicast chains
4895	 */
4896	dev_addr_discard(dev);
4897
4898	netdev_unregister_kobject(dev);
4899
4900	/* Actually switch the network namespace */
4901	dev_net_set(dev, net);
4902
4903	/* Assign the new device name */
4904	if (destname != dev->name)
4905		strcpy(dev->name, destname);
4906
4907	/* If there is an ifindex conflict assign a new one */
4908	if (__dev_get_by_index(net, dev->ifindex)) {
4909		int iflink = (dev->iflink == dev->ifindex);
4910		dev->ifindex = dev_new_index(net);
4911		if (iflink)
4912			dev->iflink = dev->ifindex;
4913	}
4914
4915	/* Fixup kobjects */
4916	err = netdev_register_kobject(dev);
4917	WARN_ON(err);
4918
4919	/* Add the device back in the hashes */
4920	list_netdevice(dev);
4921
4922	/* Notify protocols, that a new device appeared. */
4923	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4924
4925	synchronize_net();
4926	err = 0;
4927out:
4928	return err;
4929}
4930
4931static int dev_cpu_callback(struct notifier_block *nfb,
4932			    unsigned long action,
4933			    void *ocpu)
4934{
4935	struct sk_buff **list_skb;
4936	struct Qdisc **list_net;
4937	struct sk_buff *skb;
4938	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4939	struct softnet_data *sd, *oldsd;
4940
4941	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4942		return NOTIFY_OK;
4943
4944	local_irq_disable();
4945	cpu = smp_processor_id();
4946	sd = &per_cpu(softnet_data, cpu);
4947	oldsd = &per_cpu(softnet_data, oldcpu);
4948
4949	/* Find end of our completion_queue. */
4950	list_skb = &sd->completion_queue;
4951	while (*list_skb)
4952		list_skb = &(*list_skb)->next;
4953	/* Append completion queue from offline CPU. */
4954	*list_skb = oldsd->completion_queue;
4955	oldsd->completion_queue = NULL;
4956
4957	/* Find end of our output_queue. */
4958	list_net = &sd->output_queue;
4959	while (*list_net)
4960		list_net = &(*list_net)->next_sched;
4961	/* Append output queue from offline CPU. */
4962	*list_net = oldsd->output_queue;
4963	oldsd->output_queue = NULL;
4964
4965	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4966	local_irq_enable();
4967
4968	/* Process offline CPU's input_pkt_queue */
4969	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4970		netif_rx(skb);
4971
4972	return NOTIFY_OK;
4973}
4974
4975
4976/**
4977 *	netdev_increment_features - increment feature set by one
4978 *	@all: current feature set
4979 *	@one: new feature set
4980 *	@mask: mask feature set
4981 *
4982 *	Computes a new feature set after adding a device with feature set
4983 *	@one to the master device with current feature set @all.  Will not
4984 *	enable anything that is off in @mask. Returns the new feature set.
4985 */
4986unsigned long netdev_increment_features(unsigned long all, unsigned long one,
4987					unsigned long mask)
4988{
4989	/* If device needs checksumming, downgrade to it. */
4990        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4991		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
4992	else if (mask & NETIF_F_ALL_CSUM) {
4993		/* If one device supports v4/v6 checksumming, set for all. */
4994		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
4995		    !(all & NETIF_F_GEN_CSUM)) {
4996			all &= ~NETIF_F_ALL_CSUM;
4997			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
4998		}
4999
5000		/* If one device supports hw checksumming, set for all. */
5001		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5002			all &= ~NETIF_F_ALL_CSUM;
5003			all |= NETIF_F_HW_CSUM;
5004		}
5005	}
5006
5007	one |= NETIF_F_ALL_CSUM;
5008
5009	one |= all & NETIF_F_ONE_FOR_ALL;
5010	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5011	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5012
5013	return all;
5014}
5015EXPORT_SYMBOL(netdev_increment_features);
5016
5017static struct hlist_head *netdev_create_hash(void)
5018{
5019	int i;
5020	struct hlist_head *hash;
5021
5022	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5023	if (hash != NULL)
5024		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5025			INIT_HLIST_HEAD(&hash[i]);
5026
5027	return hash;
5028}
5029
5030/* Initialize per network namespace state */
5031static int __net_init netdev_init(struct net *net)
5032{
5033	INIT_LIST_HEAD(&net->dev_base_head);
5034
5035	net->dev_name_head = netdev_create_hash();
5036	if (net->dev_name_head == NULL)
5037		goto err_name;
5038
5039	net->dev_index_head = netdev_create_hash();
5040	if (net->dev_index_head == NULL)
5041		goto err_idx;
5042
5043	return 0;
5044
5045err_idx:
5046	kfree(net->dev_name_head);
5047err_name:
5048	return -ENOMEM;
5049}
5050
5051/**
5052 *	netdev_drivername - network driver for the device
5053 *	@dev: network device
5054 *	@buffer: buffer for resulting name
5055 *	@len: size of buffer
5056 *
5057 *	Determine network driver for device.
5058 */
5059char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5060{
5061	const struct device_driver *driver;
5062	const struct device *parent;
5063
5064	if (len <= 0 || !buffer)
5065		return buffer;
5066	buffer[0] = 0;
5067
5068	parent = dev->dev.parent;
5069
5070	if (!parent)
5071		return buffer;
5072
5073	driver = parent->driver;
5074	if (driver && driver->name)
5075		strlcpy(buffer, driver->name, len);
5076	return buffer;
5077}
5078
5079static void __net_exit netdev_exit(struct net *net)
5080{
5081	kfree(net->dev_name_head);
5082	kfree(net->dev_index_head);
5083}
5084
5085static struct pernet_operations __net_initdata netdev_net_ops = {
5086	.init = netdev_init,
5087	.exit = netdev_exit,
5088};
5089
5090static void __net_exit default_device_exit(struct net *net)
5091{
5092	struct net_device *dev;
5093	/*
5094	 * Push all migratable of the network devices back to the
5095	 * initial network namespace
5096	 */
5097	rtnl_lock();
5098restart:
5099	for_each_netdev(net, dev) {
5100		int err;
5101		char fb_name[IFNAMSIZ];
5102
5103		/* Ignore unmoveable devices (i.e. loopback) */
5104		if (dev->features & NETIF_F_NETNS_LOCAL)
5105			continue;
5106
5107		/* Delete virtual devices */
5108		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5109			dev->rtnl_link_ops->dellink(dev);
5110			goto restart;
5111		}
5112
5113		/* Push remaing network devices to init_net */
5114		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5115		err = dev_change_net_namespace(dev, &init_net, fb_name);
5116		if (err) {
5117			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5118				__func__, dev->name, err);
5119			BUG();
5120		}
5121		goto restart;
5122	}
5123	rtnl_unlock();
5124}
5125
5126static struct pernet_operations __net_initdata default_device_ops = {
5127	.exit = default_device_exit,
5128};
5129
5130/*
5131 *	Initialize the DEV module. At boot time this walks the device list and
5132 *	unhooks any devices that fail to initialise (normally hardware not
5133 *	present) and leaves us with a valid list of present and active devices.
5134 *
5135 */
5136
5137/*
5138 *       This is called single threaded during boot, so no need
5139 *       to take the rtnl semaphore.
5140 */
5141static int __init net_dev_init(void)
5142{
5143	int i, rc = -ENOMEM;
5144
5145	BUG_ON(!dev_boot_phase);
5146
5147	if (dev_proc_init())
5148		goto out;
5149
5150	if (netdev_kobject_init())
5151		goto out;
5152
5153	INIT_LIST_HEAD(&ptype_all);
5154	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5155		INIT_LIST_HEAD(&ptype_base[i]);
5156
5157	if (register_pernet_subsys(&netdev_net_ops))
5158		goto out;
5159
5160	/*
5161	 *	Initialise the packet receive queues.
5162	 */
5163
5164	for_each_possible_cpu(i) {
5165		struct softnet_data *queue;
5166
5167		queue = &per_cpu(softnet_data, i);
5168		skb_queue_head_init(&queue->input_pkt_queue);
5169		queue->completion_queue = NULL;
5170		INIT_LIST_HEAD(&queue->poll_list);
5171
5172		queue->backlog.poll = process_backlog;
5173		queue->backlog.weight = weight_p;
5174		queue->backlog.gro_list = NULL;
5175	}
5176
5177	dev_boot_phase = 0;
5178
5179	/* The loopback device is special if any other network devices
5180	 * is present in a network namespace the loopback device must
5181	 * be present. Since we now dynamically allocate and free the
5182	 * loopback device ensure this invariant is maintained by
5183	 * keeping the loopback device as the first device on the
5184	 * list of network devices.  Ensuring the loopback devices
5185	 * is the first device that appears and the last network device
5186	 * that disappears.
5187	 */
5188	if (register_pernet_device(&loopback_net_ops))
5189		goto out;
5190
5191	if (register_pernet_device(&default_device_ops))
5192		goto out;
5193
5194	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5195	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5196
5197	hotcpu_notifier(dev_cpu_callback, 0);
5198	dst_init();
5199	dev_mcast_init();
5200	rc = 0;
5201out:
5202	return rc;
5203}
5204
5205subsys_initcall(net_dev_init);
5206
5207EXPORT_SYMBOL(__dev_get_by_index);
5208EXPORT_SYMBOL(__dev_get_by_name);
5209EXPORT_SYMBOL(__dev_remove_pack);
5210EXPORT_SYMBOL(dev_valid_name);
5211EXPORT_SYMBOL(dev_add_pack);
5212EXPORT_SYMBOL(dev_alloc_name);
5213EXPORT_SYMBOL(dev_close);
5214EXPORT_SYMBOL(dev_get_by_flags);
5215EXPORT_SYMBOL(dev_get_by_index);
5216EXPORT_SYMBOL(dev_get_by_name);
5217EXPORT_SYMBOL(dev_open);
5218EXPORT_SYMBOL(dev_queue_xmit);
5219EXPORT_SYMBOL(dev_remove_pack);
5220EXPORT_SYMBOL(dev_set_allmulti);
5221EXPORT_SYMBOL(dev_set_promiscuity);
5222EXPORT_SYMBOL(dev_change_flags);
5223EXPORT_SYMBOL(dev_set_mtu);
5224EXPORT_SYMBOL(dev_set_mac_address);
5225EXPORT_SYMBOL(free_netdev);
5226EXPORT_SYMBOL(netdev_boot_setup_check);
5227EXPORT_SYMBOL(netdev_set_master);
5228EXPORT_SYMBOL(netdev_state_change);
5229EXPORT_SYMBOL(netif_receive_skb);
5230EXPORT_SYMBOL(netif_rx);
5231EXPORT_SYMBOL(register_gifconf);
5232EXPORT_SYMBOL(register_netdevice);
5233EXPORT_SYMBOL(register_netdevice_notifier);
5234EXPORT_SYMBOL(skb_checksum_help);
5235EXPORT_SYMBOL(synchronize_net);
5236EXPORT_SYMBOL(unregister_netdevice);
5237EXPORT_SYMBOL(unregister_netdevice_notifier);
5238EXPORT_SYMBOL(net_enable_timestamp);
5239EXPORT_SYMBOL(net_disable_timestamp);
5240EXPORT_SYMBOL(dev_get_flags);
5241
5242#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5243EXPORT_SYMBOL(br_handle_frame_hook);
5244EXPORT_SYMBOL(br_fdb_get_hook);
5245EXPORT_SYMBOL(br_fdb_put_hook);
5246#endif
5247
5248EXPORT_SYMBOL(dev_load);
5249
5250EXPORT_PER_CPU_SYMBOL(softnet_data);