net/core/dev.c at v2.6.26 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.26 115 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/notifier.h>
  94#include <linux/skbuff.h>
  95#include <net/net_namespace.h>
  96#include <net/sock.h>
  97#include <linux/rtnetlink.h>
  98#include <linux/proc_fs.h>
  99#include <linux/seq_file.h>
 100#include <linux/stat.h>
 101#include <linux/if_bridge.h>
 102#include <linux/if_macvlan.h>
 103#include <net/dst.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <linux/highmem.h>
 107#include <linux/init.h>
 108#include <linux/kmod.h>
 109#include <linux/module.h>
 110#include <linux/kallsyms.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123
 124#include "net-sysfs.h"
 125
 126/*
 127 *	The list of packet types we will receive (as opposed to discard)
 128 *	and the routines to invoke.
 129 *
 130 *	Why 16. Because with 16 the only overlap we get on a hash of the
 131 *	low nibble of the protocol value is RARP/SNAP/X.25.
 132 *
 133 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 134 *             sure which should go first, but I bet it won't make much
 135 *             difference if we are running VLANs.  The good news is that
 136 *             this protocol won't be in the list unless compiled in, so
 137 *             the average user (w/out VLANs) will not be adversely affected.
 138 *             --BLG
 139 *
 140 *		0800	IP
 141 *		8100    802.1Q VLAN
 142 *		0001	802.3
 143 *		0002	AX.25
 144 *		0004	802.2
 145 *		8035	RARP
 146 *		0005	SNAP
 147 *		0805	X.25
 148 *		0806	ARP
 149 *		8137	IPX
 150 *		0009	Localtalk
 151 *		86DD	IPv6
 152 */
 153
 154#define PTYPE_HASH_SIZE	(16)
 155#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 156
 157static DEFINE_SPINLOCK(ptype_lock);
 158static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 159static struct list_head ptype_all __read_mostly;	/* Taps */
 160
 161#ifdef CONFIG_NET_DMA
 162struct net_dma {
 163	struct dma_client client;
 164	spinlock_t lock;
 165	cpumask_t channel_mask;
 166	struct dma_chan **channels;
 167};
 168
 169static enum dma_state_client
 170netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 171	enum dma_state state);
 172
 173static struct net_dma net_dma = {
 174	.client = {
 175		.event_callback = netdev_dma_event,
 176	},
 177};
 178#endif
 179
 180/*
 181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182 * semaphore.
 183 *
 184 * Pure readers hold dev_base_lock for reading.
 185 *
 186 * Writers must hold the rtnl semaphore while they loop through the
 187 * dev_base_head list, and hold dev_base_lock for writing when they do the
 188 * actual updates.  This allows pure readers to access the list even
 189 * while a writer is preparing to update it.
 190 *
 191 * To put it another way, dev_base_lock is held for writing only to
 192 * protect against pure readers; the rtnl semaphore provides the
 193 * protection against other writers.
 194 *
 195 * See, for example usages, register_netdevice() and
 196 * unregister_netdevice(), which must be called with the rtnl
 197 * semaphore held.
 198 */
 199DEFINE_RWLOCK(dev_base_lock);
 200
 201EXPORT_SYMBOL(dev_base_lock);
 202
 203#define NETDEV_HASHBITS	8
 204#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 205
 206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 207{
 208	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 209	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 210}
 211
 212static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 213{
 214	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 215}
 216
 217/* Device list insertion */
 218static int list_netdevice(struct net_device *dev)
 219{
 220	struct net *net = dev_net(dev);
 221
 222	ASSERT_RTNL();
 223
 224	write_lock_bh(&dev_base_lock);
 225	list_add_tail(&dev->dev_list, &net->dev_base_head);
 226	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 227	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 228	write_unlock_bh(&dev_base_lock);
 229	return 0;
 230}
 231
 232/* Device list removal */
 233static void unlist_netdevice(struct net_device *dev)
 234{
 235	ASSERT_RTNL();
 236
 237	/* Unlink dev from the device chain */
 238	write_lock_bh(&dev_base_lock);
 239	list_del(&dev->dev_list);
 240	hlist_del(&dev->name_hlist);
 241	hlist_del(&dev->index_hlist);
 242	write_unlock_bh(&dev_base_lock);
 243}
 244
 245/*
 246 *	Our notifier list
 247 */
 248
 249static RAW_NOTIFIER_HEAD(netdev_chain);
 250
 251/*
 252 *	Device drivers call our routines to queue packets here. We empty the
 253 *	queue in the local softnet handler.
 254 */
 255
 256DEFINE_PER_CPU(struct softnet_data, softnet_data);
 257
 258#ifdef CONFIG_DEBUG_LOCK_ALLOC
 259/*
 260 * register_netdevice() inits dev->_xmit_lock and sets lockdep class
 261 * according to dev->type
 262 */
 263static const unsigned short netdev_lock_type[] =
 264	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 265	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 266	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 267	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 268	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 269	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 270	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 271	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 272	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 273	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 274	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 275	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 276	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 277	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 278	 ARPHRD_NONE};
 279
 280static const char *netdev_lock_name[] =
 281	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 282	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 283	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 284	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 285	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 286	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 287	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 288	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 289	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 290	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 291	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 292	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 293	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 294	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 295	 "_xmit_NONE"};
 296
 297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300{
 301	int i;
 302
 303	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304		if (netdev_lock_type[i] == dev_type)
 305			return i;
 306	/* the last key is used by default */
 307	return ARRAY_SIZE(netdev_lock_type) - 1;
 308}
 309
 310static inline void netdev_set_lockdep_class(spinlock_t *lock,
 311					    unsigned short dev_type)
 312{
 313	int i;
 314
 315	i = netdev_lock_pos(dev_type);
 316	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317				   netdev_lock_name[i]);
 318}
 319#else
 320static inline void netdev_set_lockdep_class(spinlock_t *lock,
 321					    unsigned short dev_type)
 322{
 323}
 324#endif
 325
 326/*******************************************************************************
 327
 328		Protocol management and registration routines
 329
 330*******************************************************************************/
 331
 332/*
 333 *	Add a protocol ID to the list. Now that the input handler is
 334 *	smarter we can dispense with all the messy stuff that used to be
 335 *	here.
 336 *
 337 *	BEWARE!!! Protocol handlers, mangling input packets,
 338 *	MUST BE last in hash buckets and checking protocol handlers
 339 *	MUST start from promiscuous ptype_all chain in net_bh.
 340 *	It is true now, do not change it.
 341 *	Explanation follows: if protocol handler, mangling packet, will
 342 *	be the first on list, it is not able to sense, that packet
 343 *	is cloned and should be copied-on-write, so that it will
 344 *	change it and subsequent readers will get broken packet.
 345 *							--ANK (980803)
 346 */
 347
 348/**
 349 *	dev_add_pack - add packet handler
 350 *	@pt: packet type declaration
 351 *
 352 *	Add a protocol handler to the networking stack. The passed &packet_type
 353 *	is linked into kernel lists and may not be freed until it has been
 354 *	removed from the kernel lists.
 355 *
 356 *	This call does not sleep therefore it can not
 357 *	guarantee all CPU's that are in middle of receiving packets
 358 *	will see the new packet type (until the next received packet).
 359 */
 360
 361void dev_add_pack(struct packet_type *pt)
 362{
 363	int hash;
 364
 365	spin_lock_bh(&ptype_lock);
 366	if (pt->type == htons(ETH_P_ALL))
 367		list_add_rcu(&pt->list, &ptype_all);
 368	else {
 369		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 370		list_add_rcu(&pt->list, &ptype_base[hash]);
 371	}
 372	spin_unlock_bh(&ptype_lock);
 373}
 374
 375/**
 376 *	__dev_remove_pack	 - remove packet handler
 377 *	@pt: packet type declaration
 378 *
 379 *	Remove a protocol handler that was previously added to the kernel
 380 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 381 *	from the kernel lists and can be freed or reused once this function
 382 *	returns.
 383 *
 384 *      The packet type might still be in use by receivers
 385 *	and must not be freed until after all the CPU's have gone
 386 *	through a quiescent state.
 387 */
 388void __dev_remove_pack(struct packet_type *pt)
 389{
 390	struct list_head *head;
 391	struct packet_type *pt1;
 392
 393	spin_lock_bh(&ptype_lock);
 394
 395	if (pt->type == htons(ETH_P_ALL))
 396		head = &ptype_all;
 397	else
 398		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399
 400	list_for_each_entry(pt1, head, list) {
 401		if (pt == pt1) {
 402			list_del_rcu(&pt->list);
 403			goto out;
 404		}
 405	}
 406
 407	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 408out:
 409	spin_unlock_bh(&ptype_lock);
 410}
 411/**
 412 *	dev_remove_pack	 - remove packet handler
 413 *	@pt: packet type declaration
 414 *
 415 *	Remove a protocol handler that was previously added to the kernel
 416 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 417 *	from the kernel lists and can be freed or reused once this function
 418 *	returns.
 419 *
 420 *	This call sleeps to guarantee that no CPU is looking at the packet
 421 *	type after return.
 422 */
 423void dev_remove_pack(struct packet_type *pt)
 424{
 425	__dev_remove_pack(pt);
 426
 427	synchronize_net();
 428}
 429
 430/******************************************************************************
 431
 432		      Device Boot-time Settings Routines
 433
 434*******************************************************************************/
 435
 436/* Boot time configuration table */
 437static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 438
 439/**
 440 *	netdev_boot_setup_add	- add new setup entry
 441 *	@name: name of the device
 442 *	@map: configured settings for the device
 443 *
 444 *	Adds new setup entry to the dev_boot_setup list.  The function
 445 *	returns 0 on error and 1 on success.  This is a generic routine to
 446 *	all netdevices.
 447 */
 448static int netdev_boot_setup_add(char *name, struct ifmap *map)
 449{
 450	struct netdev_boot_setup *s;
 451	int i;
 452
 453	s = dev_boot_setup;
 454	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 455		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 456			memset(s[i].name, 0, sizeof(s[i].name));
 457			strlcpy(s[i].name, name, IFNAMSIZ);
 458			memcpy(&s[i].map, map, sizeof(s[i].map));
 459			break;
 460		}
 461	}
 462
 463	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 464}
 465
 466/**
 467 *	netdev_boot_setup_check	- check boot time settings
 468 *	@dev: the netdevice
 469 *
 470 * 	Check boot time settings for the device.
 471 *	The found settings are set for the device to be used
 472 *	later in the device probing.
 473 *	Returns 0 if no settings found, 1 if they are.
 474 */
 475int netdev_boot_setup_check(struct net_device *dev)
 476{
 477	struct netdev_boot_setup *s = dev_boot_setup;
 478	int i;
 479
 480	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 481		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 482		    !strcmp(dev->name, s[i].name)) {
 483			dev->irq 	= s[i].map.irq;
 484			dev->base_addr 	= s[i].map.base_addr;
 485			dev->mem_start 	= s[i].map.mem_start;
 486			dev->mem_end 	= s[i].map.mem_end;
 487			return 1;
 488		}
 489	}
 490	return 0;
 491}
 492
 493
 494/**
 495 *	netdev_boot_base	- get address from boot time settings
 496 *	@prefix: prefix for network device
 497 *	@unit: id for network device
 498 *
 499 * 	Check boot time settings for the base address of device.
 500 *	The found settings are set for the device to be used
 501 *	later in the device probing.
 502 *	Returns 0 if no settings found.
 503 */
 504unsigned long netdev_boot_base(const char *prefix, int unit)
 505{
 506	const struct netdev_boot_setup *s = dev_boot_setup;
 507	char name[IFNAMSIZ];
 508	int i;
 509
 510	sprintf(name, "%s%d", prefix, unit);
 511
 512	/*
 513	 * If device already registered then return base of 1
 514	 * to indicate not to probe for this interface
 515	 */
 516	if (__dev_get_by_name(&init_net, name))
 517		return 1;
 518
 519	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 520		if (!strcmp(name, s[i].name))
 521			return s[i].map.base_addr;
 522	return 0;
 523}
 524
 525/*
 526 * Saves at boot time configured settings for any netdevice.
 527 */
 528int __init netdev_boot_setup(char *str)
 529{
 530	int ints[5];
 531	struct ifmap map;
 532
 533	str = get_options(str, ARRAY_SIZE(ints), ints);
 534	if (!str || !*str)
 535		return 0;
 536
 537	/* Save settings */
 538	memset(&map, 0, sizeof(map));
 539	if (ints[0] > 0)
 540		map.irq = ints[1];
 541	if (ints[0] > 1)
 542		map.base_addr = ints[2];
 543	if (ints[0] > 2)
 544		map.mem_start = ints[3];
 545	if (ints[0] > 3)
 546		map.mem_end = ints[4];
 547
 548	/* Add new entry to the list */
 549	return netdev_boot_setup_add(str, &map);
 550}
 551
 552__setup("netdev=", netdev_boot_setup);
 553
 554/*******************************************************************************
 555
 556			    Device Interface Subroutines
 557
 558*******************************************************************************/
 559
 560/**
 561 *	__dev_get_by_name	- find a device by its name
 562 *	@net: the applicable net namespace
 563 *	@name: name to find
 564 *
 565 *	Find an interface by name. Must be called under RTNL semaphore
 566 *	or @dev_base_lock. If the name is found a pointer to the device
 567 *	is returned. If the name is not found then %NULL is returned. The
 568 *	reference counters are not incremented so the caller must be
 569 *	careful with locks.
 570 */
 571
 572struct net_device *__dev_get_by_name(struct net *net, const char *name)
 573{
 574	struct hlist_node *p;
 575
 576	hlist_for_each(p, dev_name_hash(net, name)) {
 577		struct net_device *dev
 578			= hlist_entry(p, struct net_device, name_hlist);
 579		if (!strncmp(dev->name, name, IFNAMSIZ))
 580			return dev;
 581	}
 582	return NULL;
 583}
 584
 585/**
 586 *	dev_get_by_name		- find a device by its name
 587 *	@net: the applicable net namespace
 588 *	@name: name to find
 589 *
 590 *	Find an interface by name. This can be called from any
 591 *	context and does its own locking. The returned handle has
 592 *	the usage count incremented and the caller must use dev_put() to
 593 *	release it when it is no longer needed. %NULL is returned if no
 594 *	matching device is found.
 595 */
 596
 597struct net_device *dev_get_by_name(struct net *net, const char *name)
 598{
 599	struct net_device *dev;
 600
 601	read_lock(&dev_base_lock);
 602	dev = __dev_get_by_name(net, name);
 603	if (dev)
 604		dev_hold(dev);
 605	read_unlock(&dev_base_lock);
 606	return dev;
 607}
 608
 609/**
 610 *	__dev_get_by_index - find a device by its ifindex
 611 *	@net: the applicable net namespace
 612 *	@ifindex: index of device
 613 *
 614 *	Search for an interface by index. Returns %NULL if the device
 615 *	is not found or a pointer to the device. The device has not
 616 *	had its reference counter increased so the caller must be careful
 617 *	about locking. The caller must hold either the RTNL semaphore
 618 *	or @dev_base_lock.
 619 */
 620
 621struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 622{
 623	struct hlist_node *p;
 624
 625	hlist_for_each(p, dev_index_hash(net, ifindex)) {
 626		struct net_device *dev
 627			= hlist_entry(p, struct net_device, index_hlist);
 628		if (dev->ifindex == ifindex)
 629			return dev;
 630	}
 631	return NULL;
 632}
 633
 634
 635/**
 636 *	dev_get_by_index - find a device by its ifindex
 637 *	@net: the applicable net namespace
 638 *	@ifindex: index of device
 639 *
 640 *	Search for an interface by index. Returns NULL if the device
 641 *	is not found or a pointer to the device. The device returned has
 642 *	had a reference added and the pointer is safe until the user calls
 643 *	dev_put to indicate they have finished with it.
 644 */
 645
 646struct net_device *dev_get_by_index(struct net *net, int ifindex)
 647{
 648	struct net_device *dev;
 649
 650	read_lock(&dev_base_lock);
 651	dev = __dev_get_by_index(net, ifindex);
 652	if (dev)
 653		dev_hold(dev);
 654	read_unlock(&dev_base_lock);
 655	return dev;
 656}
 657
 658/**
 659 *	dev_getbyhwaddr - find a device by its hardware address
 660 *	@net: the applicable net namespace
 661 *	@type: media type of device
 662 *	@ha: hardware address
 663 *
 664 *	Search for an interface by MAC address. Returns NULL if the device
 665 *	is not found or a pointer to the device. The caller must hold the
 666 *	rtnl semaphore. The returned device has not had its ref count increased
 667 *	and the caller must therefore be careful about locking
 668 *
 669 *	BUGS:
 670 *	If the API was consistent this would be __dev_get_by_hwaddr
 671 */
 672
 673struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 674{
 675	struct net_device *dev;
 676
 677	ASSERT_RTNL();
 678
 679	for_each_netdev(net, dev)
 680		if (dev->type == type &&
 681		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 682			return dev;
 683
 684	return NULL;
 685}
 686
 687EXPORT_SYMBOL(dev_getbyhwaddr);
 688
 689struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 690{
 691	struct net_device *dev;
 692
 693	ASSERT_RTNL();
 694	for_each_netdev(net, dev)
 695		if (dev->type == type)
 696			return dev;
 697
 698	return NULL;
 699}
 700
 701EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 702
 703struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 704{
 705	struct net_device *dev;
 706
 707	rtnl_lock();
 708	dev = __dev_getfirstbyhwtype(net, type);
 709	if (dev)
 710		dev_hold(dev);
 711	rtnl_unlock();
 712	return dev;
 713}
 714
 715EXPORT_SYMBOL(dev_getfirstbyhwtype);
 716
 717/**
 718 *	dev_get_by_flags - find any device with given flags
 719 *	@net: the applicable net namespace
 720 *	@if_flags: IFF_* values
 721 *	@mask: bitmask of bits in if_flags to check
 722 *
 723 *	Search for any interface with the given flags. Returns NULL if a device
 724 *	is not found or a pointer to the device. The device returned has
 725 *	had a reference added and the pointer is safe until the user calls
 726 *	dev_put to indicate they have finished with it.
 727 */
 728
 729struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 730{
 731	struct net_device *dev, *ret;
 732
 733	ret = NULL;
 734	read_lock(&dev_base_lock);
 735	for_each_netdev(net, dev) {
 736		if (((dev->flags ^ if_flags) & mask) == 0) {
 737			dev_hold(dev);
 738			ret = dev;
 739			break;
 740		}
 741	}
 742	read_unlock(&dev_base_lock);
 743	return ret;
 744}
 745
 746/**
 747 *	dev_valid_name - check if name is okay for network device
 748 *	@name: name string
 749 *
 750 *	Network device names need to be valid file names to
 751 *	to allow sysfs to work.  We also disallow any kind of
 752 *	whitespace.
 753 */
 754int dev_valid_name(const char *name)
 755{
 756	if (*name == '\0')
 757		return 0;
 758	if (strlen(name) >= IFNAMSIZ)
 759		return 0;
 760	if (!strcmp(name, ".") || !strcmp(name, ".."))
 761		return 0;
 762
 763	while (*name) {
 764		if (*name == '/' || isspace(*name))
 765			return 0;
 766		name++;
 767	}
 768	return 1;
 769}
 770
 771/**
 772 *	__dev_alloc_name - allocate a name for a device
 773 *	@net: network namespace to allocate the device name in
 774 *	@name: name format string
 775 *	@buf:  scratch buffer and result name string
 776 *
 777 *	Passed a format string - eg "lt%d" it will try and find a suitable
 778 *	id. It scans list of devices to build up a free map, then chooses
 779 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 780 *	while allocating the name and adding the device in order to avoid
 781 *	duplicates.
 782 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 783 *	Returns the number of the unit assigned or a negative errno code.
 784 */
 785
 786static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 787{
 788	int i = 0;
 789	const char *p;
 790	const int max_netdevices = 8*PAGE_SIZE;
 791	unsigned long *inuse;
 792	struct net_device *d;
 793
 794	p = strnchr(name, IFNAMSIZ-1, '%');
 795	if (p) {
 796		/*
 797		 * Verify the string as this thing may have come from
 798		 * the user.  There must be either one "%d" and no other "%"
 799		 * characters.
 800		 */
 801		if (p[1] != 'd' || strchr(p + 2, '%'))
 802			return -EINVAL;
 803
 804		/* Use one page as a bit array of possible slots */
 805		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 806		if (!inuse)
 807			return -ENOMEM;
 808
 809		for_each_netdev(net, d) {
 810			if (!sscanf(d->name, name, &i))
 811				continue;
 812			if (i < 0 || i >= max_netdevices)
 813				continue;
 814
 815			/*  avoid cases where sscanf is not exact inverse of printf */
 816			snprintf(buf, IFNAMSIZ, name, i);
 817			if (!strncmp(buf, d->name, IFNAMSIZ))
 818				set_bit(i, inuse);
 819		}
 820
 821		i = find_first_zero_bit(inuse, max_netdevices);
 822		free_page((unsigned long) inuse);
 823	}
 824
 825	snprintf(buf, IFNAMSIZ, name, i);
 826	if (!__dev_get_by_name(net, buf))
 827		return i;
 828
 829	/* It is possible to run out of possible slots
 830	 * when the name is long and there isn't enough space left
 831	 * for the digits, or if all bits are used.
 832	 */
 833	return -ENFILE;
 834}
 835
 836/**
 837 *	dev_alloc_name - allocate a name for a device
 838 *	@dev: device
 839 *	@name: name format string
 840 *
 841 *	Passed a format string - eg "lt%d" it will try and find a suitable
 842 *	id. It scans list of devices to build up a free map, then chooses
 843 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 844 *	while allocating the name and adding the device in order to avoid
 845 *	duplicates.
 846 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 847 *	Returns the number of the unit assigned or a negative errno code.
 848 */
 849
 850int dev_alloc_name(struct net_device *dev, const char *name)
 851{
 852	char buf[IFNAMSIZ];
 853	struct net *net;
 854	int ret;
 855
 856	BUG_ON(!dev_net(dev));
 857	net = dev_net(dev);
 858	ret = __dev_alloc_name(net, name, buf);
 859	if (ret >= 0)
 860		strlcpy(dev->name, buf, IFNAMSIZ);
 861	return ret;
 862}
 863
 864
 865/**
 866 *	dev_change_name - change name of a device
 867 *	@dev: device
 868 *	@newname: name (or format string) must be at least IFNAMSIZ
 869 *
 870 *	Change name of a device, can pass format strings "eth%d".
 871 *	for wildcarding.
 872 */
 873int dev_change_name(struct net_device *dev, char *newname)
 874{
 875	char oldname[IFNAMSIZ];
 876	int err = 0;
 877	int ret;
 878	struct net *net;
 879
 880	ASSERT_RTNL();
 881	BUG_ON(!dev_net(dev));
 882
 883	net = dev_net(dev);
 884	if (dev->flags & IFF_UP)
 885		return -EBUSY;
 886
 887	if (!dev_valid_name(newname))
 888		return -EINVAL;
 889
 890	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 891		return 0;
 892
 893	memcpy(oldname, dev->name, IFNAMSIZ);
 894
 895	if (strchr(newname, '%')) {
 896		err = dev_alloc_name(dev, newname);
 897		if (err < 0)
 898			return err;
 899		strcpy(newname, dev->name);
 900	}
 901	else if (__dev_get_by_name(net, newname))
 902		return -EEXIST;
 903	else
 904		strlcpy(dev->name, newname, IFNAMSIZ);
 905
 906rollback:
 907	err = device_rename(&dev->dev, dev->name);
 908	if (err) {
 909		memcpy(dev->name, oldname, IFNAMSIZ);
 910		return err;
 911	}
 912
 913	write_lock_bh(&dev_base_lock);
 914	hlist_del(&dev->name_hlist);
 915	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 916	write_unlock_bh(&dev_base_lock);
 917
 918	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 919	ret = notifier_to_errno(ret);
 920
 921	if (ret) {
 922		if (err) {
 923			printk(KERN_ERR
 924			       "%s: name change rollback failed: %d.\n",
 925			       dev->name, ret);
 926		} else {
 927			err = ret;
 928			memcpy(dev->name, oldname, IFNAMSIZ);
 929			goto rollback;
 930		}
 931	}
 932
 933	return err;
 934}
 935
 936/**
 937 *	netdev_features_change - device changes features
 938 *	@dev: device to cause notification
 939 *
 940 *	Called to indicate a device has changed features.
 941 */
 942void netdev_features_change(struct net_device *dev)
 943{
 944	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 945}
 946EXPORT_SYMBOL(netdev_features_change);
 947
 948/**
 949 *	netdev_state_change - device changes state
 950 *	@dev: device to cause notification
 951 *
 952 *	Called to indicate a device has changed state. This function calls
 953 *	the notifier chains for netdev_chain and sends a NEWLINK message
 954 *	to the routing socket.
 955 */
 956void netdev_state_change(struct net_device *dev)
 957{
 958	if (dev->flags & IFF_UP) {
 959		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 960		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 961	}
 962}
 963
 964/**
 965 *	dev_load 	- load a network module
 966 *	@net: the applicable net namespace
 967 *	@name: name of interface
 968 *
 969 *	If a network interface is not present and the process has suitable
 970 *	privileges this function loads the module. If module loading is not
 971 *	available in this kernel then it becomes a nop.
 972 */
 973
 974void dev_load(struct net *net, const char *name)
 975{
 976	struct net_device *dev;
 977
 978	read_lock(&dev_base_lock);
 979	dev = __dev_get_by_name(net, name);
 980	read_unlock(&dev_base_lock);
 981
 982	if (!dev && capable(CAP_SYS_MODULE))
 983		request_module("%s", name);
 984}
 985
 986/**
 987 *	dev_open	- prepare an interface for use.
 988 *	@dev:	device to open
 989 *
 990 *	Takes a device from down to up state. The device's private open
 991 *	function is invoked and then the multicast lists are loaded. Finally
 992 *	the device is moved into the up state and a %NETDEV_UP message is
 993 *	sent to the netdev notifier chain.
 994 *
 995 *	Calling this function on an active interface is a nop. On a failure
 996 *	a negative errno code is returned.
 997 */
 998int dev_open(struct net_device *dev)
 999{
1000	int ret = 0;
1001
1002	ASSERT_RTNL();
1003
1004	/*
1005	 *	Is it already up?
1006	 */
1007
1008	if (dev->flags & IFF_UP)
1009		return 0;
1010
1011	/*
1012	 *	Is it even present?
1013	 */
1014	if (!netif_device_present(dev))
1015		return -ENODEV;
1016
1017	/*
1018	 *	Call device private open method
1019	 */
1020	set_bit(__LINK_STATE_START, &dev->state);
1021
1022	if (dev->validate_addr)
1023		ret = dev->validate_addr(dev);
1024
1025	if (!ret && dev->open)
1026		ret = dev->open(dev);
1027
1028	/*
1029	 *	If it went open OK then:
1030	 */
1031
1032	if (ret)
1033		clear_bit(__LINK_STATE_START, &dev->state);
1034	else {
1035		/*
1036		 *	Set the flags.
1037		 */
1038		dev->flags |= IFF_UP;
1039
1040		/*
1041		 *	Initialize multicasting status
1042		 */
1043		dev_set_rx_mode(dev);
1044
1045		/*
1046		 *	Wakeup transmit queue engine
1047		 */
1048		dev_activate(dev);
1049
1050		/*
1051		 *	... and announce new interface.
1052		 */
1053		call_netdevice_notifiers(NETDEV_UP, dev);
1054	}
1055
1056	return ret;
1057}
1058
1059/**
1060 *	dev_close - shutdown an interface.
1061 *	@dev: device to shutdown
1062 *
1063 *	This function moves an active device into down state. A
1064 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1065 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1066 *	chain.
1067 */
1068int dev_close(struct net_device *dev)
1069{
1070	ASSERT_RTNL();
1071
1072	might_sleep();
1073
1074	if (!(dev->flags & IFF_UP))
1075		return 0;
1076
1077	/*
1078	 *	Tell people we are going down, so that they can
1079	 *	prepare to death, when device is still operating.
1080	 */
1081	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1082
1083	clear_bit(__LINK_STATE_START, &dev->state);
1084
1085	/* Synchronize to scheduled poll. We cannot touch poll list,
1086	 * it can be even on different cpu. So just clear netif_running().
1087	 *
1088	 * dev->stop() will invoke napi_disable() on all of it's
1089	 * napi_struct instances on this device.
1090	 */
1091	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1092
1093	dev_deactivate(dev);
1094
1095	/*
1096	 *	Call the device specific close. This cannot fail.
1097	 *	Only if device is UP
1098	 *
1099	 *	We allow it to be called even after a DETACH hot-plug
1100	 *	event.
1101	 */
1102	if (dev->stop)
1103		dev->stop(dev);
1104
1105	/*
1106	 *	Device is now down.
1107	 */
1108
1109	dev->flags &= ~IFF_UP;
1110
1111	/*
1112	 * Tell people we are down
1113	 */
1114	call_netdevice_notifiers(NETDEV_DOWN, dev);
1115
1116	return 0;
1117}
1118
1119
1120static int dev_boot_phase = 1;
1121
1122/*
1123 *	Device change register/unregister. These are not inline or static
1124 *	as we export them to the world.
1125 */
1126
1127/**
1128 *	register_netdevice_notifier - register a network notifier block
1129 *	@nb: notifier
1130 *
1131 *	Register a notifier to be called when network device events occur.
1132 *	The notifier passed is linked into the kernel structures and must
1133 *	not be reused until it has been unregistered. A negative errno code
1134 *	is returned on a failure.
1135 *
1136 * 	When registered all registration and up events are replayed
1137 *	to the new notifier to allow device to have a race free
1138 *	view of the network device list.
1139 */
1140
1141int register_netdevice_notifier(struct notifier_block *nb)
1142{
1143	struct net_device *dev;
1144	struct net_device *last;
1145	struct net *net;
1146	int err;
1147
1148	rtnl_lock();
1149	err = raw_notifier_chain_register(&netdev_chain, nb);
1150	if (err)
1151		goto unlock;
1152	if (dev_boot_phase)
1153		goto unlock;
1154	for_each_net(net) {
1155		for_each_netdev(net, dev) {
1156			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1157			err = notifier_to_errno(err);
1158			if (err)
1159				goto rollback;
1160
1161			if (!(dev->flags & IFF_UP))
1162				continue;
1163
1164			nb->notifier_call(nb, NETDEV_UP, dev);
1165		}
1166	}
1167
1168unlock:
1169	rtnl_unlock();
1170	return err;
1171
1172rollback:
1173	last = dev;
1174	for_each_net(net) {
1175		for_each_netdev(net, dev) {
1176			if (dev == last)
1177				break;
1178
1179			if (dev->flags & IFF_UP) {
1180				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1181				nb->notifier_call(nb, NETDEV_DOWN, dev);
1182			}
1183			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1184		}
1185	}
1186
1187	raw_notifier_chain_unregister(&netdev_chain, nb);
1188	goto unlock;
1189}
1190
1191/**
1192 *	unregister_netdevice_notifier - unregister a network notifier block
1193 *	@nb: notifier
1194 *
1195 *	Unregister a notifier previously registered by
1196 *	register_netdevice_notifier(). The notifier is unlinked into the
1197 *	kernel structures and may then be reused. A negative errno code
1198 *	is returned on a failure.
1199 */
1200
1201int unregister_netdevice_notifier(struct notifier_block *nb)
1202{
1203	int err;
1204
1205	rtnl_lock();
1206	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1207	rtnl_unlock();
1208	return err;
1209}
1210
1211/**
1212 *	call_netdevice_notifiers - call all network notifier blocks
1213 *      @val: value passed unmodified to notifier function
1214 *      @dev: net_device pointer passed unmodified to notifier function
1215 *
1216 *	Call all network notifier blocks.  Parameters and return value
1217 *	are as for raw_notifier_call_chain().
1218 */
1219
1220int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1221{
1222	return raw_notifier_call_chain(&netdev_chain, val, dev);
1223}
1224
1225/* When > 0 there are consumers of rx skb time stamps */
1226static atomic_t netstamp_needed = ATOMIC_INIT(0);
1227
1228void net_enable_timestamp(void)
1229{
1230	atomic_inc(&netstamp_needed);
1231}
1232
1233void net_disable_timestamp(void)
1234{
1235	atomic_dec(&netstamp_needed);
1236}
1237
1238static inline void net_timestamp(struct sk_buff *skb)
1239{
1240	if (atomic_read(&netstamp_needed))
1241		__net_timestamp(skb);
1242	else
1243		skb->tstamp.tv64 = 0;
1244}
1245
1246/*
1247 *	Support routine. Sends outgoing frames to any network
1248 *	taps currently in use.
1249 */
1250
1251static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1252{
1253	struct packet_type *ptype;
1254
1255	net_timestamp(skb);
1256
1257	rcu_read_lock();
1258	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1259		/* Never send packets back to the socket
1260		 * they originated from - MvS (miquels@drinkel.ow.org)
1261		 */
1262		if ((ptype->dev == dev || !ptype->dev) &&
1263		    (ptype->af_packet_priv == NULL ||
1264		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1265			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1266			if (!skb2)
1267				break;
1268
1269			/* skb->nh should be correctly
1270			   set by sender, so that the second statement is
1271			   just protection against buggy protocols.
1272			 */
1273			skb_reset_mac_header(skb2);
1274
1275			if (skb_network_header(skb2) < skb2->data ||
1276			    skb2->network_header > skb2->tail) {
1277				if (net_ratelimit())
1278					printk(KERN_CRIT "protocol %04x is "
1279					       "buggy, dev %s\n",
1280					       skb2->protocol, dev->name);
1281				skb_reset_network_header(skb2);
1282			}
1283
1284			skb2->transport_header = skb2->network_header;
1285			skb2->pkt_type = PACKET_OUTGOING;
1286			ptype->func(skb2, skb->dev, ptype, skb->dev);
1287		}
1288	}
1289	rcu_read_unlock();
1290}
1291
1292
1293void __netif_schedule(struct net_device *dev)
1294{
1295	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1296		unsigned long flags;
1297		struct softnet_data *sd;
1298
1299		local_irq_save(flags);
1300		sd = &__get_cpu_var(softnet_data);
1301		dev->next_sched = sd->output_queue;
1302		sd->output_queue = dev;
1303		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1304		local_irq_restore(flags);
1305	}
1306}
1307EXPORT_SYMBOL(__netif_schedule);
1308
1309void dev_kfree_skb_irq(struct sk_buff *skb)
1310{
1311	if (atomic_dec_and_test(&skb->users)) {
1312		struct softnet_data *sd;
1313		unsigned long flags;
1314
1315		local_irq_save(flags);
1316		sd = &__get_cpu_var(softnet_data);
1317		skb->next = sd->completion_queue;
1318		sd->completion_queue = skb;
1319		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1320		local_irq_restore(flags);
1321	}
1322}
1323EXPORT_SYMBOL(dev_kfree_skb_irq);
1324
1325void dev_kfree_skb_any(struct sk_buff *skb)
1326{
1327	if (in_irq() || irqs_disabled())
1328		dev_kfree_skb_irq(skb);
1329	else
1330		dev_kfree_skb(skb);
1331}
1332EXPORT_SYMBOL(dev_kfree_skb_any);
1333
1334
1335/**
1336 * netif_device_detach - mark device as removed
1337 * @dev: network device
1338 *
1339 * Mark device as removed from system and therefore no longer available.
1340 */
1341void netif_device_detach(struct net_device *dev)
1342{
1343	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1344	    netif_running(dev)) {
1345		netif_stop_queue(dev);
1346	}
1347}
1348EXPORT_SYMBOL(netif_device_detach);
1349
1350/**
1351 * netif_device_attach - mark device as attached
1352 * @dev: network device
1353 *
1354 * Mark device as attached from system and restart if needed.
1355 */
1356void netif_device_attach(struct net_device *dev)
1357{
1358	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1359	    netif_running(dev)) {
1360		netif_wake_queue(dev);
1361		__netdev_watchdog_up(dev);
1362	}
1363}
1364EXPORT_SYMBOL(netif_device_attach);
1365
1366static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1367{
1368	return ((features & NETIF_F_GEN_CSUM) ||
1369		((features & NETIF_F_IP_CSUM) &&
1370		 protocol == htons(ETH_P_IP)) ||
1371		((features & NETIF_F_IPV6_CSUM) &&
1372		 protocol == htons(ETH_P_IPV6)));
1373}
1374
1375static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1376{
1377	if (can_checksum_protocol(dev->features, skb->protocol))
1378		return true;
1379
1380	if (skb->protocol == htons(ETH_P_8021Q)) {
1381		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1382		if (can_checksum_protocol(dev->features & dev->vlan_features,
1383					  veh->h_vlan_encapsulated_proto))
1384			return true;
1385	}
1386
1387	return false;
1388}
1389
1390/*
1391 * Invalidate hardware checksum when packet is to be mangled, and
1392 * complete checksum manually on outgoing path.
1393 */
1394int skb_checksum_help(struct sk_buff *skb)
1395{
1396	__wsum csum;
1397	int ret = 0, offset;
1398
1399	if (skb->ip_summed == CHECKSUM_COMPLETE)
1400		goto out_set_summed;
1401
1402	if (unlikely(skb_shinfo(skb)->gso_size)) {
1403		/* Let GSO fix up the checksum. */
1404		goto out_set_summed;
1405	}
1406
1407	offset = skb->csum_start - skb_headroom(skb);
1408	BUG_ON(offset >= skb_headlen(skb));
1409	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1410
1411	offset += skb->csum_offset;
1412	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1413
1414	if (skb_cloned(skb) &&
1415	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1416		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1417		if (ret)
1418			goto out;
1419	}
1420
1421	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1422out_set_summed:
1423	skb->ip_summed = CHECKSUM_NONE;
1424out:
1425	return ret;
1426}
1427
1428/**
1429 *	skb_gso_segment - Perform segmentation on skb.
1430 *	@skb: buffer to segment
1431 *	@features: features for the output path (see dev->features)
1432 *
1433 *	This function segments the given skb and returns a list of segments.
1434 *
1435 *	It may return NULL if the skb requires no segmentation.  This is
1436 *	only possible when GSO is used for verifying header integrity.
1437 */
1438struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1439{
1440	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1441	struct packet_type *ptype;
1442	__be16 type = skb->protocol;
1443	int err;
1444
1445	BUG_ON(skb_shinfo(skb)->frag_list);
1446
1447	skb_reset_mac_header(skb);
1448	skb->mac_len = skb->network_header - skb->mac_header;
1449	__skb_pull(skb, skb->mac_len);
1450
1451	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1452		if (skb_header_cloned(skb) &&
1453		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1454			return ERR_PTR(err);
1455	}
1456
1457	rcu_read_lock();
1458	list_for_each_entry_rcu(ptype,
1459			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1460		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1461			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1462				err = ptype->gso_send_check(skb);
1463				segs = ERR_PTR(err);
1464				if (err || skb_gso_ok(skb, features))
1465					break;
1466				__skb_push(skb, (skb->data -
1467						 skb_network_header(skb)));
1468			}
1469			segs = ptype->gso_segment(skb, features);
1470			break;
1471		}
1472	}
1473	rcu_read_unlock();
1474
1475	__skb_push(skb, skb->data - skb_mac_header(skb));
1476
1477	return segs;
1478}
1479
1480EXPORT_SYMBOL(skb_gso_segment);
1481
1482/* Take action when hardware reception checksum errors are detected. */
1483#ifdef CONFIG_BUG
1484void netdev_rx_csum_fault(struct net_device *dev)
1485{
1486	if (net_ratelimit()) {
1487		printk(KERN_ERR "%s: hw csum failure.\n",
1488			dev ? dev->name : "<unknown>");
1489		dump_stack();
1490	}
1491}
1492EXPORT_SYMBOL(netdev_rx_csum_fault);
1493#endif
1494
1495/* Actually, we should eliminate this check as soon as we know, that:
1496 * 1. IOMMU is present and allows to map all the memory.
1497 * 2. No high memory really exists on this machine.
1498 */
1499
1500static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1501{
1502#ifdef CONFIG_HIGHMEM
1503	int i;
1504
1505	if (dev->features & NETIF_F_HIGHDMA)
1506		return 0;
1507
1508	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1509		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1510			return 1;
1511
1512#endif
1513	return 0;
1514}
1515
1516struct dev_gso_cb {
1517	void (*destructor)(struct sk_buff *skb);
1518};
1519
1520#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1521
1522static void dev_gso_skb_destructor(struct sk_buff *skb)
1523{
1524	struct dev_gso_cb *cb;
1525
1526	do {
1527		struct sk_buff *nskb = skb->next;
1528
1529		skb->next = nskb->next;
1530		nskb->next = NULL;
1531		kfree_skb(nskb);
1532	} while (skb->next);
1533
1534	cb = DEV_GSO_CB(skb);
1535	if (cb->destructor)
1536		cb->destructor(skb);
1537}
1538
1539/**
1540 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1541 *	@skb: buffer to segment
1542 *
1543 *	This function segments the given skb and stores the list of segments
1544 *	in skb->next.
1545 */
1546static int dev_gso_segment(struct sk_buff *skb)
1547{
1548	struct net_device *dev = skb->dev;
1549	struct sk_buff *segs;
1550	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1551					 NETIF_F_SG : 0);
1552
1553	segs = skb_gso_segment(skb, features);
1554
1555	/* Verifying header integrity only. */
1556	if (!segs)
1557		return 0;
1558
1559	if (IS_ERR(segs))
1560		return PTR_ERR(segs);
1561
1562	skb->next = segs;
1563	DEV_GSO_CB(skb)->destructor = skb->destructor;
1564	skb->destructor = dev_gso_skb_destructor;
1565
1566	return 0;
1567}
1568
1569int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1570{
1571	if (likely(!skb->next)) {
1572		if (!list_empty(&ptype_all))
1573			dev_queue_xmit_nit(skb, dev);
1574
1575		if (netif_needs_gso(dev, skb)) {
1576			if (unlikely(dev_gso_segment(skb)))
1577				goto out_kfree_skb;
1578			if (skb->next)
1579				goto gso;
1580		}
1581
1582		return dev->hard_start_xmit(skb, dev);
1583	}
1584
1585gso:
1586	do {
1587		struct sk_buff *nskb = skb->next;
1588		int rc;
1589
1590		skb->next = nskb->next;
1591		nskb->next = NULL;
1592		rc = dev->hard_start_xmit(nskb, dev);
1593		if (unlikely(rc)) {
1594			nskb->next = skb->next;
1595			skb->next = nskb;
1596			return rc;
1597		}
1598		if (unlikely((netif_queue_stopped(dev) ||
1599			     netif_subqueue_stopped(dev, skb)) &&
1600			     skb->next))
1601			return NETDEV_TX_BUSY;
1602	} while (skb->next);
1603
1604	skb->destructor = DEV_GSO_CB(skb)->destructor;
1605
1606out_kfree_skb:
1607	kfree_skb(skb);
1608	return 0;
1609}
1610
1611/**
1612 *	dev_queue_xmit - transmit a buffer
1613 *	@skb: buffer to transmit
1614 *
1615 *	Queue a buffer for transmission to a network device. The caller must
1616 *	have set the device and priority and built the buffer before calling
1617 *	this function. The function can be called from an interrupt.
1618 *
1619 *	A negative errno code is returned on a failure. A success does not
1620 *	guarantee the frame will be transmitted as it may be dropped due
1621 *	to congestion or traffic shaping.
1622 *
1623 * -----------------------------------------------------------------------------------
1624 *      I notice this method can also return errors from the queue disciplines,
1625 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1626 *      be positive.
1627 *
1628 *      Regardless of the return value, the skb is consumed, so it is currently
1629 *      difficult to retry a send to this method.  (You can bump the ref count
1630 *      before sending to hold a reference for retry if you are careful.)
1631 *
1632 *      When calling this method, interrupts MUST be enabled.  This is because
1633 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1634 *          --BLG
1635 */
1636
1637int dev_queue_xmit(struct sk_buff *skb)
1638{
1639	struct net_device *dev = skb->dev;
1640	struct Qdisc *q;
1641	int rc = -ENOMEM;
1642
1643	/* GSO will handle the following emulations directly. */
1644	if (netif_needs_gso(dev, skb))
1645		goto gso;
1646
1647	if (skb_shinfo(skb)->frag_list &&
1648	    !(dev->features & NETIF_F_FRAGLIST) &&
1649	    __skb_linearize(skb))
1650		goto out_kfree_skb;
1651
1652	/* Fragmented skb is linearized if device does not support SG,
1653	 * or if at least one of fragments is in highmem and device
1654	 * does not support DMA from it.
1655	 */
1656	if (skb_shinfo(skb)->nr_frags &&
1657	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1658	    __skb_linearize(skb))
1659		goto out_kfree_skb;
1660
1661	/* If packet is not checksummed and device does not support
1662	 * checksumming for this protocol, complete checksumming here.
1663	 */
1664	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1665		skb_set_transport_header(skb, skb->csum_start -
1666					      skb_headroom(skb));
1667		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1668			goto out_kfree_skb;
1669	}
1670
1671gso:
1672	spin_lock_prefetch(&dev->queue_lock);
1673
1674	/* Disable soft irqs for various locks below. Also
1675	 * stops preemption for RCU.
1676	 */
1677	rcu_read_lock_bh();
1678
1679	/* Updates of qdisc are serialized by queue_lock.
1680	 * The struct Qdisc which is pointed to by qdisc is now a
1681	 * rcu structure - it may be accessed without acquiring
1682	 * a lock (but the structure may be stale.) The freeing of the
1683	 * qdisc will be deferred until it's known that there are no
1684	 * more references to it.
1685	 *
1686	 * If the qdisc has an enqueue function, we still need to
1687	 * hold the queue_lock before calling it, since queue_lock
1688	 * also serializes access to the device queue.
1689	 */
1690
1691	q = rcu_dereference(dev->qdisc);
1692#ifdef CONFIG_NET_CLS_ACT
1693	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1694#endif
1695	if (q->enqueue) {
1696		/* Grab device queue */
1697		spin_lock(&dev->queue_lock);
1698		q = dev->qdisc;
1699		if (q->enqueue) {
1700			/* reset queue_mapping to zero */
1701			skb_set_queue_mapping(skb, 0);
1702			rc = q->enqueue(skb, q);
1703			qdisc_run(dev);
1704			spin_unlock(&dev->queue_lock);
1705
1706			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1707			goto out;
1708		}
1709		spin_unlock(&dev->queue_lock);
1710	}
1711
1712	/* The device has no queue. Common case for software devices:
1713	   loopback, all the sorts of tunnels...
1714
1715	   Really, it is unlikely that netif_tx_lock protection is necessary
1716	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1717	   counters.)
1718	   However, it is possible, that they rely on protection
1719	   made by us here.
1720
1721	   Check this and shot the lock. It is not prone from deadlocks.
1722	   Either shot noqueue qdisc, it is even simpler 8)
1723	 */
1724	if (dev->flags & IFF_UP) {
1725		int cpu = smp_processor_id(); /* ok because BHs are off */
1726
1727		if (dev->xmit_lock_owner != cpu) {
1728
1729			HARD_TX_LOCK(dev, cpu);
1730
1731			if (!netif_queue_stopped(dev) &&
1732			    !netif_subqueue_stopped(dev, skb)) {
1733				rc = 0;
1734				if (!dev_hard_start_xmit(skb, dev)) {
1735					HARD_TX_UNLOCK(dev);
1736					goto out;
1737				}
1738			}
1739			HARD_TX_UNLOCK(dev);
1740			if (net_ratelimit())
1741				printk(KERN_CRIT "Virtual device %s asks to "
1742				       "queue packet!\n", dev->name);
1743		} else {
1744			/* Recursion is detected! It is possible,
1745			 * unfortunately */
1746			if (net_ratelimit())
1747				printk(KERN_CRIT "Dead loop on virtual device "
1748				       "%s, fix it urgently!\n", dev->name);
1749		}
1750	}
1751
1752	rc = -ENETDOWN;
1753	rcu_read_unlock_bh();
1754
1755out_kfree_skb:
1756	kfree_skb(skb);
1757	return rc;
1758out:
1759	rcu_read_unlock_bh();
1760	return rc;
1761}
1762
1763
1764/*=======================================================================
1765			Receiver routines
1766  =======================================================================*/
1767
1768int netdev_max_backlog __read_mostly = 1000;
1769int netdev_budget __read_mostly = 300;
1770int weight_p __read_mostly = 64;            /* old backlog weight */
1771
1772DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1773
1774
1775/**
1776 *	netif_rx	-	post buffer to the network code
1777 *	@skb: buffer to post
1778 *
1779 *	This function receives a packet from a device driver and queues it for
1780 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1781 *	may be dropped during processing for congestion control or by the
1782 *	protocol layers.
1783 *
1784 *	return values:
1785 *	NET_RX_SUCCESS	(no congestion)
1786 *	NET_RX_DROP     (packet was dropped)
1787 *
1788 */
1789
1790int netif_rx(struct sk_buff *skb)
1791{
1792	struct softnet_data *queue;
1793	unsigned long flags;
1794
1795	/* if netpoll wants it, pretend we never saw it */
1796	if (netpoll_rx(skb))
1797		return NET_RX_DROP;
1798
1799	if (!skb->tstamp.tv64)
1800		net_timestamp(skb);
1801
1802	/*
1803	 * The code is rearranged so that the path is the most
1804	 * short when CPU is congested, but is still operating.
1805	 */
1806	local_irq_save(flags);
1807	queue = &__get_cpu_var(softnet_data);
1808
1809	__get_cpu_var(netdev_rx_stat).total++;
1810	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1811		if (queue->input_pkt_queue.qlen) {
1812enqueue:
1813			dev_hold(skb->dev);
1814			__skb_queue_tail(&queue->input_pkt_queue, skb);
1815			local_irq_restore(flags);
1816			return NET_RX_SUCCESS;
1817		}
1818
1819		napi_schedule(&queue->backlog);
1820		goto enqueue;
1821	}
1822
1823	__get_cpu_var(netdev_rx_stat).dropped++;
1824	local_irq_restore(flags);
1825
1826	kfree_skb(skb);
1827	return NET_RX_DROP;
1828}
1829
1830int netif_rx_ni(struct sk_buff *skb)
1831{
1832	int err;
1833
1834	preempt_disable();
1835	err = netif_rx(skb);
1836	if (local_softirq_pending())
1837		do_softirq();
1838	preempt_enable();
1839
1840	return err;
1841}
1842
1843EXPORT_SYMBOL(netif_rx_ni);
1844
1845static inline struct net_device *skb_bond(struct sk_buff *skb)
1846{
1847	struct net_device *dev = skb->dev;
1848
1849	if (dev->master) {
1850		if (skb_bond_should_drop(skb)) {
1851			kfree_skb(skb);
1852			return NULL;
1853		}
1854		skb->dev = dev->master;
1855	}
1856
1857	return dev;
1858}
1859
1860
1861static void net_tx_action(struct softirq_action *h)
1862{
1863	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1864
1865	if (sd->completion_queue) {
1866		struct sk_buff *clist;
1867
1868		local_irq_disable();
1869		clist = sd->completion_queue;
1870		sd->completion_queue = NULL;
1871		local_irq_enable();
1872
1873		while (clist) {
1874			struct sk_buff *skb = clist;
1875			clist = clist->next;
1876
1877			BUG_TRAP(!atomic_read(&skb->users));
1878			__kfree_skb(skb);
1879		}
1880	}
1881
1882	if (sd->output_queue) {
1883		struct net_device *head;
1884
1885		local_irq_disable();
1886		head = sd->output_queue;
1887		sd->output_queue = NULL;
1888		local_irq_enable();
1889
1890		while (head) {
1891			struct net_device *dev = head;
1892			head = head->next_sched;
1893
1894			smp_mb__before_clear_bit();
1895			clear_bit(__LINK_STATE_SCHED, &dev->state);
1896
1897			if (spin_trylock(&dev->queue_lock)) {
1898				qdisc_run(dev);
1899				spin_unlock(&dev->queue_lock);
1900			} else {
1901				netif_schedule(dev);
1902			}
1903		}
1904	}
1905}
1906
1907static inline int deliver_skb(struct sk_buff *skb,
1908			      struct packet_type *pt_prev,
1909			      struct net_device *orig_dev)
1910{
1911	atomic_inc(&skb->users);
1912	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1913}
1914
1915#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1916/* These hooks defined here for ATM */
1917struct net_bridge;
1918struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1919						unsigned char *addr);
1920void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1921
1922/*
1923 * If bridge module is loaded call bridging hook.
1924 *  returns NULL if packet was consumed.
1925 */
1926struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1927					struct sk_buff *skb) __read_mostly;
1928static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1929					    struct packet_type **pt_prev, int *ret,
1930					    struct net_device *orig_dev)
1931{
1932	struct net_bridge_port *port;
1933
1934	if (skb->pkt_type == PACKET_LOOPBACK ||
1935	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
1936		return skb;
1937
1938	if (*pt_prev) {
1939		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1940		*pt_prev = NULL;
1941	}
1942
1943	return br_handle_frame_hook(port, skb);
1944}
1945#else
1946#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
1947#endif
1948
1949#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1950struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1951EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1952
1953static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1954					     struct packet_type **pt_prev,
1955					     int *ret,
1956					     struct net_device *orig_dev)
1957{
1958	if (skb->dev->macvlan_port == NULL)
1959		return skb;
1960
1961	if (*pt_prev) {
1962		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1963		*pt_prev = NULL;
1964	}
1965	return macvlan_handle_frame_hook(skb);
1966}
1967#else
1968#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
1969#endif
1970
1971#ifdef CONFIG_NET_CLS_ACT
1972/* TODO: Maybe we should just force sch_ingress to be compiled in
1973 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1974 * a compare and 2 stores extra right now if we dont have it on
1975 * but have CONFIG_NET_CLS_ACT
1976 * NOTE: This doesnt stop any functionality; if you dont have
1977 * the ingress scheduler, you just cant add policies on ingress.
1978 *
1979 */
1980static int ing_filter(struct sk_buff *skb)
1981{
1982	struct Qdisc *q;
1983	struct net_device *dev = skb->dev;
1984	int result = TC_ACT_OK;
1985	u32 ttl = G_TC_RTTL(skb->tc_verd);
1986
1987	if (MAX_RED_LOOP < ttl++) {
1988		printk(KERN_WARNING
1989		       "Redir loop detected Dropping packet (%d->%d)\n",
1990		       skb->iif, dev->ifindex);
1991		return TC_ACT_SHOT;
1992	}
1993
1994	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
1995	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1996
1997	spin_lock(&dev->ingress_lock);
1998	if ((q = dev->qdisc_ingress) != NULL)
1999		result = q->enqueue(skb, q);
2000	spin_unlock(&dev->ingress_lock);
2001
2002	return result;
2003}
2004
2005static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2006					 struct packet_type **pt_prev,
2007					 int *ret, struct net_device *orig_dev)
2008{
2009	if (!skb->dev->qdisc_ingress)
2010		goto out;
2011
2012	if (*pt_prev) {
2013		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2014		*pt_prev = NULL;
2015	} else {
2016		/* Huh? Why does turning on AF_PACKET affect this? */
2017		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2018	}
2019
2020	switch (ing_filter(skb)) {
2021	case TC_ACT_SHOT:
2022	case TC_ACT_STOLEN:
2023		kfree_skb(skb);
2024		return NULL;
2025	}
2026
2027out:
2028	skb->tc_verd = 0;
2029	return skb;
2030}
2031#endif
2032
2033/**
2034 *	netif_receive_skb - process receive buffer from network
2035 *	@skb: buffer to process
2036 *
2037 *	netif_receive_skb() is the main receive data processing function.
2038 *	It always succeeds. The buffer may be dropped during processing
2039 *	for congestion control or by the protocol layers.
2040 *
2041 *	This function may only be called from softirq context and interrupts
2042 *	should be enabled.
2043 *
2044 *	Return values (usually ignored):
2045 *	NET_RX_SUCCESS: no congestion
2046 *	NET_RX_DROP: packet was dropped
2047 */
2048int netif_receive_skb(struct sk_buff *skb)
2049{
2050	struct packet_type *ptype, *pt_prev;
2051	struct net_device *orig_dev;
2052	int ret = NET_RX_DROP;
2053	__be16 type;
2054
2055	/* if we've gotten here through NAPI, check netpoll */
2056	if (netpoll_receive_skb(skb))
2057		return NET_RX_DROP;
2058
2059	if (!skb->tstamp.tv64)
2060		net_timestamp(skb);
2061
2062	if (!skb->iif)
2063		skb->iif = skb->dev->ifindex;
2064
2065	orig_dev = skb_bond(skb);
2066
2067	if (!orig_dev)
2068		return NET_RX_DROP;
2069
2070	__get_cpu_var(netdev_rx_stat).total++;
2071
2072	skb_reset_network_header(skb);
2073	skb_reset_transport_header(skb);
2074	skb->mac_len = skb->network_header - skb->mac_header;
2075
2076	pt_prev = NULL;
2077
2078	rcu_read_lock();
2079
2080	/* Don't receive packets in an exiting network namespace */
2081	if (!net_alive(dev_net(skb->dev)))
2082		goto out;
2083
2084#ifdef CONFIG_NET_CLS_ACT
2085	if (skb->tc_verd & TC_NCLS) {
2086		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2087		goto ncls;
2088	}
2089#endif
2090
2091	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2092		if (!ptype->dev || ptype->dev == skb->dev) {
2093			if (pt_prev)
2094				ret = deliver_skb(skb, pt_prev, orig_dev);
2095			pt_prev = ptype;
2096		}
2097	}
2098
2099#ifdef CONFIG_NET_CLS_ACT
2100	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2101	if (!skb)
2102		goto out;
2103ncls:
2104#endif
2105
2106	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2107	if (!skb)
2108		goto out;
2109	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2110	if (!skb)
2111		goto out;
2112
2113	type = skb->protocol;
2114	list_for_each_entry_rcu(ptype,
2115			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2116		if (ptype->type == type &&
2117		    (!ptype->dev || ptype->dev == skb->dev)) {
2118			if (pt_prev)
2119				ret = deliver_skb(skb, pt_prev, orig_dev);
2120			pt_prev = ptype;
2121		}
2122	}
2123
2124	if (pt_prev) {
2125		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2126	} else {
2127		kfree_skb(skb);
2128		/* Jamal, now you will not able to escape explaining
2129		 * me how you were going to use this. :-)
2130		 */
2131		ret = NET_RX_DROP;
2132	}
2133
2134out:
2135	rcu_read_unlock();
2136	return ret;
2137}
2138
2139static int process_backlog(struct napi_struct *napi, int quota)
2140{
2141	int work = 0;
2142	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2143	unsigned long start_time = jiffies;
2144
2145	napi->weight = weight_p;
2146	do {
2147		struct sk_buff *skb;
2148		struct net_device *dev;
2149
2150		local_irq_disable();
2151		skb = __skb_dequeue(&queue->input_pkt_queue);
2152		if (!skb) {
2153			__napi_complete(napi);
2154			local_irq_enable();
2155			break;
2156		}
2157
2158		local_irq_enable();
2159
2160		dev = skb->dev;
2161
2162		netif_receive_skb(skb);
2163
2164		dev_put(dev);
2165	} while (++work < quota && jiffies == start_time);
2166
2167	return work;
2168}
2169
2170/**
2171 * __napi_schedule - schedule for receive
2172 * @n: entry to schedule
2173 *
2174 * The entry's receive function will be scheduled to run
2175 */
2176void __napi_schedule(struct napi_struct *n)
2177{
2178	unsigned long flags;
2179
2180	local_irq_save(flags);
2181	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2182	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2183	local_irq_restore(flags);
2184}
2185EXPORT_SYMBOL(__napi_schedule);
2186
2187
2188static void net_rx_action(struct softirq_action *h)
2189{
2190	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2191	unsigned long start_time = jiffies;
2192	int budget = netdev_budget;
2193	void *have;
2194
2195	local_irq_disable();
2196
2197	while (!list_empty(list)) {
2198		struct napi_struct *n;
2199		int work, weight;
2200
2201		/* If softirq window is exhuasted then punt.
2202		 *
2203		 * Note that this is a slight policy change from the
2204		 * previous NAPI code, which would allow up to 2
2205		 * jiffies to pass before breaking out.  The test
2206		 * used to be "jiffies - start_time > 1".
2207		 */
2208		if (unlikely(budget <= 0 || jiffies != start_time))
2209			goto softnet_break;
2210
2211		local_irq_enable();
2212
2213		/* Even though interrupts have been re-enabled, this
2214		 * access is safe because interrupts can only add new
2215		 * entries to the tail of this list, and only ->poll()
2216		 * calls can remove this head entry from the list.
2217		 */
2218		n = list_entry(list->next, struct napi_struct, poll_list);
2219
2220		have = netpoll_poll_lock(n);
2221
2222		weight = n->weight;
2223
2224		/* This NAPI_STATE_SCHED test is for avoiding a race
2225		 * with netpoll's poll_napi().  Only the entity which
2226		 * obtains the lock and sees NAPI_STATE_SCHED set will
2227		 * actually make the ->poll() call.  Therefore we avoid
2228		 * accidently calling ->poll() when NAPI is not scheduled.
2229		 */
2230		work = 0;
2231		if (test_bit(NAPI_STATE_SCHED, &n->state))
2232			work = n->poll(n, weight);
2233
2234		WARN_ON_ONCE(work > weight);
2235
2236		budget -= work;
2237
2238		local_irq_disable();
2239
2240		/* Drivers must not modify the NAPI state if they
2241		 * consume the entire weight.  In such cases this code
2242		 * still "owns" the NAPI instance and therefore can
2243		 * move the instance around on the list at-will.
2244		 */
2245		if (unlikely(work == weight)) {
2246			if (unlikely(napi_disable_pending(n)))
2247				__napi_complete(n);
2248			else
2249				list_move_tail(&n->poll_list, list);
2250		}
2251
2252		netpoll_poll_unlock(have);
2253	}
2254out:
2255	local_irq_enable();
2256
2257#ifdef CONFIG_NET_DMA
2258	/*
2259	 * There may not be any more sk_buffs coming right now, so push
2260	 * any pending DMA copies to hardware
2261	 */
2262	if (!cpus_empty(net_dma.channel_mask)) {
2263		int chan_idx;
2264		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2265			struct dma_chan *chan = net_dma.channels[chan_idx];
2266			if (chan)
2267				dma_async_memcpy_issue_pending(chan);
2268		}
2269	}
2270#endif
2271
2272	return;
2273
2274softnet_break:
2275	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2276	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2277	goto out;
2278}
2279
2280static gifconf_func_t * gifconf_list [NPROTO];
2281
2282/**
2283 *	register_gifconf	-	register a SIOCGIF handler
2284 *	@family: Address family
2285 *	@gifconf: Function handler
2286 *
2287 *	Register protocol dependent address dumping routines. The handler
2288 *	that is passed must not be freed or reused until it has been replaced
2289 *	by another handler.
2290 */
2291int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2292{
2293	if (family >= NPROTO)
2294		return -EINVAL;
2295	gifconf_list[family] = gifconf;
2296	return 0;
2297}
2298
2299
2300/*
2301 *	Map an interface index to its name (SIOCGIFNAME)
2302 */
2303
2304/*
2305 *	We need this ioctl for efficient implementation of the
2306 *	if_indextoname() function required by the IPv6 API.  Without
2307 *	it, we would have to search all the interfaces to find a
2308 *	match.  --pb
2309 */
2310
2311static int dev_ifname(struct net *net, struct ifreq __user *arg)
2312{
2313	struct net_device *dev;
2314	struct ifreq ifr;
2315
2316	/*
2317	 *	Fetch the caller's info block.
2318	 */
2319
2320	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2321		return -EFAULT;
2322
2323	read_lock(&dev_base_lock);
2324	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2325	if (!dev) {
2326		read_unlock(&dev_base_lock);
2327		return -ENODEV;
2328	}
2329
2330	strcpy(ifr.ifr_name, dev->name);
2331	read_unlock(&dev_base_lock);
2332
2333	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2334		return -EFAULT;
2335	return 0;
2336}
2337
2338/*
2339 *	Perform a SIOCGIFCONF call. This structure will change
2340 *	size eventually, and there is nothing I can do about it.
2341 *	Thus we will need a 'compatibility mode'.
2342 */
2343
2344static int dev_ifconf(struct net *net, char __user *arg)
2345{
2346	struct ifconf ifc;
2347	struct net_device *dev;
2348	char __user *pos;
2349	int len;
2350	int total;
2351	int i;
2352
2353	/*
2354	 *	Fetch the caller's info block.
2355	 */
2356
2357	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2358		return -EFAULT;
2359
2360	pos = ifc.ifc_buf;
2361	len = ifc.ifc_len;
2362
2363	/*
2364	 *	Loop over the interfaces, and write an info block for each.
2365	 */
2366
2367	total = 0;
2368	for_each_netdev(net, dev) {
2369		for (i = 0; i < NPROTO; i++) {
2370			if (gifconf_list[i]) {
2371				int done;
2372				if (!pos)
2373					done = gifconf_list[i](dev, NULL, 0);
2374				else
2375					done = gifconf_list[i](dev, pos + total,
2376							       len - total);
2377				if (done < 0)
2378					return -EFAULT;
2379				total += done;
2380			}
2381		}
2382	}
2383
2384	/*
2385	 *	All done.  Write the updated control block back to the caller.
2386	 */
2387	ifc.ifc_len = total;
2388
2389	/*
2390	 * 	Both BSD and Solaris return 0 here, so we do too.
2391	 */
2392	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2393}
2394
2395#ifdef CONFIG_PROC_FS
2396/*
2397 *	This is invoked by the /proc filesystem handler to display a device
2398 *	in detail.
2399 */
2400void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2401	__acquires(dev_base_lock)
2402{
2403	struct net *net = seq_file_net(seq);
2404	loff_t off;
2405	struct net_device *dev;
2406
2407	read_lock(&dev_base_lock);
2408	if (!*pos)
2409		return SEQ_START_TOKEN;
2410
2411	off = 1;
2412	for_each_netdev(net, dev)
2413		if (off++ == *pos)
2414			return dev;
2415
2416	return NULL;
2417}
2418
2419void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2420{
2421	struct net *net = seq_file_net(seq);
2422	++*pos;
2423	return v == SEQ_START_TOKEN ?
2424		first_net_device(net) : next_net_device((struct net_device *)v);
2425}
2426
2427void dev_seq_stop(struct seq_file *seq, void *v)
2428	__releases(dev_base_lock)
2429{
2430	read_unlock(&dev_base_lock);
2431}
2432
2433static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2434{
2435	struct net_device_stats *stats = dev->get_stats(dev);
2436
2437	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2438		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2439		   dev->name, stats->rx_bytes, stats->rx_packets,
2440		   stats->rx_errors,
2441		   stats->rx_dropped + stats->rx_missed_errors,
2442		   stats->rx_fifo_errors,
2443		   stats->rx_length_errors + stats->rx_over_errors +
2444		    stats->rx_crc_errors + stats->rx_frame_errors,
2445		   stats->rx_compressed, stats->multicast,
2446		   stats->tx_bytes, stats->tx_packets,
2447		   stats->tx_errors, stats->tx_dropped,
2448		   stats->tx_fifo_errors, stats->collisions,
2449		   stats->tx_carrier_errors +
2450		    stats->tx_aborted_errors +
2451		    stats->tx_window_errors +
2452		    stats->tx_heartbeat_errors,
2453		   stats->tx_compressed);
2454}
2455
2456/*
2457 *	Called from the PROCfs module. This now uses the new arbitrary sized
2458 *	/proc/net interface to create /proc/net/dev
2459 */
2460static int dev_seq_show(struct seq_file *seq, void *v)
2461{
2462	if (v == SEQ_START_TOKEN)
2463		seq_puts(seq, "Inter-|   Receive                            "
2464			      "                    |  Transmit\n"
2465			      " face |bytes    packets errs drop fifo frame "
2466			      "compressed multicast|bytes    packets errs "
2467			      "drop fifo colls carrier compressed\n");
2468	else
2469		dev_seq_printf_stats(seq, v);
2470	return 0;
2471}
2472
2473static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2474{
2475	struct netif_rx_stats *rc = NULL;
2476
2477	while (*pos < nr_cpu_ids)
2478		if (cpu_online(*pos)) {
2479			rc = &per_cpu(netdev_rx_stat, *pos);
2480			break;
2481		} else
2482			++*pos;
2483	return rc;
2484}
2485
2486static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2487{
2488	return softnet_get_online(pos);
2489}
2490
2491static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2492{
2493	++*pos;
2494	return softnet_get_online(pos);
2495}
2496
2497static void softnet_seq_stop(struct seq_file *seq, void *v)
2498{
2499}
2500
2501static int softnet_seq_show(struct seq_file *seq, void *v)
2502{
2503	struct netif_rx_stats *s = v;
2504
2505	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2506		   s->total, s->dropped, s->time_squeeze, 0,
2507		   0, 0, 0, 0, /* was fastroute */
2508		   s->cpu_collision );
2509	return 0;
2510}
2511
2512static const struct seq_operations dev_seq_ops = {
2513	.start = dev_seq_start,
2514	.next  = dev_seq_next,
2515	.stop  = dev_seq_stop,
2516	.show  = dev_seq_show,
2517};
2518
2519static int dev_seq_open(struct inode *inode, struct file *file)
2520{
2521	return seq_open_net(inode, file, &dev_seq_ops,
2522			    sizeof(struct seq_net_private));
2523}
2524
2525static const struct file_operations dev_seq_fops = {
2526	.owner	 = THIS_MODULE,
2527	.open    = dev_seq_open,
2528	.read    = seq_read,
2529	.llseek  = seq_lseek,
2530	.release = seq_release_net,
2531};
2532
2533static const struct seq_operations softnet_seq_ops = {
2534	.start = softnet_seq_start,
2535	.next  = softnet_seq_next,
2536	.stop  = softnet_seq_stop,
2537	.show  = softnet_seq_show,
2538};
2539
2540static int softnet_seq_open(struct inode *inode, struct file *file)
2541{
2542	return seq_open(file, &softnet_seq_ops);
2543}
2544
2545static const struct file_operations softnet_seq_fops = {
2546	.owner	 = THIS_MODULE,
2547	.open    = softnet_seq_open,
2548	.read    = seq_read,
2549	.llseek  = seq_lseek,
2550	.release = seq_release,
2551};
2552
2553static void *ptype_get_idx(loff_t pos)
2554{
2555	struct packet_type *pt = NULL;
2556	loff_t i = 0;
2557	int t;
2558
2559	list_for_each_entry_rcu(pt, &ptype_all, list) {
2560		if (i == pos)
2561			return pt;
2562		++i;
2563	}
2564
2565	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2566		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2567			if (i == pos)
2568				return pt;
2569			++i;
2570		}
2571	}
2572	return NULL;
2573}
2574
2575static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2576	__acquires(RCU)
2577{
2578	rcu_read_lock();
2579	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2580}
2581
2582static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2583{
2584	struct packet_type *pt;
2585	struct list_head *nxt;
2586	int hash;
2587
2588	++*pos;
2589	if (v == SEQ_START_TOKEN)
2590		return ptype_get_idx(0);
2591
2592	pt = v;
2593	nxt = pt->list.next;
2594	if (pt->type == htons(ETH_P_ALL)) {
2595		if (nxt != &ptype_all)
2596			goto found;
2597		hash = 0;
2598		nxt = ptype_base[0].next;
2599	} else
2600		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2601
2602	while (nxt == &ptype_base[hash]) {
2603		if (++hash >= PTYPE_HASH_SIZE)
2604			return NULL;
2605		nxt = ptype_base[hash].next;
2606	}
2607found:
2608	return list_entry(nxt, struct packet_type, list);
2609}
2610
2611static void ptype_seq_stop(struct seq_file *seq, void *v)
2612	__releases(RCU)
2613{
2614	rcu_read_unlock();
2615}
2616
2617static void ptype_seq_decode(struct seq_file *seq, void *sym)
2618{
2619#ifdef CONFIG_KALLSYMS
2620	unsigned long offset = 0, symsize;
2621	const char *symname;
2622	char *modname;
2623	char namebuf[128];
2624
2625	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2626				  &modname, namebuf);
2627
2628	if (symname) {
2629		char *delim = ":";
2630
2631		if (!modname)
2632			modname = delim = "";
2633		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2634			   symname, offset);
2635		return;
2636	}
2637#endif
2638
2639	seq_printf(seq, "[%p]", sym);
2640}
2641
2642static int ptype_seq_show(struct seq_file *seq, void *v)
2643{
2644	struct packet_type *pt = v;
2645
2646	if (v == SEQ_START_TOKEN)
2647		seq_puts(seq, "Type Device      Function\n");
2648	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2649		if (pt->type == htons(ETH_P_ALL))
2650			seq_puts(seq, "ALL ");
2651		else
2652			seq_printf(seq, "%04x", ntohs(pt->type));
2653
2654		seq_printf(seq, " %-8s ",
2655			   pt->dev ? pt->dev->name : "");
2656		ptype_seq_decode(seq,  pt->func);
2657		seq_putc(seq, '\n');
2658	}
2659
2660	return 0;
2661}
2662
2663static const struct seq_operations ptype_seq_ops = {
2664	.start = ptype_seq_start,
2665	.next  = ptype_seq_next,
2666	.stop  = ptype_seq_stop,
2667	.show  = ptype_seq_show,
2668};
2669
2670static int ptype_seq_open(struct inode *inode, struct file *file)
2671{
2672	return seq_open_net(inode, file, &ptype_seq_ops,
2673			sizeof(struct seq_net_private));
2674}
2675
2676static const struct file_operations ptype_seq_fops = {
2677	.owner	 = THIS_MODULE,
2678	.open    = ptype_seq_open,
2679	.read    = seq_read,
2680	.llseek  = seq_lseek,
2681	.release = seq_release_net,
2682};
2683
2684
2685static int __net_init dev_proc_net_init(struct net *net)
2686{
2687	int rc = -ENOMEM;
2688
2689	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2690		goto out;
2691	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2692		goto out_dev;
2693	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2694		goto out_softnet;
2695
2696	if (wext_proc_init(net))
2697		goto out_ptype;
2698	rc = 0;
2699out:
2700	return rc;
2701out_ptype:
2702	proc_net_remove(net, "ptype");
2703out_softnet:
2704	proc_net_remove(net, "softnet_stat");
2705out_dev:
2706	proc_net_remove(net, "dev");
2707	goto out;
2708}
2709
2710static void __net_exit dev_proc_net_exit(struct net *net)
2711{
2712	wext_proc_exit(net);
2713
2714	proc_net_remove(net, "ptype");
2715	proc_net_remove(net, "softnet_stat");
2716	proc_net_remove(net, "dev");
2717}
2718
2719static struct pernet_operations __net_initdata dev_proc_ops = {
2720	.init = dev_proc_net_init,
2721	.exit = dev_proc_net_exit,
2722};
2723
2724static int __init dev_proc_init(void)
2725{
2726	return register_pernet_subsys(&dev_proc_ops);
2727}
2728#else
2729#define dev_proc_init() 0
2730#endif	/* CONFIG_PROC_FS */
2731
2732
2733/**
2734 *	netdev_set_master	-	set up master/slave pair
2735 *	@slave: slave device
2736 *	@master: new master device
2737 *
2738 *	Changes the master device of the slave. Pass %NULL to break the
2739 *	bonding. The caller must hold the RTNL semaphore. On a failure
2740 *	a negative errno code is returned. On success the reference counts
2741 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2742 *	function returns zero.
2743 */
2744int netdev_set_master(struct net_device *slave, struct net_device *master)
2745{
2746	struct net_device *old = slave->master;
2747
2748	ASSERT_RTNL();
2749
2750	if (master) {
2751		if (old)
2752			return -EBUSY;
2753		dev_hold(master);
2754	}
2755
2756	slave->master = master;
2757
2758	synchronize_net();
2759
2760	if (old)
2761		dev_put(old);
2762
2763	if (master)
2764		slave->flags |= IFF_SLAVE;
2765	else
2766		slave->flags &= ~IFF_SLAVE;
2767
2768	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2769	return 0;
2770}
2771
2772static void __dev_set_promiscuity(struct net_device *dev, int inc)
2773{
2774	unsigned short old_flags = dev->flags;
2775
2776	ASSERT_RTNL();
2777
2778	if ((dev->promiscuity += inc) == 0)
2779		dev->flags &= ~IFF_PROMISC;
2780	else
2781		dev->flags |= IFF_PROMISC;
2782	if (dev->flags != old_flags) {
2783		printk(KERN_INFO "device %s %s promiscuous mode\n",
2784		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2785							       "left");
2786		if (audit_enabled)
2787			audit_log(current->audit_context, GFP_ATOMIC,
2788				AUDIT_ANOM_PROMISCUOUS,
2789				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2790				dev->name, (dev->flags & IFF_PROMISC),
2791				(old_flags & IFF_PROMISC),
2792				audit_get_loginuid(current),
2793				current->uid, current->gid,
2794				audit_get_sessionid(current));
2795
2796		if (dev->change_rx_flags)
2797			dev->change_rx_flags(dev, IFF_PROMISC);
2798	}
2799}
2800
2801/**
2802 *	dev_set_promiscuity	- update promiscuity count on a device
2803 *	@dev: device
2804 *	@inc: modifier
2805 *
2806 *	Add or remove promiscuity from a device. While the count in the device
2807 *	remains above zero the interface remains promiscuous. Once it hits zero
2808 *	the device reverts back to normal filtering operation. A negative inc
2809 *	value is used to drop promiscuity on the device.
2810 */
2811void dev_set_promiscuity(struct net_device *dev, int inc)
2812{
2813	unsigned short old_flags = dev->flags;
2814
2815	__dev_set_promiscuity(dev, inc);
2816	if (dev->flags != old_flags)
2817		dev_set_rx_mode(dev);
2818}
2819
2820/**
2821 *	dev_set_allmulti	- update allmulti count on a device
2822 *	@dev: device
2823 *	@inc: modifier
2824 *
2825 *	Add or remove reception of all multicast frames to a device. While the
2826 *	count in the device remains above zero the interface remains listening
2827 *	to all interfaces. Once it hits zero the device reverts back to normal
2828 *	filtering operation. A negative @inc value is used to drop the counter
2829 *	when releasing a resource needing all multicasts.
2830 */
2831
2832void dev_set_allmulti(struct net_device *dev, int inc)
2833{
2834	unsigned short old_flags = dev->flags;
2835
2836	ASSERT_RTNL();
2837
2838	dev->flags |= IFF_ALLMULTI;
2839	if ((dev->allmulti += inc) == 0)
2840		dev->flags &= ~IFF_ALLMULTI;
2841	if (dev->flags ^ old_flags) {
2842		if (dev->change_rx_flags)
2843			dev->change_rx_flags(dev, IFF_ALLMULTI);
2844		dev_set_rx_mode(dev);
2845	}
2846}
2847
2848/*
2849 *	Upload unicast and multicast address lists to device and
2850 *	configure RX filtering. When the device doesn't support unicast
2851 *	filtering it is put in promiscuous mode while unicast addresses
2852 *	are present.
2853 */
2854void __dev_set_rx_mode(struct net_device *dev)
2855{
2856	/* dev_open will call this function so the list will stay sane. */
2857	if (!(dev->flags&IFF_UP))
2858		return;
2859
2860	if (!netif_device_present(dev))
2861		return;
2862
2863	if (dev->set_rx_mode)
2864		dev->set_rx_mode(dev);
2865	else {
2866		/* Unicast addresses changes may only happen under the rtnl,
2867		 * therefore calling __dev_set_promiscuity here is safe.
2868		 */
2869		if (dev->uc_count > 0 && !dev->uc_promisc) {
2870			__dev_set_promiscuity(dev, 1);
2871			dev->uc_promisc = 1;
2872		} else if (dev->uc_count == 0 && dev->uc_promisc) {
2873			__dev_set_promiscuity(dev, -1);
2874			dev->uc_promisc = 0;
2875		}
2876
2877		if (dev->set_multicast_list)
2878			dev->set_multicast_list(dev);
2879	}
2880}
2881
2882void dev_set_rx_mode(struct net_device *dev)
2883{
2884	netif_tx_lock_bh(dev);
2885	__dev_set_rx_mode(dev);
2886	netif_tx_unlock_bh(dev);
2887}
2888
2889int __dev_addr_delete(struct dev_addr_list **list, int *count,
2890		      void *addr, int alen, int glbl)
2891{
2892	struct dev_addr_list *da;
2893
2894	for (; (da = *list) != NULL; list = &da->next) {
2895		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2896		    alen == da->da_addrlen) {
2897			if (glbl) {
2898				int old_glbl = da->da_gusers;
2899				da->da_gusers = 0;
2900				if (old_glbl == 0)
2901					break;
2902			}
2903			if (--da->da_users)
2904				return 0;
2905
2906			*list = da->next;
2907			kfree(da);
2908			(*count)--;
2909			return 0;
2910		}
2911	}
2912	return -ENOENT;
2913}
2914
2915int __dev_addr_add(struct dev_addr_list **list, int *count,
2916		   void *addr, int alen, int glbl)
2917{
2918	struct dev_addr_list *da;
2919
2920	for (da = *list; da != NULL; da = da->next) {
2921		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2922		    da->da_addrlen == alen) {
2923			if (glbl) {
2924				int old_glbl = da->da_gusers;
2925				da->da_gusers = 1;
2926				if (old_glbl)
2927					return 0;
2928			}
2929			da->da_users++;
2930			return 0;
2931		}
2932	}
2933
2934	da = kzalloc(sizeof(*da), GFP_ATOMIC);
2935	if (da == NULL)
2936		return -ENOMEM;
2937	memcpy(da->da_addr, addr, alen);
2938	da->da_addrlen = alen;
2939	da->da_users = 1;
2940	da->da_gusers = glbl ? 1 : 0;
2941	da->next = *list;
2942	*list = da;
2943	(*count)++;
2944	return 0;
2945}
2946
2947/**
2948 *	dev_unicast_delete	- Release secondary unicast address.
2949 *	@dev: device
2950 *	@addr: address to delete
2951 *	@alen: length of @addr
2952 *
2953 *	Release reference to a secondary unicast address and remove it
2954 *	from the device if the reference count drops to zero.
2955 *
2956 * 	The caller must hold the rtnl_mutex.
2957 */
2958int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2959{
2960	int err;
2961
2962	ASSERT_RTNL();
2963
2964	netif_tx_lock_bh(dev);
2965	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2966	if (!err)
2967		__dev_set_rx_mode(dev);
2968	netif_tx_unlock_bh(dev);
2969	return err;
2970}
2971EXPORT_SYMBOL(dev_unicast_delete);
2972
2973/**
2974 *	dev_unicast_add		- add a secondary unicast address
2975 *	@dev: device
2976 *	@addr: address to add
2977 *	@alen: length of @addr
2978 *
2979 *	Add a secondary unicast address to the device or increase
2980 *	the reference count if it already exists.
2981 *
2982 *	The caller must hold the rtnl_mutex.
2983 */
2984int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2985{
2986	int err;
2987
2988	ASSERT_RTNL();
2989
2990	netif_tx_lock_bh(dev);
2991	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2992	if (!err)
2993		__dev_set_rx_mode(dev);
2994	netif_tx_unlock_bh(dev);
2995	return err;
2996}
2997EXPORT_SYMBOL(dev_unicast_add);
2998
2999int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3000		    struct dev_addr_list **from, int *from_count)
3001{
3002	struct dev_addr_list *da, *next;
3003	int err = 0;
3004
3005	da = *from;
3006	while (da != NULL) {
3007		next = da->next;
3008		if (!da->da_synced) {
3009			err = __dev_addr_add(to, to_count,
3010					     da->da_addr, da->da_addrlen, 0);
3011			if (err < 0)
3012				break;
3013			da->da_synced = 1;
3014			da->da_users++;
3015		} else if (da->da_users == 1) {
3016			__dev_addr_delete(to, to_count,
3017					  da->da_addr, da->da_addrlen, 0);
3018			__dev_addr_delete(from, from_count,
3019					  da->da_addr, da->da_addrlen, 0);
3020		}
3021		da = next;
3022	}
3023	return err;
3024}
3025
3026void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3027		       struct dev_addr_list **from, int *from_count)
3028{
3029	struct dev_addr_list *da, *next;
3030
3031	da = *from;
3032	while (da != NULL) {
3033		next = da->next;
3034		if (da->da_synced) {
3035			__dev_addr_delete(to, to_count,
3036					  da->da_addr, da->da_addrlen, 0);
3037			da->da_synced = 0;
3038			__dev_addr_delete(from, from_count,
3039					  da->da_addr, da->da_addrlen, 0);
3040		}
3041		da = next;
3042	}
3043}
3044
3045/**
3046 *	dev_unicast_sync - Synchronize device's unicast list to another device
3047 *	@to: destination device
3048 *	@from: source device
3049 *
3050 *	Add newly added addresses to the destination device and release
3051 *	addresses that have no users left. The source device must be
3052 *	locked by netif_tx_lock_bh.
3053 *
3054 *	This function is intended to be called from the dev->set_rx_mode
3055 *	function of layered software devices.
3056 */
3057int dev_unicast_sync(struct net_device *to, struct net_device *from)
3058{
3059	int err = 0;
3060
3061	netif_tx_lock_bh(to);
3062	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3063			      &from->uc_list, &from->uc_count);
3064	if (!err)
3065		__dev_set_rx_mode(to);
3066	netif_tx_unlock_bh(to);
3067	return err;
3068}
3069EXPORT_SYMBOL(dev_unicast_sync);
3070
3071/**
3072 *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3073 *	@to: destination device
3074 *	@from: source device
3075 *
3076 *	Remove all addresses that were added to the destination device by
3077 *	dev_unicast_sync(). This function is intended to be called from the
3078 *	dev->stop function of layered software devices.
3079 */
3080void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3081{
3082	netif_tx_lock_bh(from);
3083	netif_tx_lock_bh(to);
3084
3085	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3086			  &from->uc_list, &from->uc_count);
3087	__dev_set_rx_mode(to);
3088
3089	netif_tx_unlock_bh(to);
3090	netif_tx_unlock_bh(from);
3091}
3092EXPORT_SYMBOL(dev_unicast_unsync);
3093
3094static void __dev_addr_discard(struct dev_addr_list **list)
3095{
3096	struct dev_addr_list *tmp;
3097
3098	while (*list != NULL) {
3099		tmp = *list;
3100		*list = tmp->next;
3101		if (tmp->da_users > tmp->da_gusers)
3102			printk("__dev_addr_discard: address leakage! "
3103			       "da_users=%d\n", tmp->da_users);
3104		kfree(tmp);
3105	}
3106}
3107
3108static void dev_addr_discard(struct net_device *dev)
3109{
3110	netif_tx_lock_bh(dev);
3111
3112	__dev_addr_discard(&dev->uc_list);
3113	dev->uc_count = 0;
3114
3115	__dev_addr_discard(&dev->mc_list);
3116	dev->mc_count = 0;
3117
3118	netif_tx_unlock_bh(dev);
3119}
3120
3121unsigned dev_get_flags(const struct net_device *dev)
3122{
3123	unsigned flags;
3124
3125	flags = (dev->flags & ~(IFF_PROMISC |
3126				IFF_ALLMULTI |
3127				IFF_RUNNING |
3128				IFF_LOWER_UP |
3129				IFF_DORMANT)) |
3130		(dev->gflags & (IFF_PROMISC |
3131				IFF_ALLMULTI));
3132
3133	if (netif_running(dev)) {
3134		if (netif_oper_up(dev))
3135			flags |= IFF_RUNNING;
3136		if (netif_carrier_ok(dev))
3137			flags |= IFF_LOWER_UP;
3138		if (netif_dormant(dev))
3139			flags |= IFF_DORMANT;
3140	}
3141
3142	return flags;
3143}
3144
3145int dev_change_flags(struct net_device *dev, unsigned flags)
3146{
3147	int ret, changes;
3148	int old_flags = dev->flags;
3149
3150	ASSERT_RTNL();
3151
3152	/*
3153	 *	Set the flags on our device.
3154	 */
3155
3156	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3157			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3158			       IFF_AUTOMEDIA)) |
3159		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3160				    IFF_ALLMULTI));
3161
3162	/*
3163	 *	Load in the correct multicast list now the flags have changed.
3164	 */
3165
3166	if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
3167		dev->change_rx_flags(dev, IFF_MULTICAST);
3168
3169	dev_set_rx_mode(dev);
3170
3171	/*
3172	 *	Have we downed the interface. We handle IFF_UP ourselves
3173	 *	according to user attempts to set it, rather than blindly
3174	 *	setting it.
3175	 */
3176
3177	ret = 0;
3178	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3179		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3180
3181		if (!ret)
3182			dev_set_rx_mode(dev);
3183	}
3184
3185	if (dev->flags & IFF_UP &&
3186	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3187					  IFF_VOLATILE)))
3188		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3189
3190	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3191		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3192		dev->gflags ^= IFF_PROMISC;
3193		dev_set_promiscuity(dev, inc);
3194	}
3195
3196	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3197	   is important. Some (broken) drivers set IFF_PROMISC, when
3198	   IFF_ALLMULTI is requested not asking us and not reporting.
3199	 */
3200	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3201		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3202		dev->gflags ^= IFF_ALLMULTI;
3203		dev_set_allmulti(dev, inc);
3204	}
3205
3206	/* Exclude state transition flags, already notified */
3207	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3208	if (changes)
3209		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3210
3211	return ret;
3212}
3213
3214int dev_set_mtu(struct net_device *dev, int new_mtu)
3215{
3216	int err;
3217
3218	if (new_mtu == dev->mtu)
3219		return 0;
3220
3221	/*	MTU must be positive.	 */
3222	if (new_mtu < 0)
3223		return -EINVAL;
3224
3225	if (!netif_device_present(dev))
3226		return -ENODEV;
3227
3228	err = 0;
3229	if (dev->change_mtu)
3230		err = dev->change_mtu(dev, new_mtu);
3231	else
3232		dev->mtu = new_mtu;
3233	if (!err && dev->flags & IFF_UP)
3234		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3235	return err;
3236}
3237
3238int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3239{
3240	int err;
3241
3242	if (!dev->set_mac_address)
3243		return -EOPNOTSUPP;
3244	if (sa->sa_family != dev->type)
3245		return -EINVAL;
3246	if (!netif_device_present(dev))
3247		return -ENODEV;
3248	err = dev->set_mac_address(dev, sa);
3249	if (!err)
3250		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3251	return err;
3252}
3253
3254/*
3255 *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3256 */
3257static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3258{
3259	int err;
3260	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3261
3262	if (!dev)
3263		return -ENODEV;
3264
3265	switch (cmd) {
3266		case SIOCGIFFLAGS:	/* Get interface flags */
3267			ifr->ifr_flags = dev_get_flags(dev);
3268			return 0;
3269
3270		case SIOCGIFMETRIC:	/* Get the metric on the interface
3271					   (currently unused) */
3272			ifr->ifr_metric = 0;
3273			return 0;
3274
3275		case SIOCGIFMTU:	/* Get the MTU of a device */
3276			ifr->ifr_mtu = dev->mtu;
3277			return 0;
3278
3279		case SIOCGIFHWADDR:
3280			if (!dev->addr_len)
3281				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3282			else
3283				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3284				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3285			ifr->ifr_hwaddr.sa_family = dev->type;
3286			return 0;
3287
3288		case SIOCGIFSLAVE:
3289			err = -EINVAL;
3290			break;
3291
3292		case SIOCGIFMAP:
3293			ifr->ifr_map.mem_start = dev->mem_start;
3294			ifr->ifr_map.mem_end   = dev->mem_end;
3295			ifr->ifr_map.base_addr = dev->base_addr;
3296			ifr->ifr_map.irq       = dev->irq;
3297			ifr->ifr_map.dma       = dev->dma;
3298			ifr->ifr_map.port      = dev->if_port;
3299			return 0;
3300
3301		case SIOCGIFINDEX:
3302			ifr->ifr_ifindex = dev->ifindex;
3303			return 0;
3304
3305		case SIOCGIFTXQLEN:
3306			ifr->ifr_qlen = dev->tx_queue_len;
3307			return 0;
3308
3309		default:
3310			/* dev_ioctl() should ensure this case
3311			 * is never reached
3312			 */
3313			WARN_ON(1);
3314			err = -EINVAL;
3315			break;
3316
3317	}
3318	return err;
3319}
3320
3321/*
3322 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3323 */
3324static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3325{
3326	int err;
3327	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3328
3329	if (!dev)
3330		return -ENODEV;
3331
3332	switch (cmd) {
3333		case SIOCSIFFLAGS:	/* Set interface flags */
3334			return dev_change_flags(dev, ifr->ifr_flags);
3335
3336		case SIOCSIFMETRIC:	/* Set the metric on the interface
3337					   (currently unused) */
3338			return -EOPNOTSUPP;
3339
3340		case SIOCSIFMTU:	/* Set the MTU of a device */
3341			return dev_set_mtu(dev, ifr->ifr_mtu);
3342
3343		case SIOCSIFHWADDR:
3344			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3345
3346		case SIOCSIFHWBROADCAST:
3347			if (ifr->ifr_hwaddr.sa_family != dev->type)
3348				return -EINVAL;
3349			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3350			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3351			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3352			return 0;
3353
3354		case SIOCSIFMAP:
3355			if (dev->set_config) {
3356				if (!netif_device_present(dev))
3357					return -ENODEV;
3358				return dev->set_config(dev, &ifr->ifr_map);
3359			}
3360			return -EOPNOTSUPP;
3361
3362		case SIOCADDMULTI:
3363			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3364			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3365				return -EINVAL;
3366			if (!netif_device_present(dev))
3367				return -ENODEV;
3368			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3369					  dev->addr_len, 1);
3370
3371		case SIOCDELMULTI:
3372			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3373			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3374				return -EINVAL;
3375			if (!netif_device_present(dev))
3376				return -ENODEV;
3377			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3378					     dev->addr_len, 1);
3379
3380		case SIOCSIFTXQLEN:
3381			if (ifr->ifr_qlen < 0)
3382				return -EINVAL;
3383			dev->tx_queue_len = ifr->ifr_qlen;
3384			return 0;
3385
3386		case SIOCSIFNAME:
3387			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3388			return dev_change_name(dev, ifr->ifr_newname);
3389
3390		/*
3391		 *	Unknown or private ioctl
3392		 */
3393
3394		default:
3395			if ((cmd >= SIOCDEVPRIVATE &&
3396			    cmd <= SIOCDEVPRIVATE + 15) ||
3397			    cmd == SIOCBONDENSLAVE ||
3398			    cmd == SIOCBONDRELEASE ||
3399			    cmd == SIOCBONDSETHWADDR ||
3400			    cmd == SIOCBONDSLAVEINFOQUERY ||
3401			    cmd == SIOCBONDINFOQUERY ||
3402			    cmd == SIOCBONDCHANGEACTIVE ||
3403			    cmd == SIOCGMIIPHY ||
3404			    cmd == SIOCGMIIREG ||
3405			    cmd == SIOCSMIIREG ||
3406			    cmd == SIOCBRADDIF ||
3407			    cmd == SIOCBRDELIF ||
3408			    cmd == SIOCWANDEV) {
3409				err = -EOPNOTSUPP;
3410				if (dev->do_ioctl) {
3411					if (netif_device_present(dev))
3412						err = dev->do_ioctl(dev, ifr,
3413								    cmd);
3414					else
3415						err = -ENODEV;
3416				}
3417			} else
3418				err = -EINVAL;
3419
3420	}
3421	return err;
3422}
3423
3424/*
3425 *	This function handles all "interface"-type I/O control requests. The actual
3426 *	'doing' part of this is dev_ifsioc above.
3427 */
3428
3429/**
3430 *	dev_ioctl	-	network device ioctl
3431 *	@net: the applicable net namespace
3432 *	@cmd: command to issue
3433 *	@arg: pointer to a struct ifreq in user space
3434 *
3435 *	Issue ioctl functions to devices. This is normally called by the
3436 *	user space syscall interfaces but can sometimes be useful for
3437 *	other purposes. The return value is the return from the syscall if
3438 *	positive or a negative errno code on error.
3439 */
3440
3441int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3442{
3443	struct ifreq ifr;
3444	int ret;
3445	char *colon;
3446
3447	/* One special case: SIOCGIFCONF takes ifconf argument
3448	   and requires shared lock, because it sleeps writing
3449	   to user space.
3450	 */
3451
3452	if (cmd == SIOCGIFCONF) {
3453		rtnl_lock();
3454		ret = dev_ifconf(net, (char __user *) arg);
3455		rtnl_unlock();
3456		return ret;
3457	}
3458	if (cmd == SIOCGIFNAME)
3459		return dev_ifname(net, (struct ifreq __user *)arg);
3460
3461	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3462		return -EFAULT;
3463
3464	ifr.ifr_name[IFNAMSIZ-1] = 0;
3465
3466	colon = strchr(ifr.ifr_name, ':');
3467	if (colon)
3468		*colon = 0;
3469
3470	/*
3471	 *	See which interface the caller is talking about.
3472	 */
3473
3474	switch (cmd) {
3475		/*
3476		 *	These ioctl calls:
3477		 *	- can be done by all.
3478		 *	- atomic and do not require locking.
3479		 *	- return a value
3480		 */
3481		case SIOCGIFFLAGS:
3482		case SIOCGIFMETRIC:
3483		case SIOCGIFMTU:
3484		case SIOCGIFHWADDR:
3485		case SIOCGIFSLAVE:
3486		case SIOCGIFMAP:
3487		case SIOCGIFINDEX:
3488		case SIOCGIFTXQLEN:
3489			dev_load(net, ifr.ifr_name);
3490			read_lock(&dev_base_lock);
3491			ret = dev_ifsioc_locked(net, &ifr, cmd);
3492			read_unlock(&dev_base_lock);
3493			if (!ret) {
3494				if (colon)
3495					*colon = ':';
3496				if (copy_to_user(arg, &ifr,
3497						 sizeof(struct ifreq)))
3498					ret = -EFAULT;
3499			}
3500			return ret;
3501
3502		case SIOCETHTOOL:
3503			dev_load(net, ifr.ifr_name);
3504			rtnl_lock();
3505			ret = dev_ethtool(net, &ifr);
3506			rtnl_unlock();
3507			if (!ret) {
3508				if (colon)
3509					*colon = ':';
3510				if (copy_to_user(arg, &ifr,
3511						 sizeof(struct ifreq)))
3512					ret = -EFAULT;
3513			}
3514			return ret;
3515
3516		/*
3517		 *	These ioctl calls:
3518		 *	- require superuser power.
3519		 *	- require strict serialization.
3520		 *	- return a value
3521		 */
3522		case SIOCGMIIPHY:
3523		case SIOCGMIIREG:
3524		case SIOCSIFNAME:
3525			if (!capable(CAP_NET_ADMIN))
3526				return -EPERM;
3527			dev_load(net, ifr.ifr_name);
3528			rtnl_lock();
3529			ret = dev_ifsioc(net, &ifr, cmd);
3530			rtnl_unlock();
3531			if (!ret) {
3532				if (colon)
3533					*colon = ':';
3534				if (copy_to_user(arg, &ifr,
3535						 sizeof(struct ifreq)))
3536					ret = -EFAULT;
3537			}
3538			return ret;
3539
3540		/*
3541		 *	These ioctl calls:
3542		 *	- require superuser power.
3543		 *	- require strict serialization.
3544		 *	- do not return a value
3545		 */
3546		case SIOCSIFFLAGS:
3547		case SIOCSIFMETRIC:
3548		case SIOCSIFMTU:
3549		case SIOCSIFMAP:
3550		case SIOCSIFHWADDR:
3551		case SIOCSIFSLAVE:
3552		case SIOCADDMULTI:
3553		case SIOCDELMULTI:
3554		case SIOCSIFHWBROADCAST:
3555		case SIOCSIFTXQLEN:
3556		case SIOCSMIIREG:
3557		case SIOCBONDENSLAVE:
3558		case SIOCBONDRELEASE:
3559		case SIOCBONDSETHWADDR:
3560		case SIOCBONDCHANGEACTIVE:
3561		case SIOCBRADDIF:
3562		case SIOCBRDELIF:
3563			if (!capable(CAP_NET_ADMIN))
3564				return -EPERM;
3565			/* fall through */
3566		case SIOCBONDSLAVEINFOQUERY:
3567		case SIOCBONDINFOQUERY:
3568			dev_load(net, ifr.ifr_name);
3569			rtnl_lock();
3570			ret = dev_ifsioc(net, &ifr, cmd);
3571			rtnl_unlock();
3572			return ret;
3573
3574		case SIOCGIFMEM:
3575			/* Get the per device memory space. We can add this but
3576			 * currently do not support it */
3577		case SIOCSIFMEM:
3578			/* Set the per device memory buffer space.
3579			 * Not applicable in our case */
3580		case SIOCSIFLINK:
3581			return -EINVAL;
3582
3583		/*
3584		 *	Unknown or private ioctl.
3585		 */
3586		default:
3587			if (cmd == SIOCWANDEV ||
3588			    (cmd >= SIOCDEVPRIVATE &&
3589			     cmd <= SIOCDEVPRIVATE + 15)) {
3590				dev_load(net, ifr.ifr_name);
3591				rtnl_lock();
3592				ret = dev_ifsioc(net, &ifr, cmd);
3593				rtnl_unlock();
3594				if (!ret && copy_to_user(arg, &ifr,
3595							 sizeof(struct ifreq)))
3596					ret = -EFAULT;
3597				return ret;
3598			}
3599			/* Take care of Wireless Extensions */
3600			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3601				return wext_handle_ioctl(net, &ifr, cmd, arg);
3602			return -EINVAL;
3603	}
3604}
3605
3606
3607/**
3608 *	dev_new_index	-	allocate an ifindex
3609 *	@net: the applicable net namespace
3610 *
3611 *	Returns a suitable unique value for a new device interface
3612 *	number.  The caller must hold the rtnl semaphore or the
3613 *	dev_base_lock to be sure it remains unique.
3614 */
3615static int dev_new_index(struct net *net)
3616{
3617	static int ifindex;
3618	for (;;) {
3619		if (++ifindex <= 0)
3620			ifindex = 1;
3621		if (!__dev_get_by_index(net, ifindex))
3622			return ifindex;
3623	}
3624}
3625
3626/* Delayed registration/unregisteration */
3627static DEFINE_SPINLOCK(net_todo_list_lock);
3628static LIST_HEAD(net_todo_list);
3629
3630static void net_set_todo(struct net_device *dev)
3631{
3632	spin_lock(&net_todo_list_lock);
3633	list_add_tail(&dev->todo_list, &net_todo_list);
3634	spin_unlock(&net_todo_list_lock);
3635}
3636
3637static void rollback_registered(struct net_device *dev)
3638{
3639	BUG_ON(dev_boot_phase);
3640	ASSERT_RTNL();
3641
3642	/* Some devices call without registering for initialization unwind. */
3643	if (dev->reg_state == NETREG_UNINITIALIZED) {
3644		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3645				  "was registered\n", dev->name, dev);
3646
3647		WARN_ON(1);
3648		return;
3649	}
3650
3651	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3652
3653	/* If device is running, close it first. */
3654	dev_close(dev);
3655
3656	/* And unlink it from device chain. */
3657	unlist_netdevice(dev);
3658
3659	dev->reg_state = NETREG_UNREGISTERING;
3660
3661	synchronize_net();
3662
3663	/* Shutdown queueing discipline. */
3664	dev_shutdown(dev);
3665
3666
3667	/* Notify protocols, that we are about to destroy
3668	   this device. They should clean all the things.
3669	*/
3670	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3671
3672	/*
3673	 *	Flush the unicast and multicast chains
3674	 */
3675	dev_addr_discard(dev);
3676
3677	if (dev->uninit)
3678		dev->uninit(dev);
3679
3680	/* Notifier chain MUST detach us from master device. */
3681	BUG_TRAP(!dev->master);
3682
3683	/* Remove entries from kobject tree */
3684	netdev_unregister_kobject(dev);
3685
3686	synchronize_net();
3687
3688	dev_put(dev);
3689}
3690
3691/**
3692 *	register_netdevice	- register a network device
3693 *	@dev: device to register
3694 *
3695 *	Take a completed network device structure and add it to the kernel
3696 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3697 *	chain. 0 is returned on success. A negative errno code is returned
3698 *	on a failure to set up the device, or if the name is a duplicate.
3699 *
3700 *	Callers must hold the rtnl semaphore. You may want
3701 *	register_netdev() instead of this.
3702 *
3703 *	BUGS:
3704 *	The locking appears insufficient to guarantee two parallel registers
3705 *	will not get the same name.
3706 */
3707
3708int register_netdevice(struct net_device *dev)
3709{
3710	struct hlist_head *head;
3711	struct hlist_node *p;
3712	int ret;
3713	struct net *net;
3714
3715	BUG_ON(dev_boot_phase);
3716	ASSERT_RTNL();
3717
3718	might_sleep();
3719
3720	/* When net_device's are persistent, this will be fatal. */
3721	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3722	BUG_ON(!dev_net(dev));
3723	net = dev_net(dev);
3724
3725	spin_lock_init(&dev->queue_lock);
3726	spin_lock_init(&dev->_xmit_lock);
3727	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3728	dev->xmit_lock_owner = -1;
3729	spin_lock_init(&dev->ingress_lock);
3730
3731	dev->iflink = -1;
3732
3733	/* Init, if this function is available */
3734	if (dev->init) {
3735		ret = dev->init(dev);
3736		if (ret) {
3737			if (ret > 0)
3738				ret = -EIO;
3739			goto out;
3740		}
3741	}
3742
3743	if (!dev_valid_name(dev->name)) {
3744		ret = -EINVAL;
3745		goto err_uninit;
3746	}
3747
3748	dev->ifindex = dev_new_index(net);
3749	if (dev->iflink == -1)
3750		dev->iflink = dev->ifindex;
3751
3752	/* Check for existence of name */
3753	head = dev_name_hash(net, dev->name);
3754	hlist_for_each(p, head) {
3755		struct net_device *d
3756			= hlist_entry(p, struct net_device, name_hlist);
3757		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3758			ret = -EEXIST;
3759			goto err_uninit;
3760		}
3761	}
3762
3763	/* Fix illegal checksum combinations */
3764	if ((dev->features & NETIF_F_HW_CSUM) &&
3765	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3766		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3767		       dev->name);
3768		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3769	}
3770
3771	if ((dev->features & NETIF_F_NO_CSUM) &&
3772	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3773		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3774		       dev->name);
3775		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3776	}
3777
3778
3779	/* Fix illegal SG+CSUM combinations. */
3780	if ((dev->features & NETIF_F_SG) &&
3781	    !(dev->features & NETIF_F_ALL_CSUM)) {
3782		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3783		       dev->name);
3784		dev->features &= ~NETIF_F_SG;
3785	}
3786
3787	/* TSO requires that SG is present as well. */
3788	if ((dev->features & NETIF_F_TSO) &&
3789	    !(dev->features & NETIF_F_SG)) {
3790		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3791		       dev->name);
3792		dev->features &= ~NETIF_F_TSO;
3793	}
3794	if (dev->features & NETIF_F_UFO) {
3795		if (!(dev->features & NETIF_F_HW_CSUM)) {
3796			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3797					"NETIF_F_HW_CSUM feature.\n",
3798							dev->name);
3799			dev->features &= ~NETIF_F_UFO;
3800		}
3801		if (!(dev->features & NETIF_F_SG)) {
3802			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3803					"NETIF_F_SG feature.\n",
3804					dev->name);
3805			dev->features &= ~NETIF_F_UFO;
3806		}
3807	}
3808
3809	netdev_initialize_kobject(dev);
3810	ret = netdev_register_kobject(dev);
3811	if (ret)
3812		goto err_uninit;
3813	dev->reg_state = NETREG_REGISTERED;
3814
3815	/*
3816	 *	Default initial state at registry is that the
3817	 *	device is present.
3818	 */
3819
3820	set_bit(__LINK_STATE_PRESENT, &dev->state);
3821
3822	dev_init_scheduler(dev);
3823	dev_hold(dev);
3824	list_netdevice(dev);
3825
3826	/* Notify protocols, that a new device appeared. */
3827	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3828	ret = notifier_to_errno(ret);
3829	if (ret) {
3830		rollback_registered(dev);
3831		dev->reg_state = NETREG_UNREGISTERED;
3832	}
3833
3834out:
3835	return ret;
3836
3837err_uninit:
3838	if (dev->uninit)
3839		dev->uninit(dev);
3840	goto out;
3841}
3842
3843/**
3844 *	register_netdev	- register a network device
3845 *	@dev: device to register
3846 *
3847 *	Take a completed network device structure and add it to the kernel
3848 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3849 *	chain. 0 is returned on success. A negative errno code is returned
3850 *	on a failure to set up the device, or if the name is a duplicate.
3851 *
3852 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
3853 *	and expands the device name if you passed a format string to
3854 *	alloc_netdev.
3855 */
3856int register_netdev(struct net_device *dev)
3857{
3858	int err;
3859
3860	rtnl_lock();
3861
3862	/*
3863	 * If the name is a format string the caller wants us to do a
3864	 * name allocation.
3865	 */
3866	if (strchr(dev->name, '%')) {
3867		err = dev_alloc_name(dev, dev->name);
3868		if (err < 0)
3869			goto out;
3870	}
3871
3872	err = register_netdevice(dev);
3873out:
3874	rtnl_unlock();
3875	return err;
3876}
3877EXPORT_SYMBOL(register_netdev);
3878
3879/*
3880 * netdev_wait_allrefs - wait until all references are gone.
3881 *
3882 * This is called when unregistering network devices.
3883 *
3884 * Any protocol or device that holds a reference should register
3885 * for netdevice notification, and cleanup and put back the
3886 * reference if they receive an UNREGISTER event.
3887 * We can get stuck here if buggy protocols don't correctly
3888 * call dev_put.
3889 */
3890static void netdev_wait_allrefs(struct net_device *dev)
3891{
3892	unsigned long rebroadcast_time, warning_time;
3893
3894	rebroadcast_time = warning_time = jiffies;
3895	while (atomic_read(&dev->refcnt) != 0) {
3896		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3897			rtnl_lock();
3898
3899			/* Rebroadcast unregister notification */
3900			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3901
3902			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3903				     &dev->state)) {
3904				/* We must not have linkwatch events
3905				 * pending on unregister. If this
3906				 * happens, we simply run the queue
3907				 * unscheduled, resulting in a noop
3908				 * for this device.
3909				 */
3910				linkwatch_run_queue();
3911			}
3912
3913			__rtnl_unlock();
3914
3915			rebroadcast_time = jiffies;
3916		}
3917
3918		msleep(250);
3919
3920		if (time_after(jiffies, warning_time + 10 * HZ)) {
3921			printk(KERN_EMERG "unregister_netdevice: "
3922			       "waiting for %s to become free. Usage "
3923			       "count = %d\n",
3924			       dev->name, atomic_read(&dev->refcnt));
3925			warning_time = jiffies;
3926		}
3927	}
3928}
3929
3930/* The sequence is:
3931 *
3932 *	rtnl_lock();
3933 *	...
3934 *	register_netdevice(x1);
3935 *	register_netdevice(x2);
3936 *	...
3937 *	unregister_netdevice(y1);
3938 *	unregister_netdevice(y2);
3939 *      ...
3940 *	rtnl_unlock();
3941 *	free_netdev(y1);
3942 *	free_netdev(y2);
3943 *
3944 * We are invoked by rtnl_unlock() after it drops the semaphore.
3945 * This allows us to deal with problems:
3946 * 1) We can delete sysfs objects which invoke hotplug
3947 *    without deadlocking with linkwatch via keventd.
3948 * 2) Since we run with the RTNL semaphore not held, we can sleep
3949 *    safely in order to wait for the netdev refcnt to drop to zero.
3950 */
3951static DEFINE_MUTEX(net_todo_run_mutex);
3952void netdev_run_todo(void)
3953{
3954	struct list_head list;
3955
3956	/* Need to guard against multiple cpu's getting out of order. */
3957	mutex_lock(&net_todo_run_mutex);
3958
3959	/* Not safe to do outside the semaphore.  We must not return
3960	 * until all unregister events invoked by the local processor
3961	 * have been completed (either by this todo run, or one on
3962	 * another cpu).
3963	 */
3964	if (list_empty(&net_todo_list))
3965		goto out;
3966
3967	/* Snapshot list, allow later requests */
3968	spin_lock(&net_todo_list_lock);
3969	list_replace_init(&net_todo_list, &list);
3970	spin_unlock(&net_todo_list_lock);
3971
3972	while (!list_empty(&list)) {
3973		struct net_device *dev
3974			= list_entry(list.next, struct net_device, todo_list);
3975		list_del(&dev->todo_list);
3976
3977		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3978			printk(KERN_ERR "network todo '%s' but state %d\n",
3979			       dev->name, dev->reg_state);
3980			dump_stack();
3981			continue;
3982		}
3983
3984		dev->reg_state = NETREG_UNREGISTERED;
3985
3986		netdev_wait_allrefs(dev);
3987
3988		/* paranoia */
3989		BUG_ON(atomic_read(&dev->refcnt));
3990		BUG_TRAP(!dev->ip_ptr);
3991		BUG_TRAP(!dev->ip6_ptr);
3992		BUG_TRAP(!dev->dn_ptr);
3993
3994		if (dev->destructor)
3995			dev->destructor(dev);
3996
3997		/* Free network device */
3998		kobject_put(&dev->dev.kobj);
3999	}
4000
4001out:
4002	mutex_unlock(&net_todo_run_mutex);
4003}
4004
4005static struct net_device_stats *internal_stats(struct net_device *dev)
4006{
4007	return &dev->stats;
4008}
4009
4010/**
4011 *	alloc_netdev_mq - allocate network device
4012 *	@sizeof_priv:	size of private data to allocate space for
4013 *	@name:		device name format string
4014 *	@setup:		callback to initialize device
4015 *	@queue_count:	the number of subqueues to allocate
4016 *
4017 *	Allocates a struct net_device with private data area for driver use
4018 *	and performs basic initialization.  Also allocates subquue structs
4019 *	for each queue on the device at the end of the netdevice.
4020 */
4021struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4022		void (*setup)(struct net_device *), unsigned int queue_count)
4023{
4024	void *p;
4025	struct net_device *dev;
4026	int alloc_size;
4027
4028	BUG_ON(strlen(name) >= sizeof(dev->name));
4029
4030	alloc_size = sizeof(struct net_device) +
4031		     sizeof(struct net_device_subqueue) * (queue_count - 1);
4032	if (sizeof_priv) {
4033		/* ensure 32-byte alignment of private area */
4034		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4035		alloc_size += sizeof_priv;
4036	}
4037	/* ensure 32-byte alignment of whole construct */
4038	alloc_size += NETDEV_ALIGN_CONST;
4039
4040	p = kzalloc(alloc_size, GFP_KERNEL);
4041	if (!p) {
4042		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4043		return NULL;
4044	}
4045
4046	dev = (struct net_device *)
4047		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4048	dev->padded = (char *)dev - (char *)p;
4049	dev_net_set(dev, &init_net);
4050
4051	if (sizeof_priv) {
4052		dev->priv = ((char *)dev +
4053			     ((sizeof(struct net_device) +
4054			       (sizeof(struct net_device_subqueue) *
4055				(queue_count - 1)) + NETDEV_ALIGN_CONST)
4056			      & ~NETDEV_ALIGN_CONST));
4057	}
4058
4059	dev->egress_subqueue_count = queue_count;
4060	dev->gso_max_size = GSO_MAX_SIZE;
4061
4062	dev->get_stats = internal_stats;
4063	netpoll_netdev_init(dev);
4064	setup(dev);
4065	strcpy(dev->name, name);
4066	return dev;
4067}
4068EXPORT_SYMBOL(alloc_netdev_mq);
4069
4070/**
4071 *	free_netdev - free network device
4072 *	@dev: device
4073 *
4074 *	This function does the last stage of destroying an allocated device
4075 * 	interface. The reference to the device object is released.
4076 *	If this is the last reference then it will be freed.
4077 */
4078void free_netdev(struct net_device *dev)
4079{
4080	release_net(dev_net(dev));
4081
4082	/*  Compatibility with error handling in drivers */
4083	if (dev->reg_state == NETREG_UNINITIALIZED) {
4084		kfree((char *)dev - dev->padded);
4085		return;
4086	}
4087
4088	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4089	dev->reg_state = NETREG_RELEASED;
4090
4091	/* will free via device release */
4092	put_device(&dev->dev);
4093}
4094
4095/* Synchronize with packet receive processing. */
4096void synchronize_net(void)
4097{
4098	might_sleep();
4099	synchronize_rcu();
4100}
4101
4102/**
4103 *	unregister_netdevice - remove device from the kernel
4104 *	@dev: device
4105 *
4106 *	This function shuts down a device interface and removes it
4107 *	from the kernel tables.
4108 *
4109 *	Callers must hold the rtnl semaphore.  You may want
4110 *	unregister_netdev() instead of this.
4111 */
4112
4113void unregister_netdevice(struct net_device *dev)
4114{
4115	ASSERT_RTNL();
4116
4117	rollback_registered(dev);
4118	/* Finish processing unregister after unlock */
4119	net_set_todo(dev);
4120}
4121
4122/**
4123 *	unregister_netdev - remove device from the kernel
4124 *	@dev: device
4125 *
4126 *	This function shuts down a device interface and removes it
4127 *	from the kernel tables.
4128 *
4129 *	This is just a wrapper for unregister_netdevice that takes
4130 *	the rtnl semaphore.  In general you want to use this and not
4131 *	unregister_netdevice.
4132 */
4133void unregister_netdev(struct net_device *dev)
4134{
4135	rtnl_lock();
4136	unregister_netdevice(dev);
4137	rtnl_unlock();
4138}
4139
4140EXPORT_SYMBOL(unregister_netdev);
4141
4142/**
4143 *	dev_change_net_namespace - move device to different nethost namespace
4144 *	@dev: device
4145 *	@net: network namespace
4146 *	@pat: If not NULL name pattern to try if the current device name
4147 *	      is already taken in the destination network namespace.
4148 *
4149 *	This function shuts down a device interface and moves it
4150 *	to a new network namespace. On success 0 is returned, on
4151 *	a failure a netagive errno code is returned.
4152 *
4153 *	Callers must hold the rtnl semaphore.
4154 */
4155
4156int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4157{
4158	char buf[IFNAMSIZ];
4159	const char *destname;
4160	int err;
4161
4162	ASSERT_RTNL();
4163
4164	/* Don't allow namespace local devices to be moved. */
4165	err = -EINVAL;
4166	if (dev->features & NETIF_F_NETNS_LOCAL)
4167		goto out;
4168
4169	/* Ensure the device has been registrered */
4170	err = -EINVAL;
4171	if (dev->reg_state != NETREG_REGISTERED)
4172		goto out;
4173
4174	/* Get out if there is nothing todo */
4175	err = 0;
4176	if (net_eq(dev_net(dev), net))
4177		goto out;
4178
4179	/* Pick the destination device name, and ensure
4180	 * we can use it in the destination network namespace.
4181	 */
4182	err = -EEXIST;
4183	destname = dev->name;
4184	if (__dev_get_by_name(net, destname)) {
4185		/* We get here if we can't use the current device name */
4186		if (!pat)
4187			goto out;
4188		if (!dev_valid_name(pat))
4189			goto out;
4190		if (strchr(pat, '%')) {
4191			if (__dev_alloc_name(net, pat, buf) < 0)
4192				goto out;
4193			destname = buf;
4194		} else
4195			destname = pat;
4196		if (__dev_get_by_name(net, destname))
4197			goto out;
4198	}
4199
4200	/*
4201	 * And now a mini version of register_netdevice unregister_netdevice.
4202	 */
4203
4204	/* If device is running close it first. */
4205	dev_close(dev);
4206
4207	/* And unlink it from device chain */
4208	err = -ENODEV;
4209	unlist_netdevice(dev);
4210
4211	synchronize_net();
4212
4213	/* Shutdown queueing discipline. */
4214	dev_shutdown(dev);
4215
4216	/* Notify protocols, that we are about to destroy
4217	   this device. They should clean all the things.
4218	*/
4219	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4220
4221	/*
4222	 *	Flush the unicast and multicast chains
4223	 */
4224	dev_addr_discard(dev);
4225
4226	/* Actually switch the network namespace */
4227	dev_net_set(dev, net);
4228
4229	/* Assign the new device name */
4230	if (destname != dev->name)
4231		strcpy(dev->name, destname);
4232
4233	/* If there is an ifindex conflict assign a new one */
4234	if (__dev_get_by_index(net, dev->ifindex)) {
4235		int iflink = (dev->iflink == dev->ifindex);
4236		dev->ifindex = dev_new_index(net);
4237		if (iflink)
4238			dev->iflink = dev->ifindex;
4239	}
4240
4241	/* Fixup kobjects */
4242	netdev_unregister_kobject(dev);
4243	err = netdev_register_kobject(dev);
4244	WARN_ON(err);
4245
4246	/* Add the device back in the hashes */
4247	list_netdevice(dev);
4248
4249	/* Notify protocols, that a new device appeared. */
4250	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4251
4252	synchronize_net();
4253	err = 0;
4254out:
4255	return err;
4256}
4257
4258static int dev_cpu_callback(struct notifier_block *nfb,
4259			    unsigned long action,
4260			    void *ocpu)
4261{
4262	struct sk_buff **list_skb;
4263	struct net_device **list_net;
4264	struct sk_buff *skb;
4265	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4266	struct softnet_data *sd, *oldsd;
4267
4268	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4269		return NOTIFY_OK;
4270
4271	local_irq_disable();
4272	cpu = smp_processor_id();
4273	sd = &per_cpu(softnet_data, cpu);
4274	oldsd = &per_cpu(softnet_data, oldcpu);
4275
4276	/* Find end of our completion_queue. */
4277	list_skb = &sd->completion_queue;
4278	while (*list_skb)
4279		list_skb = &(*list_skb)->next;
4280	/* Append completion queue from offline CPU. */
4281	*list_skb = oldsd->completion_queue;
4282	oldsd->completion_queue = NULL;
4283
4284	/* Find end of our output_queue. */
4285	list_net = &sd->output_queue;
4286	while (*list_net)
4287		list_net = &(*list_net)->next_sched;
4288	/* Append output queue from offline CPU. */
4289	*list_net = oldsd->output_queue;
4290	oldsd->output_queue = NULL;
4291
4292	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4293	local_irq_enable();
4294
4295	/* Process offline CPU's input_pkt_queue */
4296	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4297		netif_rx(skb);
4298
4299	return NOTIFY_OK;
4300}
4301
4302#ifdef CONFIG_NET_DMA
4303/**
4304 * net_dma_rebalance - try to maintain one DMA channel per CPU
4305 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4306 *
4307 * This is called when the number of channels allocated to the net_dma client
4308 * changes.  The net_dma client tries to have one DMA channel per CPU.
4309 */
4310
4311static void net_dma_rebalance(struct net_dma *net_dma)
4312{
4313	unsigned int cpu, i, n, chan_idx;
4314	struct dma_chan *chan;
4315
4316	if (cpus_empty(net_dma->channel_mask)) {
4317		for_each_online_cpu(cpu)
4318			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4319		return;
4320	}
4321
4322	i = 0;
4323	cpu = first_cpu(cpu_online_map);
4324
4325	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4326		chan = net_dma->channels[chan_idx];
4327
4328		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4329		   + (i < (num_online_cpus() %
4330			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4331
4332		while(n) {
4333			per_cpu(softnet_data, cpu).net_dma = chan;
4334			cpu = next_cpu(cpu, cpu_online_map);
4335			n--;
4336		}
4337		i++;
4338	}
4339}
4340
4341/**
4342 * netdev_dma_event - event callback for the net_dma_client
4343 * @client: should always be net_dma_client
4344 * @chan: DMA channel for the event
4345 * @state: DMA state to be handled
4346 */
4347static enum dma_state_client
4348netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4349	enum dma_state state)
4350{
4351	int i, found = 0, pos = -1;
4352	struct net_dma *net_dma =
4353		container_of(client, struct net_dma, client);
4354	enum dma_state_client ack = DMA_DUP; /* default: take no action */
4355
4356	spin_lock(&net_dma->lock);
4357	switch (state) {
4358	case DMA_RESOURCE_AVAILABLE:
4359		for (i = 0; i < nr_cpu_ids; i++)
4360			if (net_dma->channels[i] == chan) {
4361				found = 1;
4362				break;
4363			} else if (net_dma->channels[i] == NULL && pos < 0)
4364				pos = i;
4365
4366		if (!found && pos >= 0) {
4367			ack = DMA_ACK;
4368			net_dma->channels[pos] = chan;
4369			cpu_set(pos, net_dma->channel_mask);
4370			net_dma_rebalance(net_dma);
4371		}
4372		break;
4373	case DMA_RESOURCE_REMOVED:
4374		for (i = 0; i < nr_cpu_ids; i++)
4375			if (net_dma->channels[i] == chan) {
4376				found = 1;
4377				pos = i;
4378				break;
4379			}
4380
4381		if (found) {
4382			ack = DMA_ACK;
4383			cpu_clear(pos, net_dma->channel_mask);
4384			net_dma->channels[i] = NULL;
4385			net_dma_rebalance(net_dma);
4386		}
4387		break;
4388	default:
4389		break;
4390	}
4391	spin_unlock(&net_dma->lock);
4392
4393	return ack;
4394}
4395
4396/**
4397 * netdev_dma_regiser - register the networking subsystem as a DMA client
4398 */
4399static int __init netdev_dma_register(void)
4400{
4401	net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4402								GFP_KERNEL);
4403	if (unlikely(!net_dma.channels)) {
4404		printk(KERN_NOTICE
4405				"netdev_dma: no memory for net_dma.channels\n");
4406		return -ENOMEM;
4407	}
4408	spin_lock_init(&net_dma.lock);
4409	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4410	dma_async_client_register(&net_dma.client);
4411	dma_async_client_chan_request(&net_dma.client);
4412	return 0;
4413}
4414
4415#else
4416static int __init netdev_dma_register(void) { return -ENODEV; }
4417#endif /* CONFIG_NET_DMA */
4418
4419/**
4420 *	netdev_compute_feature - compute conjunction of two feature sets
4421 *	@all: first feature set
4422 *	@one: second feature set
4423 *
4424 *	Computes a new feature set after adding a device with feature set
4425 *	@one to the master device with current feature set @all.  Returns
4426 *	the new feature set.
4427 */
4428int netdev_compute_features(unsigned long all, unsigned long one)
4429{
4430	/* if device needs checksumming, downgrade to hw checksumming */
4431	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4432		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4433
4434	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4435	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4436		all ^= NETIF_F_HW_CSUM
4437			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4438
4439	if (one & NETIF_F_GSO)
4440		one |= NETIF_F_GSO_SOFTWARE;
4441	one |= NETIF_F_GSO;
4442
4443	/* If even one device supports robust GSO, enable it for all. */
4444	if (one & NETIF_F_GSO_ROBUST)
4445		all |= NETIF_F_GSO_ROBUST;
4446
4447	all &= one | NETIF_F_LLTX;
4448
4449	if (!(all & NETIF_F_ALL_CSUM))
4450		all &= ~NETIF_F_SG;
4451	if (!(all & NETIF_F_SG))
4452		all &= ~NETIF_F_GSO_MASK;
4453
4454	return all;
4455}
4456EXPORT_SYMBOL(netdev_compute_features);
4457
4458static struct hlist_head *netdev_create_hash(void)
4459{
4460	int i;
4461	struct hlist_head *hash;
4462
4463	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4464	if (hash != NULL)
4465		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4466			INIT_HLIST_HEAD(&hash[i]);
4467
4468	return hash;
4469}
4470
4471/* Initialize per network namespace state */
4472static int __net_init netdev_init(struct net *net)
4473{
4474	INIT_LIST_HEAD(&net->dev_base_head);
4475
4476	net->dev_name_head = netdev_create_hash();
4477	if (net->dev_name_head == NULL)
4478		goto err_name;
4479
4480	net->dev_index_head = netdev_create_hash();
4481	if (net->dev_index_head == NULL)
4482		goto err_idx;
4483
4484	return 0;
4485
4486err_idx:
4487	kfree(net->dev_name_head);
4488err_name:
4489	return -ENOMEM;
4490}
4491
4492static void __net_exit netdev_exit(struct net *net)
4493{
4494	kfree(net->dev_name_head);
4495	kfree(net->dev_index_head);
4496}
4497
4498static struct pernet_operations __net_initdata netdev_net_ops = {
4499	.init = netdev_init,
4500	.exit = netdev_exit,
4501};
4502
4503static void __net_exit default_device_exit(struct net *net)
4504{
4505	struct net_device *dev, *next;
4506	/*
4507	 * Push all migratable of the network devices back to the
4508	 * initial network namespace
4509	 */
4510	rtnl_lock();
4511	for_each_netdev_safe(net, dev, next) {
4512		int err;
4513		char fb_name[IFNAMSIZ];
4514
4515		/* Ignore unmoveable devices (i.e. loopback) */
4516		if (dev->features & NETIF_F_NETNS_LOCAL)
4517			continue;
4518
4519		/* Push remaing network devices to init_net */
4520		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4521		err = dev_change_net_namespace(dev, &init_net, fb_name);
4522		if (err) {
4523			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4524				__func__, dev->name, err);
4525			BUG();
4526		}
4527	}
4528	rtnl_unlock();
4529}
4530
4531static struct pernet_operations __net_initdata default_device_ops = {
4532	.exit = default_device_exit,
4533};
4534
4535/*
4536 *	Initialize the DEV module. At boot time this walks the device list and
4537 *	unhooks any devices that fail to initialise (normally hardware not
4538 *	present) and leaves us with a valid list of present and active devices.
4539 *
4540 */
4541
4542/*
4543 *       This is called single threaded during boot, so no need
4544 *       to take the rtnl semaphore.
4545 */
4546static int __init net_dev_init(void)
4547{
4548	int i, rc = -ENOMEM;
4549
4550	BUG_ON(!dev_boot_phase);
4551
4552	if (dev_proc_init())
4553		goto out;
4554
4555	if (netdev_kobject_init())
4556		goto out;
4557
4558	INIT_LIST_HEAD(&ptype_all);
4559	for (i = 0; i < PTYPE_HASH_SIZE; i++)
4560		INIT_LIST_HEAD(&ptype_base[i]);
4561
4562	if (register_pernet_subsys(&netdev_net_ops))
4563		goto out;
4564
4565	if (register_pernet_device(&default_device_ops))
4566		goto out;
4567
4568	/*
4569	 *	Initialise the packet receive queues.
4570	 */
4571
4572	for_each_possible_cpu(i) {
4573		struct softnet_data *queue;
4574
4575		queue = &per_cpu(softnet_data, i);
4576		skb_queue_head_init(&queue->input_pkt_queue);
4577		queue->completion_queue = NULL;
4578		INIT_LIST_HEAD(&queue->poll_list);
4579
4580		queue->backlog.poll = process_backlog;
4581		queue->backlog.weight = weight_p;
4582	}
4583
4584	netdev_dma_register();
4585
4586	dev_boot_phase = 0;
4587
4588	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4589	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4590
4591	hotcpu_notifier(dev_cpu_callback, 0);
4592	dst_init();
4593	dev_mcast_init();
4594	rc = 0;
4595out:
4596	return rc;
4597}
4598
4599subsys_initcall(net_dev_init);
4600
4601EXPORT_SYMBOL(__dev_get_by_index);
4602EXPORT_SYMBOL(__dev_get_by_name);
4603EXPORT_SYMBOL(__dev_remove_pack);
4604EXPORT_SYMBOL(dev_valid_name);
4605EXPORT_SYMBOL(dev_add_pack);
4606EXPORT_SYMBOL(dev_alloc_name);
4607EXPORT_SYMBOL(dev_close);
4608EXPORT_SYMBOL(dev_get_by_flags);
4609EXPORT_SYMBOL(dev_get_by_index);
4610EXPORT_SYMBOL(dev_get_by_name);
4611EXPORT_SYMBOL(dev_open);
4612EXPORT_SYMBOL(dev_queue_xmit);
4613EXPORT_SYMBOL(dev_remove_pack);
4614EXPORT_SYMBOL(dev_set_allmulti);
4615EXPORT_SYMBOL(dev_set_promiscuity);
4616EXPORT_SYMBOL(dev_change_flags);
4617EXPORT_SYMBOL(dev_set_mtu);
4618EXPORT_SYMBOL(dev_set_mac_address);
4619EXPORT_SYMBOL(free_netdev);
4620EXPORT_SYMBOL(netdev_boot_setup_check);
4621EXPORT_SYMBOL(netdev_set_master);
4622EXPORT_SYMBOL(netdev_state_change);
4623EXPORT_SYMBOL(netif_receive_skb);
4624EXPORT_SYMBOL(netif_rx);
4625EXPORT_SYMBOL(register_gifconf);
4626EXPORT_SYMBOL(register_netdevice);
4627EXPORT_SYMBOL(register_netdevice_notifier);
4628EXPORT_SYMBOL(skb_checksum_help);
4629EXPORT_SYMBOL(synchronize_net);
4630EXPORT_SYMBOL(unregister_netdevice);
4631EXPORT_SYMBOL(unregister_netdevice_notifier);
4632EXPORT_SYMBOL(net_enable_timestamp);
4633EXPORT_SYMBOL(net_disable_timestamp);
4634EXPORT_SYMBOL(dev_get_flags);
4635
4636#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4637EXPORT_SYMBOL(br_handle_frame_hook);
4638EXPORT_SYMBOL(br_fdb_get_hook);
4639EXPORT_SYMBOL(br_fdb_put_hook);
4640#endif
4641
4642#ifdef CONFIG_KMOD
4643EXPORT_SYMBOL(dev_load);
4644#endif
4645
4646EXPORT_PER_CPU_SYMBOL(softnet_data);