net/core/dev.c at v2.6.23 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.23 102 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/notifier.h>
  94#include <linux/skbuff.h>
  95#include <net/sock.h>
  96#include <linux/rtnetlink.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/stat.h>
 100#include <linux/if_bridge.h>
 101#include <linux/if_macvlan.h>
 102#include <net/dst.h>
 103#include <net/pkt_sched.h>
 104#include <net/checksum.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/kmod.h>
 108#include <linux/module.h>
 109#include <linux/kallsyms.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/wext.h>
 114#include <net/iw_handler.h>
 115#include <asm/current.h>
 116#include <linux/audit.h>
 117#include <linux/dmaengine.h>
 118#include <linux/err.h>
 119#include <linux/ctype.h>
 120#include <linux/if_arp.h>
 121
 122/*
 123 *	The list of packet types we will receive (as opposed to discard)
 124 *	and the routines to invoke.
 125 *
 126 *	Why 16. Because with 16 the only overlap we get on a hash of the
 127 *	low nibble of the protocol value is RARP/SNAP/X.25.
 128 *
 129 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 130 *             sure which should go first, but I bet it won't make much
 131 *             difference if we are running VLANs.  The good news is that
 132 *             this protocol won't be in the list unless compiled in, so
 133 *             the average user (w/out VLANs) will not be adversely affected.
 134 *             --BLG
 135 *
 136 *		0800	IP
 137 *		8100    802.1Q VLAN
 138 *		0001	802.3
 139 *		0002	AX.25
 140 *		0004	802.2
 141 *		8035	RARP
 142 *		0005	SNAP
 143 *		0805	X.25
 144 *		0806	ARP
 145 *		8137	IPX
 146 *		0009	Localtalk
 147 *		86DD	IPv6
 148 */
 149
 150static DEFINE_SPINLOCK(ptype_lock);
 151static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
 152static struct list_head ptype_all __read_mostly;	/* Taps */
 153
 154#ifdef CONFIG_NET_DMA
 155struct net_dma {
 156	struct dma_client client;
 157	spinlock_t lock;
 158	cpumask_t channel_mask;
 159	struct dma_chan *channels[NR_CPUS];
 160};
 161
 162static enum dma_state_client
 163netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 164	enum dma_state state);
 165
 166static struct net_dma net_dma = {
 167	.client = {
 168		.event_callback = netdev_dma_event,
 169	},
 170};
 171#endif
 172
 173/*
 174 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 175 * semaphore.
 176 *
 177 * Pure readers hold dev_base_lock for reading.
 178 *
 179 * Writers must hold the rtnl semaphore while they loop through the
 180 * dev_base_head list, and hold dev_base_lock for writing when they do the
 181 * actual updates.  This allows pure readers to access the list even
 182 * while a writer is preparing to update it.
 183 *
 184 * To put it another way, dev_base_lock is held for writing only to
 185 * protect against pure readers; the rtnl semaphore provides the
 186 * protection against other writers.
 187 *
 188 * See, for example usages, register_netdevice() and
 189 * unregister_netdevice(), which must be called with the rtnl
 190 * semaphore held.
 191 */
 192LIST_HEAD(dev_base_head);
 193DEFINE_RWLOCK(dev_base_lock);
 194
 195EXPORT_SYMBOL(dev_base_head);
 196EXPORT_SYMBOL(dev_base_lock);
 197
 198#define NETDEV_HASHBITS	8
 199static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 200static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 201
 202static inline struct hlist_head *dev_name_hash(const char *name)
 203{
 204	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205	return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 206}
 207
 208static inline struct hlist_head *dev_index_hash(int ifindex)
 209{
 210	return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 211}
 212
 213/*
 214 *	Our notifier list
 215 */
 216
 217static RAW_NOTIFIER_HEAD(netdev_chain);
 218
 219/*
 220 *	Device drivers call our routines to queue packets here. We empty the
 221 *	queue in the local softnet handler.
 222 */
 223DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 224
 225#ifdef CONFIG_SYSFS
 226extern int netdev_sysfs_init(void);
 227extern int netdev_register_sysfs(struct net_device *);
 228extern void netdev_unregister_sysfs(struct net_device *);
 229#else
 230#define netdev_sysfs_init()	 	(0)
 231#define netdev_register_sysfs(dev)	(0)
 232#define	netdev_unregister_sysfs(dev)	do { } while(0)
 233#endif
 234
 235#ifdef CONFIG_DEBUG_LOCK_ALLOC
 236/*
 237 * register_netdevice() inits dev->_xmit_lock and sets lockdep class
 238 * according to dev->type
 239 */
 240static const unsigned short netdev_lock_type[] =
 241	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 242	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 243	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 244	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 245	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 246	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 247	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 248	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 249	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 250	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 251	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 252	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 253	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 254	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 255	 ARPHRD_NONE};
 256
 257static const char *netdev_lock_name[] =
 258	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 259	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 260	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 261	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 262	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 263	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 264	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 265	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 266	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 267	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 268	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 269	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 270	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 271	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 272	 "_xmit_NONE"};
 273
 274static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 275
 276static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 277{
 278	int i;
 279
 280	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 281		if (netdev_lock_type[i] == dev_type)
 282			return i;
 283	/* the last key is used by default */
 284	return ARRAY_SIZE(netdev_lock_type) - 1;
 285}
 286
 287static inline void netdev_set_lockdep_class(spinlock_t *lock,
 288					    unsigned short dev_type)
 289{
 290	int i;
 291
 292	i = netdev_lock_pos(dev_type);
 293	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 294				   netdev_lock_name[i]);
 295}
 296#else
 297static inline void netdev_set_lockdep_class(spinlock_t *lock,
 298					    unsigned short dev_type)
 299{
 300}
 301#endif
 302
 303/*******************************************************************************
 304
 305		Protocol management and registration routines
 306
 307*******************************************************************************/
 308
 309/*
 310 *	Add a protocol ID to the list. Now that the input handler is
 311 *	smarter we can dispense with all the messy stuff that used to be
 312 *	here.
 313 *
 314 *	BEWARE!!! Protocol handlers, mangling input packets,
 315 *	MUST BE last in hash buckets and checking protocol handlers
 316 *	MUST start from promiscuous ptype_all chain in net_bh.
 317 *	It is true now, do not change it.
 318 *	Explanation follows: if protocol handler, mangling packet, will
 319 *	be the first on list, it is not able to sense, that packet
 320 *	is cloned and should be copied-on-write, so that it will
 321 *	change it and subsequent readers will get broken packet.
 322 *							--ANK (980803)
 323 */
 324
 325/**
 326 *	dev_add_pack - add packet handler
 327 *	@pt: packet type declaration
 328 *
 329 *	Add a protocol handler to the networking stack. The passed &packet_type
 330 *	is linked into kernel lists and may not be freed until it has been
 331 *	removed from the kernel lists.
 332 *
 333 *	This call does not sleep therefore it can not
 334 *	guarantee all CPU's that are in middle of receiving packets
 335 *	will see the new packet type (until the next received packet).
 336 */
 337
 338void dev_add_pack(struct packet_type *pt)
 339{
 340	int hash;
 341
 342	spin_lock_bh(&ptype_lock);
 343	if (pt->type == htons(ETH_P_ALL))
 344		list_add_rcu(&pt->list, &ptype_all);
 345	else {
 346		hash = ntohs(pt->type) & 15;
 347		list_add_rcu(&pt->list, &ptype_base[hash]);
 348	}
 349	spin_unlock_bh(&ptype_lock);
 350}
 351
 352/**
 353 *	__dev_remove_pack	 - remove packet handler
 354 *	@pt: packet type declaration
 355 *
 356 *	Remove a protocol handler that was previously added to the kernel
 357 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 358 *	from the kernel lists and can be freed or reused once this function
 359 *	returns.
 360 *
 361 *      The packet type might still be in use by receivers
 362 *	and must not be freed until after all the CPU's have gone
 363 *	through a quiescent state.
 364 */
 365void __dev_remove_pack(struct packet_type *pt)
 366{
 367	struct list_head *head;
 368	struct packet_type *pt1;
 369
 370	spin_lock_bh(&ptype_lock);
 371
 372	if (pt->type == htons(ETH_P_ALL))
 373		head = &ptype_all;
 374	else
 375		head = &ptype_base[ntohs(pt->type) & 15];
 376
 377	list_for_each_entry(pt1, head, list) {
 378		if (pt == pt1) {
 379			list_del_rcu(&pt->list);
 380			goto out;
 381		}
 382	}
 383
 384	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 385out:
 386	spin_unlock_bh(&ptype_lock);
 387}
 388/**
 389 *	dev_remove_pack	 - remove packet handler
 390 *	@pt: packet type declaration
 391 *
 392 *	Remove a protocol handler that was previously added to the kernel
 393 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 394 *	from the kernel lists and can be freed or reused once this function
 395 *	returns.
 396 *
 397 *	This call sleeps to guarantee that no CPU is looking at the packet
 398 *	type after return.
 399 */
 400void dev_remove_pack(struct packet_type *pt)
 401{
 402	__dev_remove_pack(pt);
 403
 404	synchronize_net();
 405}
 406
 407/******************************************************************************
 408
 409		      Device Boot-time Settings Routines
 410
 411*******************************************************************************/
 412
 413/* Boot time configuration table */
 414static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 415
 416/**
 417 *	netdev_boot_setup_add	- add new setup entry
 418 *	@name: name of the device
 419 *	@map: configured settings for the device
 420 *
 421 *	Adds new setup entry to the dev_boot_setup list.  The function
 422 *	returns 0 on error and 1 on success.  This is a generic routine to
 423 *	all netdevices.
 424 */
 425static int netdev_boot_setup_add(char *name, struct ifmap *map)
 426{
 427	struct netdev_boot_setup *s;
 428	int i;
 429
 430	s = dev_boot_setup;
 431	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 432		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 433			memset(s[i].name, 0, sizeof(s[i].name));
 434			strcpy(s[i].name, name);
 435			memcpy(&s[i].map, map, sizeof(s[i].map));
 436			break;
 437		}
 438	}
 439
 440	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 441}
 442
 443/**
 444 *	netdev_boot_setup_check	- check boot time settings
 445 *	@dev: the netdevice
 446 *
 447 * 	Check boot time settings for the device.
 448 *	The found settings are set for the device to be used
 449 *	later in the device probing.
 450 *	Returns 0 if no settings found, 1 if they are.
 451 */
 452int netdev_boot_setup_check(struct net_device *dev)
 453{
 454	struct netdev_boot_setup *s = dev_boot_setup;
 455	int i;
 456
 457	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 458		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 459		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 460			dev->irq 	= s[i].map.irq;
 461			dev->base_addr 	= s[i].map.base_addr;
 462			dev->mem_start 	= s[i].map.mem_start;
 463			dev->mem_end 	= s[i].map.mem_end;
 464			return 1;
 465		}
 466	}
 467	return 0;
 468}
 469
 470
 471/**
 472 *	netdev_boot_base	- get address from boot time settings
 473 *	@prefix: prefix for network device
 474 *	@unit: id for network device
 475 *
 476 * 	Check boot time settings for the base address of device.
 477 *	The found settings are set for the device to be used
 478 *	later in the device probing.
 479 *	Returns 0 if no settings found.
 480 */
 481unsigned long netdev_boot_base(const char *prefix, int unit)
 482{
 483	const struct netdev_boot_setup *s = dev_boot_setup;
 484	char name[IFNAMSIZ];
 485	int i;
 486
 487	sprintf(name, "%s%d", prefix, unit);
 488
 489	/*
 490	 * If device already registered then return base of 1
 491	 * to indicate not to probe for this interface
 492	 */
 493	if (__dev_get_by_name(name))
 494		return 1;
 495
 496	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 497		if (!strcmp(name, s[i].name))
 498			return s[i].map.base_addr;
 499	return 0;
 500}
 501
 502/*
 503 * Saves at boot time configured settings for any netdevice.
 504 */
 505int __init netdev_boot_setup(char *str)
 506{
 507	int ints[5];
 508	struct ifmap map;
 509
 510	str = get_options(str, ARRAY_SIZE(ints), ints);
 511	if (!str || !*str)
 512		return 0;
 513
 514	/* Save settings */
 515	memset(&map, 0, sizeof(map));
 516	if (ints[0] > 0)
 517		map.irq = ints[1];
 518	if (ints[0] > 1)
 519		map.base_addr = ints[2];
 520	if (ints[0] > 2)
 521		map.mem_start = ints[3];
 522	if (ints[0] > 3)
 523		map.mem_end = ints[4];
 524
 525	/* Add new entry to the list */
 526	return netdev_boot_setup_add(str, &map);
 527}
 528
 529__setup("netdev=", netdev_boot_setup);
 530
 531/*******************************************************************************
 532
 533			    Device Interface Subroutines
 534
 535*******************************************************************************/
 536
 537/**
 538 *	__dev_get_by_name	- find a device by its name
 539 *	@name: name to find
 540 *
 541 *	Find an interface by name. Must be called under RTNL semaphore
 542 *	or @dev_base_lock. If the name is found a pointer to the device
 543 *	is returned. If the name is not found then %NULL is returned. The
 544 *	reference counters are not incremented so the caller must be
 545 *	careful with locks.
 546 */
 547
 548struct net_device *__dev_get_by_name(const char *name)
 549{
 550	struct hlist_node *p;
 551
 552	hlist_for_each(p, dev_name_hash(name)) {
 553		struct net_device *dev
 554			= hlist_entry(p, struct net_device, name_hlist);
 555		if (!strncmp(dev->name, name, IFNAMSIZ))
 556			return dev;
 557	}
 558	return NULL;
 559}
 560
 561/**
 562 *	dev_get_by_name		- find a device by its name
 563 *	@name: name to find
 564 *
 565 *	Find an interface by name. This can be called from any
 566 *	context and does its own locking. The returned handle has
 567 *	the usage count incremented and the caller must use dev_put() to
 568 *	release it when it is no longer needed. %NULL is returned if no
 569 *	matching device is found.
 570 */
 571
 572struct net_device *dev_get_by_name(const char *name)
 573{
 574	struct net_device *dev;
 575
 576	read_lock(&dev_base_lock);
 577	dev = __dev_get_by_name(name);
 578	if (dev)
 579		dev_hold(dev);
 580	read_unlock(&dev_base_lock);
 581	return dev;
 582}
 583
 584/**
 585 *	__dev_get_by_index - find a device by its ifindex
 586 *	@ifindex: index of device
 587 *
 588 *	Search for an interface by index. Returns %NULL if the device
 589 *	is not found or a pointer to the device. The device has not
 590 *	had its reference counter increased so the caller must be careful
 591 *	about locking. The caller must hold either the RTNL semaphore
 592 *	or @dev_base_lock.
 593 */
 594
 595struct net_device *__dev_get_by_index(int ifindex)
 596{
 597	struct hlist_node *p;
 598
 599	hlist_for_each(p, dev_index_hash(ifindex)) {
 600		struct net_device *dev
 601			= hlist_entry(p, struct net_device, index_hlist);
 602		if (dev->ifindex == ifindex)
 603			return dev;
 604	}
 605	return NULL;
 606}
 607
 608
 609/**
 610 *	dev_get_by_index - find a device by its ifindex
 611 *	@ifindex: index of device
 612 *
 613 *	Search for an interface by index. Returns NULL if the device
 614 *	is not found or a pointer to the device. The device returned has
 615 *	had a reference added and the pointer is safe until the user calls
 616 *	dev_put to indicate they have finished with it.
 617 */
 618
 619struct net_device *dev_get_by_index(int ifindex)
 620{
 621	struct net_device *dev;
 622
 623	read_lock(&dev_base_lock);
 624	dev = __dev_get_by_index(ifindex);
 625	if (dev)
 626		dev_hold(dev);
 627	read_unlock(&dev_base_lock);
 628	return dev;
 629}
 630
 631/**
 632 *	dev_getbyhwaddr - find a device by its hardware address
 633 *	@type: media type of device
 634 *	@ha: hardware address
 635 *
 636 *	Search for an interface by MAC address. Returns NULL if the device
 637 *	is not found or a pointer to the device. The caller must hold the
 638 *	rtnl semaphore. The returned device has not had its ref count increased
 639 *	and the caller must therefore be careful about locking
 640 *
 641 *	BUGS:
 642 *	If the API was consistent this would be __dev_get_by_hwaddr
 643 */
 644
 645struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 646{
 647	struct net_device *dev;
 648
 649	ASSERT_RTNL();
 650
 651	for_each_netdev(dev)
 652		if (dev->type == type &&
 653		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 654			return dev;
 655
 656	return NULL;
 657}
 658
 659EXPORT_SYMBOL(dev_getbyhwaddr);
 660
 661struct net_device *__dev_getfirstbyhwtype(unsigned short type)
 662{
 663	struct net_device *dev;
 664
 665	ASSERT_RTNL();
 666	for_each_netdev(dev)
 667		if (dev->type == type)
 668			return dev;
 669
 670	return NULL;
 671}
 672
 673EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 674
 675struct net_device *dev_getfirstbyhwtype(unsigned short type)
 676{
 677	struct net_device *dev;
 678
 679	rtnl_lock();
 680	dev = __dev_getfirstbyhwtype(type);
 681	if (dev)
 682		dev_hold(dev);
 683	rtnl_unlock();
 684	return dev;
 685}
 686
 687EXPORT_SYMBOL(dev_getfirstbyhwtype);
 688
 689/**
 690 *	dev_get_by_flags - find any device with given flags
 691 *	@if_flags: IFF_* values
 692 *	@mask: bitmask of bits in if_flags to check
 693 *
 694 *	Search for any interface with the given flags. Returns NULL if a device
 695 *	is not found or a pointer to the device. The device returned has
 696 *	had a reference added and the pointer is safe until the user calls
 697 *	dev_put to indicate they have finished with it.
 698 */
 699
 700struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 701{
 702	struct net_device *dev, *ret;
 703
 704	ret = NULL;
 705	read_lock(&dev_base_lock);
 706	for_each_netdev(dev) {
 707		if (((dev->flags ^ if_flags) & mask) == 0) {
 708			dev_hold(dev);
 709			ret = dev;
 710			break;
 711		}
 712	}
 713	read_unlock(&dev_base_lock);
 714	return ret;
 715}
 716
 717/**
 718 *	dev_valid_name - check if name is okay for network device
 719 *	@name: name string
 720 *
 721 *	Network device names need to be valid file names to
 722 *	to allow sysfs to work.  We also disallow any kind of
 723 *	whitespace.
 724 */
 725int dev_valid_name(const char *name)
 726{
 727	if (*name == '\0')
 728		return 0;
 729	if (strlen(name) >= IFNAMSIZ)
 730		return 0;
 731	if (!strcmp(name, ".") || !strcmp(name, ".."))
 732		return 0;
 733
 734	while (*name) {
 735		if (*name == '/' || isspace(*name))
 736			return 0;
 737		name++;
 738	}
 739	return 1;
 740}
 741
 742/**
 743 *	dev_alloc_name - allocate a name for a device
 744 *	@dev: device
 745 *	@name: name format string
 746 *
 747 *	Passed a format string - eg "lt%d" it will try and find a suitable
 748 *	id. It scans list of devices to build up a free map, then chooses
 749 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 750 *	while allocating the name and adding the device in order to avoid
 751 *	duplicates.
 752 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 753 *	Returns the number of the unit assigned or a negative errno code.
 754 */
 755
 756int dev_alloc_name(struct net_device *dev, const char *name)
 757{
 758	int i = 0;
 759	char buf[IFNAMSIZ];
 760	const char *p;
 761	const int max_netdevices = 8*PAGE_SIZE;
 762	long *inuse;
 763	struct net_device *d;
 764
 765	p = strnchr(name, IFNAMSIZ-1, '%');
 766	if (p) {
 767		/*
 768		 * Verify the string as this thing may have come from
 769		 * the user.  There must be either one "%d" and no other "%"
 770		 * characters.
 771		 */
 772		if (p[1] != 'd' || strchr(p + 2, '%'))
 773			return -EINVAL;
 774
 775		/* Use one page as a bit array of possible slots */
 776		inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 777		if (!inuse)
 778			return -ENOMEM;
 779
 780		for_each_netdev(d) {
 781			if (!sscanf(d->name, name, &i))
 782				continue;
 783			if (i < 0 || i >= max_netdevices)
 784				continue;
 785
 786			/*  avoid cases where sscanf is not exact inverse of printf */
 787			snprintf(buf, sizeof(buf), name, i);
 788			if (!strncmp(buf, d->name, IFNAMSIZ))
 789				set_bit(i, inuse);
 790		}
 791
 792		i = find_first_zero_bit(inuse, max_netdevices);
 793		free_page((unsigned long) inuse);
 794	}
 795
 796	snprintf(buf, sizeof(buf), name, i);
 797	if (!__dev_get_by_name(buf)) {
 798		strlcpy(dev->name, buf, IFNAMSIZ);
 799		return i;
 800	}
 801
 802	/* It is possible to run out of possible slots
 803	 * when the name is long and there isn't enough space left
 804	 * for the digits, or if all bits are used.
 805	 */
 806	return -ENFILE;
 807}
 808
 809
 810/**
 811 *	dev_change_name - change name of a device
 812 *	@dev: device
 813 *	@newname: name (or format string) must be at least IFNAMSIZ
 814 *
 815 *	Change name of a device, can pass format strings "eth%d".
 816 *	for wildcarding.
 817 */
 818int dev_change_name(struct net_device *dev, char *newname)
 819{
 820	char oldname[IFNAMSIZ];
 821	int err = 0;
 822	int ret;
 823
 824	ASSERT_RTNL();
 825
 826	if (dev->flags & IFF_UP)
 827		return -EBUSY;
 828
 829	if (!dev_valid_name(newname))
 830		return -EINVAL;
 831
 832	memcpy(oldname, dev->name, IFNAMSIZ);
 833
 834	if (strchr(newname, '%')) {
 835		err = dev_alloc_name(dev, newname);
 836		if (err < 0)
 837			return err;
 838		strcpy(newname, dev->name);
 839	}
 840	else if (__dev_get_by_name(newname))
 841		return -EEXIST;
 842	else
 843		strlcpy(dev->name, newname, IFNAMSIZ);
 844
 845rollback:
 846	device_rename(&dev->dev, dev->name);
 847
 848	write_lock_bh(&dev_base_lock);
 849	hlist_del(&dev->name_hlist);
 850	hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 851	write_unlock_bh(&dev_base_lock);
 852
 853	ret = raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 854	ret = notifier_to_errno(ret);
 855
 856	if (ret) {
 857		if (err) {
 858			printk(KERN_ERR
 859			       "%s: name change rollback failed: %d.\n",
 860			       dev->name, ret);
 861		} else {
 862			err = ret;
 863			memcpy(dev->name, oldname, IFNAMSIZ);
 864			goto rollback;
 865		}
 866	}
 867
 868	return err;
 869}
 870
 871/**
 872 *	netdev_features_change - device changes features
 873 *	@dev: device to cause notification
 874 *
 875 *	Called to indicate a device has changed features.
 876 */
 877void netdev_features_change(struct net_device *dev)
 878{
 879	raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 880}
 881EXPORT_SYMBOL(netdev_features_change);
 882
 883/**
 884 *	netdev_state_change - device changes state
 885 *	@dev: device to cause notification
 886 *
 887 *	Called to indicate a device has changed state. This function calls
 888 *	the notifier chains for netdev_chain and sends a NEWLINK message
 889 *	to the routing socket.
 890 */
 891void netdev_state_change(struct net_device *dev)
 892{
 893	if (dev->flags & IFF_UP) {
 894		raw_notifier_call_chain(&netdev_chain,
 895				NETDEV_CHANGE, dev);
 896		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 897	}
 898}
 899
 900/**
 901 *	dev_load 	- load a network module
 902 *	@name: name of interface
 903 *
 904 *	If a network interface is not present and the process has suitable
 905 *	privileges this function loads the module. If module loading is not
 906 *	available in this kernel then it becomes a nop.
 907 */
 908
 909void dev_load(const char *name)
 910{
 911	struct net_device *dev;
 912
 913	read_lock(&dev_base_lock);
 914	dev = __dev_get_by_name(name);
 915	read_unlock(&dev_base_lock);
 916
 917	if (!dev && capable(CAP_SYS_MODULE))
 918		request_module("%s", name);
 919}
 920
 921static int default_rebuild_header(struct sk_buff *skb)
 922{
 923	printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 924	       skb->dev ? skb->dev->name : "NULL!!!");
 925	kfree_skb(skb);
 926	return 1;
 927}
 928
 929/**
 930 *	dev_open	- prepare an interface for use.
 931 *	@dev:	device to open
 932 *
 933 *	Takes a device from down to up state. The device's private open
 934 *	function is invoked and then the multicast lists are loaded. Finally
 935 *	the device is moved into the up state and a %NETDEV_UP message is
 936 *	sent to the netdev notifier chain.
 937 *
 938 *	Calling this function on an active interface is a nop. On a failure
 939 *	a negative errno code is returned.
 940 */
 941int dev_open(struct net_device *dev)
 942{
 943	int ret = 0;
 944
 945	/*
 946	 *	Is it already up?
 947	 */
 948
 949	if (dev->flags & IFF_UP)
 950		return 0;
 951
 952	/*
 953	 *	Is it even present?
 954	 */
 955	if (!netif_device_present(dev))
 956		return -ENODEV;
 957
 958	/*
 959	 *	Call device private open method
 960	 */
 961	set_bit(__LINK_STATE_START, &dev->state);
 962	if (dev->open) {
 963		ret = dev->open(dev);
 964		if (ret)
 965			clear_bit(__LINK_STATE_START, &dev->state);
 966	}
 967
 968	/*
 969	 *	If it went open OK then:
 970	 */
 971
 972	if (!ret) {
 973		/*
 974		 *	Set the flags.
 975		 */
 976		dev->flags |= IFF_UP;
 977
 978		/*
 979		 *	Initialize multicasting status
 980		 */
 981		dev_set_rx_mode(dev);
 982
 983		/*
 984		 *	Wakeup transmit queue engine
 985		 */
 986		dev_activate(dev);
 987
 988		/*
 989		 *	... and announce new interface.
 990		 */
 991		raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 992	}
 993	return ret;
 994}
 995
 996/**
 997 *	dev_close - shutdown an interface.
 998 *	@dev: device to shutdown
 999 *
1000 *	This function moves an active device into down state. A
1001 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1002 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1003 *	chain.
1004 */
1005int dev_close(struct net_device *dev)
1006{
1007	if (!(dev->flags & IFF_UP))
1008		return 0;
1009
1010	/*
1011	 *	Tell people we are going down, so that they can
1012	 *	prepare to death, when device is still operating.
1013	 */
1014	raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
1015
1016	dev_deactivate(dev);
1017
1018	clear_bit(__LINK_STATE_START, &dev->state);
1019
1020	/* Synchronize to scheduled poll. We cannot touch poll list,
1021	 * it can be even on different cpu. So just clear netif_running(),
1022	 * and wait when poll really will happen. Actually, the best place
1023	 * for this is inside dev->stop() after device stopped its irq
1024	 * engine, but this requires more changes in devices. */
1025
1026	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1027	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
1028		/* No hurry. */
1029		msleep(1);
1030	}
1031
1032	/*
1033	 *	Call the device specific close. This cannot fail.
1034	 *	Only if device is UP
1035	 *
1036	 *	We allow it to be called even after a DETACH hot-plug
1037	 *	event.
1038	 */
1039	if (dev->stop)
1040		dev->stop(dev);
1041
1042	/*
1043	 *	Device is now down.
1044	 */
1045
1046	dev->flags &= ~IFF_UP;
1047
1048	/*
1049	 * Tell people we are down
1050	 */
1051	raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
1052
1053	return 0;
1054}
1055
1056
1057/*
1058 *	Device change register/unregister. These are not inline or static
1059 *	as we export them to the world.
1060 */
1061
1062/**
1063 *	register_netdevice_notifier - register a network notifier block
1064 *	@nb: notifier
1065 *
1066 *	Register a notifier to be called when network device events occur.
1067 *	The notifier passed is linked into the kernel structures and must
1068 *	not be reused until it has been unregistered. A negative errno code
1069 *	is returned on a failure.
1070 *
1071 * 	When registered all registration and up events are replayed
1072 *	to the new notifier to allow device to have a race free
1073 *	view of the network device list.
1074 */
1075
1076int register_netdevice_notifier(struct notifier_block *nb)
1077{
1078	struct net_device *dev;
1079	struct net_device *last;
1080	int err;
1081
1082	rtnl_lock();
1083	err = raw_notifier_chain_register(&netdev_chain, nb);
1084	if (err)
1085		goto unlock;
1086
1087	for_each_netdev(dev) {
1088		err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1089		err = notifier_to_errno(err);
1090		if (err)
1091			goto rollback;
1092
1093		if (!(dev->flags & IFF_UP))
1094			continue;
1095
1096		nb->notifier_call(nb, NETDEV_UP, dev);
1097	}
1098
1099unlock:
1100	rtnl_unlock();
1101	return err;
1102
1103rollback:
1104	last = dev;
1105	for_each_netdev(dev) {
1106		if (dev == last)
1107			break;
1108
1109		if (dev->flags & IFF_UP) {
1110			nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1111			nb->notifier_call(nb, NETDEV_DOWN, dev);
1112		}
1113		nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1114	}
1115	goto unlock;
1116}
1117
1118/**
1119 *	unregister_netdevice_notifier - unregister a network notifier block
1120 *	@nb: notifier
1121 *
1122 *	Unregister a notifier previously registered by
1123 *	register_netdevice_notifier(). The notifier is unlinked into the
1124 *	kernel structures and may then be reused. A negative errno code
1125 *	is returned on a failure.
1126 */
1127
1128int unregister_netdevice_notifier(struct notifier_block *nb)
1129{
1130	int err;
1131
1132	rtnl_lock();
1133	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1134	rtnl_unlock();
1135	return err;
1136}
1137
1138/**
1139 *	call_netdevice_notifiers - call all network notifier blocks
1140 *      @val: value passed unmodified to notifier function
1141 *      @v:   pointer passed unmodified to notifier function
1142 *
1143 *	Call all network notifier blocks.  Parameters and return value
1144 *	are as for raw_notifier_call_chain().
1145 */
1146
1147int call_netdevice_notifiers(unsigned long val, void *v)
1148{
1149	return raw_notifier_call_chain(&netdev_chain, val, v);
1150}
1151
1152/* When > 0 there are consumers of rx skb time stamps */
1153static atomic_t netstamp_needed = ATOMIC_INIT(0);
1154
1155void net_enable_timestamp(void)
1156{
1157	atomic_inc(&netstamp_needed);
1158}
1159
1160void net_disable_timestamp(void)
1161{
1162	atomic_dec(&netstamp_needed);
1163}
1164
1165static inline void net_timestamp(struct sk_buff *skb)
1166{
1167	if (atomic_read(&netstamp_needed))
1168		__net_timestamp(skb);
1169	else
1170		skb->tstamp.tv64 = 0;
1171}
1172
1173/*
1174 *	Support routine. Sends outgoing frames to any network
1175 *	taps currently in use.
1176 */
1177
1178static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1179{
1180	struct packet_type *ptype;
1181
1182	net_timestamp(skb);
1183
1184	rcu_read_lock();
1185	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1186		/* Never send packets back to the socket
1187		 * they originated from - MvS (miquels@drinkel.ow.org)
1188		 */
1189		if ((ptype->dev == dev || !ptype->dev) &&
1190		    (ptype->af_packet_priv == NULL ||
1191		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1192			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1193			if (!skb2)
1194				break;
1195
1196			/* skb->nh should be correctly
1197			   set by sender, so that the second statement is
1198			   just protection against buggy protocols.
1199			 */
1200			skb_reset_mac_header(skb2);
1201
1202			if (skb_network_header(skb2) < skb2->data ||
1203			    skb2->network_header > skb2->tail) {
1204				if (net_ratelimit())
1205					printk(KERN_CRIT "protocol %04x is "
1206					       "buggy, dev %s\n",
1207					       skb2->protocol, dev->name);
1208				skb_reset_network_header(skb2);
1209			}
1210
1211			skb2->transport_header = skb2->network_header;
1212			skb2->pkt_type = PACKET_OUTGOING;
1213			ptype->func(skb2, skb->dev, ptype, skb->dev);
1214		}
1215	}
1216	rcu_read_unlock();
1217}
1218
1219
1220void __netif_schedule(struct net_device *dev)
1221{
1222	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1223		unsigned long flags;
1224		struct softnet_data *sd;
1225
1226		local_irq_save(flags);
1227		sd = &__get_cpu_var(softnet_data);
1228		dev->next_sched = sd->output_queue;
1229		sd->output_queue = dev;
1230		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1231		local_irq_restore(flags);
1232	}
1233}
1234EXPORT_SYMBOL(__netif_schedule);
1235
1236void __netif_rx_schedule(struct net_device *dev)
1237{
1238	unsigned long flags;
1239
1240	local_irq_save(flags);
1241	dev_hold(dev);
1242	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1243	if (dev->quota < 0)
1244		dev->quota += dev->weight;
1245	else
1246		dev->quota = dev->weight;
1247	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
1248	local_irq_restore(flags);
1249}
1250EXPORT_SYMBOL(__netif_rx_schedule);
1251
1252void dev_kfree_skb_any(struct sk_buff *skb)
1253{
1254	if (in_irq() || irqs_disabled())
1255		dev_kfree_skb_irq(skb);
1256	else
1257		dev_kfree_skb(skb);
1258}
1259EXPORT_SYMBOL(dev_kfree_skb_any);
1260
1261
1262/* Hot-plugging. */
1263void netif_device_detach(struct net_device *dev)
1264{
1265	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1266	    netif_running(dev)) {
1267		netif_stop_queue(dev);
1268	}
1269}
1270EXPORT_SYMBOL(netif_device_detach);
1271
1272void netif_device_attach(struct net_device *dev)
1273{
1274	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1275	    netif_running(dev)) {
1276		netif_wake_queue(dev);
1277		__netdev_watchdog_up(dev);
1278	}
1279}
1280EXPORT_SYMBOL(netif_device_attach);
1281
1282
1283/*
1284 * Invalidate hardware checksum when packet is to be mangled, and
1285 * complete checksum manually on outgoing path.
1286 */
1287int skb_checksum_help(struct sk_buff *skb)
1288{
1289	__wsum csum;
1290	int ret = 0, offset;
1291
1292	if (skb->ip_summed == CHECKSUM_COMPLETE)
1293		goto out_set_summed;
1294
1295	if (unlikely(skb_shinfo(skb)->gso_size)) {
1296		/* Let GSO fix up the checksum. */
1297		goto out_set_summed;
1298	}
1299
1300	if (skb_cloned(skb)) {
1301		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1302		if (ret)
1303			goto out;
1304	}
1305
1306	offset = skb->csum_start - skb_headroom(skb);
1307	BUG_ON(offset > (int)skb->len);
1308	csum = skb_checksum(skb, offset, skb->len-offset, 0);
1309
1310	offset = skb_headlen(skb) - offset;
1311	BUG_ON(offset <= 0);
1312	BUG_ON(skb->csum_offset + 2 > offset);
1313
1314	*(__sum16 *)(skb->head + skb->csum_start + skb->csum_offset) =
1315		csum_fold(csum);
1316out_set_summed:
1317	skb->ip_summed = CHECKSUM_NONE;
1318out:
1319	return ret;
1320}
1321
1322/**
1323 *	skb_gso_segment - Perform segmentation on skb.
1324 *	@skb: buffer to segment
1325 *	@features: features for the output path (see dev->features)
1326 *
1327 *	This function segments the given skb and returns a list of segments.
1328 *
1329 *	It may return NULL if the skb requires no segmentation.  This is
1330 *	only possible when GSO is used for verifying header integrity.
1331 */
1332struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1333{
1334	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1335	struct packet_type *ptype;
1336	__be16 type = skb->protocol;
1337	int err;
1338
1339	BUG_ON(skb_shinfo(skb)->frag_list);
1340
1341	skb_reset_mac_header(skb);
1342	skb->mac_len = skb->network_header - skb->mac_header;
1343	__skb_pull(skb, skb->mac_len);
1344
1345	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1346		if (skb_header_cloned(skb) &&
1347		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1348			return ERR_PTR(err);
1349	}
1350
1351	rcu_read_lock();
1352	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1353		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1354			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1355				err = ptype->gso_send_check(skb);
1356				segs = ERR_PTR(err);
1357				if (err || skb_gso_ok(skb, features))
1358					break;
1359				__skb_push(skb, (skb->data -
1360						 skb_network_header(skb)));
1361			}
1362			segs = ptype->gso_segment(skb, features);
1363			break;
1364		}
1365	}
1366	rcu_read_unlock();
1367
1368	__skb_push(skb, skb->data - skb_mac_header(skb));
1369
1370	return segs;
1371}
1372
1373EXPORT_SYMBOL(skb_gso_segment);
1374
1375/* Take action when hardware reception checksum errors are detected. */
1376#ifdef CONFIG_BUG
1377void netdev_rx_csum_fault(struct net_device *dev)
1378{
1379	if (net_ratelimit()) {
1380		printk(KERN_ERR "%s: hw csum failure.\n",
1381			dev ? dev->name : "<unknown>");
1382		dump_stack();
1383	}
1384}
1385EXPORT_SYMBOL(netdev_rx_csum_fault);
1386#endif
1387
1388/* Actually, we should eliminate this check as soon as we know, that:
1389 * 1. IOMMU is present and allows to map all the memory.
1390 * 2. No high memory really exists on this machine.
1391 */
1392
1393static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1394{
1395#ifdef CONFIG_HIGHMEM
1396	int i;
1397
1398	if (dev->features & NETIF_F_HIGHDMA)
1399		return 0;
1400
1401	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1402		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1403			return 1;
1404
1405#endif
1406	return 0;
1407}
1408
1409struct dev_gso_cb {
1410	void (*destructor)(struct sk_buff *skb);
1411};
1412
1413#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1414
1415static void dev_gso_skb_destructor(struct sk_buff *skb)
1416{
1417	struct dev_gso_cb *cb;
1418
1419	do {
1420		struct sk_buff *nskb = skb->next;
1421
1422		skb->next = nskb->next;
1423		nskb->next = NULL;
1424		kfree_skb(nskb);
1425	} while (skb->next);
1426
1427	cb = DEV_GSO_CB(skb);
1428	if (cb->destructor)
1429		cb->destructor(skb);
1430}
1431
1432/**
1433 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1434 *	@skb: buffer to segment
1435 *
1436 *	This function segments the given skb and stores the list of segments
1437 *	in skb->next.
1438 */
1439static int dev_gso_segment(struct sk_buff *skb)
1440{
1441	struct net_device *dev = skb->dev;
1442	struct sk_buff *segs;
1443	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1444					 NETIF_F_SG : 0);
1445
1446	segs = skb_gso_segment(skb, features);
1447
1448	/* Verifying header integrity only. */
1449	if (!segs)
1450		return 0;
1451
1452	if (unlikely(IS_ERR(segs)))
1453		return PTR_ERR(segs);
1454
1455	skb->next = segs;
1456	DEV_GSO_CB(skb)->destructor = skb->destructor;
1457	skb->destructor = dev_gso_skb_destructor;
1458
1459	return 0;
1460}
1461
1462int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1463{
1464	if (likely(!skb->next)) {
1465		if (!list_empty(&ptype_all))
1466			dev_queue_xmit_nit(skb, dev);
1467
1468		if (netif_needs_gso(dev, skb)) {
1469			if (unlikely(dev_gso_segment(skb)))
1470				goto out_kfree_skb;
1471			if (skb->next)
1472				goto gso;
1473		}
1474
1475		return dev->hard_start_xmit(skb, dev);
1476	}
1477
1478gso:
1479	do {
1480		struct sk_buff *nskb = skb->next;
1481		int rc;
1482
1483		skb->next = nskb->next;
1484		nskb->next = NULL;
1485		rc = dev->hard_start_xmit(nskb, dev);
1486		if (unlikely(rc)) {
1487			nskb->next = skb->next;
1488			skb->next = nskb;
1489			return rc;
1490		}
1491		if (unlikely((netif_queue_stopped(dev) ||
1492			     netif_subqueue_stopped(dev, skb->queue_mapping)) &&
1493			     skb->next))
1494			return NETDEV_TX_BUSY;
1495	} while (skb->next);
1496
1497	skb->destructor = DEV_GSO_CB(skb)->destructor;
1498
1499out_kfree_skb:
1500	kfree_skb(skb);
1501	return 0;
1502}
1503
1504#define HARD_TX_LOCK(dev, cpu) {			\
1505	if ((dev->features & NETIF_F_LLTX) == 0) {	\
1506		netif_tx_lock(dev);			\
1507	}						\
1508}
1509
1510#define HARD_TX_UNLOCK(dev) {				\
1511	if ((dev->features & NETIF_F_LLTX) == 0) {	\
1512		netif_tx_unlock(dev);			\
1513	}						\
1514}
1515
1516/**
1517 *	dev_queue_xmit - transmit a buffer
1518 *	@skb: buffer to transmit
1519 *
1520 *	Queue a buffer for transmission to a network device. The caller must
1521 *	have set the device and priority and built the buffer before calling
1522 *	this function. The function can be called from an interrupt.
1523 *
1524 *	A negative errno code is returned on a failure. A success does not
1525 *	guarantee the frame will be transmitted as it may be dropped due
1526 *	to congestion or traffic shaping.
1527 *
1528 * -----------------------------------------------------------------------------------
1529 *      I notice this method can also return errors from the queue disciplines,
1530 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1531 *      be positive.
1532 *
1533 *      Regardless of the return value, the skb is consumed, so it is currently
1534 *      difficult to retry a send to this method.  (You can bump the ref count
1535 *      before sending to hold a reference for retry if you are careful.)
1536 *
1537 *      When calling this method, interrupts MUST be enabled.  This is because
1538 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1539 *          --BLG
1540 */
1541
1542int dev_queue_xmit(struct sk_buff *skb)
1543{
1544	struct net_device *dev = skb->dev;
1545	struct Qdisc *q;
1546	int rc = -ENOMEM;
1547
1548	/* GSO will handle the following emulations directly. */
1549	if (netif_needs_gso(dev, skb))
1550		goto gso;
1551
1552	if (skb_shinfo(skb)->frag_list &&
1553	    !(dev->features & NETIF_F_FRAGLIST) &&
1554	    __skb_linearize(skb))
1555		goto out_kfree_skb;
1556
1557	/* Fragmented skb is linearized if device does not support SG,
1558	 * or if at least one of fragments is in highmem and device
1559	 * does not support DMA from it.
1560	 */
1561	if (skb_shinfo(skb)->nr_frags &&
1562	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1563	    __skb_linearize(skb))
1564		goto out_kfree_skb;
1565
1566	/* If packet is not checksummed and device does not support
1567	 * checksumming for this protocol, complete checksumming here.
1568	 */
1569	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1570		skb_set_transport_header(skb, skb->csum_start -
1571					      skb_headroom(skb));
1572
1573		if (!(dev->features & NETIF_F_GEN_CSUM) &&
1574		    !((dev->features & NETIF_F_IP_CSUM) &&
1575		      skb->protocol == htons(ETH_P_IP)) &&
1576		    !((dev->features & NETIF_F_IPV6_CSUM) &&
1577		      skb->protocol == htons(ETH_P_IPV6)))
1578			if (skb_checksum_help(skb))
1579				goto out_kfree_skb;
1580	}
1581
1582gso:
1583	spin_lock_prefetch(&dev->queue_lock);
1584
1585	/* Disable soft irqs for various locks below. Also
1586	 * stops preemption for RCU.
1587	 */
1588	rcu_read_lock_bh();
1589
1590	/* Updates of qdisc are serialized by queue_lock.
1591	 * The struct Qdisc which is pointed to by qdisc is now a
1592	 * rcu structure - it may be accessed without acquiring
1593	 * a lock (but the structure may be stale.) The freeing of the
1594	 * qdisc will be deferred until it's known that there are no
1595	 * more references to it.
1596	 *
1597	 * If the qdisc has an enqueue function, we still need to
1598	 * hold the queue_lock before calling it, since queue_lock
1599	 * also serializes access to the device queue.
1600	 */
1601
1602	q = rcu_dereference(dev->qdisc);
1603#ifdef CONFIG_NET_CLS_ACT
1604	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1605#endif
1606	if (q->enqueue) {
1607		/* Grab device queue */
1608		spin_lock(&dev->queue_lock);
1609		q = dev->qdisc;
1610		if (q->enqueue) {
1611			/* reset queue_mapping to zero */
1612			skb->queue_mapping = 0;
1613			rc = q->enqueue(skb, q);
1614			qdisc_run(dev);
1615			spin_unlock(&dev->queue_lock);
1616
1617			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1618			goto out;
1619		}
1620		spin_unlock(&dev->queue_lock);
1621	}
1622
1623	/* The device has no queue. Common case for software devices:
1624	   loopback, all the sorts of tunnels...
1625
1626	   Really, it is unlikely that netif_tx_lock protection is necessary
1627	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1628	   counters.)
1629	   However, it is possible, that they rely on protection
1630	   made by us here.
1631
1632	   Check this and shot the lock. It is not prone from deadlocks.
1633	   Either shot noqueue qdisc, it is even simpler 8)
1634	 */
1635	if (dev->flags & IFF_UP) {
1636		int cpu = smp_processor_id(); /* ok because BHs are off */
1637
1638		if (dev->xmit_lock_owner != cpu) {
1639
1640			HARD_TX_LOCK(dev, cpu);
1641
1642			if (!netif_queue_stopped(dev) &&
1643			    !netif_subqueue_stopped(dev, skb->queue_mapping)) {
1644				rc = 0;
1645				if (!dev_hard_start_xmit(skb, dev)) {
1646					HARD_TX_UNLOCK(dev);
1647					goto out;
1648				}
1649			}
1650			HARD_TX_UNLOCK(dev);
1651			if (net_ratelimit())
1652				printk(KERN_CRIT "Virtual device %s asks to "
1653				       "queue packet!\n", dev->name);
1654		} else {
1655			/* Recursion is detected! It is possible,
1656			 * unfortunately */
1657			if (net_ratelimit())
1658				printk(KERN_CRIT "Dead loop on virtual device "
1659				       "%s, fix it urgently!\n", dev->name);
1660		}
1661	}
1662
1663	rc = -ENETDOWN;
1664	rcu_read_unlock_bh();
1665
1666out_kfree_skb:
1667	kfree_skb(skb);
1668	return rc;
1669out:
1670	rcu_read_unlock_bh();
1671	return rc;
1672}
1673
1674
1675/*=======================================================================
1676			Receiver routines
1677  =======================================================================*/
1678
1679int netdev_max_backlog __read_mostly = 1000;
1680int netdev_budget __read_mostly = 300;
1681int weight_p __read_mostly = 64;            /* old backlog weight */
1682
1683DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1684
1685
1686/**
1687 *	netif_rx	-	post buffer to the network code
1688 *	@skb: buffer to post
1689 *
1690 *	This function receives a packet from a device driver and queues it for
1691 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1692 *	may be dropped during processing for congestion control or by the
1693 *	protocol layers.
1694 *
1695 *	return values:
1696 *	NET_RX_SUCCESS	(no congestion)
1697 *	NET_RX_CN_LOW   (low congestion)
1698 *	NET_RX_CN_MOD   (moderate congestion)
1699 *	NET_RX_CN_HIGH  (high congestion)
1700 *	NET_RX_DROP     (packet was dropped)
1701 *
1702 */
1703
1704int netif_rx(struct sk_buff *skb)
1705{
1706	struct softnet_data *queue;
1707	unsigned long flags;
1708
1709	/* if netpoll wants it, pretend we never saw it */
1710	if (netpoll_rx(skb))
1711		return NET_RX_DROP;
1712
1713	if (!skb->tstamp.tv64)
1714		net_timestamp(skb);
1715
1716	/*
1717	 * The code is rearranged so that the path is the most
1718	 * short when CPU is congested, but is still operating.
1719	 */
1720	local_irq_save(flags);
1721	queue = &__get_cpu_var(softnet_data);
1722
1723	__get_cpu_var(netdev_rx_stat).total++;
1724	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1725		if (queue->input_pkt_queue.qlen) {
1726enqueue:
1727			dev_hold(skb->dev);
1728			__skb_queue_tail(&queue->input_pkt_queue, skb);
1729			local_irq_restore(flags);
1730			return NET_RX_SUCCESS;
1731		}
1732
1733		netif_rx_schedule(&queue->backlog_dev);
1734		goto enqueue;
1735	}
1736
1737	__get_cpu_var(netdev_rx_stat).dropped++;
1738	local_irq_restore(flags);
1739
1740	kfree_skb(skb);
1741	return NET_RX_DROP;
1742}
1743
1744int netif_rx_ni(struct sk_buff *skb)
1745{
1746	int err;
1747
1748	preempt_disable();
1749	err = netif_rx(skb);
1750	if (local_softirq_pending())
1751		do_softirq();
1752	preempt_enable();
1753
1754	return err;
1755}
1756
1757EXPORT_SYMBOL(netif_rx_ni);
1758
1759static inline struct net_device *skb_bond(struct sk_buff *skb)
1760{
1761	struct net_device *dev = skb->dev;
1762
1763	if (dev->master) {
1764		if (skb_bond_should_drop(skb)) {
1765			kfree_skb(skb);
1766			return NULL;
1767		}
1768		skb->dev = dev->master;
1769	}
1770
1771	return dev;
1772}
1773
1774static void net_tx_action(struct softirq_action *h)
1775{
1776	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1777
1778	if (sd->completion_queue) {
1779		struct sk_buff *clist;
1780
1781		local_irq_disable();
1782		clist = sd->completion_queue;
1783		sd->completion_queue = NULL;
1784		local_irq_enable();
1785
1786		while (clist) {
1787			struct sk_buff *skb = clist;
1788			clist = clist->next;
1789
1790			BUG_TRAP(!atomic_read(&skb->users));
1791			__kfree_skb(skb);
1792		}
1793	}
1794
1795	if (sd->output_queue) {
1796		struct net_device *head;
1797
1798		local_irq_disable();
1799		head = sd->output_queue;
1800		sd->output_queue = NULL;
1801		local_irq_enable();
1802
1803		while (head) {
1804			struct net_device *dev = head;
1805			head = head->next_sched;
1806
1807			smp_mb__before_clear_bit();
1808			clear_bit(__LINK_STATE_SCHED, &dev->state);
1809
1810			if (spin_trylock(&dev->queue_lock)) {
1811				qdisc_run(dev);
1812				spin_unlock(&dev->queue_lock);
1813			} else {
1814				netif_schedule(dev);
1815			}
1816		}
1817	}
1818}
1819
1820static inline int deliver_skb(struct sk_buff *skb,
1821			      struct packet_type *pt_prev,
1822			      struct net_device *orig_dev)
1823{
1824	atomic_inc(&skb->users);
1825	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1826}
1827
1828#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1829/* These hooks defined here for ATM */
1830struct net_bridge;
1831struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1832						unsigned char *addr);
1833void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1834
1835/*
1836 * If bridge module is loaded call bridging hook.
1837 *  returns NULL if packet was consumed.
1838 */
1839struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1840					struct sk_buff *skb) __read_mostly;
1841static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1842					    struct packet_type **pt_prev, int *ret,
1843					    struct net_device *orig_dev)
1844{
1845	struct net_bridge_port *port;
1846
1847	if (skb->pkt_type == PACKET_LOOPBACK ||
1848	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
1849		return skb;
1850
1851	if (*pt_prev) {
1852		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1853		*pt_prev = NULL;
1854	}
1855
1856	return br_handle_frame_hook(port, skb);
1857}
1858#else
1859#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
1860#endif
1861
1862#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1863struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1864EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1865
1866static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1867					     struct packet_type **pt_prev,
1868					     int *ret,
1869					     struct net_device *orig_dev)
1870{
1871	if (skb->dev->macvlan_port == NULL)
1872		return skb;
1873
1874	if (*pt_prev) {
1875		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1876		*pt_prev = NULL;
1877	}
1878	return macvlan_handle_frame_hook(skb);
1879}
1880#else
1881#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
1882#endif
1883
1884#ifdef CONFIG_NET_CLS_ACT
1885/* TODO: Maybe we should just force sch_ingress to be compiled in
1886 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1887 * a compare and 2 stores extra right now if we dont have it on
1888 * but have CONFIG_NET_CLS_ACT
1889 * NOTE: This doesnt stop any functionality; if you dont have
1890 * the ingress scheduler, you just cant add policies on ingress.
1891 *
1892 */
1893static int ing_filter(struct sk_buff *skb)
1894{
1895	struct Qdisc *q;
1896	struct net_device *dev = skb->dev;
1897	int result = TC_ACT_OK;
1898
1899	if (dev->qdisc_ingress) {
1900		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1901		if (MAX_RED_LOOP < ttl++) {
1902			printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
1903				skb->iif, skb->dev->ifindex);
1904			return TC_ACT_SHOT;
1905		}
1906
1907		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1908
1909		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1910
1911		spin_lock(&dev->ingress_lock);
1912		if ((q = dev->qdisc_ingress) != NULL)
1913			result = q->enqueue(skb, q);
1914		spin_unlock(&dev->ingress_lock);
1915
1916	}
1917
1918	return result;
1919}
1920#endif
1921
1922int netif_receive_skb(struct sk_buff *skb)
1923{
1924	struct packet_type *ptype, *pt_prev;
1925	struct net_device *orig_dev;
1926	int ret = NET_RX_DROP;
1927	__be16 type;
1928
1929	/* if we've gotten here through NAPI, check netpoll */
1930	if (skb->dev->poll && netpoll_rx(skb))
1931		return NET_RX_DROP;
1932
1933	if (!skb->tstamp.tv64)
1934		net_timestamp(skb);
1935
1936	if (!skb->iif)
1937		skb->iif = skb->dev->ifindex;
1938
1939	orig_dev = skb_bond(skb);
1940
1941	if (!orig_dev)
1942		return NET_RX_DROP;
1943
1944	__get_cpu_var(netdev_rx_stat).total++;
1945
1946	skb_reset_network_header(skb);
1947	skb_reset_transport_header(skb);
1948	skb->mac_len = skb->network_header - skb->mac_header;
1949
1950	pt_prev = NULL;
1951
1952	rcu_read_lock();
1953
1954#ifdef CONFIG_NET_CLS_ACT
1955	if (skb->tc_verd & TC_NCLS) {
1956		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1957		goto ncls;
1958	}
1959#endif
1960
1961	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1962		if (!ptype->dev || ptype->dev == skb->dev) {
1963			if (pt_prev)
1964				ret = deliver_skb(skb, pt_prev, orig_dev);
1965			pt_prev = ptype;
1966		}
1967	}
1968
1969#ifdef CONFIG_NET_CLS_ACT
1970	if (pt_prev) {
1971		ret = deliver_skb(skb, pt_prev, orig_dev);
1972		pt_prev = NULL; /* noone else should process this after*/
1973	} else {
1974		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1975	}
1976
1977	ret = ing_filter(skb);
1978
1979	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1980		kfree_skb(skb);
1981		goto out;
1982	}
1983
1984	skb->tc_verd = 0;
1985ncls:
1986#endif
1987
1988	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
1989	if (!skb)
1990		goto out;
1991	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
1992	if (!skb)
1993		goto out;
1994
1995	type = skb->protocol;
1996	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1997		if (ptype->type == type &&
1998		    (!ptype->dev || ptype->dev == skb->dev)) {
1999			if (pt_prev)
2000				ret = deliver_skb(skb, pt_prev, orig_dev);
2001			pt_prev = ptype;
2002		}
2003	}
2004
2005	if (pt_prev) {
2006		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2007	} else {
2008		kfree_skb(skb);
2009		/* Jamal, now you will not able to escape explaining
2010		 * me how you were going to use this. :-)
2011		 */
2012		ret = NET_RX_DROP;
2013	}
2014
2015out:
2016	rcu_read_unlock();
2017	return ret;
2018}
2019
2020static int process_backlog(struct net_device *backlog_dev, int *budget)
2021{
2022	int work = 0;
2023	int quota = min(backlog_dev->quota, *budget);
2024	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2025	unsigned long start_time = jiffies;
2026
2027	backlog_dev->weight = weight_p;
2028	for (;;) {
2029		struct sk_buff *skb;
2030		struct net_device *dev;
2031
2032		local_irq_disable();
2033		skb = __skb_dequeue(&queue->input_pkt_queue);
2034		if (!skb)
2035			goto job_done;
2036		local_irq_enable();
2037
2038		dev = skb->dev;
2039
2040		netif_receive_skb(skb);
2041
2042		dev_put(dev);
2043
2044		work++;
2045
2046		if (work >= quota || jiffies - start_time > 1)
2047			break;
2048
2049	}
2050
2051	backlog_dev->quota -= work;
2052	*budget -= work;
2053	return -1;
2054
2055job_done:
2056	backlog_dev->quota -= work;
2057	*budget -= work;
2058
2059	list_del(&backlog_dev->poll_list);
2060	smp_mb__before_clear_bit();
2061	netif_poll_enable(backlog_dev);
2062
2063	local_irq_enable();
2064	return 0;
2065}
2066
2067static void net_rx_action(struct softirq_action *h)
2068{
2069	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2070	unsigned long start_time = jiffies;
2071	int budget = netdev_budget;
2072	void *have;
2073
2074	local_irq_disable();
2075
2076	while (!list_empty(&queue->poll_list)) {
2077		struct net_device *dev;
2078
2079		if (budget <= 0 || jiffies - start_time > 1)
2080			goto softnet_break;
2081
2082		local_irq_enable();
2083
2084		dev = list_entry(queue->poll_list.next,
2085				 struct net_device, poll_list);
2086		have = netpoll_poll_lock(dev);
2087
2088		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
2089			netpoll_poll_unlock(have);
2090			local_irq_disable();
2091			list_move_tail(&dev->poll_list, &queue->poll_list);
2092			if (dev->quota < 0)
2093				dev->quota += dev->weight;
2094			else
2095				dev->quota = dev->weight;
2096		} else {
2097			netpoll_poll_unlock(have);
2098			dev_put(dev);
2099			local_irq_disable();
2100		}
2101	}
2102out:
2103	local_irq_enable();
2104#ifdef CONFIG_NET_DMA
2105	/*
2106	 * There may not be any more sk_buffs coming right now, so push
2107	 * any pending DMA copies to hardware
2108	 */
2109	if (!cpus_empty(net_dma.channel_mask)) {
2110		int chan_idx;
2111		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2112			struct dma_chan *chan = net_dma.channels[chan_idx];
2113			if (chan)
2114				dma_async_memcpy_issue_pending(chan);
2115		}
2116	}
2117#endif
2118	return;
2119
2120softnet_break:
2121	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2122	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2123	goto out;
2124}
2125
2126static gifconf_func_t * gifconf_list [NPROTO];
2127
2128/**
2129 *	register_gifconf	-	register a SIOCGIF handler
2130 *	@family: Address family
2131 *	@gifconf: Function handler
2132 *
2133 *	Register protocol dependent address dumping routines. The handler
2134 *	that is passed must not be freed or reused until it has been replaced
2135 *	by another handler.
2136 */
2137int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2138{
2139	if (family >= NPROTO)
2140		return -EINVAL;
2141	gifconf_list[family] = gifconf;
2142	return 0;
2143}
2144
2145
2146/*
2147 *	Map an interface index to its name (SIOCGIFNAME)
2148 */
2149
2150/*
2151 *	We need this ioctl for efficient implementation of the
2152 *	if_indextoname() function required by the IPv6 API.  Without
2153 *	it, we would have to search all the interfaces to find a
2154 *	match.  --pb
2155 */
2156
2157static int dev_ifname(struct ifreq __user *arg)
2158{
2159	struct net_device *dev;
2160	struct ifreq ifr;
2161
2162	/*
2163	 *	Fetch the caller's info block.
2164	 */
2165
2166	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2167		return -EFAULT;
2168
2169	read_lock(&dev_base_lock);
2170	dev = __dev_get_by_index(ifr.ifr_ifindex);
2171	if (!dev) {
2172		read_unlock(&dev_base_lock);
2173		return -ENODEV;
2174	}
2175
2176	strcpy(ifr.ifr_name, dev->name);
2177	read_unlock(&dev_base_lock);
2178
2179	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2180		return -EFAULT;
2181	return 0;
2182}
2183
2184/*
2185 *	Perform a SIOCGIFCONF call. This structure will change
2186 *	size eventually, and there is nothing I can do about it.
2187 *	Thus we will need a 'compatibility mode'.
2188 */
2189
2190static int dev_ifconf(char __user *arg)
2191{
2192	struct ifconf ifc;
2193	struct net_device *dev;
2194	char __user *pos;
2195	int len;
2196	int total;
2197	int i;
2198
2199	/*
2200	 *	Fetch the caller's info block.
2201	 */
2202
2203	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2204		return -EFAULT;
2205
2206	pos = ifc.ifc_buf;
2207	len = ifc.ifc_len;
2208
2209	/*
2210	 *	Loop over the interfaces, and write an info block for each.
2211	 */
2212
2213	total = 0;
2214	for_each_netdev(dev) {
2215		for (i = 0; i < NPROTO; i++) {
2216			if (gifconf_list[i]) {
2217				int done;
2218				if (!pos)
2219					done = gifconf_list[i](dev, NULL, 0);
2220				else
2221					done = gifconf_list[i](dev, pos + total,
2222							       len - total);
2223				if (done < 0)
2224					return -EFAULT;
2225				total += done;
2226			}
2227		}
2228	}
2229
2230	/*
2231	 *	All done.  Write the updated control block back to the caller.
2232	 */
2233	ifc.ifc_len = total;
2234
2235	/*
2236	 * 	Both BSD and Solaris return 0 here, so we do too.
2237	 */
2238	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2239}
2240
2241#ifdef CONFIG_PROC_FS
2242/*
2243 *	This is invoked by the /proc filesystem handler to display a device
2244 *	in detail.
2245 */
2246void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2247{
2248	loff_t off;
2249	struct net_device *dev;
2250
2251	read_lock(&dev_base_lock);
2252	if (!*pos)
2253		return SEQ_START_TOKEN;
2254
2255	off = 1;
2256	for_each_netdev(dev)
2257		if (off++ == *pos)
2258			return dev;
2259
2260	return NULL;
2261}
2262
2263void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2264{
2265	++*pos;
2266	return v == SEQ_START_TOKEN ?
2267		first_net_device() : next_net_device((struct net_device *)v);
2268}
2269
2270void dev_seq_stop(struct seq_file *seq, void *v)
2271{
2272	read_unlock(&dev_base_lock);
2273}
2274
2275static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2276{
2277	struct net_device_stats *stats = dev->get_stats(dev);
2278
2279	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2280		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2281		   dev->name, stats->rx_bytes, stats->rx_packets,
2282		   stats->rx_errors,
2283		   stats->rx_dropped + stats->rx_missed_errors,
2284		   stats->rx_fifo_errors,
2285		   stats->rx_length_errors + stats->rx_over_errors +
2286		    stats->rx_crc_errors + stats->rx_frame_errors,
2287		   stats->rx_compressed, stats->multicast,
2288		   stats->tx_bytes, stats->tx_packets,
2289		   stats->tx_errors, stats->tx_dropped,
2290		   stats->tx_fifo_errors, stats->collisions,
2291		   stats->tx_carrier_errors +
2292		    stats->tx_aborted_errors +
2293		    stats->tx_window_errors +
2294		    stats->tx_heartbeat_errors,
2295		   stats->tx_compressed);
2296}
2297
2298/*
2299 *	Called from the PROCfs module. This now uses the new arbitrary sized
2300 *	/proc/net interface to create /proc/net/dev
2301 */
2302static int dev_seq_show(struct seq_file *seq, void *v)
2303{
2304	if (v == SEQ_START_TOKEN)
2305		seq_puts(seq, "Inter-|   Receive                            "
2306			      "                    |  Transmit\n"
2307			      " face |bytes    packets errs drop fifo frame "
2308			      "compressed multicast|bytes    packets errs "
2309			      "drop fifo colls carrier compressed\n");
2310	else
2311		dev_seq_printf_stats(seq, v);
2312	return 0;
2313}
2314
2315static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2316{
2317	struct netif_rx_stats *rc = NULL;
2318
2319	while (*pos < NR_CPUS)
2320		if (cpu_online(*pos)) {
2321			rc = &per_cpu(netdev_rx_stat, *pos);
2322			break;
2323		} else
2324			++*pos;
2325	return rc;
2326}
2327
2328static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2329{
2330	return softnet_get_online(pos);
2331}
2332
2333static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2334{
2335	++*pos;
2336	return softnet_get_online(pos);
2337}
2338
2339static void softnet_seq_stop(struct seq_file *seq, void *v)
2340{
2341}
2342
2343static int softnet_seq_show(struct seq_file *seq, void *v)
2344{
2345	struct netif_rx_stats *s = v;
2346
2347	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2348		   s->total, s->dropped, s->time_squeeze, 0,
2349		   0, 0, 0, 0, /* was fastroute */
2350		   s->cpu_collision );
2351	return 0;
2352}
2353
2354static const struct seq_operations dev_seq_ops = {
2355	.start = dev_seq_start,
2356	.next  = dev_seq_next,
2357	.stop  = dev_seq_stop,
2358	.show  = dev_seq_show,
2359};
2360
2361static int dev_seq_open(struct inode *inode, struct file *file)
2362{
2363	return seq_open(file, &dev_seq_ops);
2364}
2365
2366static const struct file_operations dev_seq_fops = {
2367	.owner	 = THIS_MODULE,
2368	.open    = dev_seq_open,
2369	.read    = seq_read,
2370	.llseek  = seq_lseek,
2371	.release = seq_release,
2372};
2373
2374static const struct seq_operations softnet_seq_ops = {
2375	.start = softnet_seq_start,
2376	.next  = softnet_seq_next,
2377	.stop  = softnet_seq_stop,
2378	.show  = softnet_seq_show,
2379};
2380
2381static int softnet_seq_open(struct inode *inode, struct file *file)
2382{
2383	return seq_open(file, &softnet_seq_ops);
2384}
2385
2386static const struct file_operations softnet_seq_fops = {
2387	.owner	 = THIS_MODULE,
2388	.open    = softnet_seq_open,
2389	.read    = seq_read,
2390	.llseek  = seq_lseek,
2391	.release = seq_release,
2392};
2393
2394static void *ptype_get_idx(loff_t pos)
2395{
2396	struct packet_type *pt = NULL;
2397	loff_t i = 0;
2398	int t;
2399
2400	list_for_each_entry_rcu(pt, &ptype_all, list) {
2401		if (i == pos)
2402			return pt;
2403		++i;
2404	}
2405
2406	for (t = 0; t < 16; t++) {
2407		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2408			if (i == pos)
2409				return pt;
2410			++i;
2411		}
2412	}
2413	return NULL;
2414}
2415
2416static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2417{
2418	rcu_read_lock();
2419	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2420}
2421
2422static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2423{
2424	struct packet_type *pt;
2425	struct list_head *nxt;
2426	int hash;
2427
2428	++*pos;
2429	if (v == SEQ_START_TOKEN)
2430		return ptype_get_idx(0);
2431
2432	pt = v;
2433	nxt = pt->list.next;
2434	if (pt->type == htons(ETH_P_ALL)) {
2435		if (nxt != &ptype_all)
2436			goto found;
2437		hash = 0;
2438		nxt = ptype_base[0].next;
2439	} else
2440		hash = ntohs(pt->type) & 15;
2441
2442	while (nxt == &ptype_base[hash]) {
2443		if (++hash >= 16)
2444			return NULL;
2445		nxt = ptype_base[hash].next;
2446	}
2447found:
2448	return list_entry(nxt, struct packet_type, list);
2449}
2450
2451static void ptype_seq_stop(struct seq_file *seq, void *v)
2452{
2453	rcu_read_unlock();
2454}
2455
2456static void ptype_seq_decode(struct seq_file *seq, void *sym)
2457{
2458#ifdef CONFIG_KALLSYMS
2459	unsigned long offset = 0, symsize;
2460	const char *symname;
2461	char *modname;
2462	char namebuf[128];
2463
2464	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2465				  &modname, namebuf);
2466
2467	if (symname) {
2468		char *delim = ":";
2469
2470		if (!modname)
2471			modname = delim = "";
2472		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2473			   symname, offset);
2474		return;
2475	}
2476#endif
2477
2478	seq_printf(seq, "[%p]", sym);
2479}
2480
2481static int ptype_seq_show(struct seq_file *seq, void *v)
2482{
2483	struct packet_type *pt = v;
2484
2485	if (v == SEQ_START_TOKEN)
2486		seq_puts(seq, "Type Device      Function\n");
2487	else {
2488		if (pt->type == htons(ETH_P_ALL))
2489			seq_puts(seq, "ALL ");
2490		else
2491			seq_printf(seq, "%04x", ntohs(pt->type));
2492
2493		seq_printf(seq, " %-8s ",
2494			   pt->dev ? pt->dev->name : "");
2495		ptype_seq_decode(seq,  pt->func);
2496		seq_putc(seq, '\n');
2497	}
2498
2499	return 0;
2500}
2501
2502static const struct seq_operations ptype_seq_ops = {
2503	.start = ptype_seq_start,
2504	.next  = ptype_seq_next,
2505	.stop  = ptype_seq_stop,
2506	.show  = ptype_seq_show,
2507};
2508
2509static int ptype_seq_open(struct inode *inode, struct file *file)
2510{
2511	return seq_open(file, &ptype_seq_ops);
2512}
2513
2514static const struct file_operations ptype_seq_fops = {
2515	.owner	 = THIS_MODULE,
2516	.open    = ptype_seq_open,
2517	.read    = seq_read,
2518	.llseek  = seq_lseek,
2519	.release = seq_release,
2520};
2521
2522
2523static int __init dev_proc_init(void)
2524{
2525	int rc = -ENOMEM;
2526
2527	if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2528		goto out;
2529	if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2530		goto out_dev;
2531	if (!proc_net_fops_create("ptype", S_IRUGO, &ptype_seq_fops))
2532		goto out_dev2;
2533
2534	if (wext_proc_init())
2535		goto out_softnet;
2536	rc = 0;
2537out:
2538	return rc;
2539out_softnet:
2540	proc_net_remove("ptype");
2541out_dev2:
2542	proc_net_remove("softnet_stat");
2543out_dev:
2544	proc_net_remove("dev");
2545	goto out;
2546}
2547#else
2548#define dev_proc_init() 0
2549#endif	/* CONFIG_PROC_FS */
2550
2551
2552/**
2553 *	netdev_set_master	-	set up master/slave pair
2554 *	@slave: slave device
2555 *	@master: new master device
2556 *
2557 *	Changes the master device of the slave. Pass %NULL to break the
2558 *	bonding. The caller must hold the RTNL semaphore. On a failure
2559 *	a negative errno code is returned. On success the reference counts
2560 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2561 *	function returns zero.
2562 */
2563int netdev_set_master(struct net_device *slave, struct net_device *master)
2564{
2565	struct net_device *old = slave->master;
2566
2567	ASSERT_RTNL();
2568
2569	if (master) {
2570		if (old)
2571			return -EBUSY;
2572		dev_hold(master);
2573	}
2574
2575	slave->master = master;
2576
2577	synchronize_net();
2578
2579	if (old)
2580		dev_put(old);
2581
2582	if (master)
2583		slave->flags |= IFF_SLAVE;
2584	else
2585		slave->flags &= ~IFF_SLAVE;
2586
2587	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2588	return 0;
2589}
2590
2591static void __dev_set_promiscuity(struct net_device *dev, int inc)
2592{
2593	unsigned short old_flags = dev->flags;
2594
2595	ASSERT_RTNL();
2596
2597	if ((dev->promiscuity += inc) == 0)
2598		dev->flags &= ~IFF_PROMISC;
2599	else
2600		dev->flags |= IFF_PROMISC;
2601	if (dev->flags != old_flags) {
2602		printk(KERN_INFO "device %s %s promiscuous mode\n",
2603		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2604							       "left");
2605		audit_log(current->audit_context, GFP_ATOMIC,
2606			AUDIT_ANOM_PROMISCUOUS,
2607			"dev=%s prom=%d old_prom=%d auid=%u",
2608			dev->name, (dev->flags & IFF_PROMISC),
2609			(old_flags & IFF_PROMISC),
2610			audit_get_loginuid(current->audit_context));
2611
2612		if (dev->change_rx_flags)
2613			dev->change_rx_flags(dev, IFF_PROMISC);
2614	}
2615}
2616
2617/**
2618 *	dev_set_promiscuity	- update promiscuity count on a device
2619 *	@dev: device
2620 *	@inc: modifier
2621 *
2622 *	Add or remove promiscuity from a device. While the count in the device
2623 *	remains above zero the interface remains promiscuous. Once it hits zero
2624 *	the device reverts back to normal filtering operation. A negative inc
2625 *	value is used to drop promiscuity on the device.
2626 */
2627void dev_set_promiscuity(struct net_device *dev, int inc)
2628{
2629	unsigned short old_flags = dev->flags;
2630
2631	__dev_set_promiscuity(dev, inc);
2632	if (dev->flags != old_flags)
2633		dev_set_rx_mode(dev);
2634}
2635
2636/**
2637 *	dev_set_allmulti	- update allmulti count on a device
2638 *	@dev: device
2639 *	@inc: modifier
2640 *
2641 *	Add or remove reception of all multicast frames to a device. While the
2642 *	count in the device remains above zero the interface remains listening
2643 *	to all interfaces. Once it hits zero the device reverts back to normal
2644 *	filtering operation. A negative @inc value is used to drop the counter
2645 *	when releasing a resource needing all multicasts.
2646 */
2647
2648void dev_set_allmulti(struct net_device *dev, int inc)
2649{
2650	unsigned short old_flags = dev->flags;
2651
2652	ASSERT_RTNL();
2653
2654	dev->flags |= IFF_ALLMULTI;
2655	if ((dev->allmulti += inc) == 0)
2656		dev->flags &= ~IFF_ALLMULTI;
2657	if (dev->flags ^ old_flags) {
2658		if (dev->change_rx_flags)
2659			dev->change_rx_flags(dev, IFF_ALLMULTI);
2660		dev_set_rx_mode(dev);
2661	}
2662}
2663
2664/*
2665 *	Upload unicast and multicast address lists to device and
2666 *	configure RX filtering. When the device doesn't support unicast
2667 *	filtering it is put in promiscous mode while unicast addresses
2668 *	are present.
2669 */
2670void __dev_set_rx_mode(struct net_device *dev)
2671{
2672	/* dev_open will call this function so the list will stay sane. */
2673	if (!(dev->flags&IFF_UP))
2674		return;
2675
2676	if (!netif_device_present(dev))
2677		return;
2678
2679	if (dev->set_rx_mode)
2680		dev->set_rx_mode(dev);
2681	else {
2682		/* Unicast addresses changes may only happen under the rtnl,
2683		 * therefore calling __dev_set_promiscuity here is safe.
2684		 */
2685		if (dev->uc_count > 0 && !dev->uc_promisc) {
2686			__dev_set_promiscuity(dev, 1);
2687			dev->uc_promisc = 1;
2688		} else if (dev->uc_count == 0 && dev->uc_promisc) {
2689			__dev_set_promiscuity(dev, -1);
2690			dev->uc_promisc = 0;
2691		}
2692
2693		if (dev->set_multicast_list)
2694			dev->set_multicast_list(dev);
2695	}
2696}
2697
2698void dev_set_rx_mode(struct net_device *dev)
2699{
2700	netif_tx_lock_bh(dev);
2701	__dev_set_rx_mode(dev);
2702	netif_tx_unlock_bh(dev);
2703}
2704
2705int __dev_addr_delete(struct dev_addr_list **list, int *count,
2706		      void *addr, int alen, int glbl)
2707{
2708	struct dev_addr_list *da;
2709
2710	for (; (da = *list) != NULL; list = &da->next) {
2711		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2712		    alen == da->da_addrlen) {
2713			if (glbl) {
2714				int old_glbl = da->da_gusers;
2715				da->da_gusers = 0;
2716				if (old_glbl == 0)
2717					break;
2718			}
2719			if (--da->da_users)
2720				return 0;
2721
2722			*list = da->next;
2723			kfree(da);
2724			(*count)--;
2725			return 0;
2726		}
2727	}
2728	return -ENOENT;
2729}
2730
2731int __dev_addr_add(struct dev_addr_list **list, int *count,
2732		   void *addr, int alen, int glbl)
2733{
2734	struct dev_addr_list *da;
2735
2736	for (da = *list; da != NULL; da = da->next) {
2737		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2738		    da->da_addrlen == alen) {
2739			if (glbl) {
2740				int old_glbl = da->da_gusers;
2741				da->da_gusers = 1;
2742				if (old_glbl)
2743					return 0;
2744			}
2745			da->da_users++;
2746			return 0;
2747		}
2748	}
2749
2750	da = kmalloc(sizeof(*da), GFP_ATOMIC);
2751	if (da == NULL)
2752		return -ENOMEM;
2753	memcpy(da->da_addr, addr, alen);
2754	da->da_addrlen = alen;
2755	da->da_users = 1;
2756	da->da_gusers = glbl ? 1 : 0;
2757	da->next = *list;
2758	*list = da;
2759	(*count)++;
2760	return 0;
2761}
2762
2763/**
2764 *	dev_unicast_delete	- Release secondary unicast address.
2765 *	@dev: device
2766 *	@addr: address to delete
2767 *	@alen: length of @addr
2768 *
2769 *	Release reference to a secondary unicast address and remove it
2770 *	from the device if the reference count drops to zero.
2771 *
2772 * 	The caller must hold the rtnl_mutex.
2773 */
2774int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2775{
2776	int err;
2777
2778	ASSERT_RTNL();
2779
2780	netif_tx_lock_bh(dev);
2781	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2782	if (!err)
2783		__dev_set_rx_mode(dev);
2784	netif_tx_unlock_bh(dev);
2785	return err;
2786}
2787EXPORT_SYMBOL(dev_unicast_delete);
2788
2789/**
2790 *	dev_unicast_add		- add a secondary unicast address
2791 *	@dev: device
2792 *	@addr: address to delete
2793 *	@alen: length of @addr
2794 *
2795 *	Add a secondary unicast address to the device or increase
2796 *	the reference count if it already exists.
2797 *
2798 *	The caller must hold the rtnl_mutex.
2799 */
2800int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2801{
2802	int err;
2803
2804	ASSERT_RTNL();
2805
2806	netif_tx_lock_bh(dev);
2807	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2808	if (!err)
2809		__dev_set_rx_mode(dev);
2810	netif_tx_unlock_bh(dev);
2811	return err;
2812}
2813EXPORT_SYMBOL(dev_unicast_add);
2814
2815static void __dev_addr_discard(struct dev_addr_list **list)
2816{
2817	struct dev_addr_list *tmp;
2818
2819	while (*list != NULL) {
2820		tmp = *list;
2821		*list = tmp->next;
2822		if (tmp->da_users > tmp->da_gusers)
2823			printk("__dev_addr_discard: address leakage! "
2824			       "da_users=%d\n", tmp->da_users);
2825		kfree(tmp);
2826	}
2827}
2828
2829static void dev_addr_discard(struct net_device *dev)
2830{
2831	netif_tx_lock_bh(dev);
2832
2833	__dev_addr_discard(&dev->uc_list);
2834	dev->uc_count = 0;
2835
2836	__dev_addr_discard(&dev->mc_list);
2837	dev->mc_count = 0;
2838
2839	netif_tx_unlock_bh(dev);
2840}
2841
2842unsigned dev_get_flags(const struct net_device *dev)
2843{
2844	unsigned flags;
2845
2846	flags = (dev->flags & ~(IFF_PROMISC |
2847				IFF_ALLMULTI |
2848				IFF_RUNNING |
2849				IFF_LOWER_UP |
2850				IFF_DORMANT)) |
2851		(dev->gflags & (IFF_PROMISC |
2852				IFF_ALLMULTI));
2853
2854	if (netif_running(dev)) {
2855		if (netif_oper_up(dev))
2856			flags |= IFF_RUNNING;
2857		if (netif_carrier_ok(dev))
2858			flags |= IFF_LOWER_UP;
2859		if (netif_dormant(dev))
2860			flags |= IFF_DORMANT;
2861	}
2862
2863	return flags;
2864}
2865
2866int dev_change_flags(struct net_device *dev, unsigned flags)
2867{
2868	int ret, changes;
2869	int old_flags = dev->flags;
2870
2871	ASSERT_RTNL();
2872
2873	/*
2874	 *	Set the flags on our device.
2875	 */
2876
2877	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2878			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2879			       IFF_AUTOMEDIA)) |
2880		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2881				    IFF_ALLMULTI));
2882
2883	/*
2884	 *	Load in the correct multicast list now the flags have changed.
2885	 */
2886
2887	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
2888		dev->change_rx_flags(dev, IFF_MULTICAST);
2889
2890	dev_set_rx_mode(dev);
2891
2892	/*
2893	 *	Have we downed the interface. We handle IFF_UP ourselves
2894	 *	according to user attempts to set it, rather than blindly
2895	 *	setting it.
2896	 */
2897
2898	ret = 0;
2899	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
2900		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2901
2902		if (!ret)
2903			dev_set_rx_mode(dev);
2904	}
2905
2906	if (dev->flags & IFF_UP &&
2907	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2908					  IFF_VOLATILE)))
2909		raw_notifier_call_chain(&netdev_chain,
2910				NETDEV_CHANGE, dev);
2911
2912	if ((flags ^ dev->gflags) & IFF_PROMISC) {
2913		int inc = (flags & IFF_PROMISC) ? +1 : -1;
2914		dev->gflags ^= IFF_PROMISC;
2915		dev_set_promiscuity(dev, inc);
2916	}
2917
2918	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2919	   is important. Some (broken) drivers set IFF_PROMISC, when
2920	   IFF_ALLMULTI is requested not asking us and not reporting.
2921	 */
2922	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2923		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2924		dev->gflags ^= IFF_ALLMULTI;
2925		dev_set_allmulti(dev, inc);
2926	}
2927
2928	/* Exclude state transition flags, already notified */
2929	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
2930	if (changes)
2931		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
2932
2933	return ret;
2934}
2935
2936int dev_set_mtu(struct net_device *dev, int new_mtu)
2937{
2938	int err;
2939
2940	if (new_mtu == dev->mtu)
2941		return 0;
2942
2943	/*	MTU must be positive.	 */
2944	if (new_mtu < 0)
2945		return -EINVAL;
2946
2947	if (!netif_device_present(dev))
2948		return -ENODEV;
2949
2950	err = 0;
2951	if (dev->change_mtu)
2952		err = dev->change_mtu(dev, new_mtu);
2953	else
2954		dev->mtu = new_mtu;
2955	if (!err && dev->flags & IFF_UP)
2956		raw_notifier_call_chain(&netdev_chain,
2957				NETDEV_CHANGEMTU, dev);
2958	return err;
2959}
2960
2961int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2962{
2963	int err;
2964
2965	if (!dev->set_mac_address)
2966		return -EOPNOTSUPP;
2967	if (sa->sa_family != dev->type)
2968		return -EINVAL;
2969	if (!netif_device_present(dev))
2970		return -ENODEV;
2971	err = dev->set_mac_address(dev, sa);
2972	if (!err)
2973		raw_notifier_call_chain(&netdev_chain,
2974				NETDEV_CHANGEADDR, dev);
2975	return err;
2976}
2977
2978/*
2979 *	Perform the SIOCxIFxxx calls.
2980 */
2981static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2982{
2983	int err;
2984	struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2985
2986	if (!dev)
2987		return -ENODEV;
2988
2989	switch (cmd) {
2990		case SIOCGIFFLAGS:	/* Get interface flags */
2991			ifr->ifr_flags = dev_get_flags(dev);
2992			return 0;
2993
2994		case SIOCSIFFLAGS:	/* Set interface flags */
2995			return dev_change_flags(dev, ifr->ifr_flags);
2996
2997		case SIOCGIFMETRIC:	/* Get the metric on the interface
2998					   (currently unused) */
2999			ifr->ifr_metric = 0;
3000			return 0;
3001
3002		case SIOCSIFMETRIC:	/* Set the metric on the interface
3003					   (currently unused) */
3004			return -EOPNOTSUPP;
3005
3006		case SIOCGIFMTU:	/* Get the MTU of a device */
3007			ifr->ifr_mtu = dev->mtu;
3008			return 0;
3009
3010		case SIOCSIFMTU:	/* Set the MTU of a device */
3011			return dev_set_mtu(dev, ifr->ifr_mtu);
3012
3013		case SIOCGIFHWADDR:
3014			if (!dev->addr_len)
3015				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3016			else
3017				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3018				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3019			ifr->ifr_hwaddr.sa_family = dev->type;
3020			return 0;
3021
3022		case SIOCSIFHWADDR:
3023			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3024
3025		case SIOCSIFHWBROADCAST:
3026			if (ifr->ifr_hwaddr.sa_family != dev->type)
3027				return -EINVAL;
3028			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3029			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3030			raw_notifier_call_chain(&netdev_chain,
3031					    NETDEV_CHANGEADDR, dev);
3032			return 0;
3033
3034		case SIOCGIFMAP:
3035			ifr->ifr_map.mem_start = dev->mem_start;
3036			ifr->ifr_map.mem_end   = dev->mem_end;
3037			ifr->ifr_map.base_addr = dev->base_addr;
3038			ifr->ifr_map.irq       = dev->irq;
3039			ifr->ifr_map.dma       = dev->dma;
3040			ifr->ifr_map.port      = dev->if_port;
3041			return 0;
3042
3043		case SIOCSIFMAP:
3044			if (dev->set_config) {
3045				if (!netif_device_present(dev))
3046					return -ENODEV;
3047				return dev->set_config(dev, &ifr->ifr_map);
3048			}
3049			return -EOPNOTSUPP;
3050
3051		case SIOCADDMULTI:
3052			if (!dev->set_multicast_list ||
3053			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3054				return -EINVAL;
3055			if (!netif_device_present(dev))
3056				return -ENODEV;
3057			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3058					  dev->addr_len, 1);
3059
3060		case SIOCDELMULTI:
3061			if (!dev->set_multicast_list ||
3062			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3063				return -EINVAL;
3064			if (!netif_device_present(dev))
3065				return -ENODEV;
3066			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3067					     dev->addr_len, 1);
3068
3069		case SIOCGIFINDEX:
3070			ifr->ifr_ifindex = dev->ifindex;
3071			return 0;
3072
3073		case SIOCGIFTXQLEN:
3074			ifr->ifr_qlen = dev->tx_queue_len;
3075			return 0;
3076
3077		case SIOCSIFTXQLEN:
3078			if (ifr->ifr_qlen < 0)
3079				return -EINVAL;
3080			dev->tx_queue_len = ifr->ifr_qlen;
3081			return 0;
3082
3083		case SIOCSIFNAME:
3084			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3085			return dev_change_name(dev, ifr->ifr_newname);
3086
3087		/*
3088		 *	Unknown or private ioctl
3089		 */
3090
3091		default:
3092			if ((cmd >= SIOCDEVPRIVATE &&
3093			    cmd <= SIOCDEVPRIVATE + 15) ||
3094			    cmd == SIOCBONDENSLAVE ||
3095			    cmd == SIOCBONDRELEASE ||
3096			    cmd == SIOCBONDSETHWADDR ||
3097			    cmd == SIOCBONDSLAVEINFOQUERY ||
3098			    cmd == SIOCBONDINFOQUERY ||
3099			    cmd == SIOCBONDCHANGEACTIVE ||
3100			    cmd == SIOCGMIIPHY ||
3101			    cmd == SIOCGMIIREG ||
3102			    cmd == SIOCSMIIREG ||
3103			    cmd == SIOCBRADDIF ||
3104			    cmd == SIOCBRDELIF ||
3105			    cmd == SIOCWANDEV) {
3106				err = -EOPNOTSUPP;
3107				if (dev->do_ioctl) {
3108					if (netif_device_present(dev))
3109						err = dev->do_ioctl(dev, ifr,
3110								    cmd);
3111					else
3112						err = -ENODEV;
3113				}
3114			} else
3115				err = -EINVAL;
3116
3117	}
3118	return err;
3119}
3120
3121/*
3122 *	This function handles all "interface"-type I/O control requests. The actual
3123 *	'doing' part of this is dev_ifsioc above.
3124 */
3125
3126/**
3127 *	dev_ioctl	-	network device ioctl
3128 *	@cmd: command to issue
3129 *	@arg: pointer to a struct ifreq in user space
3130 *
3131 *	Issue ioctl functions to devices. This is normally called by the
3132 *	user space syscall interfaces but can sometimes be useful for
3133 *	other purposes. The return value is the return from the syscall if
3134 *	positive or a negative errno code on error.
3135 */
3136
3137int dev_ioctl(unsigned int cmd, void __user *arg)
3138{
3139	struct ifreq ifr;
3140	int ret;
3141	char *colon;
3142
3143	/* One special case: SIOCGIFCONF takes ifconf argument
3144	   and requires shared lock, because it sleeps writing
3145	   to user space.
3146	 */
3147
3148	if (cmd == SIOCGIFCONF) {
3149		rtnl_lock();
3150		ret = dev_ifconf((char __user *) arg);
3151		rtnl_unlock();
3152		return ret;
3153	}
3154	if (cmd == SIOCGIFNAME)
3155		return dev_ifname((struct ifreq __user *)arg);
3156
3157	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3158		return -EFAULT;
3159
3160	ifr.ifr_name[IFNAMSIZ-1] = 0;
3161
3162	colon = strchr(ifr.ifr_name, ':');
3163	if (colon)
3164		*colon = 0;
3165
3166	/*
3167	 *	See which interface the caller is talking about.
3168	 */
3169
3170	switch (cmd) {
3171		/*
3172		 *	These ioctl calls:
3173		 *	- can be done by all.
3174		 *	- atomic and do not require locking.
3175		 *	- return a value
3176		 */
3177		case SIOCGIFFLAGS:
3178		case SIOCGIFMETRIC:
3179		case SIOCGIFMTU:
3180		case SIOCGIFHWADDR:
3181		case SIOCGIFSLAVE:
3182		case SIOCGIFMAP:
3183		case SIOCGIFINDEX:
3184		case SIOCGIFTXQLEN:
3185			dev_load(ifr.ifr_name);
3186			read_lock(&dev_base_lock);
3187			ret = dev_ifsioc(&ifr, cmd);
3188			read_unlock(&dev_base_lock);
3189			if (!ret) {
3190				if (colon)
3191					*colon = ':';
3192				if (copy_to_user(arg, &ifr,
3193						 sizeof(struct ifreq)))
3194					ret = -EFAULT;
3195			}
3196			return ret;
3197
3198		case SIOCETHTOOL:
3199			dev_load(ifr.ifr_name);
3200			rtnl_lock();
3201			ret = dev_ethtool(&ifr);
3202			rtnl_unlock();
3203			if (!ret) {
3204				if (colon)
3205					*colon = ':';
3206				if (copy_to_user(arg, &ifr,
3207						 sizeof(struct ifreq)))
3208					ret = -EFAULT;
3209			}
3210			return ret;
3211
3212		/*
3213		 *	These ioctl calls:
3214		 *	- require superuser power.
3215		 *	- require strict serialization.
3216		 *	- return a value
3217		 */
3218		case SIOCGMIIPHY:
3219		case SIOCGMIIREG:
3220		case SIOCSIFNAME:
3221			if (!capable(CAP_NET_ADMIN))
3222				return -EPERM;
3223			dev_load(ifr.ifr_name);
3224			rtnl_lock();
3225			ret = dev_ifsioc(&ifr, cmd);
3226			rtnl_unlock();
3227			if (!ret) {
3228				if (colon)
3229					*colon = ':';
3230				if (copy_to_user(arg, &ifr,
3231						 sizeof(struct ifreq)))
3232					ret = -EFAULT;
3233			}
3234			return ret;
3235
3236		/*
3237		 *	These ioctl calls:
3238		 *	- require superuser power.
3239		 *	- require strict serialization.
3240		 *	- do not return a value
3241		 */
3242		case SIOCSIFFLAGS:
3243		case SIOCSIFMETRIC:
3244		case SIOCSIFMTU:
3245		case SIOCSIFMAP:
3246		case SIOCSIFHWADDR:
3247		case SIOCSIFSLAVE:
3248		case SIOCADDMULTI:
3249		case SIOCDELMULTI:
3250		case SIOCSIFHWBROADCAST:
3251		case SIOCSIFTXQLEN:
3252		case SIOCSMIIREG:
3253		case SIOCBONDENSLAVE:
3254		case SIOCBONDRELEASE:
3255		case SIOCBONDSETHWADDR:
3256		case SIOCBONDCHANGEACTIVE:
3257		case SIOCBRADDIF:
3258		case SIOCBRDELIF:
3259			if (!capable(CAP_NET_ADMIN))
3260				return -EPERM;
3261			/* fall through */
3262		case SIOCBONDSLAVEINFOQUERY:
3263		case SIOCBONDINFOQUERY:
3264			dev_load(ifr.ifr_name);
3265			rtnl_lock();
3266			ret = dev_ifsioc(&ifr, cmd);
3267			rtnl_unlock();
3268			return ret;
3269
3270		case SIOCGIFMEM:
3271			/* Get the per device memory space. We can add this but
3272			 * currently do not support it */
3273		case SIOCSIFMEM:
3274			/* Set the per device memory buffer space.
3275			 * Not applicable in our case */
3276		case SIOCSIFLINK:
3277			return -EINVAL;
3278
3279		/*
3280		 *	Unknown or private ioctl.
3281		 */
3282		default:
3283			if (cmd == SIOCWANDEV ||
3284			    (cmd >= SIOCDEVPRIVATE &&
3285			     cmd <= SIOCDEVPRIVATE + 15)) {
3286				dev_load(ifr.ifr_name);
3287				rtnl_lock();
3288				ret = dev_ifsioc(&ifr, cmd);
3289				rtnl_unlock();
3290				if (!ret && copy_to_user(arg, &ifr,
3291							 sizeof(struct ifreq)))
3292					ret = -EFAULT;
3293				return ret;
3294			}
3295			/* Take care of Wireless Extensions */
3296			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3297				return wext_handle_ioctl(&ifr, cmd, arg);
3298			return -EINVAL;
3299	}
3300}
3301
3302
3303/**
3304 *	dev_new_index	-	allocate an ifindex
3305 *
3306 *	Returns a suitable unique value for a new device interface
3307 *	number.  The caller must hold the rtnl semaphore or the
3308 *	dev_base_lock to be sure it remains unique.
3309 */
3310static int dev_new_index(void)
3311{
3312	static int ifindex;
3313	for (;;) {
3314		if (++ifindex <= 0)
3315			ifindex = 1;
3316		if (!__dev_get_by_index(ifindex))
3317			return ifindex;
3318	}
3319}
3320
3321static int dev_boot_phase = 1;
3322
3323/* Delayed registration/unregisteration */
3324static DEFINE_SPINLOCK(net_todo_list_lock);
3325static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3326
3327static void net_set_todo(struct net_device *dev)
3328{
3329	spin_lock(&net_todo_list_lock);
3330	list_add_tail(&dev->todo_list, &net_todo_list);
3331	spin_unlock(&net_todo_list_lock);
3332}
3333
3334/**
3335 *	register_netdevice	- register a network device
3336 *	@dev: device to register
3337 *
3338 *	Take a completed network device structure and add it to the kernel
3339 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3340 *	chain. 0 is returned on success. A negative errno code is returned
3341 *	on a failure to set up the device, or if the name is a duplicate.
3342 *
3343 *	Callers must hold the rtnl semaphore. You may want
3344 *	register_netdev() instead of this.
3345 *
3346 *	BUGS:
3347 *	The locking appears insufficient to guarantee two parallel registers
3348 *	will not get the same name.
3349 */
3350
3351int register_netdevice(struct net_device *dev)
3352{
3353	struct hlist_head *head;
3354	struct hlist_node *p;
3355	int ret;
3356
3357	BUG_ON(dev_boot_phase);
3358	ASSERT_RTNL();
3359
3360	might_sleep();
3361
3362	/* When net_device's are persistent, this will be fatal. */
3363	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3364
3365	spin_lock_init(&dev->queue_lock);
3366	spin_lock_init(&dev->_xmit_lock);
3367	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3368	dev->xmit_lock_owner = -1;
3369	spin_lock_init(&dev->ingress_lock);
3370
3371	dev->iflink = -1;
3372
3373	/* Init, if this function is available */
3374	if (dev->init) {
3375		ret = dev->init(dev);
3376		if (ret) {
3377			if (ret > 0)
3378				ret = -EIO;
3379			goto out;
3380		}
3381	}
3382
3383	if (!dev_valid_name(dev->name)) {
3384		ret = -EINVAL;
3385		goto err_uninit;
3386	}
3387
3388	dev->ifindex = dev_new_index();
3389	if (dev->iflink == -1)
3390		dev->iflink = dev->ifindex;
3391
3392	/* Check for existence of name */
3393	head = dev_name_hash(dev->name);
3394	hlist_for_each(p, head) {
3395		struct net_device *d
3396			= hlist_entry(p, struct net_device, name_hlist);
3397		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3398			ret = -EEXIST;
3399			goto err_uninit;
3400		}
3401	}
3402
3403	/* Fix illegal checksum combinations */
3404	if ((dev->features & NETIF_F_HW_CSUM) &&
3405	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3406		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3407		       dev->name);
3408		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3409	}
3410
3411	if ((dev->features & NETIF_F_NO_CSUM) &&
3412	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3413		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3414		       dev->name);
3415		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3416	}
3417
3418
3419	/* Fix illegal SG+CSUM combinations. */
3420	if ((dev->features & NETIF_F_SG) &&
3421	    !(dev->features & NETIF_F_ALL_CSUM)) {
3422		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3423		       dev->name);
3424		dev->features &= ~NETIF_F_SG;
3425	}
3426
3427	/* TSO requires that SG is present as well. */
3428	if ((dev->features & NETIF_F_TSO) &&
3429	    !(dev->features & NETIF_F_SG)) {
3430		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3431		       dev->name);
3432		dev->features &= ~NETIF_F_TSO;
3433	}
3434	if (dev->features & NETIF_F_UFO) {
3435		if (!(dev->features & NETIF_F_HW_CSUM)) {
3436			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3437					"NETIF_F_HW_CSUM feature.\n",
3438							dev->name);
3439			dev->features &= ~NETIF_F_UFO;
3440		}
3441		if (!(dev->features & NETIF_F_SG)) {
3442			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3443					"NETIF_F_SG feature.\n",
3444					dev->name);
3445			dev->features &= ~NETIF_F_UFO;
3446		}
3447	}
3448
3449	/*
3450	 *	nil rebuild_header routine,
3451	 *	that should be never called and used as just bug trap.
3452	 */
3453
3454	if (!dev->rebuild_header)
3455		dev->rebuild_header = default_rebuild_header;
3456
3457	ret = netdev_register_sysfs(dev);
3458	if (ret)
3459		goto err_uninit;
3460	dev->reg_state = NETREG_REGISTERED;
3461
3462	/*
3463	 *	Default initial state at registry is that the
3464	 *	device is present.
3465	 */
3466
3467	set_bit(__LINK_STATE_PRESENT, &dev->state);
3468
3469	dev_init_scheduler(dev);
3470	write_lock_bh(&dev_base_lock);
3471	list_add_tail(&dev->dev_list, &dev_base_head);
3472	hlist_add_head(&dev->name_hlist, head);
3473	hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3474	dev_hold(dev);
3475	write_unlock_bh(&dev_base_lock);
3476
3477	/* Notify protocols, that a new device appeared. */
3478	ret = raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3479	ret = notifier_to_errno(ret);
3480	if (ret)
3481		unregister_netdevice(dev);
3482
3483out:
3484	return ret;
3485
3486err_uninit:
3487	if (dev->uninit)
3488		dev->uninit(dev);
3489	goto out;
3490}
3491
3492/**
3493 *	register_netdev	- register a network device
3494 *	@dev: device to register
3495 *
3496 *	Take a completed network device structure and add it to the kernel
3497 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3498 *	chain. 0 is returned on success. A negative errno code is returned
3499 *	on a failure to set up the device, or if the name is a duplicate.
3500 *
3501 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
3502 *	and expands the device name if you passed a format string to
3503 *	alloc_netdev.
3504 */
3505int register_netdev(struct net_device *dev)
3506{
3507	int err;
3508
3509	rtnl_lock();
3510
3511	/*
3512	 * If the name is a format string the caller wants us to do a
3513	 * name allocation.
3514	 */
3515	if (strchr(dev->name, '%')) {
3516		err = dev_alloc_name(dev, dev->name);
3517		if (err < 0)
3518			goto out;
3519	}
3520
3521	err = register_netdevice(dev);
3522out:
3523	rtnl_unlock();
3524	return err;
3525}
3526EXPORT_SYMBOL(register_netdev);
3527
3528/*
3529 * netdev_wait_allrefs - wait until all references are gone.
3530 *
3531 * This is called when unregistering network devices.
3532 *
3533 * Any protocol or device that holds a reference should register
3534 * for netdevice notification, and cleanup and put back the
3535 * reference if they receive an UNREGISTER event.
3536 * We can get stuck here if buggy protocols don't correctly
3537 * call dev_put.
3538 */
3539static void netdev_wait_allrefs(struct net_device *dev)
3540{
3541	unsigned long rebroadcast_time, warning_time;
3542
3543	rebroadcast_time = warning_time = jiffies;
3544	while (atomic_read(&dev->refcnt) != 0) {
3545		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3546			rtnl_lock();
3547
3548			/* Rebroadcast unregister notification */
3549			raw_notifier_call_chain(&netdev_chain,
3550					    NETDEV_UNREGISTER, dev);
3551
3552			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3553				     &dev->state)) {
3554				/* We must not have linkwatch events
3555				 * pending on unregister. If this
3556				 * happens, we simply run the queue
3557				 * unscheduled, resulting in a noop
3558				 * for this device.
3559				 */
3560				linkwatch_run_queue();
3561			}
3562
3563			__rtnl_unlock();
3564
3565			rebroadcast_time = jiffies;
3566		}
3567
3568		msleep(250);
3569
3570		if (time_after(jiffies, warning_time + 10 * HZ)) {
3571			printk(KERN_EMERG "unregister_netdevice: "
3572			       "waiting for %s to become free. Usage "
3573			       "count = %d\n",
3574			       dev->name, atomic_read(&dev->refcnt));
3575			warning_time = jiffies;
3576		}
3577	}
3578}
3579
3580/* The sequence is:
3581 *
3582 *	rtnl_lock();
3583 *	...
3584 *	register_netdevice(x1);
3585 *	register_netdevice(x2);
3586 *	...
3587 *	unregister_netdevice(y1);
3588 *	unregister_netdevice(y2);
3589 *      ...
3590 *	rtnl_unlock();
3591 *	free_netdev(y1);
3592 *	free_netdev(y2);
3593 *
3594 * We are invoked by rtnl_unlock() after it drops the semaphore.
3595 * This allows us to deal with problems:
3596 * 1) We can delete sysfs objects which invoke hotplug
3597 *    without deadlocking with linkwatch via keventd.
3598 * 2) Since we run with the RTNL semaphore not held, we can sleep
3599 *    safely in order to wait for the netdev refcnt to drop to zero.
3600 */
3601static DEFINE_MUTEX(net_todo_run_mutex);
3602void netdev_run_todo(void)
3603{
3604	struct list_head list;
3605
3606	/* Need to guard against multiple cpu's getting out of order. */
3607	mutex_lock(&net_todo_run_mutex);
3608
3609	/* Not safe to do outside the semaphore.  We must not return
3610	 * until all unregister events invoked by the local processor
3611	 * have been completed (either by this todo run, or one on
3612	 * another cpu).
3613	 */
3614	if (list_empty(&net_todo_list))
3615		goto out;
3616
3617	/* Snapshot list, allow later requests */
3618	spin_lock(&net_todo_list_lock);
3619	list_replace_init(&net_todo_list, &list);
3620	spin_unlock(&net_todo_list_lock);
3621
3622	while (!list_empty(&list)) {
3623		struct net_device *dev
3624			= list_entry(list.next, struct net_device, todo_list);
3625		list_del(&dev->todo_list);
3626
3627		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3628			printk(KERN_ERR "network todo '%s' but state %d\n",
3629			       dev->name, dev->reg_state);
3630			dump_stack();
3631			continue;
3632		}
3633
3634		dev->reg_state = NETREG_UNREGISTERED;
3635
3636		netdev_wait_allrefs(dev);
3637
3638		/* paranoia */
3639		BUG_ON(atomic_read(&dev->refcnt));
3640		BUG_TRAP(!dev->ip_ptr);
3641		BUG_TRAP(!dev->ip6_ptr);
3642		BUG_TRAP(!dev->dn_ptr);
3643
3644		if (dev->destructor)
3645			dev->destructor(dev);
3646
3647		/* Free network device */
3648		kobject_put(&dev->dev.kobj);
3649	}
3650
3651out:
3652	mutex_unlock(&net_todo_run_mutex);
3653}
3654
3655static struct net_device_stats *internal_stats(struct net_device *dev)
3656{
3657	return &dev->stats;
3658}
3659
3660/**
3661 *	alloc_netdev_mq - allocate network device
3662 *	@sizeof_priv:	size of private data to allocate space for
3663 *	@name:		device name format string
3664 *	@setup:		callback to initialize device
3665 *	@queue_count:	the number of subqueues to allocate
3666 *
3667 *	Allocates a struct net_device with private data area for driver use
3668 *	and performs basic initialization.  Also allocates subquue structs
3669 *	for each queue on the device at the end of the netdevice.
3670 */
3671struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
3672		void (*setup)(struct net_device *), unsigned int queue_count)
3673{
3674	void *p;
3675	struct net_device *dev;
3676	int alloc_size;
3677
3678	BUG_ON(strlen(name) >= sizeof(dev->name));
3679
3680	/* ensure 32-byte alignment of both the device and private area */
3681	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
3682		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
3683		     ~NETDEV_ALIGN_CONST;
3684	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3685
3686	p = kzalloc(alloc_size, GFP_KERNEL);
3687	if (!p) {
3688		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3689		return NULL;
3690	}
3691
3692	dev = (struct net_device *)
3693		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3694	dev->padded = (char *)dev - (char *)p;
3695
3696	if (sizeof_priv) {
3697		dev->priv = ((char *)dev +
3698			     ((sizeof(struct net_device) +
3699			       (sizeof(struct net_device_subqueue) *
3700				(queue_count - 1)) + NETDEV_ALIGN_CONST)
3701			      & ~NETDEV_ALIGN_CONST));
3702	}
3703
3704	dev->egress_subqueue_count = queue_count;
3705
3706	dev->get_stats = internal_stats;
3707	setup(dev);
3708	strcpy(dev->name, name);
3709	return dev;
3710}
3711EXPORT_SYMBOL(alloc_netdev_mq);
3712
3713/**
3714 *	free_netdev - free network device
3715 *	@dev: device
3716 *
3717 *	This function does the last stage of destroying an allocated device
3718 * 	interface. The reference to the device object is released.
3719 *	If this is the last reference then it will be freed.
3720 */
3721void free_netdev(struct net_device *dev)
3722{
3723#ifdef CONFIG_SYSFS
3724	/*  Compatibility with error handling in drivers */
3725	if (dev->reg_state == NETREG_UNINITIALIZED) {
3726		kfree((char *)dev - dev->padded);
3727		return;
3728	}
3729
3730	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3731	dev->reg_state = NETREG_RELEASED;
3732
3733	/* will free via device release */
3734	put_device(&dev->dev);
3735#else
3736	kfree((char *)dev - dev->padded);
3737#endif
3738}
3739
3740/* Synchronize with packet receive processing. */
3741void synchronize_net(void)
3742{
3743	might_sleep();
3744	synchronize_rcu();
3745}
3746
3747/**
3748 *	unregister_netdevice - remove device from the kernel
3749 *	@dev: device
3750 *
3751 *	This function shuts down a device interface and removes it
3752 *	from the kernel tables. On success 0 is returned, on a failure
3753 *	a negative errno code is returned.
3754 *
3755 *	Callers must hold the rtnl semaphore.  You may want
3756 *	unregister_netdev() instead of this.
3757 */
3758
3759void unregister_netdevice(struct net_device *dev)
3760{
3761	BUG_ON(dev_boot_phase);
3762	ASSERT_RTNL();
3763
3764	/* Some devices call without registering for initialization unwind. */
3765	if (dev->reg_state == NETREG_UNINITIALIZED) {
3766		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3767				  "was registered\n", dev->name, dev);
3768
3769		WARN_ON(1);
3770		return;
3771	}
3772
3773	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3774
3775	/* If device is running, close it first. */
3776	if (dev->flags & IFF_UP)
3777		dev_close(dev);
3778
3779	/* And unlink it from device chain. */
3780	write_lock_bh(&dev_base_lock);
3781	list_del(&dev->dev_list);
3782	hlist_del(&dev->name_hlist);
3783	hlist_del(&dev->index_hlist);
3784	write_unlock_bh(&dev_base_lock);
3785
3786	dev->reg_state = NETREG_UNREGISTERING;
3787
3788	synchronize_net();
3789
3790	/* Shutdown queueing discipline. */
3791	dev_shutdown(dev);
3792
3793
3794	/* Notify protocols, that we are about to destroy
3795	   this device. They should clean all the things.
3796	*/
3797	raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3798
3799	/*
3800	 *	Flush the unicast and multicast chains
3801	 */
3802	dev_addr_discard(dev);
3803
3804	if (dev->uninit)
3805		dev->uninit(dev);
3806
3807	/* Notifier chain MUST detach us from master device. */
3808	BUG_TRAP(!dev->master);
3809
3810	/* Remove entries from sysfs */
3811	netdev_unregister_sysfs(dev);
3812
3813	/* Finish processing unregister after unlock */
3814	net_set_todo(dev);
3815
3816	synchronize_net();
3817
3818	dev_put(dev);
3819}
3820
3821/**
3822 *	unregister_netdev - remove device from the kernel
3823 *	@dev: device
3824 *
3825 *	This function shuts down a device interface and removes it
3826 *	from the kernel tables. On success 0 is returned, on a failure
3827 *	a negative errno code is returned.
3828 *
3829 *	This is just a wrapper for unregister_netdevice that takes
3830 *	the rtnl semaphore.  In general you want to use this and not
3831 *	unregister_netdevice.
3832 */
3833void unregister_netdev(struct net_device *dev)
3834{
3835	rtnl_lock();
3836	unregister_netdevice(dev);
3837	rtnl_unlock();
3838}
3839
3840EXPORT_SYMBOL(unregister_netdev);
3841
3842static int dev_cpu_callback(struct notifier_block *nfb,
3843			    unsigned long action,
3844			    void *ocpu)
3845{
3846	struct sk_buff **list_skb;
3847	struct net_device **list_net;
3848	struct sk_buff *skb;
3849	unsigned int cpu, oldcpu = (unsigned long)ocpu;
3850	struct softnet_data *sd, *oldsd;
3851
3852	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
3853		return NOTIFY_OK;
3854
3855	local_irq_disable();
3856	cpu = smp_processor_id();
3857	sd = &per_cpu(softnet_data, cpu);
3858	oldsd = &per_cpu(softnet_data, oldcpu);
3859
3860	/* Find end of our completion_queue. */
3861	list_skb = &sd->completion_queue;
3862	while (*list_skb)
3863		list_skb = &(*list_skb)->next;
3864	/* Append completion queue from offline CPU. */
3865	*list_skb = oldsd->completion_queue;
3866	oldsd->completion_queue = NULL;
3867
3868	/* Find end of our output_queue. */
3869	list_net = &sd->output_queue;
3870	while (*list_net)
3871		list_net = &(*list_net)->next_sched;
3872	/* Append output queue from offline CPU. */
3873	*list_net = oldsd->output_queue;
3874	oldsd->output_queue = NULL;
3875
3876	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3877	local_irq_enable();
3878
3879	/* Process offline CPU's input_pkt_queue */
3880	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3881		netif_rx(skb);
3882
3883	return NOTIFY_OK;
3884}
3885
3886#ifdef CONFIG_NET_DMA
3887/**
3888 * net_dma_rebalance - try to maintain one DMA channel per CPU
3889 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
3890 *
3891 * This is called when the number of channels allocated to the net_dma client
3892 * changes.  The net_dma client tries to have one DMA channel per CPU.
3893 */
3894
3895static void net_dma_rebalance(struct net_dma *net_dma)
3896{
3897	unsigned int cpu, i, n, chan_idx;
3898	struct dma_chan *chan;
3899
3900	if (cpus_empty(net_dma->channel_mask)) {
3901		for_each_online_cpu(cpu)
3902			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3903		return;
3904	}
3905
3906	i = 0;
3907	cpu = first_cpu(cpu_online_map);
3908
3909	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
3910		chan = net_dma->channels[chan_idx];
3911
3912		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
3913		   + (i < (num_online_cpus() %
3914			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
3915
3916		while(n) {
3917			per_cpu(softnet_data, cpu).net_dma = chan;
3918			cpu = next_cpu(cpu, cpu_online_map);
3919			n--;
3920		}
3921		i++;
3922	}
3923}
3924
3925/**
3926 * netdev_dma_event - event callback for the net_dma_client
3927 * @client: should always be net_dma_client
3928 * @chan: DMA channel for the event
3929 * @state: DMA state to be handled
3930 */
3931static enum dma_state_client
3932netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3933	enum dma_state state)
3934{
3935	int i, found = 0, pos = -1;
3936	struct net_dma *net_dma =
3937		container_of(client, struct net_dma, client);
3938	enum dma_state_client ack = DMA_DUP; /* default: take no action */
3939
3940	spin_lock(&net_dma->lock);
3941	switch (state) {
3942	case DMA_RESOURCE_AVAILABLE:
3943		for (i = 0; i < NR_CPUS; i++)
3944			if (net_dma->channels[i] == chan) {
3945				found = 1;
3946				break;
3947			} else if (net_dma->channels[i] == NULL && pos < 0)
3948				pos = i;
3949
3950		if (!found && pos >= 0) {
3951			ack = DMA_ACK;
3952			net_dma->channels[pos] = chan;
3953			cpu_set(pos, net_dma->channel_mask);
3954			net_dma_rebalance(net_dma);
3955		}
3956		break;
3957	case DMA_RESOURCE_REMOVED:
3958		for (i = 0; i < NR_CPUS; i++)
3959			if (net_dma->channels[i] == chan) {
3960				found = 1;
3961				pos = i;
3962				break;
3963			}
3964
3965		if (found) {
3966			ack = DMA_ACK;
3967			cpu_clear(pos, net_dma->channel_mask);
3968			net_dma->channels[i] = NULL;
3969			net_dma_rebalance(net_dma);
3970		}
3971		break;
3972	default:
3973		break;
3974	}
3975	spin_unlock(&net_dma->lock);
3976
3977	return ack;
3978}
3979
3980/**
3981 * netdev_dma_regiser - register the networking subsystem as a DMA client
3982 */
3983static int __init netdev_dma_register(void)
3984{
3985	spin_lock_init(&net_dma.lock);
3986	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
3987	dma_async_client_register(&net_dma.client);
3988	dma_async_client_chan_request(&net_dma.client);
3989	return 0;
3990}
3991
3992#else
3993static int __init netdev_dma_register(void) { return -ENODEV; }
3994#endif /* CONFIG_NET_DMA */
3995
3996/**
3997 *	netdev_compute_feature - compute conjunction of two feature sets
3998 *	@all: first feature set
3999 *	@one: second feature set
4000 *
4001 *	Computes a new feature set after adding a device with feature set
4002 *	@one to the master device with current feature set @all.  Returns
4003 *	the new feature set.
4004 */
4005int netdev_compute_features(unsigned long all, unsigned long one)
4006{
4007	/* if device needs checksumming, downgrade to hw checksumming */
4008	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4009		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4010
4011	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4012	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4013		all ^= NETIF_F_HW_CSUM
4014			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4015
4016	if (one & NETIF_F_GSO)
4017		one |= NETIF_F_GSO_SOFTWARE;
4018	one |= NETIF_F_GSO;
4019
4020	/* If even one device supports robust GSO, enable it for all. */
4021	if (one & NETIF_F_GSO_ROBUST)
4022		all |= NETIF_F_GSO_ROBUST;
4023
4024	all &= one | NETIF_F_LLTX;
4025
4026	if (!(all & NETIF_F_ALL_CSUM))
4027		all &= ~NETIF_F_SG;
4028	if (!(all & NETIF_F_SG))
4029		all &= ~NETIF_F_GSO_MASK;
4030
4031	return all;
4032}
4033EXPORT_SYMBOL(netdev_compute_features);
4034
4035/*
4036 *	Initialize the DEV module. At boot time this walks the device list and
4037 *	unhooks any devices that fail to initialise (normally hardware not
4038 *	present) and leaves us with a valid list of present and active devices.
4039 *
4040 */
4041
4042/*
4043 *       This is called single threaded during boot, so no need
4044 *       to take the rtnl semaphore.
4045 */
4046static int __init net_dev_init(void)
4047{
4048	int i, rc = -ENOMEM;
4049
4050	BUG_ON(!dev_boot_phase);
4051
4052	if (dev_proc_init())
4053		goto out;
4054
4055	if (netdev_sysfs_init())
4056		goto out;
4057
4058	INIT_LIST_HEAD(&ptype_all);
4059	for (i = 0; i < 16; i++)
4060		INIT_LIST_HEAD(&ptype_base[i]);
4061
4062	for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
4063		INIT_HLIST_HEAD(&dev_name_head[i]);
4064
4065	for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
4066		INIT_HLIST_HEAD(&dev_index_head[i]);
4067
4068	/*
4069	 *	Initialise the packet receive queues.
4070	 */
4071
4072	for_each_possible_cpu(i) {
4073		struct softnet_data *queue;
4074
4075		queue = &per_cpu(softnet_data, i);
4076		skb_queue_head_init(&queue->input_pkt_queue);
4077		queue->completion_queue = NULL;
4078		INIT_LIST_HEAD(&queue->poll_list);
4079		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
4080		queue->backlog_dev.weight = weight_p;
4081		queue->backlog_dev.poll = process_backlog;
4082		atomic_set(&queue->backlog_dev.refcnt, 1);
4083	}
4084
4085	netdev_dma_register();
4086
4087	dev_boot_phase = 0;
4088
4089	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4090	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4091
4092	hotcpu_notifier(dev_cpu_callback, 0);
4093	dst_init();
4094	dev_mcast_init();
4095	rc = 0;
4096out:
4097	return rc;
4098}
4099
4100subsys_initcall(net_dev_init);
4101
4102EXPORT_SYMBOL(__dev_get_by_index);
4103EXPORT_SYMBOL(__dev_get_by_name);
4104EXPORT_SYMBOL(__dev_remove_pack);
4105EXPORT_SYMBOL(dev_valid_name);
4106EXPORT_SYMBOL(dev_add_pack);
4107EXPORT_SYMBOL(dev_alloc_name);
4108EXPORT_SYMBOL(dev_close);
4109EXPORT_SYMBOL(dev_get_by_flags);
4110EXPORT_SYMBOL(dev_get_by_index);
4111EXPORT_SYMBOL(dev_get_by_name);
4112EXPORT_SYMBOL(dev_open);
4113EXPORT_SYMBOL(dev_queue_xmit);
4114EXPORT_SYMBOL(dev_remove_pack);
4115EXPORT_SYMBOL(dev_set_allmulti);
4116EXPORT_SYMBOL(dev_set_promiscuity);
4117EXPORT_SYMBOL(dev_change_flags);
4118EXPORT_SYMBOL(dev_set_mtu);
4119EXPORT_SYMBOL(dev_set_mac_address);
4120EXPORT_SYMBOL(free_netdev);
4121EXPORT_SYMBOL(netdev_boot_setup_check);
4122EXPORT_SYMBOL(netdev_set_master);
4123EXPORT_SYMBOL(netdev_state_change);
4124EXPORT_SYMBOL(netif_receive_skb);
4125EXPORT_SYMBOL(netif_rx);
4126EXPORT_SYMBOL(register_gifconf);
4127EXPORT_SYMBOL(register_netdevice);
4128EXPORT_SYMBOL(register_netdevice_notifier);
4129EXPORT_SYMBOL(skb_checksum_help);
4130EXPORT_SYMBOL(synchronize_net);
4131EXPORT_SYMBOL(unregister_netdevice);
4132EXPORT_SYMBOL(unregister_netdevice_notifier);
4133EXPORT_SYMBOL(net_enable_timestamp);
4134EXPORT_SYMBOL(net_disable_timestamp);
4135EXPORT_SYMBOL(dev_get_flags);
4136
4137#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4138EXPORT_SYMBOL(br_handle_frame_hook);
4139EXPORT_SYMBOL(br_fdb_get_hook);
4140EXPORT_SYMBOL(br_fdb_put_hook);
4141#endif
4142
4143#ifdef CONFIG_KMOD
4144EXPORT_SYMBOL(dev_load);
4145#endif
4146
4147EXPORT_PER_CPU_SYMBOL(softnet_data);