net/core/dev.c at v2.6.12-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.12-rc6 3384 lines 84 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/config.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/string.h>
  84#include <linux/mm.h>
  85#include <linux/socket.h>
  86#include <linux/sockios.h>
  87#include <linux/errno.h>
  88#include <linux/interrupt.h>
  89#include <linux/if_ether.h>
  90#include <linux/netdevice.h>
  91#include <linux/etherdevice.h>
  92#include <linux/notifier.h>
  93#include <linux/skbuff.h>
  94#include <net/sock.h>
  95#include <linux/rtnetlink.h>
  96#include <linux/proc_fs.h>
  97#include <linux/seq_file.h>
  98#include <linux/stat.h>
  99#include <linux/if_bridge.h>
 100#include <linux/divert.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <linux/highmem.h>
 105#include <linux/init.h>
 106#include <linux/kmod.h>
 107#include <linux/module.h>
 108#include <linux/kallsyms.h>
 109#include <linux/netpoll.h>
 110#include <linux/rcupdate.h>
 111#include <linux/delay.h>
 112#ifdef CONFIG_NET_RADIO
 113#include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
 114#include <net/iw_handler.h>
 115#endif	/* CONFIG_NET_RADIO */
 116#include <asm/current.h>
 117
 118/* This define, if set, will randomly drop a packet when congestion
 119 * is more than moderate.  It helps fairness in the multi-interface
 120 * case when one of them is a hog, but it kills performance for the
 121 * single interface case so it is off now by default.
 122 */
 123#undef RAND_LIE
 124
 125/* Setting this will sample the queue lengths and thus congestion
 126 * via a timer instead of as each packet is received.
 127 */
 128#undef OFFLINE_SAMPLE
 129
 130/*
 131 *	The list of packet types we will receive (as opposed to discard)
 132 *	and the routines to invoke.
 133 *
 134 *	Why 16. Because with 16 the only overlap we get on a hash of the
 135 *	low nibble of the protocol value is RARP/SNAP/X.25.
 136 *
 137 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 138 *             sure which should go first, but I bet it won't make much
 139 *             difference if we are running VLANs.  The good news is that
 140 *             this protocol won't be in the list unless compiled in, so
 141 *             the average user (w/out VLANs) will not be adversly affected.
 142 *             --BLG
 143 *
 144 *		0800	IP
 145 *		8100    802.1Q VLAN
 146 *		0001	802.3
 147 *		0002	AX.25
 148 *		0004	802.2
 149 *		8035	RARP
 150 *		0005	SNAP
 151 *		0805	X.25
 152 *		0806	ARP
 153 *		8137	IPX
 154 *		0009	Localtalk
 155 *		86DD	IPv6
 156 */
 157
 158static DEFINE_SPINLOCK(ptype_lock);
 159static struct list_head ptype_base[16];	/* 16 way hashed list */
 160static struct list_head ptype_all;		/* Taps */
 161
 162#ifdef OFFLINE_SAMPLE
 163static void sample_queue(unsigned long dummy);
 164static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
 165#endif
 166
 167/*
 168 * The @dev_base list is protected by @dev_base_lock and the rtln
 169 * semaphore.
 170 *
 171 * Pure readers hold dev_base_lock for reading.
 172 *
 173 * Writers must hold the rtnl semaphore while they loop through the
 174 * dev_base list, and hold dev_base_lock for writing when they do the
 175 * actual updates.  This allows pure readers to access the list even
 176 * while a writer is preparing to update it.
 177 *
 178 * To put it another way, dev_base_lock is held for writing only to
 179 * protect against pure readers; the rtnl semaphore provides the
 180 * protection against other writers.
 181 *
 182 * See, for example usages, register_netdevice() and
 183 * unregister_netdevice(), which must be called with the rtnl
 184 * semaphore held.
 185 */
 186struct net_device *dev_base;
 187static struct net_device **dev_tail = &dev_base;
 188DEFINE_RWLOCK(dev_base_lock);
 189
 190EXPORT_SYMBOL(dev_base);
 191EXPORT_SYMBOL(dev_base_lock);
 192
 193#define NETDEV_HASHBITS	8
 194static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 195static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 196
 197static inline struct hlist_head *dev_name_hash(const char *name)
 198{
 199	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 200	return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 201}
 202
 203static inline struct hlist_head *dev_index_hash(int ifindex)
 204{
 205	return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 206}
 207
 208/*
 209 *	Our notifier list
 210 */
 211
 212static struct notifier_block *netdev_chain;
 213
 214/*
 215 *	Device drivers call our routines to queue packets here. We empty the
 216 *	queue in the local softnet handler.
 217 */
 218DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
 219
 220#ifdef CONFIG_SYSFS
 221extern int netdev_sysfs_init(void);
 222extern int netdev_register_sysfs(struct net_device *);
 223extern void netdev_unregister_sysfs(struct net_device *);
 224#else
 225#define netdev_sysfs_init()	 	(0)
 226#define netdev_register_sysfs(dev)	(0)
 227#define	netdev_unregister_sysfs(dev)	do { } while(0)
 228#endif
 229
 230
 231/*******************************************************************************
 232
 233		Protocol management and registration routines
 234
 235*******************************************************************************/
 236
 237/*
 238 *	For efficiency
 239 */
 240
 241int netdev_nit;
 242
 243/*
 244 *	Add a protocol ID to the list. Now that the input handler is
 245 *	smarter we can dispense with all the messy stuff that used to be
 246 *	here.
 247 *
 248 *	BEWARE!!! Protocol handlers, mangling input packets,
 249 *	MUST BE last in hash buckets and checking protocol handlers
 250 *	MUST start from promiscuous ptype_all chain in net_bh.
 251 *	It is true now, do not change it.
 252 *	Explanation follows: if protocol handler, mangling packet, will
 253 *	be the first on list, it is not able to sense, that packet
 254 *	is cloned and should be copied-on-write, so that it will
 255 *	change it and subsequent readers will get broken packet.
 256 *							--ANK (980803)
 257 */
 258
 259/**
 260 *	dev_add_pack - add packet handler
 261 *	@pt: packet type declaration
 262 *
 263 *	Add a protocol handler to the networking stack. The passed &packet_type
 264 *	is linked into kernel lists and may not be freed until it has been
 265 *	removed from the kernel lists.
 266 *
 267 *	This call does not sleep therefore it can not 
 268 *	guarantee all CPU's that are in middle of receiving packets
 269 *	will see the new packet type (until the next received packet).
 270 */
 271
 272void dev_add_pack(struct packet_type *pt)
 273{
 274	int hash;
 275
 276	spin_lock_bh(&ptype_lock);
 277	if (pt->type == htons(ETH_P_ALL)) {
 278		netdev_nit++;
 279		list_add_rcu(&pt->list, &ptype_all);
 280	} else {
 281		hash = ntohs(pt->type) & 15;
 282		list_add_rcu(&pt->list, &ptype_base[hash]);
 283	}
 284	spin_unlock_bh(&ptype_lock);
 285}
 286
 287extern void linkwatch_run_queue(void);
 288
 289
 290
 291/**
 292 *	__dev_remove_pack	 - remove packet handler
 293 *	@pt: packet type declaration
 294 *
 295 *	Remove a protocol handler that was previously added to the kernel
 296 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 297 *	from the kernel lists and can be freed or reused once this function
 298 *	returns. 
 299 *
 300 *      The packet type might still be in use by receivers
 301 *	and must not be freed until after all the CPU's have gone
 302 *	through a quiescent state.
 303 */
 304void __dev_remove_pack(struct packet_type *pt)
 305{
 306	struct list_head *head;
 307	struct packet_type *pt1;
 308
 309	spin_lock_bh(&ptype_lock);
 310
 311	if (pt->type == htons(ETH_P_ALL)) {
 312		netdev_nit--;
 313		head = &ptype_all;
 314	} else
 315		head = &ptype_base[ntohs(pt->type) & 15];
 316
 317	list_for_each_entry(pt1, head, list) {
 318		if (pt == pt1) {
 319			list_del_rcu(&pt->list);
 320			goto out;
 321		}
 322	}
 323
 324	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 325out:
 326	spin_unlock_bh(&ptype_lock);
 327}
 328/**
 329 *	dev_remove_pack	 - remove packet handler
 330 *	@pt: packet type declaration
 331 *
 332 *	Remove a protocol handler that was previously added to the kernel
 333 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 334 *	from the kernel lists and can be freed or reused once this function
 335 *	returns.
 336 *
 337 *	This call sleeps to guarantee that no CPU is looking at the packet
 338 *	type after return.
 339 */
 340void dev_remove_pack(struct packet_type *pt)
 341{
 342	__dev_remove_pack(pt);
 343	
 344	synchronize_net();
 345}
 346
 347/******************************************************************************
 348
 349		      Device Boot-time Settings Routines
 350
 351*******************************************************************************/
 352
 353/* Boot time configuration table */
 354static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 355
 356/**
 357 *	netdev_boot_setup_add	- add new setup entry
 358 *	@name: name of the device
 359 *	@map: configured settings for the device
 360 *
 361 *	Adds new setup entry to the dev_boot_setup list.  The function
 362 *	returns 0 on error and 1 on success.  This is a generic routine to
 363 *	all netdevices.
 364 */
 365static int netdev_boot_setup_add(char *name, struct ifmap *map)
 366{
 367	struct netdev_boot_setup *s;
 368	int i;
 369
 370	s = dev_boot_setup;
 371	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 372		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 373			memset(s[i].name, 0, sizeof(s[i].name));
 374			strcpy(s[i].name, name);
 375			memcpy(&s[i].map, map, sizeof(s[i].map));
 376			break;
 377		}
 378	}
 379
 380	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 381}
 382
 383/**
 384 *	netdev_boot_setup_check	- check boot time settings
 385 *	@dev: the netdevice
 386 *
 387 * 	Check boot time settings for the device.
 388 *	The found settings are set for the device to be used
 389 *	later in the device probing.
 390 *	Returns 0 if no settings found, 1 if they are.
 391 */
 392int netdev_boot_setup_check(struct net_device *dev)
 393{
 394	struct netdev_boot_setup *s = dev_boot_setup;
 395	int i;
 396
 397	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 398		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 399		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 400			dev->irq 	= s[i].map.irq;
 401			dev->base_addr 	= s[i].map.base_addr;
 402			dev->mem_start 	= s[i].map.mem_start;
 403			dev->mem_end 	= s[i].map.mem_end;
 404			return 1;
 405		}
 406	}
 407	return 0;
 408}
 409
 410
 411/**
 412 *	netdev_boot_base	- get address from boot time settings
 413 *	@prefix: prefix for network device
 414 *	@unit: id for network device
 415 *
 416 * 	Check boot time settings for the base address of device.
 417 *	The found settings are set for the device to be used
 418 *	later in the device probing.
 419 *	Returns 0 if no settings found.
 420 */
 421unsigned long netdev_boot_base(const char *prefix, int unit)
 422{
 423	const struct netdev_boot_setup *s = dev_boot_setup;
 424	char name[IFNAMSIZ];
 425	int i;
 426
 427	sprintf(name, "%s%d", prefix, unit);
 428
 429	/*
 430	 * If device already registered then return base of 1
 431	 * to indicate not to probe for this interface
 432	 */
 433	if (__dev_get_by_name(name))
 434		return 1;
 435
 436	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 437		if (!strcmp(name, s[i].name))
 438			return s[i].map.base_addr;
 439	return 0;
 440}
 441
 442/*
 443 * Saves at boot time configured settings for any netdevice.
 444 */
 445int __init netdev_boot_setup(char *str)
 446{
 447	int ints[5];
 448	struct ifmap map;
 449
 450	str = get_options(str, ARRAY_SIZE(ints), ints);
 451	if (!str || !*str)
 452		return 0;
 453
 454	/* Save settings */
 455	memset(&map, 0, sizeof(map));
 456	if (ints[0] > 0)
 457		map.irq = ints[1];
 458	if (ints[0] > 1)
 459		map.base_addr = ints[2];
 460	if (ints[0] > 2)
 461		map.mem_start = ints[3];
 462	if (ints[0] > 3)
 463		map.mem_end = ints[4];
 464
 465	/* Add new entry to the list */
 466	return netdev_boot_setup_add(str, &map);
 467}
 468
 469__setup("netdev=", netdev_boot_setup);
 470
 471/*******************************************************************************
 472
 473			    Device Interface Subroutines
 474
 475*******************************************************************************/
 476
 477/**
 478 *	__dev_get_by_name	- find a device by its name
 479 *	@name: name to find
 480 *
 481 *	Find an interface by name. Must be called under RTNL semaphore
 482 *	or @dev_base_lock. If the name is found a pointer to the device
 483 *	is returned. If the name is not found then %NULL is returned. The
 484 *	reference counters are not incremented so the caller must be
 485 *	careful with locks.
 486 */
 487
 488struct net_device *__dev_get_by_name(const char *name)
 489{
 490	struct hlist_node *p;
 491
 492	hlist_for_each(p, dev_name_hash(name)) {
 493		struct net_device *dev
 494			= hlist_entry(p, struct net_device, name_hlist);
 495		if (!strncmp(dev->name, name, IFNAMSIZ))
 496			return dev;
 497	}
 498	return NULL;
 499}
 500
 501/**
 502 *	dev_get_by_name		- find a device by its name
 503 *	@name: name to find
 504 *
 505 *	Find an interface by name. This can be called from any
 506 *	context and does its own locking. The returned handle has
 507 *	the usage count incremented and the caller must use dev_put() to
 508 *	release it when it is no longer needed. %NULL is returned if no
 509 *	matching device is found.
 510 */
 511
 512struct net_device *dev_get_by_name(const char *name)
 513{
 514	struct net_device *dev;
 515
 516	read_lock(&dev_base_lock);
 517	dev = __dev_get_by_name(name);
 518	if (dev)
 519		dev_hold(dev);
 520	read_unlock(&dev_base_lock);
 521	return dev;
 522}
 523
 524/**
 525 *	__dev_get_by_index - find a device by its ifindex
 526 *	@ifindex: index of device
 527 *
 528 *	Search for an interface by index. Returns %NULL if the device
 529 *	is not found or a pointer to the device. The device has not
 530 *	had its reference counter increased so the caller must be careful
 531 *	about locking. The caller must hold either the RTNL semaphore
 532 *	or @dev_base_lock.
 533 */
 534
 535struct net_device *__dev_get_by_index(int ifindex)
 536{
 537	struct hlist_node *p;
 538
 539	hlist_for_each(p, dev_index_hash(ifindex)) {
 540		struct net_device *dev
 541			= hlist_entry(p, struct net_device, index_hlist);
 542		if (dev->ifindex == ifindex)
 543			return dev;
 544	}
 545	return NULL;
 546}
 547
 548
 549/**
 550 *	dev_get_by_index - find a device by its ifindex
 551 *	@ifindex: index of device
 552 *
 553 *	Search for an interface by index. Returns NULL if the device
 554 *	is not found or a pointer to the device. The device returned has
 555 *	had a reference added and the pointer is safe until the user calls
 556 *	dev_put to indicate they have finished with it.
 557 */
 558
 559struct net_device *dev_get_by_index(int ifindex)
 560{
 561	struct net_device *dev;
 562
 563	read_lock(&dev_base_lock);
 564	dev = __dev_get_by_index(ifindex);
 565	if (dev)
 566		dev_hold(dev);
 567	read_unlock(&dev_base_lock);
 568	return dev;
 569}
 570
 571/**
 572 *	dev_getbyhwaddr - find a device by its hardware address
 573 *	@type: media type of device
 574 *	@ha: hardware address
 575 *
 576 *	Search for an interface by MAC address. Returns NULL if the device
 577 *	is not found or a pointer to the device. The caller must hold the
 578 *	rtnl semaphore. The returned device has not had its ref count increased
 579 *	and the caller must therefore be careful about locking
 580 *
 581 *	BUGS:
 582 *	If the API was consistent this would be __dev_get_by_hwaddr
 583 */
 584
 585struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 586{
 587	struct net_device *dev;
 588
 589	ASSERT_RTNL();
 590
 591	for (dev = dev_base; dev; dev = dev->next)
 592		if (dev->type == type &&
 593		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 594			break;
 595	return dev;
 596}
 597
 598struct net_device *dev_getfirstbyhwtype(unsigned short type)
 599{
 600	struct net_device *dev;
 601
 602	rtnl_lock();
 603	for (dev = dev_base; dev; dev = dev->next) {
 604		if (dev->type == type) {
 605			dev_hold(dev);
 606			break;
 607		}
 608	}
 609	rtnl_unlock();
 610	return dev;
 611}
 612
 613EXPORT_SYMBOL(dev_getfirstbyhwtype);
 614
 615/**
 616 *	dev_get_by_flags - find any device with given flags
 617 *	@if_flags: IFF_* values
 618 *	@mask: bitmask of bits in if_flags to check
 619 *
 620 *	Search for any interface with the given flags. Returns NULL if a device
 621 *	is not found or a pointer to the device. The device returned has 
 622 *	had a reference added and the pointer is safe until the user calls
 623 *	dev_put to indicate they have finished with it.
 624 */
 625
 626struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 627{
 628	struct net_device *dev;
 629
 630	read_lock(&dev_base_lock);
 631	for (dev = dev_base; dev != NULL; dev = dev->next) {
 632		if (((dev->flags ^ if_flags) & mask) == 0) {
 633			dev_hold(dev);
 634			break;
 635		}
 636	}
 637	read_unlock(&dev_base_lock);
 638	return dev;
 639}
 640
 641/**
 642 *	dev_valid_name - check if name is okay for network device
 643 *	@name: name string
 644 *
 645 *	Network device names need to be valid file names to
 646 *	to allow sysfs to work
 647 */
 648static int dev_valid_name(const char *name)
 649{
 650	return !(*name == '\0' 
 651		 || !strcmp(name, ".")
 652		 || !strcmp(name, "..")
 653		 || strchr(name, '/'));
 654}
 655
 656/**
 657 *	dev_alloc_name - allocate a name for a device
 658 *	@dev: device
 659 *	@name: name format string
 660 *
 661 *	Passed a format string - eg "lt%d" it will try and find a suitable
 662 *	id. Not efficient for many devices, not called a lot. The caller
 663 *	must hold the dev_base or rtnl lock while allocating the name and
 664 *	adding the device in order to avoid duplicates. Returns the number
 665 *	of the unit assigned or a negative errno code.
 666 */
 667
 668int dev_alloc_name(struct net_device *dev, const char *name)
 669{
 670	int i = 0;
 671	char buf[IFNAMSIZ];
 672	const char *p;
 673	const int max_netdevices = 8*PAGE_SIZE;
 674	long *inuse;
 675	struct net_device *d;
 676
 677	p = strnchr(name, IFNAMSIZ-1, '%');
 678	if (p) {
 679		/*
 680		 * Verify the string as this thing may have come from
 681		 * the user.  There must be either one "%d" and no other "%"
 682		 * characters.
 683		 */
 684		if (p[1] != 'd' || strchr(p + 2, '%'))
 685			return -EINVAL;
 686
 687		/* Use one page as a bit array of possible slots */
 688		inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 689		if (!inuse)
 690			return -ENOMEM;
 691
 692		for (d = dev_base; d; d = d->next) {
 693			if (!sscanf(d->name, name, &i))
 694				continue;
 695			if (i < 0 || i >= max_netdevices)
 696				continue;
 697
 698			/*  avoid cases where sscanf is not exact inverse of printf */
 699			snprintf(buf, sizeof(buf), name, i);
 700			if (!strncmp(buf, d->name, IFNAMSIZ))
 701				set_bit(i, inuse);
 702		}
 703
 704		i = find_first_zero_bit(inuse, max_netdevices);
 705		free_page((unsigned long) inuse);
 706	}
 707
 708	snprintf(buf, sizeof(buf), name, i);
 709	if (!__dev_get_by_name(buf)) {
 710		strlcpy(dev->name, buf, IFNAMSIZ);
 711		return i;
 712	}
 713
 714	/* It is possible to run out of possible slots
 715	 * when the name is long and there isn't enough space left
 716	 * for the digits, or if all bits are used.
 717	 */
 718	return -ENFILE;
 719}
 720
 721
 722/**
 723 *	dev_change_name - change name of a device
 724 *	@dev: device
 725 *	@newname: name (or format string) must be at least IFNAMSIZ
 726 *
 727 *	Change name of a device, can pass format strings "eth%d".
 728 *	for wildcarding.
 729 */
 730int dev_change_name(struct net_device *dev, char *newname)
 731{
 732	int err = 0;
 733
 734	ASSERT_RTNL();
 735
 736	if (dev->flags & IFF_UP)
 737		return -EBUSY;
 738
 739	if (!dev_valid_name(newname))
 740		return -EINVAL;
 741
 742	if (strchr(newname, '%')) {
 743		err = dev_alloc_name(dev, newname);
 744		if (err < 0)
 745			return err;
 746		strcpy(newname, dev->name);
 747	}
 748	else if (__dev_get_by_name(newname))
 749		return -EEXIST;
 750	else
 751		strlcpy(dev->name, newname, IFNAMSIZ);
 752
 753	err = class_device_rename(&dev->class_dev, dev->name);
 754	if (!err) {
 755		hlist_del(&dev->name_hlist);
 756		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 757		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
 758	}
 759
 760	return err;
 761}
 762
 763/**
 764 *	netdev_features_change - device changes fatures
 765 *	@dev: device to cause notification
 766 *
 767 *	Called to indicate a device has changed features.
 768 */
 769void netdev_features_change(struct net_device *dev)
 770{
 771	notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 772}
 773EXPORT_SYMBOL(netdev_features_change);
 774
 775/**
 776 *	netdev_state_change - device changes state
 777 *	@dev: device to cause notification
 778 *
 779 *	Called to indicate a device has changed state. This function calls
 780 *	the notifier chains for netdev_chain and sends a NEWLINK message
 781 *	to the routing socket.
 782 */
 783void netdev_state_change(struct net_device *dev)
 784{
 785	if (dev->flags & IFF_UP) {
 786		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
 787		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 788	}
 789}
 790
 791/**
 792 *	dev_load 	- load a network module
 793 *	@name: name of interface
 794 *
 795 *	If a network interface is not present and the process has suitable
 796 *	privileges this function loads the module. If module loading is not
 797 *	available in this kernel then it becomes a nop.
 798 */
 799
 800void dev_load(const char *name)
 801{
 802	struct net_device *dev;  
 803
 804	read_lock(&dev_base_lock);
 805	dev = __dev_get_by_name(name);
 806	read_unlock(&dev_base_lock);
 807
 808	if (!dev && capable(CAP_SYS_MODULE))
 809		request_module("%s", name);
 810}
 811
 812static int default_rebuild_header(struct sk_buff *skb)
 813{
 814	printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 815	       skb->dev ? skb->dev->name : "NULL!!!");
 816	kfree_skb(skb);
 817	return 1;
 818}
 819
 820
 821/**
 822 *	dev_open	- prepare an interface for use.
 823 *	@dev:	device to open
 824 *
 825 *	Takes a device from down to up state. The device's private open
 826 *	function is invoked and then the multicast lists are loaded. Finally
 827 *	the device is moved into the up state and a %NETDEV_UP message is
 828 *	sent to the netdev notifier chain.
 829 *
 830 *	Calling this function on an active interface is a nop. On a failure
 831 *	a negative errno code is returned.
 832 */
 833int dev_open(struct net_device *dev)
 834{
 835	int ret = 0;
 836
 837	/*
 838	 *	Is it already up?
 839	 */
 840
 841	if (dev->flags & IFF_UP)
 842		return 0;
 843
 844	/*
 845	 *	Is it even present?
 846	 */
 847	if (!netif_device_present(dev))
 848		return -ENODEV;
 849
 850	/*
 851	 *	Call device private open method
 852	 */
 853	set_bit(__LINK_STATE_START, &dev->state);
 854	if (dev->open) {
 855		ret = dev->open(dev);
 856		if (ret)
 857			clear_bit(__LINK_STATE_START, &dev->state);
 858	}
 859
 860 	/*
 861	 *	If it went open OK then:
 862	 */
 863
 864	if (!ret) {
 865		/*
 866		 *	Set the flags.
 867		 */
 868		dev->flags |= IFF_UP;
 869
 870		/*
 871		 *	Initialize multicasting status
 872		 */
 873		dev_mc_upload(dev);
 874
 875		/*
 876		 *	Wakeup transmit queue engine
 877		 */
 878		dev_activate(dev);
 879
 880		/*
 881		 *	... and announce new interface.
 882		 */
 883		notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 884	}
 885	return ret;
 886}
 887
 888/**
 889 *	dev_close - shutdown an interface.
 890 *	@dev: device to shutdown
 891 *
 892 *	This function moves an active device into down state. A
 893 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 894 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 895 *	chain.
 896 */
 897int dev_close(struct net_device *dev)
 898{
 899	if (!(dev->flags & IFF_UP))
 900		return 0;
 901
 902	/*
 903	 *	Tell people we are going down, so that they can
 904	 *	prepare to death, when device is still operating.
 905	 */
 906	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 907
 908	dev_deactivate(dev);
 909
 910	clear_bit(__LINK_STATE_START, &dev->state);
 911
 912	/* Synchronize to scheduled poll. We cannot touch poll list,
 913	 * it can be even on different cpu. So just clear netif_running(),
 914	 * and wait when poll really will happen. Actually, the best place
 915	 * for this is inside dev->stop() after device stopped its irq
 916	 * engine, but this requires more changes in devices. */
 917
 918	smp_mb__after_clear_bit(); /* Commit netif_running(). */
 919	while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 920		/* No hurry. */
 921		current->state = TASK_INTERRUPTIBLE;
 922		schedule_timeout(1);
 923	}
 924
 925	/*
 926	 *	Call the device specific close. This cannot fail.
 927	 *	Only if device is UP
 928	 *
 929	 *	We allow it to be called even after a DETACH hot-plug
 930	 *	event.
 931	 */
 932	if (dev->stop)
 933		dev->stop(dev);
 934
 935	/*
 936	 *	Device is now down.
 937	 */
 938
 939	dev->flags &= ~IFF_UP;
 940
 941	/*
 942	 * Tell people we are down
 943	 */
 944	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 945
 946	return 0;
 947}
 948
 949
 950/*
 951 *	Device change register/unregister. These are not inline or static
 952 *	as we export them to the world.
 953 */
 954
 955/**
 956 *	register_netdevice_notifier - register a network notifier block
 957 *	@nb: notifier
 958 *
 959 *	Register a notifier to be called when network device events occur.
 960 *	The notifier passed is linked into the kernel structures and must
 961 *	not be reused until it has been unregistered. A negative errno code
 962 *	is returned on a failure.
 963 *
 964 * 	When registered all registration and up events are replayed
 965 *	to the new notifier to allow device to have a race free 
 966 *	view of the network device list.
 967 */
 968
 969int register_netdevice_notifier(struct notifier_block *nb)
 970{
 971	struct net_device *dev;
 972	int err;
 973
 974	rtnl_lock();
 975	err = notifier_chain_register(&netdev_chain, nb);
 976	if (!err) {
 977		for (dev = dev_base; dev; dev = dev->next) {
 978			nb->notifier_call(nb, NETDEV_REGISTER, dev);
 979
 980			if (dev->flags & IFF_UP) 
 981				nb->notifier_call(nb, NETDEV_UP, dev);
 982		}
 983	}
 984	rtnl_unlock();
 985	return err;
 986}
 987
 988/**
 989 *	unregister_netdevice_notifier - unregister a network notifier block
 990 *	@nb: notifier
 991 *
 992 *	Unregister a notifier previously registered by
 993 *	register_netdevice_notifier(). The notifier is unlinked into the
 994 *	kernel structures and may then be reused. A negative errno code
 995 *	is returned on a failure.
 996 */
 997
 998int unregister_netdevice_notifier(struct notifier_block *nb)
 999{
1000	return notifier_chain_unregister(&netdev_chain, nb);
1001}
1002
1003/**
1004 *	call_netdevice_notifiers - call all network notifier blocks
1005 *      @val: value passed unmodified to notifier function
1006 *      @v:   pointer passed unmodified to notifier function
1007 *
1008 *	Call all network notifier blocks.  Parameters and return value
1009 *	are as for notifier_call_chain().
1010 */
1011
1012int call_netdevice_notifiers(unsigned long val, void *v)
1013{
1014	return notifier_call_chain(&netdev_chain, val, v);
1015}
1016
1017/* When > 0 there are consumers of rx skb time stamps */
1018static atomic_t netstamp_needed = ATOMIC_INIT(0);
1019
1020void net_enable_timestamp(void)
1021{
1022	atomic_inc(&netstamp_needed);
1023}
1024
1025void net_disable_timestamp(void)
1026{
1027	atomic_dec(&netstamp_needed);
1028}
1029
1030static inline void net_timestamp(struct timeval *stamp)
1031{
1032	if (atomic_read(&netstamp_needed))
1033		do_gettimeofday(stamp);
1034	else {
1035		stamp->tv_sec = 0;
1036		stamp->tv_usec = 0;
1037	}
1038}
1039
1040/*
1041 *	Support routine. Sends outgoing frames to any network
1042 *	taps currently in use.
1043 */
1044
1045void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1046{
1047	struct packet_type *ptype;
1048	net_timestamp(&skb->stamp);
1049
1050	rcu_read_lock();
1051	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1052		/* Never send packets back to the socket
1053		 * they originated from - MvS (miquels@drinkel.ow.org)
1054		 */
1055		if ((ptype->dev == dev || !ptype->dev) &&
1056		    (ptype->af_packet_priv == NULL ||
1057		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1058			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1059			if (!skb2)
1060				break;
1061
1062			/* skb->nh should be correctly
1063			   set by sender, so that the second statement is
1064			   just protection against buggy protocols.
1065			 */
1066			skb2->mac.raw = skb2->data;
1067
1068			if (skb2->nh.raw < skb2->data ||
1069			    skb2->nh.raw > skb2->tail) {
1070				if (net_ratelimit())
1071					printk(KERN_CRIT "protocol %04x is "
1072					       "buggy, dev %s\n",
1073					       skb2->protocol, dev->name);
1074				skb2->nh.raw = skb2->data;
1075			}
1076
1077			skb2->h.raw = skb2->nh.raw;
1078			skb2->pkt_type = PACKET_OUTGOING;
1079			ptype->func(skb2, skb->dev, ptype);
1080		}
1081	}
1082	rcu_read_unlock();
1083}
1084
1085/*
1086 * Invalidate hardware checksum when packet is to be mangled, and
1087 * complete checksum manually on outgoing path.
1088 */
1089int skb_checksum_help(struct sk_buff *skb, int inward)
1090{
1091	unsigned int csum;
1092	int ret = 0, offset = skb->h.raw - skb->data;
1093
1094	if (inward) {
1095		skb->ip_summed = CHECKSUM_NONE;
1096		goto out;
1097	}
1098
1099	if (skb_cloned(skb)) {
1100		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1101		if (ret)
1102			goto out;
1103	}
1104
1105	if (offset > (int)skb->len)
1106		BUG();
1107	csum = skb_checksum(skb, offset, skb->len-offset, 0);
1108
1109	offset = skb->tail - skb->h.raw;
1110	if (offset <= 0)
1111		BUG();
1112	if (skb->csum + 2 > offset)
1113		BUG();
1114
1115	*(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1116	skb->ip_summed = CHECKSUM_NONE;
1117out:	
1118	return ret;
1119}
1120
1121#ifdef CONFIG_HIGHMEM
1122/* Actually, we should eliminate this check as soon as we know, that:
1123 * 1. IOMMU is present and allows to map all the memory.
1124 * 2. No high memory really exists on this machine.
1125 */
1126
1127static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1128{
1129	int i;
1130
1131	if (dev->features & NETIF_F_HIGHDMA)
1132		return 0;
1133
1134	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1135		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1136			return 1;
1137
1138	return 0;
1139}
1140#else
1141#define illegal_highdma(dev, skb)	(0)
1142#endif
1143
1144extern void skb_release_data(struct sk_buff *);
1145
1146/* Keep head the same: replace data */
1147int __skb_linearize(struct sk_buff *skb, int gfp_mask)
1148{
1149	unsigned int size;
1150	u8 *data;
1151	long offset;
1152	struct skb_shared_info *ninfo;
1153	int headerlen = skb->data - skb->head;
1154	int expand = (skb->tail + skb->data_len) - skb->end;
1155
1156	if (skb_shared(skb))
1157		BUG();
1158
1159	if (expand <= 0)
1160		expand = 0;
1161
1162	size = skb->end - skb->head + expand;
1163	size = SKB_DATA_ALIGN(size);
1164	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1165	if (!data)
1166		return -ENOMEM;
1167
1168	/* Copy entire thing */
1169	if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1170		BUG();
1171
1172	/* Set up shinfo */
1173	ninfo = (struct skb_shared_info*)(data + size);
1174	atomic_set(&ninfo->dataref, 1);
1175	ninfo->tso_size = skb_shinfo(skb)->tso_size;
1176	ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1177	ninfo->nr_frags = 0;
1178	ninfo->frag_list = NULL;
1179
1180	/* Offset between the two in bytes */
1181	offset = data - skb->head;
1182
1183	/* Free old data. */
1184	skb_release_data(skb);
1185
1186	skb->head = data;
1187	skb->end  = data + size;
1188
1189	/* Set up new pointers */
1190	skb->h.raw   += offset;
1191	skb->nh.raw  += offset;
1192	skb->mac.raw += offset;
1193	skb->tail    += offset;
1194	skb->data    += offset;
1195
1196	/* We are no longer a clone, even if we were. */
1197	skb->cloned    = 0;
1198
1199	skb->tail     += skb->data_len;
1200	skb->data_len  = 0;
1201	return 0;
1202}
1203
1204#define HARD_TX_LOCK(dev, cpu) {			\
1205	if ((dev->features & NETIF_F_LLTX) == 0) {	\
1206		spin_lock(&dev->xmit_lock);		\
1207		dev->xmit_lock_owner = cpu;		\
1208	}						\
1209}
1210
1211#define HARD_TX_UNLOCK(dev) {				\
1212	if ((dev->features & NETIF_F_LLTX) == 0) {	\
1213		dev->xmit_lock_owner = -1;		\
1214		spin_unlock(&dev->xmit_lock);		\
1215	}						\
1216}
1217
1218/**
1219 *	dev_queue_xmit - transmit a buffer
1220 *	@skb: buffer to transmit
1221 *
1222 *	Queue a buffer for transmission to a network device. The caller must
1223 *	have set the device and priority and built the buffer before calling
1224 *	this function. The function can be called from an interrupt.
1225 *
1226 *	A negative errno code is returned on a failure. A success does not
1227 *	guarantee the frame will be transmitted as it may be dropped due
1228 *	to congestion or traffic shaping.
1229 *
1230 * -----------------------------------------------------------------------------------
1231 *      I notice this method can also return errors from the queue disciplines,
1232 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1233 *      be positive.
1234 *
1235 *      Regardless of the return value, the skb is consumed, so it is currently
1236 *      difficult to retry a send to this method.  (You can bump the ref count
1237 *      before sending to hold a reference for retry if you are careful.)
1238 *
1239 *      When calling this method, interrupts MUST be enabled.  This is because
1240 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1241 *          --BLG
1242 */
1243
1244int dev_queue_xmit(struct sk_buff *skb)
1245{
1246	struct net_device *dev = skb->dev;
1247	struct Qdisc *q;
1248	int rc = -ENOMEM;
1249
1250	if (skb_shinfo(skb)->frag_list &&
1251	    !(dev->features & NETIF_F_FRAGLIST) &&
1252	    __skb_linearize(skb, GFP_ATOMIC))
1253		goto out_kfree_skb;
1254
1255	/* Fragmented skb is linearized if device does not support SG,
1256	 * or if at least one of fragments is in highmem and device
1257	 * does not support DMA from it.
1258	 */
1259	if (skb_shinfo(skb)->nr_frags &&
1260	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1261	    __skb_linearize(skb, GFP_ATOMIC))
1262		goto out_kfree_skb;
1263
1264	/* If packet is not checksummed and device does not support
1265	 * checksumming for this protocol, complete checksumming here.
1266	 */
1267	if (skb->ip_summed == CHECKSUM_HW &&
1268	    (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1269	     (!(dev->features & NETIF_F_IP_CSUM) ||
1270	      skb->protocol != htons(ETH_P_IP))))
1271	      	if (skb_checksum_help(skb, 0))
1272	      		goto out_kfree_skb;
1273
1274	/* Disable soft irqs for various locks below. Also 
1275	 * stops preemption for RCU. 
1276	 */
1277	local_bh_disable(); 
1278
1279	/* Updates of qdisc are serialized by queue_lock. 
1280	 * The struct Qdisc which is pointed to by qdisc is now a 
1281	 * rcu structure - it may be accessed without acquiring 
1282	 * a lock (but the structure may be stale.) The freeing of the
1283	 * qdisc will be deferred until it's known that there are no 
1284	 * more references to it.
1285	 * 
1286	 * If the qdisc has an enqueue function, we still need to 
1287	 * hold the queue_lock before calling it, since queue_lock
1288	 * also serializes access to the device queue.
1289	 */
1290
1291	q = rcu_dereference(dev->qdisc);
1292#ifdef CONFIG_NET_CLS_ACT
1293	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1294#endif
1295	if (q->enqueue) {
1296		/* Grab device queue */
1297		spin_lock(&dev->queue_lock);
1298
1299		rc = q->enqueue(skb, q);
1300
1301		qdisc_run(dev);
1302
1303		spin_unlock(&dev->queue_lock);
1304		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1305		goto out;
1306	}
1307
1308	/* The device has no queue. Common case for software devices:
1309	   loopback, all the sorts of tunnels...
1310
1311	   Really, it is unlikely that xmit_lock protection is necessary here.
1312	   (f.e. loopback and IP tunnels are clean ignoring statistics
1313	   counters.)
1314	   However, it is possible, that they rely on protection
1315	   made by us here.
1316
1317	   Check this and shot the lock. It is not prone from deadlocks.
1318	   Either shot noqueue qdisc, it is even simpler 8)
1319	 */
1320	if (dev->flags & IFF_UP) {
1321		int cpu = smp_processor_id(); /* ok because BHs are off */
1322
1323		if (dev->xmit_lock_owner != cpu) {
1324
1325			HARD_TX_LOCK(dev, cpu);
1326
1327			if (!netif_queue_stopped(dev)) {
1328				if (netdev_nit)
1329					dev_queue_xmit_nit(skb, dev);
1330
1331				rc = 0;
1332				if (!dev->hard_start_xmit(skb, dev)) {
1333					HARD_TX_UNLOCK(dev);
1334					goto out;
1335				}
1336			}
1337			HARD_TX_UNLOCK(dev);
1338			if (net_ratelimit())
1339				printk(KERN_CRIT "Virtual device %s asks to "
1340				       "queue packet!\n", dev->name);
1341		} else {
1342			/* Recursion is detected! It is possible,
1343			 * unfortunately */
1344			if (net_ratelimit())
1345				printk(KERN_CRIT "Dead loop on virtual device "
1346				       "%s, fix it urgently!\n", dev->name);
1347		}
1348	}
1349
1350	rc = -ENETDOWN;
1351	local_bh_enable();
1352
1353out_kfree_skb:
1354	kfree_skb(skb);
1355	return rc;
1356out:
1357	local_bh_enable();
1358	return rc;
1359}
1360
1361
1362/*=======================================================================
1363			Receiver routines
1364  =======================================================================*/
1365
1366int netdev_max_backlog = 300;
1367int weight_p = 64;            /* old backlog weight */
1368/* These numbers are selected based on intuition and some
1369 * experimentatiom, if you have more scientific way of doing this
1370 * please go ahead and fix things.
1371 */
1372int no_cong_thresh = 10;
1373int no_cong = 20;
1374int lo_cong = 100;
1375int mod_cong = 290;
1376
1377DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1378
1379
1380static void get_sample_stats(int cpu)
1381{
1382#ifdef RAND_LIE
1383	unsigned long rd;
1384	int rq;
1385#endif
1386	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
1387	int blog = sd->input_pkt_queue.qlen;
1388	int avg_blog = sd->avg_blog;
1389
1390	avg_blog = (avg_blog >> 1) + (blog >> 1);
1391
1392	if (avg_blog > mod_cong) {
1393		/* Above moderate congestion levels. */
1394		sd->cng_level = NET_RX_CN_HIGH;
1395#ifdef RAND_LIE
1396		rd = net_random();
1397		rq = rd % netdev_max_backlog;
1398		if (rq < avg_blog) /* unlucky bastard */
1399			sd->cng_level = NET_RX_DROP;
1400#endif
1401	} else if (avg_blog > lo_cong) {
1402		sd->cng_level = NET_RX_CN_MOD;
1403#ifdef RAND_LIE
1404		rd = net_random();
1405		rq = rd % netdev_max_backlog;
1406			if (rq < avg_blog) /* unlucky bastard */
1407				sd->cng_level = NET_RX_CN_HIGH;
1408#endif
1409	} else if (avg_blog > no_cong)
1410		sd->cng_level = NET_RX_CN_LOW;
1411	else  /* no congestion */
1412		sd->cng_level = NET_RX_SUCCESS;
1413
1414	sd->avg_blog = avg_blog;
1415}
1416
1417#ifdef OFFLINE_SAMPLE
1418static void sample_queue(unsigned long dummy)
1419{
1420/* 10 ms 0r 1ms -- i don't care -- JHS */
1421	int next_tick = 1;
1422	int cpu = smp_processor_id();
1423
1424	get_sample_stats(cpu);
1425	next_tick += jiffies;
1426	mod_timer(&samp_timer, next_tick);
1427}
1428#endif
1429
1430
1431/**
1432 *	netif_rx	-	post buffer to the network code
1433 *	@skb: buffer to post
1434 *
1435 *	This function receives a packet from a device driver and queues it for
1436 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1437 *	may be dropped during processing for congestion control or by the
1438 *	protocol layers.
1439 *
1440 *	return values:
1441 *	NET_RX_SUCCESS	(no congestion)
1442 *	NET_RX_CN_LOW   (low congestion)
1443 *	NET_RX_CN_MOD   (moderate congestion)
1444 *	NET_RX_CN_HIGH  (high congestion)
1445 *	NET_RX_DROP     (packet was dropped)
1446 *
1447 */
1448
1449int netif_rx(struct sk_buff *skb)
1450{
1451	int this_cpu;
1452	struct softnet_data *queue;
1453	unsigned long flags;
1454
1455	/* if netpoll wants it, pretend we never saw it */
1456	if (netpoll_rx(skb))
1457		return NET_RX_DROP;
1458
1459	if (!skb->stamp.tv_sec)
1460		net_timestamp(&skb->stamp);
1461
1462	/*
1463	 * The code is rearranged so that the path is the most
1464	 * short when CPU is congested, but is still operating.
1465	 */
1466	local_irq_save(flags);
1467	this_cpu = smp_processor_id();
1468	queue = &__get_cpu_var(softnet_data);
1469
1470	__get_cpu_var(netdev_rx_stat).total++;
1471	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1472		if (queue->input_pkt_queue.qlen) {
1473			if (queue->throttle)
1474				goto drop;
1475
1476enqueue:
1477			dev_hold(skb->dev);
1478			__skb_queue_tail(&queue->input_pkt_queue, skb);
1479#ifndef OFFLINE_SAMPLE
1480			get_sample_stats(this_cpu);
1481#endif
1482			local_irq_restore(flags);
1483			return queue->cng_level;
1484		}
1485
1486		if (queue->throttle)
1487			queue->throttle = 0;
1488
1489		netif_rx_schedule(&queue->backlog_dev);
1490		goto enqueue;
1491	}
1492
1493	if (!queue->throttle) {
1494		queue->throttle = 1;
1495		__get_cpu_var(netdev_rx_stat).throttled++;
1496	}
1497
1498drop:
1499	__get_cpu_var(netdev_rx_stat).dropped++;
1500	local_irq_restore(flags);
1501
1502	kfree_skb(skb);
1503	return NET_RX_DROP;
1504}
1505
1506int netif_rx_ni(struct sk_buff *skb)
1507{
1508	int err;
1509
1510	preempt_disable();
1511	err = netif_rx(skb);
1512	if (local_softirq_pending())
1513		do_softirq();
1514	preempt_enable();
1515
1516	return err;
1517}
1518
1519EXPORT_SYMBOL(netif_rx_ni);
1520
1521static __inline__ void skb_bond(struct sk_buff *skb)
1522{
1523	struct net_device *dev = skb->dev;
1524
1525	if (dev->master) {
1526		skb->real_dev = skb->dev;
1527		skb->dev = dev->master;
1528	}
1529}
1530
1531static void net_tx_action(struct softirq_action *h)
1532{
1533	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1534
1535	if (sd->completion_queue) {
1536		struct sk_buff *clist;
1537
1538		local_irq_disable();
1539		clist = sd->completion_queue;
1540		sd->completion_queue = NULL;
1541		local_irq_enable();
1542
1543		while (clist) {
1544			struct sk_buff *skb = clist;
1545			clist = clist->next;
1546
1547			BUG_TRAP(!atomic_read(&skb->users));
1548			__kfree_skb(skb);
1549		}
1550	}
1551
1552	if (sd->output_queue) {
1553		struct net_device *head;
1554
1555		local_irq_disable();
1556		head = sd->output_queue;
1557		sd->output_queue = NULL;
1558		local_irq_enable();
1559
1560		while (head) {
1561			struct net_device *dev = head;
1562			head = head->next_sched;
1563
1564			smp_mb__before_clear_bit();
1565			clear_bit(__LINK_STATE_SCHED, &dev->state);
1566
1567			if (spin_trylock(&dev->queue_lock)) {
1568				qdisc_run(dev);
1569				spin_unlock(&dev->queue_lock);
1570			} else {
1571				netif_schedule(dev);
1572			}
1573		}
1574	}
1575}
1576
1577static __inline__ int deliver_skb(struct sk_buff *skb,
1578				  struct packet_type *pt_prev)
1579{
1580	atomic_inc(&skb->users);
1581	return pt_prev->func(skb, skb->dev, pt_prev);
1582}
1583
1584#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1585int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1586struct net_bridge;
1587struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1588						unsigned char *addr);
1589void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1590
1591static __inline__ int handle_bridge(struct sk_buff **pskb,
1592				    struct packet_type **pt_prev, int *ret)
1593{
1594	struct net_bridge_port *port;
1595
1596	if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1597	    (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1598		return 0;
1599
1600	if (*pt_prev) {
1601		*ret = deliver_skb(*pskb, *pt_prev);
1602		*pt_prev = NULL;
1603	} 
1604	
1605	return br_handle_frame_hook(port, pskb);
1606}
1607#else
1608#define handle_bridge(skb, pt_prev, ret)	(0)
1609#endif
1610
1611#ifdef CONFIG_NET_CLS_ACT
1612/* TODO: Maybe we should just force sch_ingress to be compiled in
1613 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1614 * a compare and 2 stores extra right now if we dont have it on
1615 * but have CONFIG_NET_CLS_ACT
1616 * NOTE: This doesnt stop any functionality; if you dont have 
1617 * the ingress scheduler, you just cant add policies on ingress.
1618 *
1619 */
1620static int ing_filter(struct sk_buff *skb) 
1621{
1622	struct Qdisc *q;
1623	struct net_device *dev = skb->dev;
1624	int result = TC_ACT_OK;
1625	
1626	if (dev->qdisc_ingress) {
1627		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1628		if (MAX_RED_LOOP < ttl++) {
1629			printk("Redir loop detected Dropping packet (%s->%s)\n",
1630				skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
1631			return TC_ACT_SHOT;
1632		}
1633
1634		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1635
1636		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1637		if (NULL == skb->input_dev) {
1638			skb->input_dev = skb->dev;
1639			printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
1640		}
1641		spin_lock(&dev->ingress_lock);
1642		if ((q = dev->qdisc_ingress) != NULL)
1643			result = q->enqueue(skb, q);
1644		spin_unlock(&dev->ingress_lock);
1645
1646	}
1647
1648	return result;
1649}
1650#endif
1651
1652int netif_receive_skb(struct sk_buff *skb)
1653{
1654	struct packet_type *ptype, *pt_prev;
1655	int ret = NET_RX_DROP;
1656	unsigned short type;
1657
1658	/* if we've gotten here through NAPI, check netpoll */
1659	if (skb->dev->poll && netpoll_rx(skb))
1660		return NET_RX_DROP;
1661
1662	if (!skb->stamp.tv_sec)
1663		net_timestamp(&skb->stamp);
1664
1665	skb_bond(skb);
1666
1667	__get_cpu_var(netdev_rx_stat).total++;
1668
1669	skb->h.raw = skb->nh.raw = skb->data;
1670	skb->mac_len = skb->nh.raw - skb->mac.raw;
1671
1672	pt_prev = NULL;
1673
1674	rcu_read_lock();
1675
1676#ifdef CONFIG_NET_CLS_ACT
1677	if (skb->tc_verd & TC_NCLS) {
1678		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1679		goto ncls;
1680	}
1681#endif
1682
1683	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1684		if (!ptype->dev || ptype->dev == skb->dev) {
1685			if (pt_prev) 
1686				ret = deliver_skb(skb, pt_prev);
1687			pt_prev = ptype;
1688		}
1689	}
1690
1691#ifdef CONFIG_NET_CLS_ACT
1692	if (pt_prev) {
1693		ret = deliver_skb(skb, pt_prev);
1694		pt_prev = NULL; /* noone else should process this after*/
1695	} else {
1696		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1697	}
1698
1699	ret = ing_filter(skb);
1700
1701	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1702		kfree_skb(skb);
1703		goto out;
1704	}
1705
1706	skb->tc_verd = 0;
1707ncls:
1708#endif
1709
1710	handle_diverter(skb);
1711
1712	if (handle_bridge(&skb, &pt_prev, &ret))
1713		goto out;
1714
1715	type = skb->protocol;
1716	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1717		if (ptype->type == type &&
1718		    (!ptype->dev || ptype->dev == skb->dev)) {
1719			if (pt_prev) 
1720				ret = deliver_skb(skb, pt_prev);
1721			pt_prev = ptype;
1722		}
1723	}
1724
1725	if (pt_prev) {
1726		ret = pt_prev->func(skb, skb->dev, pt_prev);
1727	} else {
1728		kfree_skb(skb);
1729		/* Jamal, now you will not able to escape explaining
1730		 * me how you were going to use this. :-)
1731		 */
1732		ret = NET_RX_DROP;
1733	}
1734
1735out:
1736	rcu_read_unlock();
1737	return ret;
1738}
1739
1740static int process_backlog(struct net_device *backlog_dev, int *budget)
1741{
1742	int work = 0;
1743	int quota = min(backlog_dev->quota, *budget);
1744	struct softnet_data *queue = &__get_cpu_var(softnet_data);
1745	unsigned long start_time = jiffies;
1746
1747	for (;;) {
1748		struct sk_buff *skb;
1749		struct net_device *dev;
1750
1751		local_irq_disable();
1752		skb = __skb_dequeue(&queue->input_pkt_queue);
1753		if (!skb)
1754			goto job_done;
1755		local_irq_enable();
1756
1757		dev = skb->dev;
1758
1759		netif_receive_skb(skb);
1760
1761		dev_put(dev);
1762
1763		work++;
1764
1765		if (work >= quota || jiffies - start_time > 1)
1766			break;
1767
1768	}
1769
1770	backlog_dev->quota -= work;
1771	*budget -= work;
1772	return -1;
1773
1774job_done:
1775	backlog_dev->quota -= work;
1776	*budget -= work;
1777
1778	list_del(&backlog_dev->poll_list);
1779	smp_mb__before_clear_bit();
1780	netif_poll_enable(backlog_dev);
1781
1782	if (queue->throttle)
1783		queue->throttle = 0;
1784	local_irq_enable();
1785	return 0;
1786}
1787
1788static void net_rx_action(struct softirq_action *h)
1789{
1790	struct softnet_data *queue = &__get_cpu_var(softnet_data);
1791	unsigned long start_time = jiffies;
1792	int budget = netdev_max_backlog;
1793
1794	
1795	local_irq_disable();
1796
1797	while (!list_empty(&queue->poll_list)) {
1798		struct net_device *dev;
1799
1800		if (budget <= 0 || jiffies - start_time > 1)
1801			goto softnet_break;
1802
1803		local_irq_enable();
1804
1805		dev = list_entry(queue->poll_list.next,
1806				 struct net_device, poll_list);
1807		netpoll_poll_lock(dev);
1808
1809		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1810			netpoll_poll_unlock(dev);
1811			local_irq_disable();
1812			list_del(&dev->poll_list);
1813			list_add_tail(&dev->poll_list, &queue->poll_list);
1814			if (dev->quota < 0)
1815				dev->quota += dev->weight;
1816			else
1817				dev->quota = dev->weight;
1818		} else {
1819			netpoll_poll_unlock(dev);
1820			dev_put(dev);
1821			local_irq_disable();
1822		}
1823	}
1824out:
1825	local_irq_enable();
1826	return;
1827
1828softnet_break:
1829	__get_cpu_var(netdev_rx_stat).time_squeeze++;
1830	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
1831	goto out;
1832}
1833
1834static gifconf_func_t * gifconf_list [NPROTO];
1835
1836/**
1837 *	register_gifconf	-	register a SIOCGIF handler
1838 *	@family: Address family
1839 *	@gifconf: Function handler
1840 *
1841 *	Register protocol dependent address dumping routines. The handler
1842 *	that is passed must not be freed or reused until it has been replaced
1843 *	by another handler.
1844 */
1845int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1846{
1847	if (family >= NPROTO)
1848		return -EINVAL;
1849	gifconf_list[family] = gifconf;
1850	return 0;
1851}
1852
1853
1854/*
1855 *	Map an interface index to its name (SIOCGIFNAME)
1856 */
1857
1858/*
1859 *	We need this ioctl for efficient implementation of the
1860 *	if_indextoname() function required by the IPv6 API.  Without
1861 *	it, we would have to search all the interfaces to find a
1862 *	match.  --pb
1863 */
1864
1865static int dev_ifname(struct ifreq __user *arg)
1866{
1867	struct net_device *dev;
1868	struct ifreq ifr;
1869
1870	/*
1871	 *	Fetch the caller's info block.
1872	 */
1873
1874	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1875		return -EFAULT;
1876
1877	read_lock(&dev_base_lock);
1878	dev = __dev_get_by_index(ifr.ifr_ifindex);
1879	if (!dev) {
1880		read_unlock(&dev_base_lock);
1881		return -ENODEV;
1882	}
1883
1884	strcpy(ifr.ifr_name, dev->name);
1885	read_unlock(&dev_base_lock);
1886
1887	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1888		return -EFAULT;
1889	return 0;
1890}
1891
1892/*
1893 *	Perform a SIOCGIFCONF call. This structure will change
1894 *	size eventually, and there is nothing I can do about it.
1895 *	Thus we will need a 'compatibility mode'.
1896 */
1897
1898static int dev_ifconf(char __user *arg)
1899{
1900	struct ifconf ifc;
1901	struct net_device *dev;
1902	char __user *pos;
1903	int len;
1904	int total;
1905	int i;
1906
1907	/*
1908	 *	Fetch the caller's info block.
1909	 */
1910
1911	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1912		return -EFAULT;
1913
1914	pos = ifc.ifc_buf;
1915	len = ifc.ifc_len;
1916
1917	/*
1918	 *	Loop over the interfaces, and write an info block for each.
1919	 */
1920
1921	total = 0;
1922	for (dev = dev_base; dev; dev = dev->next) {
1923		for (i = 0; i < NPROTO; i++) {
1924			if (gifconf_list[i]) {
1925				int done;
1926				if (!pos)
1927					done = gifconf_list[i](dev, NULL, 0);
1928				else
1929					done = gifconf_list[i](dev, pos + total,
1930							       len - total);
1931				if (done < 0)
1932					return -EFAULT;
1933				total += done;
1934			}
1935		}
1936  	}
1937
1938	/*
1939	 *	All done.  Write the updated control block back to the caller.
1940	 */
1941	ifc.ifc_len = total;
1942
1943	/*
1944	 * 	Both BSD and Solaris return 0 here, so we do too.
1945	 */
1946	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1947}
1948
1949#ifdef CONFIG_PROC_FS
1950/*
1951 *	This is invoked by the /proc filesystem handler to display a device
1952 *	in detail.
1953 */
1954static __inline__ struct net_device *dev_get_idx(loff_t pos)
1955{
1956	struct net_device *dev;
1957	loff_t i;
1958
1959	for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1960
1961	return i == pos ? dev : NULL;
1962}
1963
1964void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1965{
1966	read_lock(&dev_base_lock);
1967	return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1968}
1969
1970void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1971{
1972	++*pos;
1973	return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1974}
1975
1976void dev_seq_stop(struct seq_file *seq, void *v)
1977{
1978	read_unlock(&dev_base_lock);
1979}
1980
1981static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
1982{
1983	if (dev->get_stats) {
1984		struct net_device_stats *stats = dev->get_stats(dev);
1985
1986		seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
1987				"%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
1988			   dev->name, stats->rx_bytes, stats->rx_packets,
1989			   stats->rx_errors,
1990			   stats->rx_dropped + stats->rx_missed_errors,
1991			   stats->rx_fifo_errors,
1992			   stats->rx_length_errors + stats->rx_over_errors +
1993			     stats->rx_crc_errors + stats->rx_frame_errors,
1994			   stats->rx_compressed, stats->multicast,
1995			   stats->tx_bytes, stats->tx_packets,
1996			   stats->tx_errors, stats->tx_dropped,
1997			   stats->tx_fifo_errors, stats->collisions,
1998			   stats->tx_carrier_errors +
1999			     stats->tx_aborted_errors +
2000			     stats->tx_window_errors +
2001			     stats->tx_heartbeat_errors,
2002			   stats->tx_compressed);
2003	} else
2004		seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2005}
2006
2007/*
2008 *	Called from the PROCfs module. This now uses the new arbitrary sized
2009 *	/proc/net interface to create /proc/net/dev
2010 */
2011static int dev_seq_show(struct seq_file *seq, void *v)
2012{
2013	if (v == SEQ_START_TOKEN)
2014		seq_puts(seq, "Inter-|   Receive                            "
2015			      "                    |  Transmit\n"
2016			      " face |bytes    packets errs drop fifo frame "
2017			      "compressed multicast|bytes    packets errs "
2018			      "drop fifo colls carrier compressed\n");
2019	else
2020		dev_seq_printf_stats(seq, v);
2021	return 0;
2022}
2023
2024static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2025{
2026	struct netif_rx_stats *rc = NULL;
2027
2028	while (*pos < NR_CPUS)
2029	       	if (cpu_online(*pos)) {
2030			rc = &per_cpu(netdev_rx_stat, *pos);
2031			break;
2032		} else
2033			++*pos;
2034	return rc;
2035}
2036
2037static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2038{
2039	return softnet_get_online(pos);
2040}
2041
2042static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2043{
2044	++*pos;
2045	return softnet_get_online(pos);
2046}
2047
2048static void softnet_seq_stop(struct seq_file *seq, void *v)
2049{
2050}
2051
2052static int softnet_seq_show(struct seq_file *seq, void *v)
2053{
2054	struct netif_rx_stats *s = v;
2055
2056	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2057		   s->total, s->dropped, s->time_squeeze, s->throttled,
2058		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
2059		   s->fastroute_deferred_out,
2060#if 0
2061		   s->fastroute_latency_reduction
2062#else
2063		   s->cpu_collision
2064#endif
2065		  );
2066	return 0;
2067}
2068
2069static struct seq_operations dev_seq_ops = {
2070	.start = dev_seq_start,
2071	.next  = dev_seq_next,
2072	.stop  = dev_seq_stop,
2073	.show  = dev_seq_show,
2074};
2075
2076static int dev_seq_open(struct inode *inode, struct file *file)
2077{
2078	return seq_open(file, &dev_seq_ops);
2079}
2080
2081static struct file_operations dev_seq_fops = {
2082	.owner	 = THIS_MODULE,
2083	.open    = dev_seq_open,
2084	.read    = seq_read,
2085	.llseek  = seq_lseek,
2086	.release = seq_release,
2087};
2088
2089static struct seq_operations softnet_seq_ops = {
2090	.start = softnet_seq_start,
2091	.next  = softnet_seq_next,
2092	.stop  = softnet_seq_stop,
2093	.show  = softnet_seq_show,
2094};
2095
2096static int softnet_seq_open(struct inode *inode, struct file *file)
2097{
2098	return seq_open(file, &softnet_seq_ops);
2099}
2100
2101static struct file_operations softnet_seq_fops = {
2102	.owner	 = THIS_MODULE,
2103	.open    = softnet_seq_open,
2104	.read    = seq_read,
2105	.llseek  = seq_lseek,
2106	.release = seq_release,
2107};
2108
2109#ifdef WIRELESS_EXT
2110extern int wireless_proc_init(void);
2111#else
2112#define wireless_proc_init() 0
2113#endif
2114
2115static int __init dev_proc_init(void)
2116{
2117	int rc = -ENOMEM;
2118
2119	if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2120		goto out;
2121	if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2122		goto out_dev;
2123	if (wireless_proc_init())
2124		goto out_softnet;
2125	rc = 0;
2126out:
2127	return rc;
2128out_softnet:
2129	proc_net_remove("softnet_stat");
2130out_dev:
2131	proc_net_remove("dev");
2132	goto out;
2133}
2134#else
2135#define dev_proc_init() 0
2136#endif	/* CONFIG_PROC_FS */
2137
2138
2139/**
2140 *	netdev_set_master	-	set up master/slave pair
2141 *	@slave: slave device
2142 *	@master: new master device
2143 *
2144 *	Changes the master device of the slave. Pass %NULL to break the
2145 *	bonding. The caller must hold the RTNL semaphore. On a failure
2146 *	a negative errno code is returned. On success the reference counts
2147 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2148 *	function returns zero.
2149 */
2150int netdev_set_master(struct net_device *slave, struct net_device *master)
2151{
2152	struct net_device *old = slave->master;
2153
2154	ASSERT_RTNL();
2155
2156	if (master) {
2157		if (old)
2158			return -EBUSY;
2159		dev_hold(master);
2160	}
2161
2162	slave->master = master;
2163	
2164	synchronize_net();
2165
2166	if (old)
2167		dev_put(old);
2168
2169	if (master)
2170		slave->flags |= IFF_SLAVE;
2171	else
2172		slave->flags &= ~IFF_SLAVE;
2173
2174	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2175	return 0;
2176}
2177
2178/**
2179 *	dev_set_promiscuity	- update promiscuity count on a device
2180 *	@dev: device
2181 *	@inc: modifier
2182 *
2183 *	Add or remove promsicuity from a device. While the count in the device
2184 *	remains above zero the interface remains promiscuous. Once it hits zero
2185 *	the device reverts back to normal filtering operation. A negative inc
2186 *	value is used to drop promiscuity on the device.
2187 */
2188void dev_set_promiscuity(struct net_device *dev, int inc)
2189{
2190	unsigned short old_flags = dev->flags;
2191
2192	dev->flags |= IFF_PROMISC;
2193	if ((dev->promiscuity += inc) == 0)
2194		dev->flags &= ~IFF_PROMISC;
2195	if (dev->flags ^ old_flags) {
2196		dev_mc_upload(dev);
2197		printk(KERN_INFO "device %s %s promiscuous mode\n",
2198		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2199		       					       "left");
2200	}
2201}
2202
2203/**
2204 *	dev_set_allmulti	- update allmulti count on a device
2205 *	@dev: device
2206 *	@inc: modifier
2207 *
2208 *	Add or remove reception of all multicast frames to a device. While the
2209 *	count in the device remains above zero the interface remains listening
2210 *	to all interfaces. Once it hits zero the device reverts back to normal
2211 *	filtering operation. A negative @inc value is used to drop the counter
2212 *	when releasing a resource needing all multicasts.
2213 */
2214
2215void dev_set_allmulti(struct net_device *dev, int inc)
2216{
2217	unsigned short old_flags = dev->flags;
2218
2219	dev->flags |= IFF_ALLMULTI;
2220	if ((dev->allmulti += inc) == 0)
2221		dev->flags &= ~IFF_ALLMULTI;
2222	if (dev->flags ^ old_flags)
2223		dev_mc_upload(dev);
2224}
2225
2226unsigned dev_get_flags(const struct net_device *dev)
2227{
2228	unsigned flags;
2229
2230	flags = (dev->flags & ~(IFF_PROMISC |
2231				IFF_ALLMULTI |
2232				IFF_RUNNING)) | 
2233		(dev->gflags & (IFF_PROMISC |
2234				IFF_ALLMULTI));
2235
2236	if (netif_running(dev) && netif_carrier_ok(dev))
2237		flags |= IFF_RUNNING;
2238
2239	return flags;
2240}
2241
2242int dev_change_flags(struct net_device *dev, unsigned flags)
2243{
2244	int ret;
2245	int old_flags = dev->flags;
2246
2247	/*
2248	 *	Set the flags on our device.
2249	 */
2250
2251	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2252			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2253			       IFF_AUTOMEDIA)) |
2254		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2255				    IFF_ALLMULTI));
2256
2257	/*
2258	 *	Load in the correct multicast list now the flags have changed.
2259	 */
2260
2261	dev_mc_upload(dev);
2262
2263	/*
2264	 *	Have we downed the interface. We handle IFF_UP ourselves
2265	 *	according to user attempts to set it, rather than blindly
2266	 *	setting it.
2267	 */
2268
2269	ret = 0;
2270	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
2271		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2272
2273		if (!ret)
2274			dev_mc_upload(dev);
2275	}
2276
2277	if (dev->flags & IFF_UP &&
2278	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2279					  IFF_VOLATILE)))
2280		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
2281
2282	if ((flags ^ dev->gflags) & IFF_PROMISC) {
2283		int inc = (flags & IFF_PROMISC) ? +1 : -1;
2284		dev->gflags ^= IFF_PROMISC;
2285		dev_set_promiscuity(dev, inc);
2286	}
2287
2288	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2289	   is important. Some (broken) drivers set IFF_PROMISC, when
2290	   IFF_ALLMULTI is requested not asking us and not reporting.
2291	 */
2292	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2293		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2294		dev->gflags ^= IFF_ALLMULTI;
2295		dev_set_allmulti(dev, inc);
2296	}
2297
2298	if (old_flags ^ dev->flags)
2299		rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2300
2301	return ret;
2302}
2303
2304int dev_set_mtu(struct net_device *dev, int new_mtu)
2305{
2306	int err;
2307
2308	if (new_mtu == dev->mtu)
2309		return 0;
2310
2311	/*	MTU must be positive.	 */
2312	if (new_mtu < 0)
2313		return -EINVAL;
2314
2315	if (!netif_device_present(dev))
2316		return -ENODEV;
2317
2318	err = 0;
2319	if (dev->change_mtu)
2320		err = dev->change_mtu(dev, new_mtu);
2321	else
2322		dev->mtu = new_mtu;
2323	if (!err && dev->flags & IFF_UP)
2324		notifier_call_chain(&netdev_chain,
2325				    NETDEV_CHANGEMTU, dev);
2326	return err;
2327}
2328
2329int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2330{
2331	int err;
2332
2333	if (!dev->set_mac_address)
2334		return -EOPNOTSUPP;
2335	if (sa->sa_family != dev->type)
2336		return -EINVAL;
2337	if (!netif_device_present(dev))
2338		return -ENODEV;
2339	err = dev->set_mac_address(dev, sa);
2340	if (!err)
2341		notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
2342	return err;
2343}
2344
2345/*
2346 *	Perform the SIOCxIFxxx calls.
2347 */
2348static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2349{
2350	int err;
2351	struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2352
2353	if (!dev)
2354		return -ENODEV;
2355
2356	switch (cmd) {
2357		case SIOCGIFFLAGS:	/* Get interface flags */
2358			ifr->ifr_flags = dev_get_flags(dev);
2359			return 0;
2360
2361		case SIOCSIFFLAGS:	/* Set interface flags */
2362			return dev_change_flags(dev, ifr->ifr_flags);
2363
2364		case SIOCGIFMETRIC:	/* Get the metric on the interface
2365					   (currently unused) */
2366			ifr->ifr_metric = 0;
2367			return 0;
2368
2369		case SIOCSIFMETRIC:	/* Set the metric on the interface
2370					   (currently unused) */
2371			return -EOPNOTSUPP;
2372
2373		case SIOCGIFMTU:	/* Get the MTU of a device */
2374			ifr->ifr_mtu = dev->mtu;
2375			return 0;
2376
2377		case SIOCSIFMTU:	/* Set the MTU of a device */
2378			return dev_set_mtu(dev, ifr->ifr_mtu);
2379
2380		case SIOCGIFHWADDR:
2381			if (!dev->addr_len)
2382				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2383			else
2384				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2385				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2386			ifr->ifr_hwaddr.sa_family = dev->type;
2387			return 0;
2388
2389		case SIOCSIFHWADDR:
2390			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2391
2392		case SIOCSIFHWBROADCAST:
2393			if (ifr->ifr_hwaddr.sa_family != dev->type)
2394				return -EINVAL;
2395			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2396			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2397			notifier_call_chain(&netdev_chain,
2398					    NETDEV_CHANGEADDR, dev);
2399			return 0;
2400
2401		case SIOCGIFMAP:
2402			ifr->ifr_map.mem_start = dev->mem_start;
2403			ifr->ifr_map.mem_end   = dev->mem_end;
2404			ifr->ifr_map.base_addr = dev->base_addr;
2405			ifr->ifr_map.irq       = dev->irq;
2406			ifr->ifr_map.dma       = dev->dma;
2407			ifr->ifr_map.port      = dev->if_port;
2408			return 0;
2409
2410		case SIOCSIFMAP:
2411			if (dev->set_config) {
2412				if (!netif_device_present(dev))
2413					return -ENODEV;
2414				return dev->set_config(dev, &ifr->ifr_map);
2415			}
2416			return -EOPNOTSUPP;
2417
2418		case SIOCADDMULTI:
2419			if (!dev->set_multicast_list ||
2420			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2421				return -EINVAL;
2422			if (!netif_device_present(dev))
2423				return -ENODEV;
2424			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2425					  dev->addr_len, 1);
2426
2427		case SIOCDELMULTI:
2428			if (!dev->set_multicast_list ||
2429			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2430				return -EINVAL;
2431			if (!netif_device_present(dev))
2432				return -ENODEV;
2433			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2434					     dev->addr_len, 1);
2435
2436		case SIOCGIFINDEX:
2437			ifr->ifr_ifindex = dev->ifindex;
2438			return 0;
2439
2440		case SIOCGIFTXQLEN:
2441			ifr->ifr_qlen = dev->tx_queue_len;
2442			return 0;
2443
2444		case SIOCSIFTXQLEN:
2445			if (ifr->ifr_qlen < 0)
2446				return -EINVAL;
2447			dev->tx_queue_len = ifr->ifr_qlen;
2448			return 0;
2449
2450		case SIOCSIFNAME:
2451			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2452			return dev_change_name(dev, ifr->ifr_newname);
2453
2454		/*
2455		 *	Unknown or private ioctl
2456		 */
2457
2458		default:
2459			if ((cmd >= SIOCDEVPRIVATE &&
2460			    cmd <= SIOCDEVPRIVATE + 15) ||
2461			    cmd == SIOCBONDENSLAVE ||
2462			    cmd == SIOCBONDRELEASE ||
2463			    cmd == SIOCBONDSETHWADDR ||
2464			    cmd == SIOCBONDSLAVEINFOQUERY ||
2465			    cmd == SIOCBONDINFOQUERY ||
2466			    cmd == SIOCBONDCHANGEACTIVE ||
2467			    cmd == SIOCGMIIPHY ||
2468			    cmd == SIOCGMIIREG ||
2469			    cmd == SIOCSMIIREG ||
2470			    cmd == SIOCBRADDIF ||
2471			    cmd == SIOCBRDELIF ||
2472			    cmd == SIOCWANDEV) {
2473				err = -EOPNOTSUPP;
2474				if (dev->do_ioctl) {
2475					if (netif_device_present(dev))
2476						err = dev->do_ioctl(dev, ifr,
2477								    cmd);
2478					else
2479						err = -ENODEV;
2480				}
2481			} else
2482				err = -EINVAL;
2483
2484	}
2485	return err;
2486}
2487
2488/*
2489 *	This function handles all "interface"-type I/O control requests. The actual
2490 *	'doing' part of this is dev_ifsioc above.
2491 */
2492
2493/**
2494 *	dev_ioctl	-	network device ioctl
2495 *	@cmd: command to issue
2496 *	@arg: pointer to a struct ifreq in user space
2497 *
2498 *	Issue ioctl functions to devices. This is normally called by the
2499 *	user space syscall interfaces but can sometimes be useful for
2500 *	other purposes. The return value is the return from the syscall if
2501 *	positive or a negative errno code on error.
2502 */
2503
2504int dev_ioctl(unsigned int cmd, void __user *arg)
2505{
2506	struct ifreq ifr;
2507	int ret;
2508	char *colon;
2509
2510	/* One special case: SIOCGIFCONF takes ifconf argument
2511	   and requires shared lock, because it sleeps writing
2512	   to user space.
2513	 */
2514
2515	if (cmd == SIOCGIFCONF) {
2516		rtnl_shlock();
2517		ret = dev_ifconf((char __user *) arg);
2518		rtnl_shunlock();
2519		return ret;
2520	}
2521	if (cmd == SIOCGIFNAME)
2522		return dev_ifname((struct ifreq __user *)arg);
2523
2524	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2525		return -EFAULT;
2526
2527	ifr.ifr_name[IFNAMSIZ-1] = 0;
2528
2529	colon = strchr(ifr.ifr_name, ':');
2530	if (colon)
2531		*colon = 0;
2532
2533	/*
2534	 *	See which interface the caller is talking about.
2535	 */
2536
2537	switch (cmd) {
2538		/*
2539		 *	These ioctl calls:
2540		 *	- can be done by all.
2541		 *	- atomic and do not require locking.
2542		 *	- return a value
2543		 */
2544		case SIOCGIFFLAGS:
2545		case SIOCGIFMETRIC:
2546		case SIOCGIFMTU:
2547		case SIOCGIFHWADDR:
2548		case SIOCGIFSLAVE:
2549		case SIOCGIFMAP:
2550		case SIOCGIFINDEX:
2551		case SIOCGIFTXQLEN:
2552			dev_load(ifr.ifr_name);
2553			read_lock(&dev_base_lock);
2554			ret = dev_ifsioc(&ifr, cmd);
2555			read_unlock(&dev_base_lock);
2556			if (!ret) {
2557				if (colon)
2558					*colon = ':';
2559				if (copy_to_user(arg, &ifr,
2560						 sizeof(struct ifreq)))
2561					ret = -EFAULT;
2562			}
2563			return ret;
2564
2565		case SIOCETHTOOL:
2566			dev_load(ifr.ifr_name);
2567			rtnl_lock();
2568			ret = dev_ethtool(&ifr);
2569			rtnl_unlock();
2570			if (!ret) {
2571				if (colon)
2572					*colon = ':';
2573				if (copy_to_user(arg, &ifr,
2574						 sizeof(struct ifreq)))
2575					ret = -EFAULT;
2576			}
2577			return ret;
2578
2579		/*
2580		 *	These ioctl calls:
2581		 *	- require superuser power.
2582		 *	- require strict serialization.
2583		 *	- return a value
2584		 */
2585		case SIOCGMIIPHY:
2586		case SIOCGMIIREG:
2587		case SIOCSIFNAME:
2588			if (!capable(CAP_NET_ADMIN))
2589				return -EPERM;
2590			dev_load(ifr.ifr_name);
2591			rtnl_lock();
2592			ret = dev_ifsioc(&ifr, cmd);
2593			rtnl_unlock();
2594			if (!ret) {
2595				if (colon)
2596					*colon = ':';
2597				if (copy_to_user(arg, &ifr,
2598						 sizeof(struct ifreq)))
2599					ret = -EFAULT;
2600			}
2601			return ret;
2602
2603		/*
2604		 *	These ioctl calls:
2605		 *	- require superuser power.
2606		 *	- require strict serialization.
2607		 *	- do not return a value
2608		 */
2609		case SIOCSIFFLAGS:
2610		case SIOCSIFMETRIC:
2611		case SIOCSIFMTU:
2612		case SIOCSIFMAP:
2613		case SIOCSIFHWADDR:
2614		case SIOCSIFSLAVE:
2615		case SIOCADDMULTI:
2616		case SIOCDELMULTI:
2617		case SIOCSIFHWBROADCAST:
2618		case SIOCSIFTXQLEN:
2619		case SIOCSMIIREG:
2620		case SIOCBONDENSLAVE:
2621		case SIOCBONDRELEASE:
2622		case SIOCBONDSETHWADDR:
2623		case SIOCBONDSLAVEINFOQUERY:
2624		case SIOCBONDINFOQUERY:
2625		case SIOCBONDCHANGEACTIVE:
2626		case SIOCBRADDIF:
2627		case SIOCBRDELIF:
2628			if (!capable(CAP_NET_ADMIN))
2629				return -EPERM;
2630			dev_load(ifr.ifr_name);
2631			rtnl_lock();
2632			ret = dev_ifsioc(&ifr, cmd);
2633			rtnl_unlock();
2634			return ret;
2635
2636		case SIOCGIFMEM:
2637			/* Get the per device memory space. We can add this but
2638			 * currently do not support it */
2639		case SIOCSIFMEM:
2640			/* Set the per device memory buffer space.
2641			 * Not applicable in our case */
2642		case SIOCSIFLINK:
2643			return -EINVAL;
2644
2645		/*
2646		 *	Unknown or private ioctl.
2647		 */
2648		default:
2649			if (cmd == SIOCWANDEV ||
2650			    (cmd >= SIOCDEVPRIVATE &&
2651			     cmd <= SIOCDEVPRIVATE + 15)) {
2652				dev_load(ifr.ifr_name);
2653				rtnl_lock();
2654				ret = dev_ifsioc(&ifr, cmd);
2655				rtnl_unlock();
2656				if (!ret && copy_to_user(arg, &ifr,
2657							 sizeof(struct ifreq)))
2658					ret = -EFAULT;
2659				return ret;
2660			}
2661#ifdef WIRELESS_EXT
2662			/* Take care of Wireless Extensions */
2663			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2664				/* If command is `set a parameter', or
2665				 * `get the encoding parameters', check if
2666				 * the user has the right to do it */
2667				if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2668					if (!capable(CAP_NET_ADMIN))
2669						return -EPERM;
2670				}
2671				dev_load(ifr.ifr_name);
2672				rtnl_lock();
2673				/* Follow me in net/core/wireless.c */
2674				ret = wireless_process_ioctl(&ifr, cmd);
2675				rtnl_unlock();
2676				if (IW_IS_GET(cmd) &&
2677				    copy_to_user(arg, &ifr,
2678					    	 sizeof(struct ifreq)))
2679					ret = -EFAULT;
2680				return ret;
2681			}
2682#endif	/* WIRELESS_EXT */
2683			return -EINVAL;
2684	}
2685}
2686
2687
2688/**
2689 *	dev_new_index	-	allocate an ifindex
2690 *
2691 *	Returns a suitable unique value for a new device interface
2692 *	number.  The caller must hold the rtnl semaphore or the
2693 *	dev_base_lock to be sure it remains unique.
2694 */
2695static int dev_new_index(void)
2696{
2697	static int ifindex;
2698	for (;;) {
2699		if (++ifindex <= 0)
2700			ifindex = 1;
2701		if (!__dev_get_by_index(ifindex))
2702			return ifindex;
2703	}
2704}
2705
2706static int dev_boot_phase = 1;
2707
2708/* Delayed registration/unregisteration */
2709static DEFINE_SPINLOCK(net_todo_list_lock);
2710static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2711
2712static inline void net_set_todo(struct net_device *dev)
2713{
2714	spin_lock(&net_todo_list_lock);
2715	list_add_tail(&dev->todo_list, &net_todo_list);
2716	spin_unlock(&net_todo_list_lock);
2717}
2718
2719/**
2720 *	register_netdevice	- register a network device
2721 *	@dev: device to register
2722 *
2723 *	Take a completed network device structure and add it to the kernel
2724 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2725 *	chain. 0 is returned on success. A negative errno code is returned
2726 *	on a failure to set up the device, or if the name is a duplicate.
2727 *
2728 *	Callers must hold the rtnl semaphore. You may want
2729 *	register_netdev() instead of this.
2730 *
2731 *	BUGS:
2732 *	The locking appears insufficient to guarantee two parallel registers
2733 *	will not get the same name.
2734 */
2735
2736int register_netdevice(struct net_device *dev)
2737{
2738	struct hlist_head *head;
2739	struct hlist_node *p;
2740	int ret;
2741
2742	BUG_ON(dev_boot_phase);
2743	ASSERT_RTNL();
2744
2745	/* When net_device's are persistent, this will be fatal. */
2746	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2747
2748	spin_lock_init(&dev->queue_lock);
2749	spin_lock_init(&dev->xmit_lock);
2750	dev->xmit_lock_owner = -1;
2751#ifdef CONFIG_NET_CLS_ACT
2752	spin_lock_init(&dev->ingress_lock);
2753#endif
2754
2755	ret = alloc_divert_blk(dev);
2756	if (ret)
2757		goto out;
2758
2759	dev->iflink = -1;
2760
2761	/* Init, if this function is available */
2762	if (dev->init) {
2763		ret = dev->init(dev);
2764		if (ret) {
2765			if (ret > 0)
2766				ret = -EIO;
2767			goto out_err;
2768		}
2769	}
2770 
2771	if (!dev_valid_name(dev->name)) {
2772		ret = -EINVAL;
2773		goto out_err;
2774	}
2775
2776	dev->ifindex = dev_new_index();
2777	if (dev->iflink == -1)
2778		dev->iflink = dev->ifindex;
2779
2780	/* Check for existence of name */
2781	head = dev_name_hash(dev->name);
2782	hlist_for_each(p, head) {
2783		struct net_device *d
2784			= hlist_entry(p, struct net_device, name_hlist);
2785		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2786			ret = -EEXIST;
2787 			goto out_err;
2788		}
2789 	}
2790
2791	/* Fix illegal SG+CSUM combinations. */
2792	if ((dev->features & NETIF_F_SG) &&
2793	    !(dev->features & (NETIF_F_IP_CSUM |
2794			       NETIF_F_NO_CSUM |
2795			       NETIF_F_HW_CSUM))) {
2796		printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2797		       dev->name);
2798		dev->features &= ~NETIF_F_SG;
2799	}
2800
2801	/* TSO requires that SG is present as well. */
2802	if ((dev->features & NETIF_F_TSO) &&
2803	    !(dev->features & NETIF_F_SG)) {
2804		printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2805		       dev->name);
2806		dev->features &= ~NETIF_F_TSO;
2807	}
2808
2809	/*
2810	 *	nil rebuild_header routine,
2811	 *	that should be never called and used as just bug trap.
2812	 */
2813
2814	if (!dev->rebuild_header)
2815		dev->rebuild_header = default_rebuild_header;
2816
2817	/*
2818	 *	Default initial state at registry is that the
2819	 *	device is present.
2820	 */
2821
2822	set_bit(__LINK_STATE_PRESENT, &dev->state);
2823
2824	dev->next = NULL;
2825	dev_init_scheduler(dev);
2826	write_lock_bh(&dev_base_lock);
2827	*dev_tail = dev;
2828	dev_tail = &dev->next;
2829	hlist_add_head(&dev->name_hlist, head);
2830	hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2831	dev_hold(dev);
2832	dev->reg_state = NETREG_REGISTERING;
2833	write_unlock_bh(&dev_base_lock);
2834
2835	/* Notify protocols, that a new device appeared. */
2836	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2837
2838	/* Finish registration after unlock */
2839	net_set_todo(dev);
2840	ret = 0;
2841
2842out:
2843	return ret;
2844out_err:
2845	free_divert_blk(dev);
2846	goto out;
2847}
2848
2849/**
2850 *	register_netdev	- register a network device
2851 *	@dev: device to register
2852 *
2853 *	Take a completed network device structure and add it to the kernel
2854 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2855 *	chain. 0 is returned on success. A negative errno code is returned
2856 *	on a failure to set up the device, or if the name is a duplicate.
2857 *
2858 *	This is a wrapper around register_netdev that takes the rtnl semaphore
2859 *	and expands the device name if you passed a format string to
2860 *	alloc_netdev.
2861 */
2862int register_netdev(struct net_device *dev)
2863{
2864	int err;
2865
2866	rtnl_lock();
2867
2868	/*
2869	 * If the name is a format string the caller wants us to do a
2870	 * name allocation.
2871	 */
2872	if (strchr(dev->name, '%')) {
2873		err = dev_alloc_name(dev, dev->name);
2874		if (err < 0)
2875			goto out;
2876	}
2877	
2878	/*
2879	 * Back compatibility hook. Kill this one in 2.5
2880	 */
2881	if (dev->name[0] == 0 || dev->name[0] == ' ') {
2882		err = dev_alloc_name(dev, "eth%d");
2883		if (err < 0)
2884			goto out;
2885	}
2886
2887	err = register_netdevice(dev);
2888out:
2889	rtnl_unlock();
2890	return err;
2891}
2892EXPORT_SYMBOL(register_netdev);
2893
2894/*
2895 * netdev_wait_allrefs - wait until all references are gone.
2896 *
2897 * This is called when unregistering network devices.
2898 *
2899 * Any protocol or device that holds a reference should register
2900 * for netdevice notification, and cleanup and put back the
2901 * reference if they receive an UNREGISTER event.
2902 * We can get stuck here if buggy protocols don't correctly
2903 * call dev_put. 
2904 */
2905static void netdev_wait_allrefs(struct net_device *dev)
2906{
2907	unsigned long rebroadcast_time, warning_time;
2908
2909	rebroadcast_time = warning_time = jiffies;
2910	while (atomic_read(&dev->refcnt) != 0) {
2911		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2912			rtnl_shlock();
2913
2914			/* Rebroadcast unregister notification */
2915			notifier_call_chain(&netdev_chain,
2916					    NETDEV_UNREGISTER, dev);
2917
2918			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2919				     &dev->state)) {
2920				/* We must not have linkwatch events
2921				 * pending on unregister. If this
2922				 * happens, we simply run the queue
2923				 * unscheduled, resulting in a noop
2924				 * for this device.
2925				 */
2926				linkwatch_run_queue();
2927			}
2928
2929			rtnl_shunlock();
2930
2931			rebroadcast_time = jiffies;
2932		}
2933
2934		msleep(250);
2935
2936		if (time_after(jiffies, warning_time + 10 * HZ)) {
2937			printk(KERN_EMERG "unregister_netdevice: "
2938			       "waiting for %s to become free. Usage "
2939			       "count = %d\n",
2940			       dev->name, atomic_read(&dev->refcnt));
2941			warning_time = jiffies;
2942		}
2943	}
2944}
2945
2946/* The sequence is:
2947 *
2948 *	rtnl_lock();
2949 *	...
2950 *	register_netdevice(x1);
2951 *	register_netdevice(x2);
2952 *	...
2953 *	unregister_netdevice(y1);
2954 *	unregister_netdevice(y2);
2955 *      ...
2956 *	rtnl_unlock();
2957 *	free_netdev(y1);
2958 *	free_netdev(y2);
2959 *
2960 * We are invoked by rtnl_unlock() after it drops the semaphore.
2961 * This allows us to deal with problems:
2962 * 1) We can create/delete sysfs objects which invoke hotplug
2963 *    without deadlocking with linkwatch via keventd.
2964 * 2) Since we run with the RTNL semaphore not held, we can sleep
2965 *    safely in order to wait for the netdev refcnt to drop to zero.
2966 */
2967static DECLARE_MUTEX(net_todo_run_mutex);
2968void netdev_run_todo(void)
2969{
2970	struct list_head list = LIST_HEAD_INIT(list);
2971	int err;
2972
2973
2974	/* Need to guard against multiple cpu's getting out of order. */
2975	down(&net_todo_run_mutex);
2976
2977	/* Not safe to do outside the semaphore.  We must not return
2978	 * until all unregister events invoked by the local processor
2979	 * have been completed (either by this todo run, or one on
2980	 * another cpu).
2981	 */
2982	if (list_empty(&net_todo_list))
2983		goto out;
2984
2985	/* Snapshot list, allow later requests */
2986	spin_lock(&net_todo_list_lock);
2987	list_splice_init(&net_todo_list, &list);
2988	spin_unlock(&net_todo_list_lock);
2989		
2990	while (!list_empty(&list)) {
2991		struct net_device *dev
2992			= list_entry(list.next, struct net_device, todo_list);
2993		list_del(&dev->todo_list);
2994
2995		switch(dev->reg_state) {
2996		case NETREG_REGISTERING:
2997			err = netdev_register_sysfs(dev);
2998			if (err)
2999				printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
3000				       dev->name, err);
3001			dev->reg_state = NETREG_REGISTERED;
3002			break;
3003
3004		case NETREG_UNREGISTERING:
3005			netdev_unregister_sysfs(dev);
3006			dev->reg_state = NETREG_UNREGISTERED;
3007
3008			netdev_wait_allrefs(dev);
3009
3010			/* paranoia */
3011			BUG_ON(atomic_read(&dev->refcnt));
3012			BUG_TRAP(!dev->ip_ptr);
3013			BUG_TRAP(!dev->ip6_ptr);
3014			BUG_TRAP(!dev->dn_ptr);
3015
3016
3017			/* It must be the very last action, 
3018			 * after this 'dev' may point to freed up memory.
3019			 */
3020			if (dev->destructor)
3021				dev->destructor(dev);
3022			break;
3023
3024		default:
3025			printk(KERN_ERR "network todo '%s' but state %d\n",
3026			       dev->name, dev->reg_state);
3027			break;
3028		}
3029	}
3030
3031out:
3032	up(&net_todo_run_mutex);
3033}
3034
3035/**
3036 *	alloc_netdev - allocate network device
3037 *	@sizeof_priv:	size of private data to allocate space for
3038 *	@name:		device name format string
3039 *	@setup:		callback to initialize device
3040 *
3041 *	Allocates a struct net_device with private data area for driver use
3042 *	and performs basic initialization.
3043 */
3044struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3045		void (*setup)(struct net_device *))
3046{
3047	void *p;
3048	struct net_device *dev;
3049	int alloc_size;
3050
3051	/* ensure 32-byte alignment of both the device and private area */
3052	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3053	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3054
3055	p = kmalloc(alloc_size, GFP_KERNEL);
3056	if (!p) {
3057		printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3058		return NULL;
3059	}
3060	memset(p, 0, alloc_size);
3061
3062	dev = (struct net_device *)
3063		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3064	dev->padded = (char *)dev - (char *)p;
3065
3066	if (sizeof_priv)
3067		dev->priv = netdev_priv(dev);
3068
3069	setup(dev);
3070	strcpy(dev->name, name);
3071	return dev;
3072}
3073EXPORT_SYMBOL(alloc_netdev);
3074
3075/**
3076 *	free_netdev - free network device
3077 *	@dev: device
3078 *
3079 *	This function does the last stage of destroying an allocated device 
3080 * 	interface. The reference to the device object is released.  
3081 *	If this is the last reference then it will be freed.
3082 */
3083void free_netdev(struct net_device *dev)
3084{
3085#ifdef CONFIG_SYSFS
3086	/*  Compatiablity with error handling in drivers */
3087	if (dev->reg_state == NETREG_UNINITIALIZED) {
3088		kfree((char *)dev - dev->padded);
3089		return;
3090	}
3091
3092	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3093	dev->reg_state = NETREG_RELEASED;
3094
3095	/* will free via class release */
3096	class_device_put(&dev->class_dev);
3097#else
3098	kfree((char *)dev - dev->padded);
3099#endif
3100}
3101 
3102/* Synchronize with packet receive processing. */
3103void synchronize_net(void) 
3104{
3105	might_sleep();
3106	synchronize_rcu();
3107}
3108
3109/**
3110 *	unregister_netdevice - remove device from the kernel
3111 *	@dev: device
3112 *
3113 *	This function shuts down a device interface and removes it
3114 *	from the kernel tables. On success 0 is returned, on a failure
3115 *	a negative errno code is returned.
3116 *
3117 *	Callers must hold the rtnl semaphore.  You may want
3118 *	unregister_netdev() instead of this.
3119 */
3120
3121int unregister_netdevice(struct net_device *dev)
3122{
3123	struct net_device *d, **dp;
3124
3125	BUG_ON(dev_boot_phase);
3126	ASSERT_RTNL();
3127
3128	/* Some devices call without registering for initialization unwind. */
3129	if (dev->reg_state == NETREG_UNINITIALIZED) {
3130		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3131				  "was registered\n", dev->name, dev);
3132		return -ENODEV;
3133	}
3134
3135	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3136
3137	/* If device is running, close it first. */
3138	if (dev->flags & IFF_UP)
3139		dev_close(dev);
3140
3141	/* And unlink it from device chain. */
3142	for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3143		if (d == dev) {
3144			write_lock_bh(&dev_base_lock);
3145			hlist_del(&dev->name_hlist);
3146			hlist_del(&dev->index_hlist);
3147			if (dev_tail == &dev->next)
3148				dev_tail = dp;
3149			*dp = d->next;
3150			write_unlock_bh(&dev_base_lock);
3151			break;
3152		}
3153	}
3154	if (!d) {
3155		printk(KERN_ERR "unregister net_device: '%s' not found\n",
3156		       dev->name);
3157		return -ENODEV;
3158	}
3159
3160	dev->reg_state = NETREG_UNREGISTERING;
3161
3162	synchronize_net();
3163
3164	/* Shutdown queueing discipline. */
3165	dev_shutdown(dev);
3166
3167	
3168	/* Notify protocols, that we are about to destroy
3169	   this device. They should clean all the things.
3170	*/
3171	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3172	
3173	/*
3174	 *	Flush the multicast chain
3175	 */
3176	dev_mc_discard(dev);
3177
3178	if (dev->uninit)
3179		dev->uninit(dev);
3180
3181	/* Notifier chain MUST detach us from master device. */
3182	BUG_TRAP(!dev->master);
3183
3184	free_divert_blk(dev);
3185
3186	/* Finish processing unregister after unlock */
3187	net_set_todo(dev);
3188
3189	synchronize_net();
3190
3191	dev_put(dev);
3192	return 0;
3193}
3194
3195/**
3196 *	unregister_netdev - remove device from the kernel
3197 *	@dev: device
3198 *
3199 *	This function shuts down a device interface and removes it
3200 *	from the kernel tables. On success 0 is returned, on a failure
3201 *	a negative errno code is returned.
3202 *
3203 *	This is just a wrapper for unregister_netdevice that takes
3204 *	the rtnl semaphore.  In general you want to use this and not
3205 *	unregister_netdevice.
3206 */
3207void unregister_netdev(struct net_device *dev)
3208{
3209	rtnl_lock();
3210	unregister_netdevice(dev);
3211	rtnl_unlock();
3212}
3213
3214EXPORT_SYMBOL(unregister_netdev);
3215
3216#ifdef CONFIG_HOTPLUG_CPU
3217static int dev_cpu_callback(struct notifier_block *nfb,
3218			    unsigned long action,
3219			    void *ocpu)
3220{
3221	struct sk_buff **list_skb;
3222	struct net_device **list_net;
3223	struct sk_buff *skb;
3224	unsigned int cpu, oldcpu = (unsigned long)ocpu;
3225	struct softnet_data *sd, *oldsd;
3226
3227	if (action != CPU_DEAD)
3228		return NOTIFY_OK;
3229
3230	local_irq_disable();
3231	cpu = smp_processor_id();
3232	sd = &per_cpu(softnet_data, cpu);
3233	oldsd = &per_cpu(softnet_data, oldcpu);
3234
3235	/* Find end of our completion_queue. */
3236	list_skb = &sd->completion_queue;
3237	while (*list_skb)
3238		list_skb = &(*list_skb)->next;
3239	/* Append completion queue from offline CPU. */
3240	*list_skb = oldsd->completion_queue;
3241	oldsd->completion_queue = NULL;
3242
3243	/* Find end of our output_queue. */
3244	list_net = &sd->output_queue;
3245	while (*list_net)
3246		list_net = &(*list_net)->next_sched;
3247	/* Append output queue from offline CPU. */
3248	*list_net = oldsd->output_queue;
3249	oldsd->output_queue = NULL;
3250
3251	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3252	local_irq_enable();
3253
3254	/* Process offline CPU's input_pkt_queue */
3255	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3256		netif_rx(skb);
3257
3258	return NOTIFY_OK;
3259}
3260#endif /* CONFIG_HOTPLUG_CPU */
3261
3262
3263/*
3264 *	Initialize the DEV module. At boot time this walks the device list and
3265 *	unhooks any devices that fail to initialise (normally hardware not
3266 *	present) and leaves us with a valid list of present and active devices.
3267 *
3268 */
3269
3270/*
3271 *       This is called single threaded during boot, so no need
3272 *       to take the rtnl semaphore.
3273 */
3274static int __init net_dev_init(void)
3275{
3276	int i, rc = -ENOMEM;
3277
3278	BUG_ON(!dev_boot_phase);
3279
3280	net_random_init();
3281
3282	if (dev_proc_init())
3283		goto out;
3284
3285	if (netdev_sysfs_init())
3286		goto out;
3287
3288	INIT_LIST_HEAD(&ptype_all);
3289	for (i = 0; i < 16; i++) 
3290		INIT_LIST_HEAD(&ptype_base[i]);
3291
3292	for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3293		INIT_HLIST_HEAD(&dev_name_head[i]);
3294
3295	for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3296		INIT_HLIST_HEAD(&dev_index_head[i]);
3297
3298	/*
3299	 *	Initialise the packet receive queues.
3300	 */
3301
3302	for (i = 0; i < NR_CPUS; i++) {
3303		struct softnet_data *queue;
3304
3305		queue = &per_cpu(softnet_data, i);
3306		skb_queue_head_init(&queue->input_pkt_queue);
3307		queue->throttle = 0;
3308		queue->cng_level = 0;
3309		queue->avg_blog = 10; /* arbitrary non-zero */
3310		queue->completion_queue = NULL;
3311		INIT_LIST_HEAD(&queue->poll_list);
3312		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3313		queue->backlog_dev.weight = weight_p;
3314		queue->backlog_dev.poll = process_backlog;
3315		atomic_set(&queue->backlog_dev.refcnt, 1);
3316	}
3317
3318#ifdef OFFLINE_SAMPLE
3319	samp_timer.expires = jiffies + (10 * HZ);
3320	add_timer(&samp_timer);
3321#endif
3322
3323	dev_boot_phase = 0;
3324
3325	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3326	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3327
3328	hotcpu_notifier(dev_cpu_callback, 0);
3329	dst_init();
3330	dev_mcast_init();
3331	rc = 0;
3332out:
3333	return rc;
3334}
3335
3336subsys_initcall(net_dev_init);
3337
3338EXPORT_SYMBOL(__dev_get_by_index);
3339EXPORT_SYMBOL(__dev_get_by_name);
3340EXPORT_SYMBOL(__dev_remove_pack);
3341EXPORT_SYMBOL(__skb_linearize);
3342EXPORT_SYMBOL(dev_add_pack);
3343EXPORT_SYMBOL(dev_alloc_name);
3344EXPORT_SYMBOL(dev_close);
3345EXPORT_SYMBOL(dev_get_by_flags);
3346EXPORT_SYMBOL(dev_get_by_index);
3347EXPORT_SYMBOL(dev_get_by_name);
3348EXPORT_SYMBOL(dev_ioctl);
3349EXPORT_SYMBOL(dev_open);
3350EXPORT_SYMBOL(dev_queue_xmit);
3351EXPORT_SYMBOL(dev_remove_pack);
3352EXPORT_SYMBOL(dev_set_allmulti);
3353EXPORT_SYMBOL(dev_set_promiscuity);
3354EXPORT_SYMBOL(dev_change_flags);
3355EXPORT_SYMBOL(dev_set_mtu);
3356EXPORT_SYMBOL(dev_set_mac_address);
3357EXPORT_SYMBOL(free_netdev);
3358EXPORT_SYMBOL(netdev_boot_setup_check);
3359EXPORT_SYMBOL(netdev_set_master);
3360EXPORT_SYMBOL(netdev_state_change);
3361EXPORT_SYMBOL(netif_receive_skb);
3362EXPORT_SYMBOL(netif_rx);
3363EXPORT_SYMBOL(register_gifconf);
3364EXPORT_SYMBOL(register_netdevice);
3365EXPORT_SYMBOL(register_netdevice_notifier);
3366EXPORT_SYMBOL(skb_checksum_help);
3367EXPORT_SYMBOL(synchronize_net);
3368EXPORT_SYMBOL(unregister_netdevice);
3369EXPORT_SYMBOL(unregister_netdevice_notifier);
3370EXPORT_SYMBOL(net_enable_timestamp);
3371EXPORT_SYMBOL(net_disable_timestamp);
3372EXPORT_SYMBOL(dev_get_flags);
3373
3374#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3375EXPORT_SYMBOL(br_handle_frame_hook);
3376EXPORT_SYMBOL(br_fdb_get_hook);
3377EXPORT_SYMBOL(br_fdb_put_hook);
3378#endif
3379
3380#ifdef CONFIG_KMOD
3381EXPORT_SYMBOL(dev_load);
3382#endif
3383
3384EXPORT_PER_CPU_SYMBOL(softnet_data);