net/core/dev.c at v2.6.24-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.24-rc3 4504 lines 111 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/notifier.h>
  94#include <linux/skbuff.h>
  95#include <net/net_namespace.h>
  96#include <net/sock.h>
  97#include <linux/rtnetlink.h>
  98#include <linux/proc_fs.h>
  99#include <linux/seq_file.h>
 100#include <linux/stat.h>
 101#include <linux/if_bridge.h>
 102#include <linux/if_macvlan.h>
 103#include <net/dst.h>
 104#include <net/pkt_sched.h>
 105#include <net/checksum.h>
 106#include <linux/highmem.h>
 107#include <linux/init.h>
 108#include <linux/kmod.h>
 109#include <linux/module.h>
 110#include <linux/kallsyms.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122
 123#include "net-sysfs.h"
 124
 125/*
 126 *	The list of packet types we will receive (as opposed to discard)
 127 *	and the routines to invoke.
 128 *
 129 *	Why 16. Because with 16 the only overlap we get on a hash of the
 130 *	low nibble of the protocol value is RARP/SNAP/X.25.
 131 *
 132 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 133 *             sure which should go first, but I bet it won't make much
 134 *             difference if we are running VLANs.  The good news is that
 135 *             this protocol won't be in the list unless compiled in, so
 136 *             the average user (w/out VLANs) will not be adversely affected.
 137 *             --BLG
 138 *
 139 *		0800	IP
 140 *		8100    802.1Q VLAN
 141 *		0001	802.3
 142 *		0002	AX.25
 143 *		0004	802.2
 144 *		8035	RARP
 145 *		0005	SNAP
 146 *		0805	X.25
 147 *		0806	ARP
 148 *		8137	IPX
 149 *		0009	Localtalk
 150 *		86DD	IPv6
 151 */
 152
 153static DEFINE_SPINLOCK(ptype_lock);
 154static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
 155static struct list_head ptype_all __read_mostly;	/* Taps */
 156
 157#ifdef CONFIG_NET_DMA
 158struct net_dma {
 159	struct dma_client client;
 160	spinlock_t lock;
 161	cpumask_t channel_mask;
 162	struct dma_chan *channels[NR_CPUS];
 163};
 164
 165static enum dma_state_client
 166netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 167	enum dma_state state);
 168
 169static struct net_dma net_dma = {
 170	.client = {
 171		.event_callback = netdev_dma_event,
 172	},
 173};
 174#endif
 175
 176/*
 177 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 178 * semaphore.
 179 *
 180 * Pure readers hold dev_base_lock for reading.
 181 *
 182 * Writers must hold the rtnl semaphore while they loop through the
 183 * dev_base_head list, and hold dev_base_lock for writing when they do the
 184 * actual updates.  This allows pure readers to access the list even
 185 * while a writer is preparing to update it.
 186 *
 187 * To put it another way, dev_base_lock is held for writing only to
 188 * protect against pure readers; the rtnl semaphore provides the
 189 * protection against other writers.
 190 *
 191 * See, for example usages, register_netdevice() and
 192 * unregister_netdevice(), which must be called with the rtnl
 193 * semaphore held.
 194 */
 195DEFINE_RWLOCK(dev_base_lock);
 196
 197EXPORT_SYMBOL(dev_base_lock);
 198
 199#define NETDEV_HASHBITS	8
 200#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 201
 202static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203{
 204	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 206}
 207
 208static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209{
 210	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 211}
 212
 213/* Device list insertion */
 214static int list_netdevice(struct net_device *dev)
 215{
 216	struct net *net = dev->nd_net;
 217
 218	ASSERT_RTNL();
 219
 220	write_lock_bh(&dev_base_lock);
 221	list_add_tail(&dev->dev_list, &net->dev_base_head);
 222	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 223	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 224	write_unlock_bh(&dev_base_lock);
 225	return 0;
 226}
 227
 228/* Device list removal */
 229static void unlist_netdevice(struct net_device *dev)
 230{
 231	ASSERT_RTNL();
 232
 233	/* Unlink dev from the device chain */
 234	write_lock_bh(&dev_base_lock);
 235	list_del(&dev->dev_list);
 236	hlist_del(&dev->name_hlist);
 237	hlist_del(&dev->index_hlist);
 238	write_unlock_bh(&dev_base_lock);
 239}
 240
 241/*
 242 *	Our notifier list
 243 */
 244
 245static RAW_NOTIFIER_HEAD(netdev_chain);
 246
 247/*
 248 *	Device drivers call our routines to queue packets here. We empty the
 249 *	queue in the local softnet handler.
 250 */
 251
 252DEFINE_PER_CPU(struct softnet_data, softnet_data);
 253
 254#ifdef CONFIG_DEBUG_LOCK_ALLOC
 255/*
 256 * register_netdevice() inits dev->_xmit_lock and sets lockdep class
 257 * according to dev->type
 258 */
 259static const unsigned short netdev_lock_type[] =
 260	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 261	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 262	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 263	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 264	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 265	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 266	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 267	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 268	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 269	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 270	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 271	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 272	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 273	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 274	 ARPHRD_NONE};
 275
 276static const char *netdev_lock_name[] =
 277	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 278	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 279	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 280	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 281	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 282	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 283	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 284	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 285	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 286	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 287	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 288	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 289	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 290	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 291	 "_xmit_NONE"};
 292
 293static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294
 295static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 296{
 297	int i;
 298
 299	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 300		if (netdev_lock_type[i] == dev_type)
 301			return i;
 302	/* the last key is used by default */
 303	return ARRAY_SIZE(netdev_lock_type) - 1;
 304}
 305
 306static inline void netdev_set_lockdep_class(spinlock_t *lock,
 307					    unsigned short dev_type)
 308{
 309	int i;
 310
 311	i = netdev_lock_pos(dev_type);
 312	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 313				   netdev_lock_name[i]);
 314}
 315#else
 316static inline void netdev_set_lockdep_class(spinlock_t *lock,
 317					    unsigned short dev_type)
 318{
 319}
 320#endif
 321
 322/*******************************************************************************
 323
 324		Protocol management and registration routines
 325
 326*******************************************************************************/
 327
 328/*
 329 *	Add a protocol ID to the list. Now that the input handler is
 330 *	smarter we can dispense with all the messy stuff that used to be
 331 *	here.
 332 *
 333 *	BEWARE!!! Protocol handlers, mangling input packets,
 334 *	MUST BE last in hash buckets and checking protocol handlers
 335 *	MUST start from promiscuous ptype_all chain in net_bh.
 336 *	It is true now, do not change it.
 337 *	Explanation follows: if protocol handler, mangling packet, will
 338 *	be the first on list, it is not able to sense, that packet
 339 *	is cloned and should be copied-on-write, so that it will
 340 *	change it and subsequent readers will get broken packet.
 341 *							--ANK (980803)
 342 */
 343
 344/**
 345 *	dev_add_pack - add packet handler
 346 *	@pt: packet type declaration
 347 *
 348 *	Add a protocol handler to the networking stack. The passed &packet_type
 349 *	is linked into kernel lists and may not be freed until it has been
 350 *	removed from the kernel lists.
 351 *
 352 *	This call does not sleep therefore it can not
 353 *	guarantee all CPU's that are in middle of receiving packets
 354 *	will see the new packet type (until the next received packet).
 355 */
 356
 357void dev_add_pack(struct packet_type *pt)
 358{
 359	int hash;
 360
 361	spin_lock_bh(&ptype_lock);
 362	if (pt->type == htons(ETH_P_ALL))
 363		list_add_rcu(&pt->list, &ptype_all);
 364	else {
 365		hash = ntohs(pt->type) & 15;
 366		list_add_rcu(&pt->list, &ptype_base[hash]);
 367	}
 368	spin_unlock_bh(&ptype_lock);
 369}
 370
 371/**
 372 *	__dev_remove_pack	 - remove packet handler
 373 *	@pt: packet type declaration
 374 *
 375 *	Remove a protocol handler that was previously added to the kernel
 376 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 377 *	from the kernel lists and can be freed or reused once this function
 378 *	returns.
 379 *
 380 *      The packet type might still be in use by receivers
 381 *	and must not be freed until after all the CPU's have gone
 382 *	through a quiescent state.
 383 */
 384void __dev_remove_pack(struct packet_type *pt)
 385{
 386	struct list_head *head;
 387	struct packet_type *pt1;
 388
 389	spin_lock_bh(&ptype_lock);
 390
 391	if (pt->type == htons(ETH_P_ALL))
 392		head = &ptype_all;
 393	else
 394		head = &ptype_base[ntohs(pt->type) & 15];
 395
 396	list_for_each_entry(pt1, head, list) {
 397		if (pt == pt1) {
 398			list_del_rcu(&pt->list);
 399			goto out;
 400		}
 401	}
 402
 403	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 404out:
 405	spin_unlock_bh(&ptype_lock);
 406}
 407/**
 408 *	dev_remove_pack	 - remove packet handler
 409 *	@pt: packet type declaration
 410 *
 411 *	Remove a protocol handler that was previously added to the kernel
 412 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 413 *	from the kernel lists and can be freed or reused once this function
 414 *	returns.
 415 *
 416 *	This call sleeps to guarantee that no CPU is looking at the packet
 417 *	type after return.
 418 */
 419void dev_remove_pack(struct packet_type *pt)
 420{
 421	__dev_remove_pack(pt);
 422
 423	synchronize_net();
 424}
 425
 426/******************************************************************************
 427
 428		      Device Boot-time Settings Routines
 429
 430*******************************************************************************/
 431
 432/* Boot time configuration table */
 433static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 434
 435/**
 436 *	netdev_boot_setup_add	- add new setup entry
 437 *	@name: name of the device
 438 *	@map: configured settings for the device
 439 *
 440 *	Adds new setup entry to the dev_boot_setup list.  The function
 441 *	returns 0 on error and 1 on success.  This is a generic routine to
 442 *	all netdevices.
 443 */
 444static int netdev_boot_setup_add(char *name, struct ifmap *map)
 445{
 446	struct netdev_boot_setup *s;
 447	int i;
 448
 449	s = dev_boot_setup;
 450	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 451		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 452			memset(s[i].name, 0, sizeof(s[i].name));
 453			strcpy(s[i].name, name);
 454			memcpy(&s[i].map, map, sizeof(s[i].map));
 455			break;
 456		}
 457	}
 458
 459	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 460}
 461
 462/**
 463 *	netdev_boot_setup_check	- check boot time settings
 464 *	@dev: the netdevice
 465 *
 466 * 	Check boot time settings for the device.
 467 *	The found settings are set for the device to be used
 468 *	later in the device probing.
 469 *	Returns 0 if no settings found, 1 if they are.
 470 */
 471int netdev_boot_setup_check(struct net_device *dev)
 472{
 473	struct netdev_boot_setup *s = dev_boot_setup;
 474	int i;
 475
 476	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 477		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 478		    !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 479			dev->irq 	= s[i].map.irq;
 480			dev->base_addr 	= s[i].map.base_addr;
 481			dev->mem_start 	= s[i].map.mem_start;
 482			dev->mem_end 	= s[i].map.mem_end;
 483			return 1;
 484		}
 485	}
 486	return 0;
 487}
 488
 489
 490/**
 491 *	netdev_boot_base	- get address from boot time settings
 492 *	@prefix: prefix for network device
 493 *	@unit: id for network device
 494 *
 495 * 	Check boot time settings for the base address of device.
 496 *	The found settings are set for the device to be used
 497 *	later in the device probing.
 498 *	Returns 0 if no settings found.
 499 */
 500unsigned long netdev_boot_base(const char *prefix, int unit)
 501{
 502	const struct netdev_boot_setup *s = dev_boot_setup;
 503	char name[IFNAMSIZ];
 504	int i;
 505
 506	sprintf(name, "%s%d", prefix, unit);
 507
 508	/*
 509	 * If device already registered then return base of 1
 510	 * to indicate not to probe for this interface
 511	 */
 512	if (__dev_get_by_name(&init_net, name))
 513		return 1;
 514
 515	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 516		if (!strcmp(name, s[i].name))
 517			return s[i].map.base_addr;
 518	return 0;
 519}
 520
 521/*
 522 * Saves at boot time configured settings for any netdevice.
 523 */
 524int __init netdev_boot_setup(char *str)
 525{
 526	int ints[5];
 527	struct ifmap map;
 528
 529	str = get_options(str, ARRAY_SIZE(ints), ints);
 530	if (!str || !*str)
 531		return 0;
 532
 533	/* Save settings */
 534	memset(&map, 0, sizeof(map));
 535	if (ints[0] > 0)
 536		map.irq = ints[1];
 537	if (ints[0] > 1)
 538		map.base_addr = ints[2];
 539	if (ints[0] > 2)
 540		map.mem_start = ints[3];
 541	if (ints[0] > 3)
 542		map.mem_end = ints[4];
 543
 544	/* Add new entry to the list */
 545	return netdev_boot_setup_add(str, &map);
 546}
 547
 548__setup("netdev=", netdev_boot_setup);
 549
 550/*******************************************************************************
 551
 552			    Device Interface Subroutines
 553
 554*******************************************************************************/
 555
 556/**
 557 *	__dev_get_by_name	- find a device by its name
 558 *	@net: the applicable net namespace
 559 *	@name: name to find
 560 *
 561 *	Find an interface by name. Must be called under RTNL semaphore
 562 *	or @dev_base_lock. If the name is found a pointer to the device
 563 *	is returned. If the name is not found then %NULL is returned. The
 564 *	reference counters are not incremented so the caller must be
 565 *	careful with locks.
 566 */
 567
 568struct net_device *__dev_get_by_name(struct net *net, const char *name)
 569{
 570	struct hlist_node *p;
 571
 572	hlist_for_each(p, dev_name_hash(net, name)) {
 573		struct net_device *dev
 574			= hlist_entry(p, struct net_device, name_hlist);
 575		if (!strncmp(dev->name, name, IFNAMSIZ))
 576			return dev;
 577	}
 578	return NULL;
 579}
 580
 581/**
 582 *	dev_get_by_name		- find a device by its name
 583 *	@net: the applicable net namespace
 584 *	@name: name to find
 585 *
 586 *	Find an interface by name. This can be called from any
 587 *	context and does its own locking. The returned handle has
 588 *	the usage count incremented and the caller must use dev_put() to
 589 *	release it when it is no longer needed. %NULL is returned if no
 590 *	matching device is found.
 591 */
 592
 593struct net_device *dev_get_by_name(struct net *net, const char *name)
 594{
 595	struct net_device *dev;
 596
 597	read_lock(&dev_base_lock);
 598	dev = __dev_get_by_name(net, name);
 599	if (dev)
 600		dev_hold(dev);
 601	read_unlock(&dev_base_lock);
 602	return dev;
 603}
 604
 605/**
 606 *	__dev_get_by_index - find a device by its ifindex
 607 *	@net: the applicable net namespace
 608 *	@ifindex: index of device
 609 *
 610 *	Search for an interface by index. Returns %NULL if the device
 611 *	is not found or a pointer to the device. The device has not
 612 *	had its reference counter increased so the caller must be careful
 613 *	about locking. The caller must hold either the RTNL semaphore
 614 *	or @dev_base_lock.
 615 */
 616
 617struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 618{
 619	struct hlist_node *p;
 620
 621	hlist_for_each(p, dev_index_hash(net, ifindex)) {
 622		struct net_device *dev
 623			= hlist_entry(p, struct net_device, index_hlist);
 624		if (dev->ifindex == ifindex)
 625			return dev;
 626	}
 627	return NULL;
 628}
 629
 630
 631/**
 632 *	dev_get_by_index - find a device by its ifindex
 633 *	@net: the applicable net namespace
 634 *	@ifindex: index of device
 635 *
 636 *	Search for an interface by index. Returns NULL if the device
 637 *	is not found or a pointer to the device. The device returned has
 638 *	had a reference added and the pointer is safe until the user calls
 639 *	dev_put to indicate they have finished with it.
 640 */
 641
 642struct net_device *dev_get_by_index(struct net *net, int ifindex)
 643{
 644	struct net_device *dev;
 645
 646	read_lock(&dev_base_lock);
 647	dev = __dev_get_by_index(net, ifindex);
 648	if (dev)
 649		dev_hold(dev);
 650	read_unlock(&dev_base_lock);
 651	return dev;
 652}
 653
 654/**
 655 *	dev_getbyhwaddr - find a device by its hardware address
 656 *	@net: the applicable net namespace
 657 *	@type: media type of device
 658 *	@ha: hardware address
 659 *
 660 *	Search for an interface by MAC address. Returns NULL if the device
 661 *	is not found or a pointer to the device. The caller must hold the
 662 *	rtnl semaphore. The returned device has not had its ref count increased
 663 *	and the caller must therefore be careful about locking
 664 *
 665 *	BUGS:
 666 *	If the API was consistent this would be __dev_get_by_hwaddr
 667 */
 668
 669struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 670{
 671	struct net_device *dev;
 672
 673	ASSERT_RTNL();
 674
 675	for_each_netdev(&init_net, dev)
 676		if (dev->type == type &&
 677		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 678			return dev;
 679
 680	return NULL;
 681}
 682
 683EXPORT_SYMBOL(dev_getbyhwaddr);
 684
 685struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 686{
 687	struct net_device *dev;
 688
 689	ASSERT_RTNL();
 690	for_each_netdev(net, dev)
 691		if (dev->type == type)
 692			return dev;
 693
 694	return NULL;
 695}
 696
 697EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 698
 699struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 700{
 701	struct net_device *dev;
 702
 703	rtnl_lock();
 704	dev = __dev_getfirstbyhwtype(net, type);
 705	if (dev)
 706		dev_hold(dev);
 707	rtnl_unlock();
 708	return dev;
 709}
 710
 711EXPORT_SYMBOL(dev_getfirstbyhwtype);
 712
 713/**
 714 *	dev_get_by_flags - find any device with given flags
 715 *	@net: the applicable net namespace
 716 *	@if_flags: IFF_* values
 717 *	@mask: bitmask of bits in if_flags to check
 718 *
 719 *	Search for any interface with the given flags. Returns NULL if a device
 720 *	is not found or a pointer to the device. The device returned has
 721 *	had a reference added and the pointer is safe until the user calls
 722 *	dev_put to indicate they have finished with it.
 723 */
 724
 725struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 726{
 727	struct net_device *dev, *ret;
 728
 729	ret = NULL;
 730	read_lock(&dev_base_lock);
 731	for_each_netdev(net, dev) {
 732		if (((dev->flags ^ if_flags) & mask) == 0) {
 733			dev_hold(dev);
 734			ret = dev;
 735			break;
 736		}
 737	}
 738	read_unlock(&dev_base_lock);
 739	return ret;
 740}
 741
 742/**
 743 *	dev_valid_name - check if name is okay for network device
 744 *	@name: name string
 745 *
 746 *	Network device names need to be valid file names to
 747 *	to allow sysfs to work.  We also disallow any kind of
 748 *	whitespace.
 749 */
 750int dev_valid_name(const char *name)
 751{
 752	if (*name == '\0')
 753		return 0;
 754	if (strlen(name) >= IFNAMSIZ)
 755		return 0;
 756	if (!strcmp(name, ".") || !strcmp(name, ".."))
 757		return 0;
 758
 759	while (*name) {
 760		if (*name == '/' || isspace(*name))
 761			return 0;
 762		name++;
 763	}
 764	return 1;
 765}
 766
 767/**
 768 *	__dev_alloc_name - allocate a name for a device
 769 *	@net: network namespace to allocate the device name in
 770 *	@name: name format string
 771 *	@buf:  scratch buffer and result name string
 772 *
 773 *	Passed a format string - eg "lt%d" it will try and find a suitable
 774 *	id. It scans list of devices to build up a free map, then chooses
 775 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 776 *	while allocating the name and adding the device in order to avoid
 777 *	duplicates.
 778 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 779 *	Returns the number of the unit assigned or a negative errno code.
 780 */
 781
 782static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 783{
 784	int i = 0;
 785	const char *p;
 786	const int max_netdevices = 8*PAGE_SIZE;
 787	unsigned long *inuse;
 788	struct net_device *d;
 789
 790	p = strnchr(name, IFNAMSIZ-1, '%');
 791	if (p) {
 792		/*
 793		 * Verify the string as this thing may have come from
 794		 * the user.  There must be either one "%d" and no other "%"
 795		 * characters.
 796		 */
 797		if (p[1] != 'd' || strchr(p + 2, '%'))
 798			return -EINVAL;
 799
 800		/* Use one page as a bit array of possible slots */
 801		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 802		if (!inuse)
 803			return -ENOMEM;
 804
 805		for_each_netdev(net, d) {
 806			if (!sscanf(d->name, name, &i))
 807				continue;
 808			if (i < 0 || i >= max_netdevices)
 809				continue;
 810
 811			/*  avoid cases where sscanf is not exact inverse of printf */
 812			snprintf(buf, IFNAMSIZ, name, i);
 813			if (!strncmp(buf, d->name, IFNAMSIZ))
 814				set_bit(i, inuse);
 815		}
 816
 817		i = find_first_zero_bit(inuse, max_netdevices);
 818		free_page((unsigned long) inuse);
 819	}
 820
 821	snprintf(buf, IFNAMSIZ, name, i);
 822	if (!__dev_get_by_name(net, buf))
 823		return i;
 824
 825	/* It is possible to run out of possible slots
 826	 * when the name is long and there isn't enough space left
 827	 * for the digits, or if all bits are used.
 828	 */
 829	return -ENFILE;
 830}
 831
 832/**
 833 *	dev_alloc_name - allocate a name for a device
 834 *	@dev: device
 835 *	@name: name format string
 836 *
 837 *	Passed a format string - eg "lt%d" it will try and find a suitable
 838 *	id. It scans list of devices to build up a free map, then chooses
 839 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 840 *	while allocating the name and adding the device in order to avoid
 841 *	duplicates.
 842 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 843 *	Returns the number of the unit assigned or a negative errno code.
 844 */
 845
 846int dev_alloc_name(struct net_device *dev, const char *name)
 847{
 848	char buf[IFNAMSIZ];
 849	struct net *net;
 850	int ret;
 851
 852	BUG_ON(!dev->nd_net);
 853	net = dev->nd_net;
 854	ret = __dev_alloc_name(net, name, buf);
 855	if (ret >= 0)
 856		strlcpy(dev->name, buf, IFNAMSIZ);
 857	return ret;
 858}
 859
 860
 861/**
 862 *	dev_change_name - change name of a device
 863 *	@dev: device
 864 *	@newname: name (or format string) must be at least IFNAMSIZ
 865 *
 866 *	Change name of a device, can pass format strings "eth%d".
 867 *	for wildcarding.
 868 */
 869int dev_change_name(struct net_device *dev, char *newname)
 870{
 871	char oldname[IFNAMSIZ];
 872	int err = 0;
 873	int ret;
 874	struct net *net;
 875
 876	ASSERT_RTNL();
 877	BUG_ON(!dev->nd_net);
 878
 879	net = dev->nd_net;
 880	if (dev->flags & IFF_UP)
 881		return -EBUSY;
 882
 883	if (!dev_valid_name(newname))
 884		return -EINVAL;
 885
 886	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 887		return 0;
 888
 889	memcpy(oldname, dev->name, IFNAMSIZ);
 890
 891	if (strchr(newname, '%')) {
 892		err = dev_alloc_name(dev, newname);
 893		if (err < 0)
 894			return err;
 895		strcpy(newname, dev->name);
 896	}
 897	else if (__dev_get_by_name(net, newname))
 898		return -EEXIST;
 899	else
 900		strlcpy(dev->name, newname, IFNAMSIZ);
 901
 902rollback:
 903	device_rename(&dev->dev, dev->name);
 904
 905	write_lock_bh(&dev_base_lock);
 906	hlist_del(&dev->name_hlist);
 907	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 908	write_unlock_bh(&dev_base_lock);
 909
 910	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 911	ret = notifier_to_errno(ret);
 912
 913	if (ret) {
 914		if (err) {
 915			printk(KERN_ERR
 916			       "%s: name change rollback failed: %d.\n",
 917			       dev->name, ret);
 918		} else {
 919			err = ret;
 920			memcpy(dev->name, oldname, IFNAMSIZ);
 921			goto rollback;
 922		}
 923	}
 924
 925	return err;
 926}
 927
 928/**
 929 *	netdev_features_change - device changes features
 930 *	@dev: device to cause notification
 931 *
 932 *	Called to indicate a device has changed features.
 933 */
 934void netdev_features_change(struct net_device *dev)
 935{
 936	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 937}
 938EXPORT_SYMBOL(netdev_features_change);
 939
 940/**
 941 *	netdev_state_change - device changes state
 942 *	@dev: device to cause notification
 943 *
 944 *	Called to indicate a device has changed state. This function calls
 945 *	the notifier chains for netdev_chain and sends a NEWLINK message
 946 *	to the routing socket.
 947 */
 948void netdev_state_change(struct net_device *dev)
 949{
 950	if (dev->flags & IFF_UP) {
 951		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 952		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 953	}
 954}
 955
 956/**
 957 *	dev_load 	- load a network module
 958 *	@net: the applicable net namespace
 959 *	@name: name of interface
 960 *
 961 *	If a network interface is not present and the process has suitable
 962 *	privileges this function loads the module. If module loading is not
 963 *	available in this kernel then it becomes a nop.
 964 */
 965
 966void dev_load(struct net *net, const char *name)
 967{
 968	struct net_device *dev;
 969
 970	read_lock(&dev_base_lock);
 971	dev = __dev_get_by_name(net, name);
 972	read_unlock(&dev_base_lock);
 973
 974	if (!dev && capable(CAP_SYS_MODULE))
 975		request_module("%s", name);
 976}
 977
 978/**
 979 *	dev_open	- prepare an interface for use.
 980 *	@dev:	device to open
 981 *
 982 *	Takes a device from down to up state. The device's private open
 983 *	function is invoked and then the multicast lists are loaded. Finally
 984 *	the device is moved into the up state and a %NETDEV_UP message is
 985 *	sent to the netdev notifier chain.
 986 *
 987 *	Calling this function on an active interface is a nop. On a failure
 988 *	a negative errno code is returned.
 989 */
 990int dev_open(struct net_device *dev)
 991{
 992	int ret = 0;
 993
 994	/*
 995	 *	Is it already up?
 996	 */
 997
 998	if (dev->flags & IFF_UP)
 999		return 0;
1000
1001	/*
1002	 *	Is it even present?
1003	 */
1004	if (!netif_device_present(dev))
1005		return -ENODEV;
1006
1007	/*
1008	 *	Call device private open method
1009	 */
1010	set_bit(__LINK_STATE_START, &dev->state);
1011
1012	if (dev->validate_addr)
1013		ret = dev->validate_addr(dev);
1014
1015	if (!ret && dev->open)
1016		ret = dev->open(dev);
1017
1018	/*
1019	 *	If it went open OK then:
1020	 */
1021
1022	if (ret)
1023		clear_bit(__LINK_STATE_START, &dev->state);
1024	else {
1025		/*
1026		 *	Set the flags.
1027		 */
1028		dev->flags |= IFF_UP;
1029
1030		/*
1031		 *	Initialize multicasting status
1032		 */
1033		dev_set_rx_mode(dev);
1034
1035		/*
1036		 *	Wakeup transmit queue engine
1037		 */
1038		dev_activate(dev);
1039
1040		/*
1041		 *	... and announce new interface.
1042		 */
1043		call_netdevice_notifiers(NETDEV_UP, dev);
1044	}
1045
1046	return ret;
1047}
1048
1049/**
1050 *	dev_close - shutdown an interface.
1051 *	@dev: device to shutdown
1052 *
1053 *	This function moves an active device into down state. A
1054 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1055 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1056 *	chain.
1057 */
1058int dev_close(struct net_device *dev)
1059{
1060	might_sleep();
1061
1062	if (!(dev->flags & IFF_UP))
1063		return 0;
1064
1065	/*
1066	 *	Tell people we are going down, so that they can
1067	 *	prepare to death, when device is still operating.
1068	 */
1069	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1070
1071	dev_deactivate(dev);
1072
1073	clear_bit(__LINK_STATE_START, &dev->state);
1074
1075	/* Synchronize to scheduled poll. We cannot touch poll list,
1076	 * it can be even on different cpu. So just clear netif_running().
1077	 *
1078	 * dev->stop() will invoke napi_disable() on all of it's
1079	 * napi_struct instances on this device.
1080	 */
1081	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1082
1083	/*
1084	 *	Call the device specific close. This cannot fail.
1085	 *	Only if device is UP
1086	 *
1087	 *	We allow it to be called even after a DETACH hot-plug
1088	 *	event.
1089	 */
1090	if (dev->stop)
1091		dev->stop(dev);
1092
1093	/*
1094	 *	Device is now down.
1095	 */
1096
1097	dev->flags &= ~IFF_UP;
1098
1099	/*
1100	 * Tell people we are down
1101	 */
1102	call_netdevice_notifiers(NETDEV_DOWN, dev);
1103
1104	return 0;
1105}
1106
1107
1108static int dev_boot_phase = 1;
1109
1110/*
1111 *	Device change register/unregister. These are not inline or static
1112 *	as we export them to the world.
1113 */
1114
1115/**
1116 *	register_netdevice_notifier - register a network notifier block
1117 *	@nb: notifier
1118 *
1119 *	Register a notifier to be called when network device events occur.
1120 *	The notifier passed is linked into the kernel structures and must
1121 *	not be reused until it has been unregistered. A negative errno code
1122 *	is returned on a failure.
1123 *
1124 * 	When registered all registration and up events are replayed
1125 *	to the new notifier to allow device to have a race free
1126 *	view of the network device list.
1127 */
1128
1129int register_netdevice_notifier(struct notifier_block *nb)
1130{
1131	struct net_device *dev;
1132	struct net_device *last;
1133	struct net *net;
1134	int err;
1135
1136	rtnl_lock();
1137	err = raw_notifier_chain_register(&netdev_chain, nb);
1138	if (err)
1139		goto unlock;
1140	if (dev_boot_phase)
1141		goto unlock;
1142	for_each_net(net) {
1143		for_each_netdev(net, dev) {
1144			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1145			err = notifier_to_errno(err);
1146			if (err)
1147				goto rollback;
1148
1149			if (!(dev->flags & IFF_UP))
1150				continue;
1151
1152			nb->notifier_call(nb, NETDEV_UP, dev);
1153		}
1154	}
1155
1156unlock:
1157	rtnl_unlock();
1158	return err;
1159
1160rollback:
1161	last = dev;
1162	for_each_net(net) {
1163		for_each_netdev(net, dev) {
1164			if (dev == last)
1165				break;
1166
1167			if (dev->flags & IFF_UP) {
1168				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1169				nb->notifier_call(nb, NETDEV_DOWN, dev);
1170			}
1171			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1172		}
1173	}
1174
1175	raw_notifier_chain_unregister(&netdev_chain, nb);
1176	goto unlock;
1177}
1178
1179/**
1180 *	unregister_netdevice_notifier - unregister a network notifier block
1181 *	@nb: notifier
1182 *
1183 *	Unregister a notifier previously registered by
1184 *	register_netdevice_notifier(). The notifier is unlinked into the
1185 *	kernel structures and may then be reused. A negative errno code
1186 *	is returned on a failure.
1187 */
1188
1189int unregister_netdevice_notifier(struct notifier_block *nb)
1190{
1191	int err;
1192
1193	rtnl_lock();
1194	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1195	rtnl_unlock();
1196	return err;
1197}
1198
1199/**
1200 *	call_netdevice_notifiers - call all network notifier blocks
1201 *      @val: value passed unmodified to notifier function
1202 *      @dev: net_device pointer passed unmodified to notifier function
1203 *
1204 *	Call all network notifier blocks.  Parameters and return value
1205 *	are as for raw_notifier_call_chain().
1206 */
1207
1208int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1209{
1210	return raw_notifier_call_chain(&netdev_chain, val, dev);
1211}
1212
1213/* When > 0 there are consumers of rx skb time stamps */
1214static atomic_t netstamp_needed = ATOMIC_INIT(0);
1215
1216void net_enable_timestamp(void)
1217{
1218	atomic_inc(&netstamp_needed);
1219}
1220
1221void net_disable_timestamp(void)
1222{
1223	atomic_dec(&netstamp_needed);
1224}
1225
1226static inline void net_timestamp(struct sk_buff *skb)
1227{
1228	if (atomic_read(&netstamp_needed))
1229		__net_timestamp(skb);
1230	else
1231		skb->tstamp.tv64 = 0;
1232}
1233
1234/*
1235 *	Support routine. Sends outgoing frames to any network
1236 *	taps currently in use.
1237 */
1238
1239static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1240{
1241	struct packet_type *ptype;
1242
1243	net_timestamp(skb);
1244
1245	rcu_read_lock();
1246	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1247		/* Never send packets back to the socket
1248		 * they originated from - MvS (miquels@drinkel.ow.org)
1249		 */
1250		if ((ptype->dev == dev || !ptype->dev) &&
1251		    (ptype->af_packet_priv == NULL ||
1252		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1253			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1254			if (!skb2)
1255				break;
1256
1257			/* skb->nh should be correctly
1258			   set by sender, so that the second statement is
1259			   just protection against buggy protocols.
1260			 */
1261			skb_reset_mac_header(skb2);
1262
1263			if (skb_network_header(skb2) < skb2->data ||
1264			    skb2->network_header > skb2->tail) {
1265				if (net_ratelimit())
1266					printk(KERN_CRIT "protocol %04x is "
1267					       "buggy, dev %s\n",
1268					       skb2->protocol, dev->name);
1269				skb_reset_network_header(skb2);
1270			}
1271
1272			skb2->transport_header = skb2->network_header;
1273			skb2->pkt_type = PACKET_OUTGOING;
1274			ptype->func(skb2, skb->dev, ptype, skb->dev);
1275		}
1276	}
1277	rcu_read_unlock();
1278}
1279
1280
1281void __netif_schedule(struct net_device *dev)
1282{
1283	if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1284		unsigned long flags;
1285		struct softnet_data *sd;
1286
1287		local_irq_save(flags);
1288		sd = &__get_cpu_var(softnet_data);
1289		dev->next_sched = sd->output_queue;
1290		sd->output_queue = dev;
1291		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1292		local_irq_restore(flags);
1293	}
1294}
1295EXPORT_SYMBOL(__netif_schedule);
1296
1297void dev_kfree_skb_irq(struct sk_buff *skb)
1298{
1299	if (atomic_dec_and_test(&skb->users)) {
1300		struct softnet_data *sd;
1301		unsigned long flags;
1302
1303		local_irq_save(flags);
1304		sd = &__get_cpu_var(softnet_data);
1305		skb->next = sd->completion_queue;
1306		sd->completion_queue = skb;
1307		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1308		local_irq_restore(flags);
1309	}
1310}
1311EXPORT_SYMBOL(dev_kfree_skb_irq);
1312
1313void dev_kfree_skb_any(struct sk_buff *skb)
1314{
1315	if (in_irq() || irqs_disabled())
1316		dev_kfree_skb_irq(skb);
1317	else
1318		dev_kfree_skb(skb);
1319}
1320EXPORT_SYMBOL(dev_kfree_skb_any);
1321
1322
1323/**
1324 * netif_device_detach - mark device as removed
1325 * @dev: network device
1326 *
1327 * Mark device as removed from system and therefore no longer available.
1328 */
1329void netif_device_detach(struct net_device *dev)
1330{
1331	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1332	    netif_running(dev)) {
1333		netif_stop_queue(dev);
1334	}
1335}
1336EXPORT_SYMBOL(netif_device_detach);
1337
1338/**
1339 * netif_device_attach - mark device as attached
1340 * @dev: network device
1341 *
1342 * Mark device as attached from system and restart if needed.
1343 */
1344void netif_device_attach(struct net_device *dev)
1345{
1346	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1347	    netif_running(dev)) {
1348		netif_wake_queue(dev);
1349		__netdev_watchdog_up(dev);
1350	}
1351}
1352EXPORT_SYMBOL(netif_device_attach);
1353
1354
1355/*
1356 * Invalidate hardware checksum when packet is to be mangled, and
1357 * complete checksum manually on outgoing path.
1358 */
1359int skb_checksum_help(struct sk_buff *skb)
1360{
1361	__wsum csum;
1362	int ret = 0, offset;
1363
1364	if (skb->ip_summed == CHECKSUM_COMPLETE)
1365		goto out_set_summed;
1366
1367	if (unlikely(skb_shinfo(skb)->gso_size)) {
1368		/* Let GSO fix up the checksum. */
1369		goto out_set_summed;
1370	}
1371
1372	offset = skb->csum_start - skb_headroom(skb);
1373	BUG_ON(offset >= skb_headlen(skb));
1374	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1375
1376	offset += skb->csum_offset;
1377	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1378
1379	if (skb_cloned(skb) &&
1380	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1381		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1382		if (ret)
1383			goto out;
1384	}
1385
1386	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1387out_set_summed:
1388	skb->ip_summed = CHECKSUM_NONE;
1389out:
1390	return ret;
1391}
1392
1393/**
1394 *	skb_gso_segment - Perform segmentation on skb.
1395 *	@skb: buffer to segment
1396 *	@features: features for the output path (see dev->features)
1397 *
1398 *	This function segments the given skb and returns a list of segments.
1399 *
1400 *	It may return NULL if the skb requires no segmentation.  This is
1401 *	only possible when GSO is used for verifying header integrity.
1402 */
1403struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1404{
1405	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1406	struct packet_type *ptype;
1407	__be16 type = skb->protocol;
1408	int err;
1409
1410	BUG_ON(skb_shinfo(skb)->frag_list);
1411
1412	skb_reset_mac_header(skb);
1413	skb->mac_len = skb->network_header - skb->mac_header;
1414	__skb_pull(skb, skb->mac_len);
1415
1416	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1417		if (skb_header_cloned(skb) &&
1418		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1419			return ERR_PTR(err);
1420	}
1421
1422	rcu_read_lock();
1423	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1424		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1425			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1426				err = ptype->gso_send_check(skb);
1427				segs = ERR_PTR(err);
1428				if (err || skb_gso_ok(skb, features))
1429					break;
1430				__skb_push(skb, (skb->data -
1431						 skb_network_header(skb)));
1432			}
1433			segs = ptype->gso_segment(skb, features);
1434			break;
1435		}
1436	}
1437	rcu_read_unlock();
1438
1439	__skb_push(skb, skb->data - skb_mac_header(skb));
1440
1441	return segs;
1442}
1443
1444EXPORT_SYMBOL(skb_gso_segment);
1445
1446/* Take action when hardware reception checksum errors are detected. */
1447#ifdef CONFIG_BUG
1448void netdev_rx_csum_fault(struct net_device *dev)
1449{
1450	if (net_ratelimit()) {
1451		printk(KERN_ERR "%s: hw csum failure.\n",
1452			dev ? dev->name : "<unknown>");
1453		dump_stack();
1454	}
1455}
1456EXPORT_SYMBOL(netdev_rx_csum_fault);
1457#endif
1458
1459/* Actually, we should eliminate this check as soon as we know, that:
1460 * 1. IOMMU is present and allows to map all the memory.
1461 * 2. No high memory really exists on this machine.
1462 */
1463
1464static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1465{
1466#ifdef CONFIG_HIGHMEM
1467	int i;
1468
1469	if (dev->features & NETIF_F_HIGHDMA)
1470		return 0;
1471
1472	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1473		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1474			return 1;
1475
1476#endif
1477	return 0;
1478}
1479
1480struct dev_gso_cb {
1481	void (*destructor)(struct sk_buff *skb);
1482};
1483
1484#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1485
1486static void dev_gso_skb_destructor(struct sk_buff *skb)
1487{
1488	struct dev_gso_cb *cb;
1489
1490	do {
1491		struct sk_buff *nskb = skb->next;
1492
1493		skb->next = nskb->next;
1494		nskb->next = NULL;
1495		kfree_skb(nskb);
1496	} while (skb->next);
1497
1498	cb = DEV_GSO_CB(skb);
1499	if (cb->destructor)
1500		cb->destructor(skb);
1501}
1502
1503/**
1504 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1505 *	@skb: buffer to segment
1506 *
1507 *	This function segments the given skb and stores the list of segments
1508 *	in skb->next.
1509 */
1510static int dev_gso_segment(struct sk_buff *skb)
1511{
1512	struct net_device *dev = skb->dev;
1513	struct sk_buff *segs;
1514	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1515					 NETIF_F_SG : 0);
1516
1517	segs = skb_gso_segment(skb, features);
1518
1519	/* Verifying header integrity only. */
1520	if (!segs)
1521		return 0;
1522
1523	if (unlikely(IS_ERR(segs)))
1524		return PTR_ERR(segs);
1525
1526	skb->next = segs;
1527	DEV_GSO_CB(skb)->destructor = skb->destructor;
1528	skb->destructor = dev_gso_skb_destructor;
1529
1530	return 0;
1531}
1532
1533int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1534{
1535	if (likely(!skb->next)) {
1536		if (!list_empty(&ptype_all))
1537			dev_queue_xmit_nit(skb, dev);
1538
1539		if (netif_needs_gso(dev, skb)) {
1540			if (unlikely(dev_gso_segment(skb)))
1541				goto out_kfree_skb;
1542			if (skb->next)
1543				goto gso;
1544		}
1545
1546		return dev->hard_start_xmit(skb, dev);
1547	}
1548
1549gso:
1550	do {
1551		struct sk_buff *nskb = skb->next;
1552		int rc;
1553
1554		skb->next = nskb->next;
1555		nskb->next = NULL;
1556		rc = dev->hard_start_xmit(nskb, dev);
1557		if (unlikely(rc)) {
1558			nskb->next = skb->next;
1559			skb->next = nskb;
1560			return rc;
1561		}
1562		if (unlikely((netif_queue_stopped(dev) ||
1563			     netif_subqueue_stopped(dev, skb)) &&
1564			     skb->next))
1565			return NETDEV_TX_BUSY;
1566	} while (skb->next);
1567
1568	skb->destructor = DEV_GSO_CB(skb)->destructor;
1569
1570out_kfree_skb:
1571	kfree_skb(skb);
1572	return 0;
1573}
1574
1575/**
1576 *	dev_queue_xmit - transmit a buffer
1577 *	@skb: buffer to transmit
1578 *
1579 *	Queue a buffer for transmission to a network device. The caller must
1580 *	have set the device and priority and built the buffer before calling
1581 *	this function. The function can be called from an interrupt.
1582 *
1583 *	A negative errno code is returned on a failure. A success does not
1584 *	guarantee the frame will be transmitted as it may be dropped due
1585 *	to congestion or traffic shaping.
1586 *
1587 * -----------------------------------------------------------------------------------
1588 *      I notice this method can also return errors from the queue disciplines,
1589 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1590 *      be positive.
1591 *
1592 *      Regardless of the return value, the skb is consumed, so it is currently
1593 *      difficult to retry a send to this method.  (You can bump the ref count
1594 *      before sending to hold a reference for retry if you are careful.)
1595 *
1596 *      When calling this method, interrupts MUST be enabled.  This is because
1597 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1598 *          --BLG
1599 */
1600
1601int dev_queue_xmit(struct sk_buff *skb)
1602{
1603	struct net_device *dev = skb->dev;
1604	struct Qdisc *q;
1605	int rc = -ENOMEM;
1606
1607	/* GSO will handle the following emulations directly. */
1608	if (netif_needs_gso(dev, skb))
1609		goto gso;
1610
1611	if (skb_shinfo(skb)->frag_list &&
1612	    !(dev->features & NETIF_F_FRAGLIST) &&
1613	    __skb_linearize(skb))
1614		goto out_kfree_skb;
1615
1616	/* Fragmented skb is linearized if device does not support SG,
1617	 * or if at least one of fragments is in highmem and device
1618	 * does not support DMA from it.
1619	 */
1620	if (skb_shinfo(skb)->nr_frags &&
1621	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1622	    __skb_linearize(skb))
1623		goto out_kfree_skb;
1624
1625	/* If packet is not checksummed and device does not support
1626	 * checksumming for this protocol, complete checksumming here.
1627	 */
1628	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1629		skb_set_transport_header(skb, skb->csum_start -
1630					      skb_headroom(skb));
1631
1632		if (!(dev->features & NETIF_F_GEN_CSUM) &&
1633		    !((dev->features & NETIF_F_IP_CSUM) &&
1634		      skb->protocol == htons(ETH_P_IP)) &&
1635		    !((dev->features & NETIF_F_IPV6_CSUM) &&
1636		      skb->protocol == htons(ETH_P_IPV6)))
1637			if (skb_checksum_help(skb))
1638				goto out_kfree_skb;
1639	}
1640
1641gso:
1642	spin_lock_prefetch(&dev->queue_lock);
1643
1644	/* Disable soft irqs for various locks below. Also
1645	 * stops preemption for RCU.
1646	 */
1647	rcu_read_lock_bh();
1648
1649	/* Updates of qdisc are serialized by queue_lock.
1650	 * The struct Qdisc which is pointed to by qdisc is now a
1651	 * rcu structure - it may be accessed without acquiring
1652	 * a lock (but the structure may be stale.) The freeing of the
1653	 * qdisc will be deferred until it's known that there are no
1654	 * more references to it.
1655	 *
1656	 * If the qdisc has an enqueue function, we still need to
1657	 * hold the queue_lock before calling it, since queue_lock
1658	 * also serializes access to the device queue.
1659	 */
1660
1661	q = rcu_dereference(dev->qdisc);
1662#ifdef CONFIG_NET_CLS_ACT
1663	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1664#endif
1665	if (q->enqueue) {
1666		/* Grab device queue */
1667		spin_lock(&dev->queue_lock);
1668		q = dev->qdisc;
1669		if (q->enqueue) {
1670			/* reset queue_mapping to zero */
1671			skb_set_queue_mapping(skb, 0);
1672			rc = q->enqueue(skb, q);
1673			qdisc_run(dev);
1674			spin_unlock(&dev->queue_lock);
1675
1676			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1677			goto out;
1678		}
1679		spin_unlock(&dev->queue_lock);
1680	}
1681
1682	/* The device has no queue. Common case for software devices:
1683	   loopback, all the sorts of tunnels...
1684
1685	   Really, it is unlikely that netif_tx_lock protection is necessary
1686	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1687	   counters.)
1688	   However, it is possible, that they rely on protection
1689	   made by us here.
1690
1691	   Check this and shot the lock. It is not prone from deadlocks.
1692	   Either shot noqueue qdisc, it is even simpler 8)
1693	 */
1694	if (dev->flags & IFF_UP) {
1695		int cpu = smp_processor_id(); /* ok because BHs are off */
1696
1697		if (dev->xmit_lock_owner != cpu) {
1698
1699			HARD_TX_LOCK(dev, cpu);
1700
1701			if (!netif_queue_stopped(dev) &&
1702			    !netif_subqueue_stopped(dev, skb)) {
1703				rc = 0;
1704				if (!dev_hard_start_xmit(skb, dev)) {
1705					HARD_TX_UNLOCK(dev);
1706					goto out;
1707				}
1708			}
1709			HARD_TX_UNLOCK(dev);
1710			if (net_ratelimit())
1711				printk(KERN_CRIT "Virtual device %s asks to "
1712				       "queue packet!\n", dev->name);
1713		} else {
1714			/* Recursion is detected! It is possible,
1715			 * unfortunately */
1716			if (net_ratelimit())
1717				printk(KERN_CRIT "Dead loop on virtual device "
1718				       "%s, fix it urgently!\n", dev->name);
1719		}
1720	}
1721
1722	rc = -ENETDOWN;
1723	rcu_read_unlock_bh();
1724
1725out_kfree_skb:
1726	kfree_skb(skb);
1727	return rc;
1728out:
1729	rcu_read_unlock_bh();
1730	return rc;
1731}
1732
1733
1734/*=======================================================================
1735			Receiver routines
1736  =======================================================================*/
1737
1738int netdev_max_backlog __read_mostly = 1000;
1739int netdev_budget __read_mostly = 300;
1740int weight_p __read_mostly = 64;            /* old backlog weight */
1741
1742DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1743
1744
1745/**
1746 *	netif_rx	-	post buffer to the network code
1747 *	@skb: buffer to post
1748 *
1749 *	This function receives a packet from a device driver and queues it for
1750 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1751 *	may be dropped during processing for congestion control or by the
1752 *	protocol layers.
1753 *
1754 *	return values:
1755 *	NET_RX_SUCCESS	(no congestion)
1756 *	NET_RX_DROP     (packet was dropped)
1757 *
1758 */
1759
1760int netif_rx(struct sk_buff *skb)
1761{
1762	struct softnet_data *queue;
1763	unsigned long flags;
1764
1765	/* if netpoll wants it, pretend we never saw it */
1766	if (netpoll_rx(skb))
1767		return NET_RX_DROP;
1768
1769	if (!skb->tstamp.tv64)
1770		net_timestamp(skb);
1771
1772	/*
1773	 * The code is rearranged so that the path is the most
1774	 * short when CPU is congested, but is still operating.
1775	 */
1776	local_irq_save(flags);
1777	queue = &__get_cpu_var(softnet_data);
1778
1779	__get_cpu_var(netdev_rx_stat).total++;
1780	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1781		if (queue->input_pkt_queue.qlen) {
1782enqueue:
1783			dev_hold(skb->dev);
1784			__skb_queue_tail(&queue->input_pkt_queue, skb);
1785			local_irq_restore(flags);
1786			return NET_RX_SUCCESS;
1787		}
1788
1789		napi_schedule(&queue->backlog);
1790		goto enqueue;
1791	}
1792
1793	__get_cpu_var(netdev_rx_stat).dropped++;
1794	local_irq_restore(flags);
1795
1796	kfree_skb(skb);
1797	return NET_RX_DROP;
1798}
1799
1800int netif_rx_ni(struct sk_buff *skb)
1801{
1802	int err;
1803
1804	preempt_disable();
1805	err = netif_rx(skb);
1806	if (local_softirq_pending())
1807		do_softirq();
1808	preempt_enable();
1809
1810	return err;
1811}
1812
1813EXPORT_SYMBOL(netif_rx_ni);
1814
1815static inline struct net_device *skb_bond(struct sk_buff *skb)
1816{
1817	struct net_device *dev = skb->dev;
1818
1819	if (dev->master) {
1820		if (skb_bond_should_drop(skb)) {
1821			kfree_skb(skb);
1822			return NULL;
1823		}
1824		skb->dev = dev->master;
1825	}
1826
1827	return dev;
1828}
1829
1830
1831static void net_tx_action(struct softirq_action *h)
1832{
1833	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1834
1835	if (sd->completion_queue) {
1836		struct sk_buff *clist;
1837
1838		local_irq_disable();
1839		clist = sd->completion_queue;
1840		sd->completion_queue = NULL;
1841		local_irq_enable();
1842
1843		while (clist) {
1844			struct sk_buff *skb = clist;
1845			clist = clist->next;
1846
1847			BUG_TRAP(!atomic_read(&skb->users));
1848			__kfree_skb(skb);
1849		}
1850	}
1851
1852	if (sd->output_queue) {
1853		struct net_device *head;
1854
1855		local_irq_disable();
1856		head = sd->output_queue;
1857		sd->output_queue = NULL;
1858		local_irq_enable();
1859
1860		while (head) {
1861			struct net_device *dev = head;
1862			head = head->next_sched;
1863
1864			smp_mb__before_clear_bit();
1865			clear_bit(__LINK_STATE_SCHED, &dev->state);
1866
1867			if (spin_trylock(&dev->queue_lock)) {
1868				qdisc_run(dev);
1869				spin_unlock(&dev->queue_lock);
1870			} else {
1871				netif_schedule(dev);
1872			}
1873		}
1874	}
1875}
1876
1877static inline int deliver_skb(struct sk_buff *skb,
1878			      struct packet_type *pt_prev,
1879			      struct net_device *orig_dev)
1880{
1881	atomic_inc(&skb->users);
1882	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1883}
1884
1885#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1886/* These hooks defined here for ATM */
1887struct net_bridge;
1888struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1889						unsigned char *addr);
1890void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1891
1892/*
1893 * If bridge module is loaded call bridging hook.
1894 *  returns NULL if packet was consumed.
1895 */
1896struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1897					struct sk_buff *skb) __read_mostly;
1898static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1899					    struct packet_type **pt_prev, int *ret,
1900					    struct net_device *orig_dev)
1901{
1902	struct net_bridge_port *port;
1903
1904	if (skb->pkt_type == PACKET_LOOPBACK ||
1905	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
1906		return skb;
1907
1908	if (*pt_prev) {
1909		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1910		*pt_prev = NULL;
1911	}
1912
1913	return br_handle_frame_hook(port, skb);
1914}
1915#else
1916#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
1917#endif
1918
1919#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1920struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1921EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1922
1923static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1924					     struct packet_type **pt_prev,
1925					     int *ret,
1926					     struct net_device *orig_dev)
1927{
1928	if (skb->dev->macvlan_port == NULL)
1929		return skb;
1930
1931	if (*pt_prev) {
1932		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1933		*pt_prev = NULL;
1934	}
1935	return macvlan_handle_frame_hook(skb);
1936}
1937#else
1938#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
1939#endif
1940
1941#ifdef CONFIG_NET_CLS_ACT
1942/* TODO: Maybe we should just force sch_ingress to be compiled in
1943 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1944 * a compare and 2 stores extra right now if we dont have it on
1945 * but have CONFIG_NET_CLS_ACT
1946 * NOTE: This doesnt stop any functionality; if you dont have
1947 * the ingress scheduler, you just cant add policies on ingress.
1948 *
1949 */
1950static int ing_filter(struct sk_buff *skb)
1951{
1952	struct Qdisc *q;
1953	struct net_device *dev = skb->dev;
1954	int result = TC_ACT_OK;
1955	u32 ttl = G_TC_RTTL(skb->tc_verd);
1956
1957	if (MAX_RED_LOOP < ttl++) {
1958		printk(KERN_WARNING
1959		       "Redir loop detected Dropping packet (%d->%d)\n",
1960		       skb->iif, dev->ifindex);
1961		return TC_ACT_SHOT;
1962	}
1963
1964	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
1965	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1966
1967	spin_lock(&dev->ingress_lock);
1968	if ((q = dev->qdisc_ingress) != NULL)
1969		result = q->enqueue(skb, q);
1970	spin_unlock(&dev->ingress_lock);
1971
1972	return result;
1973}
1974
1975static inline struct sk_buff *handle_ing(struct sk_buff *skb,
1976					 struct packet_type **pt_prev,
1977					 int *ret, struct net_device *orig_dev)
1978{
1979	if (!skb->dev->qdisc_ingress)
1980		goto out;
1981
1982	if (*pt_prev) {
1983		*ret = deliver_skb(skb, *pt_prev, orig_dev);
1984		*pt_prev = NULL;
1985	} else {
1986		/* Huh? Why does turning on AF_PACKET affect this? */
1987		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1988	}
1989
1990	switch (ing_filter(skb)) {
1991	case TC_ACT_SHOT:
1992	case TC_ACT_STOLEN:
1993		kfree_skb(skb);
1994		return NULL;
1995	}
1996
1997out:
1998	skb->tc_verd = 0;
1999	return skb;
2000}
2001#endif
2002
2003/**
2004 *	netif_receive_skb - process receive buffer from network
2005 *	@skb: buffer to process
2006 *
2007 *	netif_receive_skb() is the main receive data processing function.
2008 *	It always succeeds. The buffer may be dropped during processing
2009 *	for congestion control or by the protocol layers.
2010 *
2011 *	This function may only be called from softirq context and interrupts
2012 *	should be enabled.
2013 *
2014 *	Return values (usually ignored):
2015 *	NET_RX_SUCCESS: no congestion
2016 *	NET_RX_DROP: packet was dropped
2017 */
2018int netif_receive_skb(struct sk_buff *skb)
2019{
2020	struct packet_type *ptype, *pt_prev;
2021	struct net_device *orig_dev;
2022	int ret = NET_RX_DROP;
2023	__be16 type;
2024
2025	/* if we've gotten here through NAPI, check netpoll */
2026	if (netpoll_receive_skb(skb))
2027		return NET_RX_DROP;
2028
2029	if (!skb->tstamp.tv64)
2030		net_timestamp(skb);
2031
2032	if (!skb->iif)
2033		skb->iif = skb->dev->ifindex;
2034
2035	orig_dev = skb_bond(skb);
2036
2037	if (!orig_dev)
2038		return NET_RX_DROP;
2039
2040	__get_cpu_var(netdev_rx_stat).total++;
2041
2042	skb_reset_network_header(skb);
2043	skb_reset_transport_header(skb);
2044	skb->mac_len = skb->network_header - skb->mac_header;
2045
2046	pt_prev = NULL;
2047
2048	rcu_read_lock();
2049
2050#ifdef CONFIG_NET_CLS_ACT
2051	if (skb->tc_verd & TC_NCLS) {
2052		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2053		goto ncls;
2054	}
2055#endif
2056
2057	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2058		if (!ptype->dev || ptype->dev == skb->dev) {
2059			if (pt_prev)
2060				ret = deliver_skb(skb, pt_prev, orig_dev);
2061			pt_prev = ptype;
2062		}
2063	}
2064
2065#ifdef CONFIG_NET_CLS_ACT
2066	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2067	if (!skb)
2068		goto out;
2069ncls:
2070#endif
2071
2072	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2073	if (!skb)
2074		goto out;
2075	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2076	if (!skb)
2077		goto out;
2078
2079	type = skb->protocol;
2080	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
2081		if (ptype->type == type &&
2082		    (!ptype->dev || ptype->dev == skb->dev)) {
2083			if (pt_prev)
2084				ret = deliver_skb(skb, pt_prev, orig_dev);
2085			pt_prev = ptype;
2086		}
2087	}
2088
2089	if (pt_prev) {
2090		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2091	} else {
2092		kfree_skb(skb);
2093		/* Jamal, now you will not able to escape explaining
2094		 * me how you were going to use this. :-)
2095		 */
2096		ret = NET_RX_DROP;
2097	}
2098
2099out:
2100	rcu_read_unlock();
2101	return ret;
2102}
2103
2104static int process_backlog(struct napi_struct *napi, int quota)
2105{
2106	int work = 0;
2107	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2108	unsigned long start_time = jiffies;
2109
2110	napi->weight = weight_p;
2111	do {
2112		struct sk_buff *skb;
2113		struct net_device *dev;
2114
2115		local_irq_disable();
2116		skb = __skb_dequeue(&queue->input_pkt_queue);
2117		if (!skb) {
2118			__napi_complete(napi);
2119			local_irq_enable();
2120			break;
2121		}
2122
2123		local_irq_enable();
2124
2125		dev = skb->dev;
2126
2127		netif_receive_skb(skb);
2128
2129		dev_put(dev);
2130	} while (++work < quota && jiffies == start_time);
2131
2132	return work;
2133}
2134
2135/**
2136 * __napi_schedule - schedule for receive
2137 * @n: entry to schedule
2138 *
2139 * The entry's receive function will be scheduled to run
2140 */
2141void fastcall __napi_schedule(struct napi_struct *n)
2142{
2143	unsigned long flags;
2144
2145	local_irq_save(flags);
2146	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2147	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2148	local_irq_restore(flags);
2149}
2150EXPORT_SYMBOL(__napi_schedule);
2151
2152
2153static void net_rx_action(struct softirq_action *h)
2154{
2155	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2156	unsigned long start_time = jiffies;
2157	int budget = netdev_budget;
2158	void *have;
2159
2160	local_irq_disable();
2161
2162	while (!list_empty(list)) {
2163		struct napi_struct *n;
2164		int work, weight;
2165
2166		/* If softirq window is exhuasted then punt.
2167		 *
2168		 * Note that this is a slight policy change from the
2169		 * previous NAPI code, which would allow up to 2
2170		 * jiffies to pass before breaking out.  The test
2171		 * used to be "jiffies - start_time > 1".
2172		 */
2173		if (unlikely(budget <= 0 || jiffies != start_time))
2174			goto softnet_break;
2175
2176		local_irq_enable();
2177
2178		/* Even though interrupts have been re-enabled, this
2179		 * access is safe because interrupts can only add new
2180		 * entries to the tail of this list, and only ->poll()
2181		 * calls can remove this head entry from the list.
2182		 */
2183		n = list_entry(list->next, struct napi_struct, poll_list);
2184
2185		have = netpoll_poll_lock(n);
2186
2187		weight = n->weight;
2188
2189		/* This NAPI_STATE_SCHED test is for avoiding a race
2190		 * with netpoll's poll_napi().  Only the entity which
2191		 * obtains the lock and sees NAPI_STATE_SCHED set will
2192		 * actually make the ->poll() call.  Therefore we avoid
2193		 * accidently calling ->poll() when NAPI is not scheduled.
2194		 */
2195		work = 0;
2196		if (test_bit(NAPI_STATE_SCHED, &n->state))
2197			work = n->poll(n, weight);
2198
2199		WARN_ON_ONCE(work > weight);
2200
2201		budget -= work;
2202
2203		local_irq_disable();
2204
2205		/* Drivers must not modify the NAPI state if they
2206		 * consume the entire weight.  In such cases this code
2207		 * still "owns" the NAPI instance and therefore can
2208		 * move the instance around on the list at-will.
2209		 */
2210		if (unlikely(work == weight))
2211			list_move_tail(&n->poll_list, list);
2212
2213		netpoll_poll_unlock(have);
2214	}
2215out:
2216	local_irq_enable();
2217
2218#ifdef CONFIG_NET_DMA
2219	/*
2220	 * There may not be any more sk_buffs coming right now, so push
2221	 * any pending DMA copies to hardware
2222	 */
2223	if (!cpus_empty(net_dma.channel_mask)) {
2224		int chan_idx;
2225		for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2226			struct dma_chan *chan = net_dma.channels[chan_idx];
2227			if (chan)
2228				dma_async_memcpy_issue_pending(chan);
2229		}
2230	}
2231#endif
2232
2233	return;
2234
2235softnet_break:
2236	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2237	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2238	goto out;
2239}
2240
2241static gifconf_func_t * gifconf_list [NPROTO];
2242
2243/**
2244 *	register_gifconf	-	register a SIOCGIF handler
2245 *	@family: Address family
2246 *	@gifconf: Function handler
2247 *
2248 *	Register protocol dependent address dumping routines. The handler
2249 *	that is passed must not be freed or reused until it has been replaced
2250 *	by another handler.
2251 */
2252int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2253{
2254	if (family >= NPROTO)
2255		return -EINVAL;
2256	gifconf_list[family] = gifconf;
2257	return 0;
2258}
2259
2260
2261/*
2262 *	Map an interface index to its name (SIOCGIFNAME)
2263 */
2264
2265/*
2266 *	We need this ioctl for efficient implementation of the
2267 *	if_indextoname() function required by the IPv6 API.  Without
2268 *	it, we would have to search all the interfaces to find a
2269 *	match.  --pb
2270 */
2271
2272static int dev_ifname(struct net *net, struct ifreq __user *arg)
2273{
2274	struct net_device *dev;
2275	struct ifreq ifr;
2276
2277	/*
2278	 *	Fetch the caller's info block.
2279	 */
2280
2281	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2282		return -EFAULT;
2283
2284	read_lock(&dev_base_lock);
2285	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2286	if (!dev) {
2287		read_unlock(&dev_base_lock);
2288		return -ENODEV;
2289	}
2290
2291	strcpy(ifr.ifr_name, dev->name);
2292	read_unlock(&dev_base_lock);
2293
2294	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2295		return -EFAULT;
2296	return 0;
2297}
2298
2299/*
2300 *	Perform a SIOCGIFCONF call. This structure will change
2301 *	size eventually, and there is nothing I can do about it.
2302 *	Thus we will need a 'compatibility mode'.
2303 */
2304
2305static int dev_ifconf(struct net *net, char __user *arg)
2306{
2307	struct ifconf ifc;
2308	struct net_device *dev;
2309	char __user *pos;
2310	int len;
2311	int total;
2312	int i;
2313
2314	/*
2315	 *	Fetch the caller's info block.
2316	 */
2317
2318	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2319		return -EFAULT;
2320
2321	pos = ifc.ifc_buf;
2322	len = ifc.ifc_len;
2323
2324	/*
2325	 *	Loop over the interfaces, and write an info block for each.
2326	 */
2327
2328	total = 0;
2329	for_each_netdev(net, dev) {
2330		for (i = 0; i < NPROTO; i++) {
2331			if (gifconf_list[i]) {
2332				int done;
2333				if (!pos)
2334					done = gifconf_list[i](dev, NULL, 0);
2335				else
2336					done = gifconf_list[i](dev, pos + total,
2337							       len - total);
2338				if (done < 0)
2339					return -EFAULT;
2340				total += done;
2341			}
2342		}
2343	}
2344
2345	/*
2346	 *	All done.  Write the updated control block back to the caller.
2347	 */
2348	ifc.ifc_len = total;
2349
2350	/*
2351	 * 	Both BSD and Solaris return 0 here, so we do too.
2352	 */
2353	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2354}
2355
2356#ifdef CONFIG_PROC_FS
2357/*
2358 *	This is invoked by the /proc filesystem handler to display a device
2359 *	in detail.
2360 */
2361void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2362{
2363	struct net *net = seq->private;
2364	loff_t off;
2365	struct net_device *dev;
2366
2367	read_lock(&dev_base_lock);
2368	if (!*pos)
2369		return SEQ_START_TOKEN;
2370
2371	off = 1;
2372	for_each_netdev(net, dev)
2373		if (off++ == *pos)
2374			return dev;
2375
2376	return NULL;
2377}
2378
2379void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2380{
2381	struct net *net = seq->private;
2382	++*pos;
2383	return v == SEQ_START_TOKEN ?
2384		first_net_device(net) : next_net_device((struct net_device *)v);
2385}
2386
2387void dev_seq_stop(struct seq_file *seq, void *v)
2388{
2389	read_unlock(&dev_base_lock);
2390}
2391
2392static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2393{
2394	struct net_device_stats *stats = dev->get_stats(dev);
2395
2396	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2397		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2398		   dev->name, stats->rx_bytes, stats->rx_packets,
2399		   stats->rx_errors,
2400		   stats->rx_dropped + stats->rx_missed_errors,
2401		   stats->rx_fifo_errors,
2402		   stats->rx_length_errors + stats->rx_over_errors +
2403		    stats->rx_crc_errors + stats->rx_frame_errors,
2404		   stats->rx_compressed, stats->multicast,
2405		   stats->tx_bytes, stats->tx_packets,
2406		   stats->tx_errors, stats->tx_dropped,
2407		   stats->tx_fifo_errors, stats->collisions,
2408		   stats->tx_carrier_errors +
2409		    stats->tx_aborted_errors +
2410		    stats->tx_window_errors +
2411		    stats->tx_heartbeat_errors,
2412		   stats->tx_compressed);
2413}
2414
2415/*
2416 *	Called from the PROCfs module. This now uses the new arbitrary sized
2417 *	/proc/net interface to create /proc/net/dev
2418 */
2419static int dev_seq_show(struct seq_file *seq, void *v)
2420{
2421	if (v == SEQ_START_TOKEN)
2422		seq_puts(seq, "Inter-|   Receive                            "
2423			      "                    |  Transmit\n"
2424			      " face |bytes    packets errs drop fifo frame "
2425			      "compressed multicast|bytes    packets errs "
2426			      "drop fifo colls carrier compressed\n");
2427	else
2428		dev_seq_printf_stats(seq, v);
2429	return 0;
2430}
2431
2432static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2433{
2434	struct netif_rx_stats *rc = NULL;
2435
2436	while (*pos < NR_CPUS)
2437		if (cpu_online(*pos)) {
2438			rc = &per_cpu(netdev_rx_stat, *pos);
2439			break;
2440		} else
2441			++*pos;
2442	return rc;
2443}
2444
2445static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2446{
2447	return softnet_get_online(pos);
2448}
2449
2450static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2451{
2452	++*pos;
2453	return softnet_get_online(pos);
2454}
2455
2456static void softnet_seq_stop(struct seq_file *seq, void *v)
2457{
2458}
2459
2460static int softnet_seq_show(struct seq_file *seq, void *v)
2461{
2462	struct netif_rx_stats *s = v;
2463
2464	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2465		   s->total, s->dropped, s->time_squeeze, 0,
2466		   0, 0, 0, 0, /* was fastroute */
2467		   s->cpu_collision );
2468	return 0;
2469}
2470
2471static const struct seq_operations dev_seq_ops = {
2472	.start = dev_seq_start,
2473	.next  = dev_seq_next,
2474	.stop  = dev_seq_stop,
2475	.show  = dev_seq_show,
2476};
2477
2478static int dev_seq_open(struct inode *inode, struct file *file)
2479{
2480	struct seq_file *seq;
2481	int res;
2482	res =  seq_open(file, &dev_seq_ops);
2483	if (!res) {
2484		seq = file->private_data;
2485		seq->private = get_proc_net(inode);
2486		if (!seq->private) {
2487			seq_release(inode, file);
2488			res = -ENXIO;
2489		}
2490	}
2491	return res;
2492}
2493
2494static int dev_seq_release(struct inode *inode, struct file *file)
2495{
2496	struct seq_file *seq = file->private_data;
2497	struct net *net = seq->private;
2498	put_net(net);
2499	return seq_release(inode, file);
2500}
2501
2502static const struct file_operations dev_seq_fops = {
2503	.owner	 = THIS_MODULE,
2504	.open    = dev_seq_open,
2505	.read    = seq_read,
2506	.llseek  = seq_lseek,
2507	.release = dev_seq_release,
2508};
2509
2510static const struct seq_operations softnet_seq_ops = {
2511	.start = softnet_seq_start,
2512	.next  = softnet_seq_next,
2513	.stop  = softnet_seq_stop,
2514	.show  = softnet_seq_show,
2515};
2516
2517static int softnet_seq_open(struct inode *inode, struct file *file)
2518{
2519	return seq_open(file, &softnet_seq_ops);
2520}
2521
2522static const struct file_operations softnet_seq_fops = {
2523	.owner	 = THIS_MODULE,
2524	.open    = softnet_seq_open,
2525	.read    = seq_read,
2526	.llseek  = seq_lseek,
2527	.release = seq_release,
2528};
2529
2530static void *ptype_get_idx(loff_t pos)
2531{
2532	struct packet_type *pt = NULL;
2533	loff_t i = 0;
2534	int t;
2535
2536	list_for_each_entry_rcu(pt, &ptype_all, list) {
2537		if (i == pos)
2538			return pt;
2539		++i;
2540	}
2541
2542	for (t = 0; t < 16; t++) {
2543		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2544			if (i == pos)
2545				return pt;
2546			++i;
2547		}
2548	}
2549	return NULL;
2550}
2551
2552static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2553{
2554	rcu_read_lock();
2555	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2556}
2557
2558static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2559{
2560	struct packet_type *pt;
2561	struct list_head *nxt;
2562	int hash;
2563
2564	++*pos;
2565	if (v == SEQ_START_TOKEN)
2566		return ptype_get_idx(0);
2567
2568	pt = v;
2569	nxt = pt->list.next;
2570	if (pt->type == htons(ETH_P_ALL)) {
2571		if (nxt != &ptype_all)
2572			goto found;
2573		hash = 0;
2574		nxt = ptype_base[0].next;
2575	} else
2576		hash = ntohs(pt->type) & 15;
2577
2578	while (nxt == &ptype_base[hash]) {
2579		if (++hash >= 16)
2580			return NULL;
2581		nxt = ptype_base[hash].next;
2582	}
2583found:
2584	return list_entry(nxt, struct packet_type, list);
2585}
2586
2587static void ptype_seq_stop(struct seq_file *seq, void *v)
2588{
2589	rcu_read_unlock();
2590}
2591
2592static void ptype_seq_decode(struct seq_file *seq, void *sym)
2593{
2594#ifdef CONFIG_KALLSYMS
2595	unsigned long offset = 0, symsize;
2596	const char *symname;
2597	char *modname;
2598	char namebuf[128];
2599
2600	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2601				  &modname, namebuf);
2602
2603	if (symname) {
2604		char *delim = ":";
2605
2606		if (!modname)
2607			modname = delim = "";
2608		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2609			   symname, offset);
2610		return;
2611	}
2612#endif
2613
2614	seq_printf(seq, "[%p]", sym);
2615}
2616
2617static int ptype_seq_show(struct seq_file *seq, void *v)
2618{
2619	struct packet_type *pt = v;
2620
2621	if (v == SEQ_START_TOKEN)
2622		seq_puts(seq, "Type Device      Function\n");
2623	else {
2624		if (pt->type == htons(ETH_P_ALL))
2625			seq_puts(seq, "ALL ");
2626		else
2627			seq_printf(seq, "%04x", ntohs(pt->type));
2628
2629		seq_printf(seq, " %-8s ",
2630			   pt->dev ? pt->dev->name : "");
2631		ptype_seq_decode(seq,  pt->func);
2632		seq_putc(seq, '\n');
2633	}
2634
2635	return 0;
2636}
2637
2638static const struct seq_operations ptype_seq_ops = {
2639	.start = ptype_seq_start,
2640	.next  = ptype_seq_next,
2641	.stop  = ptype_seq_stop,
2642	.show  = ptype_seq_show,
2643};
2644
2645static int ptype_seq_open(struct inode *inode, struct file *file)
2646{
2647	return seq_open(file, &ptype_seq_ops);
2648}
2649
2650static const struct file_operations ptype_seq_fops = {
2651	.owner	 = THIS_MODULE,
2652	.open    = ptype_seq_open,
2653	.read    = seq_read,
2654	.llseek  = seq_lseek,
2655	.release = seq_release,
2656};
2657
2658
2659static int __net_init dev_proc_net_init(struct net *net)
2660{
2661	int rc = -ENOMEM;
2662
2663	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2664		goto out;
2665	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2666		goto out_dev;
2667	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2668		goto out_softnet;
2669
2670	if (wext_proc_init(net))
2671		goto out_ptype;
2672	rc = 0;
2673out:
2674	return rc;
2675out_ptype:
2676	proc_net_remove(net, "ptype");
2677out_softnet:
2678	proc_net_remove(net, "softnet_stat");
2679out_dev:
2680	proc_net_remove(net, "dev");
2681	goto out;
2682}
2683
2684static void __net_exit dev_proc_net_exit(struct net *net)
2685{
2686	wext_proc_exit(net);
2687
2688	proc_net_remove(net, "ptype");
2689	proc_net_remove(net, "softnet_stat");
2690	proc_net_remove(net, "dev");
2691}
2692
2693static struct pernet_operations __net_initdata dev_proc_ops = {
2694	.init = dev_proc_net_init,
2695	.exit = dev_proc_net_exit,
2696};
2697
2698static int __init dev_proc_init(void)
2699{
2700	return register_pernet_subsys(&dev_proc_ops);
2701}
2702#else
2703#define dev_proc_init() 0
2704#endif	/* CONFIG_PROC_FS */
2705
2706
2707/**
2708 *	netdev_set_master	-	set up master/slave pair
2709 *	@slave: slave device
2710 *	@master: new master device
2711 *
2712 *	Changes the master device of the slave. Pass %NULL to break the
2713 *	bonding. The caller must hold the RTNL semaphore. On a failure
2714 *	a negative errno code is returned. On success the reference counts
2715 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2716 *	function returns zero.
2717 */
2718int netdev_set_master(struct net_device *slave, struct net_device *master)
2719{
2720	struct net_device *old = slave->master;
2721
2722	ASSERT_RTNL();
2723
2724	if (master) {
2725		if (old)
2726			return -EBUSY;
2727		dev_hold(master);
2728	}
2729
2730	slave->master = master;
2731
2732	synchronize_net();
2733
2734	if (old)
2735		dev_put(old);
2736
2737	if (master)
2738		slave->flags |= IFF_SLAVE;
2739	else
2740		slave->flags &= ~IFF_SLAVE;
2741
2742	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2743	return 0;
2744}
2745
2746static void __dev_set_promiscuity(struct net_device *dev, int inc)
2747{
2748	unsigned short old_flags = dev->flags;
2749
2750	ASSERT_RTNL();
2751
2752	if ((dev->promiscuity += inc) == 0)
2753		dev->flags &= ~IFF_PROMISC;
2754	else
2755		dev->flags |= IFF_PROMISC;
2756	if (dev->flags != old_flags) {
2757		printk(KERN_INFO "device %s %s promiscuous mode\n",
2758		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2759							       "left");
2760		audit_log(current->audit_context, GFP_ATOMIC,
2761			AUDIT_ANOM_PROMISCUOUS,
2762			"dev=%s prom=%d old_prom=%d auid=%u",
2763			dev->name, (dev->flags & IFF_PROMISC),
2764			(old_flags & IFF_PROMISC),
2765			audit_get_loginuid(current->audit_context));
2766
2767		if (dev->change_rx_flags)
2768			dev->change_rx_flags(dev, IFF_PROMISC);
2769	}
2770}
2771
2772/**
2773 *	dev_set_promiscuity	- update promiscuity count on a device
2774 *	@dev: device
2775 *	@inc: modifier
2776 *
2777 *	Add or remove promiscuity from a device. While the count in the device
2778 *	remains above zero the interface remains promiscuous. Once it hits zero
2779 *	the device reverts back to normal filtering operation. A negative inc
2780 *	value is used to drop promiscuity on the device.
2781 */
2782void dev_set_promiscuity(struct net_device *dev, int inc)
2783{
2784	unsigned short old_flags = dev->flags;
2785
2786	__dev_set_promiscuity(dev, inc);
2787	if (dev->flags != old_flags)
2788		dev_set_rx_mode(dev);
2789}
2790
2791/**
2792 *	dev_set_allmulti	- update allmulti count on a device
2793 *	@dev: device
2794 *	@inc: modifier
2795 *
2796 *	Add or remove reception of all multicast frames to a device. While the
2797 *	count in the device remains above zero the interface remains listening
2798 *	to all interfaces. Once it hits zero the device reverts back to normal
2799 *	filtering operation. A negative @inc value is used to drop the counter
2800 *	when releasing a resource needing all multicasts.
2801 */
2802
2803void dev_set_allmulti(struct net_device *dev, int inc)
2804{
2805	unsigned short old_flags = dev->flags;
2806
2807	ASSERT_RTNL();
2808
2809	dev->flags |= IFF_ALLMULTI;
2810	if ((dev->allmulti += inc) == 0)
2811		dev->flags &= ~IFF_ALLMULTI;
2812	if (dev->flags ^ old_flags) {
2813		if (dev->change_rx_flags)
2814			dev->change_rx_flags(dev, IFF_ALLMULTI);
2815		dev_set_rx_mode(dev);
2816	}
2817}
2818
2819/*
2820 *	Upload unicast and multicast address lists to device and
2821 *	configure RX filtering. When the device doesn't support unicast
2822 *	filtering it is put in promiscous mode while unicast addresses
2823 *	are present.
2824 */
2825void __dev_set_rx_mode(struct net_device *dev)
2826{
2827	/* dev_open will call this function so the list will stay sane. */
2828	if (!(dev->flags&IFF_UP))
2829		return;
2830
2831	if (!netif_device_present(dev))
2832		return;
2833
2834	if (dev->set_rx_mode)
2835		dev->set_rx_mode(dev);
2836	else {
2837		/* Unicast addresses changes may only happen under the rtnl,
2838		 * therefore calling __dev_set_promiscuity here is safe.
2839		 */
2840		if (dev->uc_count > 0 && !dev->uc_promisc) {
2841			__dev_set_promiscuity(dev, 1);
2842			dev->uc_promisc = 1;
2843		} else if (dev->uc_count == 0 && dev->uc_promisc) {
2844			__dev_set_promiscuity(dev, -1);
2845			dev->uc_promisc = 0;
2846		}
2847
2848		if (dev->set_multicast_list)
2849			dev->set_multicast_list(dev);
2850	}
2851}
2852
2853void dev_set_rx_mode(struct net_device *dev)
2854{
2855	netif_tx_lock_bh(dev);
2856	__dev_set_rx_mode(dev);
2857	netif_tx_unlock_bh(dev);
2858}
2859
2860int __dev_addr_delete(struct dev_addr_list **list, int *count,
2861		      void *addr, int alen, int glbl)
2862{
2863	struct dev_addr_list *da;
2864
2865	for (; (da = *list) != NULL; list = &da->next) {
2866		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2867		    alen == da->da_addrlen) {
2868			if (glbl) {
2869				int old_glbl = da->da_gusers;
2870				da->da_gusers = 0;
2871				if (old_glbl == 0)
2872					break;
2873			}
2874			if (--da->da_users)
2875				return 0;
2876
2877			*list = da->next;
2878			kfree(da);
2879			(*count)--;
2880			return 0;
2881		}
2882	}
2883	return -ENOENT;
2884}
2885
2886int __dev_addr_add(struct dev_addr_list **list, int *count,
2887		   void *addr, int alen, int glbl)
2888{
2889	struct dev_addr_list *da;
2890
2891	for (da = *list; da != NULL; da = da->next) {
2892		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2893		    da->da_addrlen == alen) {
2894			if (glbl) {
2895				int old_glbl = da->da_gusers;
2896				da->da_gusers = 1;
2897				if (old_glbl)
2898					return 0;
2899			}
2900			da->da_users++;
2901			return 0;
2902		}
2903	}
2904
2905	da = kmalloc(sizeof(*da), GFP_ATOMIC);
2906	if (da == NULL)
2907		return -ENOMEM;
2908	memcpy(da->da_addr, addr, alen);
2909	da->da_addrlen = alen;
2910	da->da_users = 1;
2911	da->da_gusers = glbl ? 1 : 0;
2912	da->next = *list;
2913	*list = da;
2914	(*count)++;
2915	return 0;
2916}
2917
2918/**
2919 *	dev_unicast_delete	- Release secondary unicast address.
2920 *	@dev: device
2921 *	@addr: address to delete
2922 *	@alen: length of @addr
2923 *
2924 *	Release reference to a secondary unicast address and remove it
2925 *	from the device if the reference count drops to zero.
2926 *
2927 * 	The caller must hold the rtnl_mutex.
2928 */
2929int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2930{
2931	int err;
2932
2933	ASSERT_RTNL();
2934
2935	netif_tx_lock_bh(dev);
2936	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2937	if (!err)
2938		__dev_set_rx_mode(dev);
2939	netif_tx_unlock_bh(dev);
2940	return err;
2941}
2942EXPORT_SYMBOL(dev_unicast_delete);
2943
2944/**
2945 *	dev_unicast_add		- add a secondary unicast address
2946 *	@dev: device
2947 *	@addr: address to delete
2948 *	@alen: length of @addr
2949 *
2950 *	Add a secondary unicast address to the device or increase
2951 *	the reference count if it already exists.
2952 *
2953 *	The caller must hold the rtnl_mutex.
2954 */
2955int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2956{
2957	int err;
2958
2959	ASSERT_RTNL();
2960
2961	netif_tx_lock_bh(dev);
2962	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2963	if (!err)
2964		__dev_set_rx_mode(dev);
2965	netif_tx_unlock_bh(dev);
2966	return err;
2967}
2968EXPORT_SYMBOL(dev_unicast_add);
2969
2970static void __dev_addr_discard(struct dev_addr_list **list)
2971{
2972	struct dev_addr_list *tmp;
2973
2974	while (*list != NULL) {
2975		tmp = *list;
2976		*list = tmp->next;
2977		if (tmp->da_users > tmp->da_gusers)
2978			printk("__dev_addr_discard: address leakage! "
2979			       "da_users=%d\n", tmp->da_users);
2980		kfree(tmp);
2981	}
2982}
2983
2984static void dev_addr_discard(struct net_device *dev)
2985{
2986	netif_tx_lock_bh(dev);
2987
2988	__dev_addr_discard(&dev->uc_list);
2989	dev->uc_count = 0;
2990
2991	__dev_addr_discard(&dev->mc_list);
2992	dev->mc_count = 0;
2993
2994	netif_tx_unlock_bh(dev);
2995}
2996
2997unsigned dev_get_flags(const struct net_device *dev)
2998{
2999	unsigned flags;
3000
3001	flags = (dev->flags & ~(IFF_PROMISC |
3002				IFF_ALLMULTI |
3003				IFF_RUNNING |
3004				IFF_LOWER_UP |
3005				IFF_DORMANT)) |
3006		(dev->gflags & (IFF_PROMISC |
3007				IFF_ALLMULTI));
3008
3009	if (netif_running(dev)) {
3010		if (netif_oper_up(dev))
3011			flags |= IFF_RUNNING;
3012		if (netif_carrier_ok(dev))
3013			flags |= IFF_LOWER_UP;
3014		if (netif_dormant(dev))
3015			flags |= IFF_DORMANT;
3016	}
3017
3018	return flags;
3019}
3020
3021int dev_change_flags(struct net_device *dev, unsigned flags)
3022{
3023	int ret, changes;
3024	int old_flags = dev->flags;
3025
3026	ASSERT_RTNL();
3027
3028	/*
3029	 *	Set the flags on our device.
3030	 */
3031
3032	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3033			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3034			       IFF_AUTOMEDIA)) |
3035		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3036				    IFF_ALLMULTI));
3037
3038	/*
3039	 *	Load in the correct multicast list now the flags have changed.
3040	 */
3041
3042	if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST)
3043		dev->change_rx_flags(dev, IFF_MULTICAST);
3044
3045	dev_set_rx_mode(dev);
3046
3047	/*
3048	 *	Have we downed the interface. We handle IFF_UP ourselves
3049	 *	according to user attempts to set it, rather than blindly
3050	 *	setting it.
3051	 */
3052
3053	ret = 0;
3054	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3055		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3056
3057		if (!ret)
3058			dev_set_rx_mode(dev);
3059	}
3060
3061	if (dev->flags & IFF_UP &&
3062	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3063					  IFF_VOLATILE)))
3064		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3065
3066	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3067		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3068		dev->gflags ^= IFF_PROMISC;
3069		dev_set_promiscuity(dev, inc);
3070	}
3071
3072	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3073	   is important. Some (broken) drivers set IFF_PROMISC, when
3074	   IFF_ALLMULTI is requested not asking us and not reporting.
3075	 */
3076	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3077		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3078		dev->gflags ^= IFF_ALLMULTI;
3079		dev_set_allmulti(dev, inc);
3080	}
3081
3082	/* Exclude state transition flags, already notified */
3083	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3084	if (changes)
3085		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3086
3087	return ret;
3088}
3089
3090int dev_set_mtu(struct net_device *dev, int new_mtu)
3091{
3092	int err;
3093
3094	if (new_mtu == dev->mtu)
3095		return 0;
3096
3097	/*	MTU must be positive.	 */
3098	if (new_mtu < 0)
3099		return -EINVAL;
3100
3101	if (!netif_device_present(dev))
3102		return -ENODEV;
3103
3104	err = 0;
3105	if (dev->change_mtu)
3106		err = dev->change_mtu(dev, new_mtu);
3107	else
3108		dev->mtu = new_mtu;
3109	if (!err && dev->flags & IFF_UP)
3110		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3111	return err;
3112}
3113
3114int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3115{
3116	int err;
3117
3118	if (!dev->set_mac_address)
3119		return -EOPNOTSUPP;
3120	if (sa->sa_family != dev->type)
3121		return -EINVAL;
3122	if (!netif_device_present(dev))
3123		return -ENODEV;
3124	err = dev->set_mac_address(dev, sa);
3125	if (!err)
3126		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3127	return err;
3128}
3129
3130/*
3131 *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3132 */
3133static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3134{
3135	int err;
3136	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3137
3138	if (!dev)
3139		return -ENODEV;
3140
3141	switch (cmd) {
3142		case SIOCGIFFLAGS:	/* Get interface flags */
3143			ifr->ifr_flags = dev_get_flags(dev);
3144			return 0;
3145
3146		case SIOCGIFMETRIC:	/* Get the metric on the interface
3147					   (currently unused) */
3148			ifr->ifr_metric = 0;
3149			return 0;
3150
3151		case SIOCGIFMTU:	/* Get the MTU of a device */
3152			ifr->ifr_mtu = dev->mtu;
3153			return 0;
3154
3155		case SIOCGIFHWADDR:
3156			if (!dev->addr_len)
3157				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3158			else
3159				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3160				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3161			ifr->ifr_hwaddr.sa_family = dev->type;
3162			return 0;
3163
3164		case SIOCGIFSLAVE:
3165			err = -EINVAL;
3166			break;
3167
3168		case SIOCGIFMAP:
3169			ifr->ifr_map.mem_start = dev->mem_start;
3170			ifr->ifr_map.mem_end   = dev->mem_end;
3171			ifr->ifr_map.base_addr = dev->base_addr;
3172			ifr->ifr_map.irq       = dev->irq;
3173			ifr->ifr_map.dma       = dev->dma;
3174			ifr->ifr_map.port      = dev->if_port;
3175			return 0;
3176
3177		case SIOCGIFINDEX:
3178			ifr->ifr_ifindex = dev->ifindex;
3179			return 0;
3180
3181		case SIOCGIFTXQLEN:
3182			ifr->ifr_qlen = dev->tx_queue_len;
3183			return 0;
3184
3185		default:
3186			/* dev_ioctl() should ensure this case
3187			 * is never reached
3188			 */
3189			WARN_ON(1);
3190			err = -EINVAL;
3191			break;
3192
3193	}
3194	return err;
3195}
3196
3197/*
3198 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3199 */
3200static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3201{
3202	int err;
3203	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3204
3205	if (!dev)
3206		return -ENODEV;
3207
3208	switch (cmd) {
3209		case SIOCSIFFLAGS:	/* Set interface flags */
3210			return dev_change_flags(dev, ifr->ifr_flags);
3211
3212		case SIOCSIFMETRIC:	/* Set the metric on the interface
3213					   (currently unused) */
3214			return -EOPNOTSUPP;
3215
3216		case SIOCSIFMTU:	/* Set the MTU of a device */
3217			return dev_set_mtu(dev, ifr->ifr_mtu);
3218
3219		case SIOCSIFHWADDR:
3220			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3221
3222		case SIOCSIFHWBROADCAST:
3223			if (ifr->ifr_hwaddr.sa_family != dev->type)
3224				return -EINVAL;
3225			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3226			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3227			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3228			return 0;
3229
3230		case SIOCSIFMAP:
3231			if (dev->set_config) {
3232				if (!netif_device_present(dev))
3233					return -ENODEV;
3234				return dev->set_config(dev, &ifr->ifr_map);
3235			}
3236			return -EOPNOTSUPP;
3237
3238		case SIOCADDMULTI:
3239			if (!dev->set_multicast_list ||
3240			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3241				return -EINVAL;
3242			if (!netif_device_present(dev))
3243				return -ENODEV;
3244			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3245					  dev->addr_len, 1);
3246
3247		case SIOCDELMULTI:
3248			if (!dev->set_multicast_list ||
3249			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3250				return -EINVAL;
3251			if (!netif_device_present(dev))
3252				return -ENODEV;
3253			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3254					     dev->addr_len, 1);
3255
3256		case SIOCSIFTXQLEN:
3257			if (ifr->ifr_qlen < 0)
3258				return -EINVAL;
3259			dev->tx_queue_len = ifr->ifr_qlen;
3260			return 0;
3261
3262		case SIOCSIFNAME:
3263			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3264			return dev_change_name(dev, ifr->ifr_newname);
3265
3266		/*
3267		 *	Unknown or private ioctl
3268		 */
3269
3270		default:
3271			if ((cmd >= SIOCDEVPRIVATE &&
3272			    cmd <= SIOCDEVPRIVATE + 15) ||
3273			    cmd == SIOCBONDENSLAVE ||
3274			    cmd == SIOCBONDRELEASE ||
3275			    cmd == SIOCBONDSETHWADDR ||
3276			    cmd == SIOCBONDSLAVEINFOQUERY ||
3277			    cmd == SIOCBONDINFOQUERY ||
3278			    cmd == SIOCBONDCHANGEACTIVE ||
3279			    cmd == SIOCGMIIPHY ||
3280			    cmd == SIOCGMIIREG ||
3281			    cmd == SIOCSMIIREG ||
3282			    cmd == SIOCBRADDIF ||
3283			    cmd == SIOCBRDELIF ||
3284			    cmd == SIOCWANDEV) {
3285				err = -EOPNOTSUPP;
3286				if (dev->do_ioctl) {
3287					if (netif_device_present(dev))
3288						err = dev->do_ioctl(dev, ifr,
3289								    cmd);
3290					else
3291						err = -ENODEV;
3292				}
3293			} else
3294				err = -EINVAL;
3295
3296	}
3297	return err;
3298}
3299
3300/*
3301 *	This function handles all "interface"-type I/O control requests. The actual
3302 *	'doing' part of this is dev_ifsioc above.
3303 */
3304
3305/**
3306 *	dev_ioctl	-	network device ioctl
3307 *	@net: the applicable net namespace
3308 *	@cmd: command to issue
3309 *	@arg: pointer to a struct ifreq in user space
3310 *
3311 *	Issue ioctl functions to devices. This is normally called by the
3312 *	user space syscall interfaces but can sometimes be useful for
3313 *	other purposes. The return value is the return from the syscall if
3314 *	positive or a negative errno code on error.
3315 */
3316
3317int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3318{
3319	struct ifreq ifr;
3320	int ret;
3321	char *colon;
3322
3323	/* One special case: SIOCGIFCONF takes ifconf argument
3324	   and requires shared lock, because it sleeps writing
3325	   to user space.
3326	 */
3327
3328	if (cmd == SIOCGIFCONF) {
3329		rtnl_lock();
3330		ret = dev_ifconf(net, (char __user *) arg);
3331		rtnl_unlock();
3332		return ret;
3333	}
3334	if (cmd == SIOCGIFNAME)
3335		return dev_ifname(net, (struct ifreq __user *)arg);
3336
3337	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3338		return -EFAULT;
3339
3340	ifr.ifr_name[IFNAMSIZ-1] = 0;
3341
3342	colon = strchr(ifr.ifr_name, ':');
3343	if (colon)
3344		*colon = 0;
3345
3346	/*
3347	 *	See which interface the caller is talking about.
3348	 */
3349
3350	switch (cmd) {
3351		/*
3352		 *	These ioctl calls:
3353		 *	- can be done by all.
3354		 *	- atomic and do not require locking.
3355		 *	- return a value
3356		 */
3357		case SIOCGIFFLAGS:
3358		case SIOCGIFMETRIC:
3359		case SIOCGIFMTU:
3360		case SIOCGIFHWADDR:
3361		case SIOCGIFSLAVE:
3362		case SIOCGIFMAP:
3363		case SIOCGIFINDEX:
3364		case SIOCGIFTXQLEN:
3365			dev_load(net, ifr.ifr_name);
3366			read_lock(&dev_base_lock);
3367			ret = dev_ifsioc_locked(net, &ifr, cmd);
3368			read_unlock(&dev_base_lock);
3369			if (!ret) {
3370				if (colon)
3371					*colon = ':';
3372				if (copy_to_user(arg, &ifr,
3373						 sizeof(struct ifreq)))
3374					ret = -EFAULT;
3375			}
3376			return ret;
3377
3378		case SIOCETHTOOL:
3379			dev_load(net, ifr.ifr_name);
3380			rtnl_lock();
3381			ret = dev_ethtool(net, &ifr);
3382			rtnl_unlock();
3383			if (!ret) {
3384				if (colon)
3385					*colon = ':';
3386				if (copy_to_user(arg, &ifr,
3387						 sizeof(struct ifreq)))
3388					ret = -EFAULT;
3389			}
3390			return ret;
3391
3392		/*
3393		 *	These ioctl calls:
3394		 *	- require superuser power.
3395		 *	- require strict serialization.
3396		 *	- return a value
3397		 */
3398		case SIOCGMIIPHY:
3399		case SIOCGMIIREG:
3400		case SIOCSIFNAME:
3401			if (!capable(CAP_NET_ADMIN))
3402				return -EPERM;
3403			dev_load(net, ifr.ifr_name);
3404			rtnl_lock();
3405			ret = dev_ifsioc(net, &ifr, cmd);
3406			rtnl_unlock();
3407			if (!ret) {
3408				if (colon)
3409					*colon = ':';
3410				if (copy_to_user(arg, &ifr,
3411						 sizeof(struct ifreq)))
3412					ret = -EFAULT;
3413			}
3414			return ret;
3415
3416		/*
3417		 *	These ioctl calls:
3418		 *	- require superuser power.
3419		 *	- require strict serialization.
3420		 *	- do not return a value
3421		 */
3422		case SIOCSIFFLAGS:
3423		case SIOCSIFMETRIC:
3424		case SIOCSIFMTU:
3425		case SIOCSIFMAP:
3426		case SIOCSIFHWADDR:
3427		case SIOCSIFSLAVE:
3428		case SIOCADDMULTI:
3429		case SIOCDELMULTI:
3430		case SIOCSIFHWBROADCAST:
3431		case SIOCSIFTXQLEN:
3432		case SIOCSMIIREG:
3433		case SIOCBONDENSLAVE:
3434		case SIOCBONDRELEASE:
3435		case SIOCBONDSETHWADDR:
3436		case SIOCBONDCHANGEACTIVE:
3437		case SIOCBRADDIF:
3438		case SIOCBRDELIF:
3439			if (!capable(CAP_NET_ADMIN))
3440				return -EPERM;
3441			/* fall through */
3442		case SIOCBONDSLAVEINFOQUERY:
3443		case SIOCBONDINFOQUERY:
3444			dev_load(net, ifr.ifr_name);
3445			rtnl_lock();
3446			ret = dev_ifsioc(net, &ifr, cmd);
3447			rtnl_unlock();
3448			return ret;
3449
3450		case SIOCGIFMEM:
3451			/* Get the per device memory space. We can add this but
3452			 * currently do not support it */
3453		case SIOCSIFMEM:
3454			/* Set the per device memory buffer space.
3455			 * Not applicable in our case */
3456		case SIOCSIFLINK:
3457			return -EINVAL;
3458
3459		/*
3460		 *	Unknown or private ioctl.
3461		 */
3462		default:
3463			if (cmd == SIOCWANDEV ||
3464			    (cmd >= SIOCDEVPRIVATE &&
3465			     cmd <= SIOCDEVPRIVATE + 15)) {
3466				dev_load(net, ifr.ifr_name);
3467				rtnl_lock();
3468				ret = dev_ifsioc(net, &ifr, cmd);
3469				rtnl_unlock();
3470				if (!ret && copy_to_user(arg, &ifr,
3471							 sizeof(struct ifreq)))
3472					ret = -EFAULT;
3473				return ret;
3474			}
3475			/* Take care of Wireless Extensions */
3476			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3477				return wext_handle_ioctl(net, &ifr, cmd, arg);
3478			return -EINVAL;
3479	}
3480}
3481
3482
3483/**
3484 *	dev_new_index	-	allocate an ifindex
3485 *	@net: the applicable net namespace
3486 *
3487 *	Returns a suitable unique value for a new device interface
3488 *	number.  The caller must hold the rtnl semaphore or the
3489 *	dev_base_lock to be sure it remains unique.
3490 */
3491static int dev_new_index(struct net *net)
3492{
3493	static int ifindex;
3494	for (;;) {
3495		if (++ifindex <= 0)
3496			ifindex = 1;
3497		if (!__dev_get_by_index(net, ifindex))
3498			return ifindex;
3499	}
3500}
3501
3502/* Delayed registration/unregisteration */
3503static DEFINE_SPINLOCK(net_todo_list_lock);
3504static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3505
3506static void net_set_todo(struct net_device *dev)
3507{
3508	spin_lock(&net_todo_list_lock);
3509	list_add_tail(&dev->todo_list, &net_todo_list);
3510	spin_unlock(&net_todo_list_lock);
3511}
3512
3513static void rollback_registered(struct net_device *dev)
3514{
3515	BUG_ON(dev_boot_phase);
3516	ASSERT_RTNL();
3517
3518	/* Some devices call without registering for initialization unwind. */
3519	if (dev->reg_state == NETREG_UNINITIALIZED) {
3520		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3521				  "was registered\n", dev->name, dev);
3522
3523		WARN_ON(1);
3524		return;
3525	}
3526
3527	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3528
3529	/* If device is running, close it first. */
3530	dev_close(dev);
3531
3532	/* And unlink it from device chain. */
3533	unlist_netdevice(dev);
3534
3535	dev->reg_state = NETREG_UNREGISTERING;
3536
3537	synchronize_net();
3538
3539	/* Shutdown queueing discipline. */
3540	dev_shutdown(dev);
3541
3542
3543	/* Notify protocols, that we are about to destroy
3544	   this device. They should clean all the things.
3545	*/
3546	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3547
3548	/*
3549	 *	Flush the unicast and multicast chains
3550	 */
3551	dev_addr_discard(dev);
3552
3553	if (dev->uninit)
3554		dev->uninit(dev);
3555
3556	/* Notifier chain MUST detach us from master device. */
3557	BUG_TRAP(!dev->master);
3558
3559	/* Remove entries from kobject tree */
3560	netdev_unregister_kobject(dev);
3561
3562	synchronize_net();
3563
3564	dev_put(dev);
3565}
3566
3567/**
3568 *	register_netdevice	- register a network device
3569 *	@dev: device to register
3570 *
3571 *	Take a completed network device structure and add it to the kernel
3572 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3573 *	chain. 0 is returned on success. A negative errno code is returned
3574 *	on a failure to set up the device, or if the name is a duplicate.
3575 *
3576 *	Callers must hold the rtnl semaphore. You may want
3577 *	register_netdev() instead of this.
3578 *
3579 *	BUGS:
3580 *	The locking appears insufficient to guarantee two parallel registers
3581 *	will not get the same name.
3582 */
3583
3584int register_netdevice(struct net_device *dev)
3585{
3586	struct hlist_head *head;
3587	struct hlist_node *p;
3588	int ret;
3589	struct net *net;
3590
3591	BUG_ON(dev_boot_phase);
3592	ASSERT_RTNL();
3593
3594	might_sleep();
3595
3596	/* When net_device's are persistent, this will be fatal. */
3597	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3598	BUG_ON(!dev->nd_net);
3599	net = dev->nd_net;
3600
3601	spin_lock_init(&dev->queue_lock);
3602	spin_lock_init(&dev->_xmit_lock);
3603	netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3604	dev->xmit_lock_owner = -1;
3605	spin_lock_init(&dev->ingress_lock);
3606
3607	dev->iflink = -1;
3608
3609	/* Init, if this function is available */
3610	if (dev->init) {
3611		ret = dev->init(dev);
3612		if (ret) {
3613			if (ret > 0)
3614				ret = -EIO;
3615			goto out;
3616		}
3617	}
3618
3619	if (!dev_valid_name(dev->name)) {
3620		ret = -EINVAL;
3621		goto err_uninit;
3622	}
3623
3624	dev->ifindex = dev_new_index(net);
3625	if (dev->iflink == -1)
3626		dev->iflink = dev->ifindex;
3627
3628	/* Check for existence of name */
3629	head = dev_name_hash(net, dev->name);
3630	hlist_for_each(p, head) {
3631		struct net_device *d
3632			= hlist_entry(p, struct net_device, name_hlist);
3633		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3634			ret = -EEXIST;
3635			goto err_uninit;
3636		}
3637	}
3638
3639	/* Fix illegal checksum combinations */
3640	if ((dev->features & NETIF_F_HW_CSUM) &&
3641	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3642		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3643		       dev->name);
3644		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3645	}
3646
3647	if ((dev->features & NETIF_F_NO_CSUM) &&
3648	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3649		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3650		       dev->name);
3651		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3652	}
3653
3654
3655	/* Fix illegal SG+CSUM combinations. */
3656	if ((dev->features & NETIF_F_SG) &&
3657	    !(dev->features & NETIF_F_ALL_CSUM)) {
3658		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3659		       dev->name);
3660		dev->features &= ~NETIF_F_SG;
3661	}
3662
3663	/* TSO requires that SG is present as well. */
3664	if ((dev->features & NETIF_F_TSO) &&
3665	    !(dev->features & NETIF_F_SG)) {
3666		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3667		       dev->name);
3668		dev->features &= ~NETIF_F_TSO;
3669	}
3670	if (dev->features & NETIF_F_UFO) {
3671		if (!(dev->features & NETIF_F_HW_CSUM)) {
3672			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3673					"NETIF_F_HW_CSUM feature.\n",
3674							dev->name);
3675			dev->features &= ~NETIF_F_UFO;
3676		}
3677		if (!(dev->features & NETIF_F_SG)) {
3678			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3679					"NETIF_F_SG feature.\n",
3680					dev->name);
3681			dev->features &= ~NETIF_F_UFO;
3682		}
3683	}
3684
3685	ret = netdev_register_kobject(dev);
3686	if (ret)
3687		goto err_uninit;
3688	dev->reg_state = NETREG_REGISTERED;
3689
3690	/*
3691	 *	Default initial state at registry is that the
3692	 *	device is present.
3693	 */
3694
3695	set_bit(__LINK_STATE_PRESENT, &dev->state);
3696
3697	dev_init_scheduler(dev);
3698	dev_hold(dev);
3699	list_netdevice(dev);
3700
3701	/* Notify protocols, that a new device appeared. */
3702	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3703	ret = notifier_to_errno(ret);
3704	if (ret) {
3705		rollback_registered(dev);
3706		dev->reg_state = NETREG_UNREGISTERED;
3707	}
3708
3709out:
3710	return ret;
3711
3712err_uninit:
3713	if (dev->uninit)
3714		dev->uninit(dev);
3715	goto out;
3716}
3717
3718/**
3719 *	register_netdev	- register a network device
3720 *	@dev: device to register
3721 *
3722 *	Take a completed network device structure and add it to the kernel
3723 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3724 *	chain. 0 is returned on success. A negative errno code is returned
3725 *	on a failure to set up the device, or if the name is a duplicate.
3726 *
3727 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
3728 *	and expands the device name if you passed a format string to
3729 *	alloc_netdev.
3730 */
3731int register_netdev(struct net_device *dev)
3732{
3733	int err;
3734
3735	rtnl_lock();
3736
3737	/*
3738	 * If the name is a format string the caller wants us to do a
3739	 * name allocation.
3740	 */
3741	if (strchr(dev->name, '%')) {
3742		err = dev_alloc_name(dev, dev->name);
3743		if (err < 0)
3744			goto out;
3745	}
3746
3747	err = register_netdevice(dev);
3748out:
3749	rtnl_unlock();
3750	return err;
3751}
3752EXPORT_SYMBOL(register_netdev);
3753
3754/*
3755 * netdev_wait_allrefs - wait until all references are gone.
3756 *
3757 * This is called when unregistering network devices.
3758 *
3759 * Any protocol or device that holds a reference should register
3760 * for netdevice notification, and cleanup and put back the
3761 * reference if they receive an UNREGISTER event.
3762 * We can get stuck here if buggy protocols don't correctly
3763 * call dev_put.
3764 */
3765static void netdev_wait_allrefs(struct net_device *dev)
3766{
3767	unsigned long rebroadcast_time, warning_time;
3768
3769	rebroadcast_time = warning_time = jiffies;
3770	while (atomic_read(&dev->refcnt) != 0) {
3771		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3772			rtnl_lock();
3773
3774			/* Rebroadcast unregister notification */
3775			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3776
3777			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3778				     &dev->state)) {
3779				/* We must not have linkwatch events
3780				 * pending on unregister. If this
3781				 * happens, we simply run the queue
3782				 * unscheduled, resulting in a noop
3783				 * for this device.
3784				 */
3785				linkwatch_run_queue();
3786			}
3787
3788			__rtnl_unlock();
3789
3790			rebroadcast_time = jiffies;
3791		}
3792
3793		msleep(250);
3794
3795		if (time_after(jiffies, warning_time + 10 * HZ)) {
3796			printk(KERN_EMERG "unregister_netdevice: "
3797			       "waiting for %s to become free. Usage "
3798			       "count = %d\n",
3799			       dev->name, atomic_read(&dev->refcnt));
3800			warning_time = jiffies;
3801		}
3802	}
3803}
3804
3805/* The sequence is:
3806 *
3807 *	rtnl_lock();
3808 *	...
3809 *	register_netdevice(x1);
3810 *	register_netdevice(x2);
3811 *	...
3812 *	unregister_netdevice(y1);
3813 *	unregister_netdevice(y2);
3814 *      ...
3815 *	rtnl_unlock();
3816 *	free_netdev(y1);
3817 *	free_netdev(y2);
3818 *
3819 * We are invoked by rtnl_unlock() after it drops the semaphore.
3820 * This allows us to deal with problems:
3821 * 1) We can delete sysfs objects which invoke hotplug
3822 *    without deadlocking with linkwatch via keventd.
3823 * 2) Since we run with the RTNL semaphore not held, we can sleep
3824 *    safely in order to wait for the netdev refcnt to drop to zero.
3825 */
3826static DEFINE_MUTEX(net_todo_run_mutex);
3827void netdev_run_todo(void)
3828{
3829	struct list_head list;
3830
3831	/* Need to guard against multiple cpu's getting out of order. */
3832	mutex_lock(&net_todo_run_mutex);
3833
3834	/* Not safe to do outside the semaphore.  We must not return
3835	 * until all unregister events invoked by the local processor
3836	 * have been completed (either by this todo run, or one on
3837	 * another cpu).
3838	 */
3839	if (list_empty(&net_todo_list))
3840		goto out;
3841
3842	/* Snapshot list, allow later requests */
3843	spin_lock(&net_todo_list_lock);
3844	list_replace_init(&net_todo_list, &list);
3845	spin_unlock(&net_todo_list_lock);
3846
3847	while (!list_empty(&list)) {
3848		struct net_device *dev
3849			= list_entry(list.next, struct net_device, todo_list);
3850		list_del(&dev->todo_list);
3851
3852		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3853			printk(KERN_ERR "network todo '%s' but state %d\n",
3854			       dev->name, dev->reg_state);
3855			dump_stack();
3856			continue;
3857		}
3858
3859		dev->reg_state = NETREG_UNREGISTERED;
3860
3861		netdev_wait_allrefs(dev);
3862
3863		/* paranoia */
3864		BUG_ON(atomic_read(&dev->refcnt));
3865		BUG_TRAP(!dev->ip_ptr);
3866		BUG_TRAP(!dev->ip6_ptr);
3867		BUG_TRAP(!dev->dn_ptr);
3868
3869		if (dev->destructor)
3870			dev->destructor(dev);
3871
3872		/* Free network device */
3873		kobject_put(&dev->dev.kobj);
3874	}
3875
3876out:
3877	mutex_unlock(&net_todo_run_mutex);
3878}
3879
3880static struct net_device_stats *internal_stats(struct net_device *dev)
3881{
3882	return &dev->stats;
3883}
3884
3885/**
3886 *	alloc_netdev_mq - allocate network device
3887 *	@sizeof_priv:	size of private data to allocate space for
3888 *	@name:		device name format string
3889 *	@setup:		callback to initialize device
3890 *	@queue_count:	the number of subqueues to allocate
3891 *
3892 *	Allocates a struct net_device with private data area for driver use
3893 *	and performs basic initialization.  Also allocates subquue structs
3894 *	for each queue on the device at the end of the netdevice.
3895 */
3896struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
3897		void (*setup)(struct net_device *), unsigned int queue_count)
3898{
3899	void *p;
3900	struct net_device *dev;
3901	int alloc_size;
3902
3903	BUG_ON(strlen(name) >= sizeof(dev->name));
3904
3905	/* ensure 32-byte alignment of both the device and private area */
3906	alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST +
3907		     (sizeof(struct net_device_subqueue) * (queue_count - 1))) &
3908		     ~NETDEV_ALIGN_CONST;
3909	alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3910
3911	p = kzalloc(alloc_size, GFP_KERNEL);
3912	if (!p) {
3913		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3914		return NULL;
3915	}
3916
3917	dev = (struct net_device *)
3918		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3919	dev->padded = (char *)dev - (char *)p;
3920	dev->nd_net = &init_net;
3921
3922	if (sizeof_priv) {
3923		dev->priv = ((char *)dev +
3924			     ((sizeof(struct net_device) +
3925			       (sizeof(struct net_device_subqueue) *
3926				(queue_count - 1)) + NETDEV_ALIGN_CONST)
3927			      & ~NETDEV_ALIGN_CONST));
3928	}
3929
3930	dev->egress_subqueue_count = queue_count;
3931
3932	dev->get_stats = internal_stats;
3933	netpoll_netdev_init(dev);
3934	setup(dev);
3935	strcpy(dev->name, name);
3936	return dev;
3937}
3938EXPORT_SYMBOL(alloc_netdev_mq);
3939
3940/**
3941 *	free_netdev - free network device
3942 *	@dev: device
3943 *
3944 *	This function does the last stage of destroying an allocated device
3945 * 	interface. The reference to the device object is released.
3946 *	If this is the last reference then it will be freed.
3947 */
3948void free_netdev(struct net_device *dev)
3949{
3950	/*  Compatibility with error handling in drivers */
3951	if (dev->reg_state == NETREG_UNINITIALIZED) {
3952		kfree((char *)dev - dev->padded);
3953		return;
3954	}
3955
3956	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3957	dev->reg_state = NETREG_RELEASED;
3958
3959	/* will free via device release */
3960	put_device(&dev->dev);
3961}
3962
3963/* Synchronize with packet receive processing. */
3964void synchronize_net(void)
3965{
3966	might_sleep();
3967	synchronize_rcu();
3968}
3969
3970/**
3971 *	unregister_netdevice - remove device from the kernel
3972 *	@dev: device
3973 *
3974 *	This function shuts down a device interface and removes it
3975 *	from the kernel tables. On success 0 is returned, on a failure
3976 *	a negative errno code is returned.
3977 *
3978 *	Callers must hold the rtnl semaphore.  You may want
3979 *	unregister_netdev() instead of this.
3980 */
3981
3982void unregister_netdevice(struct net_device *dev)
3983{
3984	rollback_registered(dev);
3985	/* Finish processing unregister after unlock */
3986	net_set_todo(dev);
3987}
3988
3989/**
3990 *	unregister_netdev - remove device from the kernel
3991 *	@dev: device
3992 *
3993 *	This function shuts down a device interface and removes it
3994 *	from the kernel tables. On success 0 is returned, on a failure
3995 *	a negative errno code is returned.
3996 *
3997 *	This is just a wrapper for unregister_netdevice that takes
3998 *	the rtnl semaphore.  In general you want to use this and not
3999 *	unregister_netdevice.
4000 */
4001void unregister_netdev(struct net_device *dev)
4002{
4003	rtnl_lock();
4004	unregister_netdevice(dev);
4005	rtnl_unlock();
4006}
4007
4008EXPORT_SYMBOL(unregister_netdev);
4009
4010/**
4011 *	dev_change_net_namespace - move device to different nethost namespace
4012 *	@dev: device
4013 *	@net: network namespace
4014 *	@pat: If not NULL name pattern to try if the current device name
4015 *	      is already taken in the destination network namespace.
4016 *
4017 *	This function shuts down a device interface and moves it
4018 *	to a new network namespace. On success 0 is returned, on
4019 *	a failure a netagive errno code is returned.
4020 *
4021 *	Callers must hold the rtnl semaphore.
4022 */
4023
4024int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4025{
4026	char buf[IFNAMSIZ];
4027	const char *destname;
4028	int err;
4029
4030	ASSERT_RTNL();
4031
4032	/* Don't allow namespace local devices to be moved. */
4033	err = -EINVAL;
4034	if (dev->features & NETIF_F_NETNS_LOCAL)
4035		goto out;
4036
4037	/* Ensure the device has been registrered */
4038	err = -EINVAL;
4039	if (dev->reg_state != NETREG_REGISTERED)
4040		goto out;
4041
4042	/* Get out if there is nothing todo */
4043	err = 0;
4044	if (dev->nd_net == net)
4045		goto out;
4046
4047	/* Pick the destination device name, and ensure
4048	 * we can use it in the destination network namespace.
4049	 */
4050	err = -EEXIST;
4051	destname = dev->name;
4052	if (__dev_get_by_name(net, destname)) {
4053		/* We get here if we can't use the current device name */
4054		if (!pat)
4055			goto out;
4056		if (!dev_valid_name(pat))
4057			goto out;
4058		if (strchr(pat, '%')) {
4059			if (__dev_alloc_name(net, pat, buf) < 0)
4060				goto out;
4061			destname = buf;
4062		} else
4063			destname = pat;
4064		if (__dev_get_by_name(net, destname))
4065			goto out;
4066	}
4067
4068	/*
4069	 * And now a mini version of register_netdevice unregister_netdevice.
4070	 */
4071
4072	/* If device is running close it first. */
4073	dev_close(dev);
4074
4075	/* And unlink it from device chain */
4076	err = -ENODEV;
4077	unlist_netdevice(dev);
4078
4079	synchronize_net();
4080
4081	/* Shutdown queueing discipline. */
4082	dev_shutdown(dev);
4083
4084	/* Notify protocols, that we are about to destroy
4085	   this device. They should clean all the things.
4086	*/
4087	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4088
4089	/*
4090	 *	Flush the unicast and multicast chains
4091	 */
4092	dev_addr_discard(dev);
4093
4094	/* Actually switch the network namespace */
4095	dev->nd_net = net;
4096
4097	/* Assign the new device name */
4098	if (destname != dev->name)
4099		strcpy(dev->name, destname);
4100
4101	/* If there is an ifindex conflict assign a new one */
4102	if (__dev_get_by_index(net, dev->ifindex)) {
4103		int iflink = (dev->iflink == dev->ifindex);
4104		dev->ifindex = dev_new_index(net);
4105		if (iflink)
4106			dev->iflink = dev->ifindex;
4107	}
4108
4109	/* Fixup kobjects */
4110	err = device_rename(&dev->dev, dev->name);
4111	WARN_ON(err);
4112
4113	/* Add the device back in the hashes */
4114	list_netdevice(dev);
4115
4116	/* Notify protocols, that a new device appeared. */
4117	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4118
4119	synchronize_net();
4120	err = 0;
4121out:
4122	return err;
4123}
4124
4125static int dev_cpu_callback(struct notifier_block *nfb,
4126			    unsigned long action,
4127			    void *ocpu)
4128{
4129	struct sk_buff **list_skb;
4130	struct net_device **list_net;
4131	struct sk_buff *skb;
4132	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4133	struct softnet_data *sd, *oldsd;
4134
4135	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4136		return NOTIFY_OK;
4137
4138	local_irq_disable();
4139	cpu = smp_processor_id();
4140	sd = &per_cpu(softnet_data, cpu);
4141	oldsd = &per_cpu(softnet_data, oldcpu);
4142
4143	/* Find end of our completion_queue. */
4144	list_skb = &sd->completion_queue;
4145	while (*list_skb)
4146		list_skb = &(*list_skb)->next;
4147	/* Append completion queue from offline CPU. */
4148	*list_skb = oldsd->completion_queue;
4149	oldsd->completion_queue = NULL;
4150
4151	/* Find end of our output_queue. */
4152	list_net = &sd->output_queue;
4153	while (*list_net)
4154		list_net = &(*list_net)->next_sched;
4155	/* Append output queue from offline CPU. */
4156	*list_net = oldsd->output_queue;
4157	oldsd->output_queue = NULL;
4158
4159	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4160	local_irq_enable();
4161
4162	/* Process offline CPU's input_pkt_queue */
4163	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4164		netif_rx(skb);
4165
4166	return NOTIFY_OK;
4167}
4168
4169#ifdef CONFIG_NET_DMA
4170/**
4171 * net_dma_rebalance - try to maintain one DMA channel per CPU
4172 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4173 *
4174 * This is called when the number of channels allocated to the net_dma client
4175 * changes.  The net_dma client tries to have one DMA channel per CPU.
4176 */
4177
4178static void net_dma_rebalance(struct net_dma *net_dma)
4179{
4180	unsigned int cpu, i, n, chan_idx;
4181	struct dma_chan *chan;
4182
4183	if (cpus_empty(net_dma->channel_mask)) {
4184		for_each_online_cpu(cpu)
4185			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4186		return;
4187	}
4188
4189	i = 0;
4190	cpu = first_cpu(cpu_online_map);
4191
4192	for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4193		chan = net_dma->channels[chan_idx];
4194
4195		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4196		   + (i < (num_online_cpus() %
4197			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4198
4199		while(n) {
4200			per_cpu(softnet_data, cpu).net_dma = chan;
4201			cpu = next_cpu(cpu, cpu_online_map);
4202			n--;
4203		}
4204		i++;
4205	}
4206}
4207
4208/**
4209 * netdev_dma_event - event callback for the net_dma_client
4210 * @client: should always be net_dma_client
4211 * @chan: DMA channel for the event
4212 * @state: DMA state to be handled
4213 */
4214static enum dma_state_client
4215netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4216	enum dma_state state)
4217{
4218	int i, found = 0, pos = -1;
4219	struct net_dma *net_dma =
4220		container_of(client, struct net_dma, client);
4221	enum dma_state_client ack = DMA_DUP; /* default: take no action */
4222
4223	spin_lock(&net_dma->lock);
4224	switch (state) {
4225	case DMA_RESOURCE_AVAILABLE:
4226		for (i = 0; i < NR_CPUS; i++)
4227			if (net_dma->channels[i] == chan) {
4228				found = 1;
4229				break;
4230			} else if (net_dma->channels[i] == NULL && pos < 0)
4231				pos = i;
4232
4233		if (!found && pos >= 0) {
4234			ack = DMA_ACK;
4235			net_dma->channels[pos] = chan;
4236			cpu_set(pos, net_dma->channel_mask);
4237			net_dma_rebalance(net_dma);
4238		}
4239		break;
4240	case DMA_RESOURCE_REMOVED:
4241		for (i = 0; i < NR_CPUS; i++)
4242			if (net_dma->channels[i] == chan) {
4243				found = 1;
4244				pos = i;
4245				break;
4246			}
4247
4248		if (found) {
4249			ack = DMA_ACK;
4250			cpu_clear(pos, net_dma->channel_mask);
4251			net_dma->channels[i] = NULL;
4252			net_dma_rebalance(net_dma);
4253		}
4254		break;
4255	default:
4256		break;
4257	}
4258	spin_unlock(&net_dma->lock);
4259
4260	return ack;
4261}
4262
4263/**
4264 * netdev_dma_regiser - register the networking subsystem as a DMA client
4265 */
4266static int __init netdev_dma_register(void)
4267{
4268	spin_lock_init(&net_dma.lock);
4269	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4270	dma_async_client_register(&net_dma.client);
4271	dma_async_client_chan_request(&net_dma.client);
4272	return 0;
4273}
4274
4275#else
4276static int __init netdev_dma_register(void) { return -ENODEV; }
4277#endif /* CONFIG_NET_DMA */
4278
4279/**
4280 *	netdev_compute_feature - compute conjunction of two feature sets
4281 *	@all: first feature set
4282 *	@one: second feature set
4283 *
4284 *	Computes a new feature set after adding a device with feature set
4285 *	@one to the master device with current feature set @all.  Returns
4286 *	the new feature set.
4287 */
4288int netdev_compute_features(unsigned long all, unsigned long one)
4289{
4290	/* if device needs checksumming, downgrade to hw checksumming */
4291	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4292		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4293
4294	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4295	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4296		all ^= NETIF_F_HW_CSUM
4297			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4298
4299	if (one & NETIF_F_GSO)
4300		one |= NETIF_F_GSO_SOFTWARE;
4301	one |= NETIF_F_GSO;
4302
4303	/* If even one device supports robust GSO, enable it for all. */
4304	if (one & NETIF_F_GSO_ROBUST)
4305		all |= NETIF_F_GSO_ROBUST;
4306
4307	all &= one | NETIF_F_LLTX;
4308
4309	if (!(all & NETIF_F_ALL_CSUM))
4310		all &= ~NETIF_F_SG;
4311	if (!(all & NETIF_F_SG))
4312		all &= ~NETIF_F_GSO_MASK;
4313
4314	return all;
4315}
4316EXPORT_SYMBOL(netdev_compute_features);
4317
4318static struct hlist_head *netdev_create_hash(void)
4319{
4320	int i;
4321	struct hlist_head *hash;
4322
4323	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4324	if (hash != NULL)
4325		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4326			INIT_HLIST_HEAD(&hash[i]);
4327
4328	return hash;
4329}
4330
4331/* Initialize per network namespace state */
4332static int __net_init netdev_init(struct net *net)
4333{
4334	INIT_LIST_HEAD(&net->dev_base_head);
4335
4336	net->dev_name_head = netdev_create_hash();
4337	if (net->dev_name_head == NULL)
4338		goto err_name;
4339
4340	net->dev_index_head = netdev_create_hash();
4341	if (net->dev_index_head == NULL)
4342		goto err_idx;
4343
4344	return 0;
4345
4346err_idx:
4347	kfree(net->dev_name_head);
4348err_name:
4349	return -ENOMEM;
4350}
4351
4352static void __net_exit netdev_exit(struct net *net)
4353{
4354	kfree(net->dev_name_head);
4355	kfree(net->dev_index_head);
4356}
4357
4358static struct pernet_operations __net_initdata netdev_net_ops = {
4359	.init = netdev_init,
4360	.exit = netdev_exit,
4361};
4362
4363static void __net_exit default_device_exit(struct net *net)
4364{
4365	struct net_device *dev, *next;
4366	/*
4367	 * Push all migratable of the network devices back to the
4368	 * initial network namespace
4369	 */
4370	rtnl_lock();
4371	for_each_netdev_safe(net, dev, next) {
4372		int err;
4373
4374		/* Ignore unmoveable devices (i.e. loopback) */
4375		if (dev->features & NETIF_F_NETNS_LOCAL)
4376			continue;
4377
4378		/* Push remaing network devices to init_net */
4379		err = dev_change_net_namespace(dev, &init_net, "dev%d");
4380		if (err) {
4381			printk(KERN_WARNING "%s: failed to move %s to init_net: %d\n",
4382				__func__, dev->name, err);
4383			unregister_netdevice(dev);
4384		}
4385	}
4386	rtnl_unlock();
4387}
4388
4389static struct pernet_operations __net_initdata default_device_ops = {
4390	.exit = default_device_exit,
4391};
4392
4393/*
4394 *	Initialize the DEV module. At boot time this walks the device list and
4395 *	unhooks any devices that fail to initialise (normally hardware not
4396 *	present) and leaves us with a valid list of present and active devices.
4397 *
4398 */
4399
4400/*
4401 *       This is called single threaded during boot, so no need
4402 *       to take the rtnl semaphore.
4403 */
4404static int __init net_dev_init(void)
4405{
4406	int i, rc = -ENOMEM;
4407
4408	BUG_ON(!dev_boot_phase);
4409
4410	if (dev_proc_init())
4411		goto out;
4412
4413	if (netdev_kobject_init())
4414		goto out;
4415
4416	INIT_LIST_HEAD(&ptype_all);
4417	for (i = 0; i < 16; i++)
4418		INIT_LIST_HEAD(&ptype_base[i]);
4419
4420	if (register_pernet_subsys(&netdev_net_ops))
4421		goto out;
4422
4423	if (register_pernet_device(&default_device_ops))
4424		goto out;
4425
4426	/*
4427	 *	Initialise the packet receive queues.
4428	 */
4429
4430	for_each_possible_cpu(i) {
4431		struct softnet_data *queue;
4432
4433		queue = &per_cpu(softnet_data, i);
4434		skb_queue_head_init(&queue->input_pkt_queue);
4435		queue->completion_queue = NULL;
4436		INIT_LIST_HEAD(&queue->poll_list);
4437
4438		queue->backlog.poll = process_backlog;
4439		queue->backlog.weight = weight_p;
4440	}
4441
4442	netdev_dma_register();
4443
4444	dev_boot_phase = 0;
4445
4446	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4447	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4448
4449	hotcpu_notifier(dev_cpu_callback, 0);
4450	dst_init();
4451	dev_mcast_init();
4452	rc = 0;
4453out:
4454	return rc;
4455}
4456
4457subsys_initcall(net_dev_init);
4458
4459EXPORT_SYMBOL(__dev_get_by_index);
4460EXPORT_SYMBOL(__dev_get_by_name);
4461EXPORT_SYMBOL(__dev_remove_pack);
4462EXPORT_SYMBOL(dev_valid_name);
4463EXPORT_SYMBOL(dev_add_pack);
4464EXPORT_SYMBOL(dev_alloc_name);
4465EXPORT_SYMBOL(dev_close);
4466EXPORT_SYMBOL(dev_get_by_flags);
4467EXPORT_SYMBOL(dev_get_by_index);
4468EXPORT_SYMBOL(dev_get_by_name);
4469EXPORT_SYMBOL(dev_open);
4470EXPORT_SYMBOL(dev_queue_xmit);
4471EXPORT_SYMBOL(dev_remove_pack);
4472EXPORT_SYMBOL(dev_set_allmulti);
4473EXPORT_SYMBOL(dev_set_promiscuity);
4474EXPORT_SYMBOL(dev_change_flags);
4475EXPORT_SYMBOL(dev_set_mtu);
4476EXPORT_SYMBOL(dev_set_mac_address);
4477EXPORT_SYMBOL(free_netdev);
4478EXPORT_SYMBOL(netdev_boot_setup_check);
4479EXPORT_SYMBOL(netdev_set_master);
4480EXPORT_SYMBOL(netdev_state_change);
4481EXPORT_SYMBOL(netif_receive_skb);
4482EXPORT_SYMBOL(netif_rx);
4483EXPORT_SYMBOL(register_gifconf);
4484EXPORT_SYMBOL(register_netdevice);
4485EXPORT_SYMBOL(register_netdevice_notifier);
4486EXPORT_SYMBOL(skb_checksum_help);
4487EXPORT_SYMBOL(synchronize_net);
4488EXPORT_SYMBOL(unregister_netdevice);
4489EXPORT_SYMBOL(unregister_netdevice_notifier);
4490EXPORT_SYMBOL(net_enable_timestamp);
4491EXPORT_SYMBOL(net_disable_timestamp);
4492EXPORT_SYMBOL(dev_get_flags);
4493
4494#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4495EXPORT_SYMBOL(br_handle_frame_hook);
4496EXPORT_SYMBOL(br_fdb_get_hook);
4497EXPORT_SYMBOL(br_fdb_put_hook);
4498#endif
4499
4500#ifdef CONFIG_KMOD
4501EXPORT_SYMBOL(dev_load);
4502#endif
4503
4504EXPORT_PER_CPU_SYMBOL(softnet_data);