net/core/dev.c at v2.6.27-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.27-rc4 4889 lines 121 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/ethtool.h>
  94#include <linux/notifier.h>
  95#include <linux/skbuff.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/stat.h>
 102#include <linux/if_bridge.h>
 103#include <linux/if_macvlan.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/kallsyms.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129
 130#include "net-sysfs.h"
 131
 132/*
 133 *	The list of packet types we will receive (as opposed to discard)
 134 *	and the routines to invoke.
 135 *
 136 *	Why 16. Because with 16 the only overlap we get on a hash of the
 137 *	low nibble of the protocol value is RARP/SNAP/X.25.
 138 *
 139 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 140 *             sure which should go first, but I bet it won't make much
 141 *             difference if we are running VLANs.  The good news is that
 142 *             this protocol won't be in the list unless compiled in, so
 143 *             the average user (w/out VLANs) will not be adversely affected.
 144 *             --BLG
 145 *
 146 *		0800	IP
 147 *		8100    802.1Q VLAN
 148 *		0001	802.3
 149 *		0002	AX.25
 150 *		0004	802.2
 151 *		8035	RARP
 152 *		0005	SNAP
 153 *		0805	X.25
 154 *		0806	ARP
 155 *		8137	IPX
 156 *		0009	Localtalk
 157 *		86DD	IPv6
 158 */
 159
 160#define PTYPE_HASH_SIZE	(16)
 161#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 162
 163static DEFINE_SPINLOCK(ptype_lock);
 164static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 165static struct list_head ptype_all __read_mostly;	/* Taps */
 166
 167#ifdef CONFIG_NET_DMA
 168struct net_dma {
 169	struct dma_client client;
 170	spinlock_t lock;
 171	cpumask_t channel_mask;
 172	struct dma_chan **channels;
 173};
 174
 175static enum dma_state_client
 176netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 177	enum dma_state state);
 178
 179static struct net_dma net_dma = {
 180	.client = {
 181		.event_callback = netdev_dma_event,
 182	},
 183};
 184#endif
 185
 186/*
 187 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 188 * semaphore.
 189 *
 190 * Pure readers hold dev_base_lock for reading.
 191 *
 192 * Writers must hold the rtnl semaphore while they loop through the
 193 * dev_base_head list, and hold dev_base_lock for writing when they do the
 194 * actual updates.  This allows pure readers to access the list even
 195 * while a writer is preparing to update it.
 196 *
 197 * To put it another way, dev_base_lock is held for writing only to
 198 * protect against pure readers; the rtnl semaphore provides the
 199 * protection against other writers.
 200 *
 201 * See, for example usages, register_netdevice() and
 202 * unregister_netdevice(), which must be called with the rtnl
 203 * semaphore held.
 204 */
 205DEFINE_RWLOCK(dev_base_lock);
 206
 207EXPORT_SYMBOL(dev_base_lock);
 208
 209#define NETDEV_HASHBITS	8
 210#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 211
 212static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213{
 214	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 216}
 217
 218static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219{
 220	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 221}
 222
 223/* Device list insertion */
 224static int list_netdevice(struct net_device *dev)
 225{
 226	struct net *net = dev_net(dev);
 227
 228	ASSERT_RTNL();
 229
 230	write_lock_bh(&dev_base_lock);
 231	list_add_tail(&dev->dev_list, &net->dev_base_head);
 232	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 233	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 234	write_unlock_bh(&dev_base_lock);
 235	return 0;
 236}
 237
 238/* Device list removal */
 239static void unlist_netdevice(struct net_device *dev)
 240{
 241	ASSERT_RTNL();
 242
 243	/* Unlink dev from the device chain */
 244	write_lock_bh(&dev_base_lock);
 245	list_del(&dev->dev_list);
 246	hlist_del(&dev->name_hlist);
 247	hlist_del(&dev->index_hlist);
 248	write_unlock_bh(&dev_base_lock);
 249}
 250
 251/*
 252 *	Our notifier list
 253 */
 254
 255static RAW_NOTIFIER_HEAD(netdev_chain);
 256
 257/*
 258 *	Device drivers call our routines to queue packets here. We empty the
 259 *	queue in the local softnet handler.
 260 */
 261
 262DEFINE_PER_CPU(struct softnet_data, softnet_data);
 263
 264#ifdef CONFIG_LOCKDEP
 265/*
 266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267 * according to dev->type
 268 */
 269static const unsigned short netdev_lock_type[] =
 270	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 283	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 284	 ARPHRD_NONE};
 285
 286static const char *netdev_lock_name[] =
 287	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 300	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 301	 "_xmit_NONE"};
 302
 303static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307{
 308	int i;
 309
 310	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311		if (netdev_lock_type[i] == dev_type)
 312			return i;
 313	/* the last key is used by default */
 314	return ARRAY_SIZE(netdev_lock_type) - 1;
 315}
 316
 317static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318						 unsigned short dev_type)
 319{
 320	int i;
 321
 322	i = netdev_lock_pos(dev_type);
 323	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324				   netdev_lock_name[i]);
 325}
 326
 327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328{
 329	int i;
 330
 331	i = netdev_lock_pos(dev->type);
 332	lockdep_set_class_and_name(&dev->addr_list_lock,
 333				   &netdev_addr_lock_key[i],
 334				   netdev_lock_name[i]);
 335}
 336#else
 337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338						 unsigned short dev_type)
 339{
 340}
 341static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342{
 343}
 344#endif
 345
 346/*******************************************************************************
 347
 348		Protocol management and registration routines
 349
 350*******************************************************************************/
 351
 352/*
 353 *	Add a protocol ID to the list. Now that the input handler is
 354 *	smarter we can dispense with all the messy stuff that used to be
 355 *	here.
 356 *
 357 *	BEWARE!!! Protocol handlers, mangling input packets,
 358 *	MUST BE last in hash buckets and checking protocol handlers
 359 *	MUST start from promiscuous ptype_all chain in net_bh.
 360 *	It is true now, do not change it.
 361 *	Explanation follows: if protocol handler, mangling packet, will
 362 *	be the first on list, it is not able to sense, that packet
 363 *	is cloned and should be copied-on-write, so that it will
 364 *	change it and subsequent readers will get broken packet.
 365 *							--ANK (980803)
 366 */
 367
 368/**
 369 *	dev_add_pack - add packet handler
 370 *	@pt: packet type declaration
 371 *
 372 *	Add a protocol handler to the networking stack. The passed &packet_type
 373 *	is linked into kernel lists and may not be freed until it has been
 374 *	removed from the kernel lists.
 375 *
 376 *	This call does not sleep therefore it can not
 377 *	guarantee all CPU's that are in middle of receiving packets
 378 *	will see the new packet type (until the next received packet).
 379 */
 380
 381void dev_add_pack(struct packet_type *pt)
 382{
 383	int hash;
 384
 385	spin_lock_bh(&ptype_lock);
 386	if (pt->type == htons(ETH_P_ALL))
 387		list_add_rcu(&pt->list, &ptype_all);
 388	else {
 389		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 390		list_add_rcu(&pt->list, &ptype_base[hash]);
 391	}
 392	spin_unlock_bh(&ptype_lock);
 393}
 394
 395/**
 396 *	__dev_remove_pack	 - remove packet handler
 397 *	@pt: packet type declaration
 398 *
 399 *	Remove a protocol handler that was previously added to the kernel
 400 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 401 *	from the kernel lists and can be freed or reused once this function
 402 *	returns.
 403 *
 404 *      The packet type might still be in use by receivers
 405 *	and must not be freed until after all the CPU's have gone
 406 *	through a quiescent state.
 407 */
 408void __dev_remove_pack(struct packet_type *pt)
 409{
 410	struct list_head *head;
 411	struct packet_type *pt1;
 412
 413	spin_lock_bh(&ptype_lock);
 414
 415	if (pt->type == htons(ETH_P_ALL))
 416		head = &ptype_all;
 417	else
 418		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 419
 420	list_for_each_entry(pt1, head, list) {
 421		if (pt == pt1) {
 422			list_del_rcu(&pt->list);
 423			goto out;
 424		}
 425	}
 426
 427	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 428out:
 429	spin_unlock_bh(&ptype_lock);
 430}
 431/**
 432 *	dev_remove_pack	 - remove packet handler
 433 *	@pt: packet type declaration
 434 *
 435 *	Remove a protocol handler that was previously added to the kernel
 436 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 437 *	from the kernel lists and can be freed or reused once this function
 438 *	returns.
 439 *
 440 *	This call sleeps to guarantee that no CPU is looking at the packet
 441 *	type after return.
 442 */
 443void dev_remove_pack(struct packet_type *pt)
 444{
 445	__dev_remove_pack(pt);
 446
 447	synchronize_net();
 448}
 449
 450/******************************************************************************
 451
 452		      Device Boot-time Settings Routines
 453
 454*******************************************************************************/
 455
 456/* Boot time configuration table */
 457static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 458
 459/**
 460 *	netdev_boot_setup_add	- add new setup entry
 461 *	@name: name of the device
 462 *	@map: configured settings for the device
 463 *
 464 *	Adds new setup entry to the dev_boot_setup list.  The function
 465 *	returns 0 on error and 1 on success.  This is a generic routine to
 466 *	all netdevices.
 467 */
 468static int netdev_boot_setup_add(char *name, struct ifmap *map)
 469{
 470	struct netdev_boot_setup *s;
 471	int i;
 472
 473	s = dev_boot_setup;
 474	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 475		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 476			memset(s[i].name, 0, sizeof(s[i].name));
 477			strlcpy(s[i].name, name, IFNAMSIZ);
 478			memcpy(&s[i].map, map, sizeof(s[i].map));
 479			break;
 480		}
 481	}
 482
 483	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 484}
 485
 486/**
 487 *	netdev_boot_setup_check	- check boot time settings
 488 *	@dev: the netdevice
 489 *
 490 * 	Check boot time settings for the device.
 491 *	The found settings are set for the device to be used
 492 *	later in the device probing.
 493 *	Returns 0 if no settings found, 1 if they are.
 494 */
 495int netdev_boot_setup_check(struct net_device *dev)
 496{
 497	struct netdev_boot_setup *s = dev_boot_setup;
 498	int i;
 499
 500	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 502		    !strcmp(dev->name, s[i].name)) {
 503			dev->irq 	= s[i].map.irq;
 504			dev->base_addr 	= s[i].map.base_addr;
 505			dev->mem_start 	= s[i].map.mem_start;
 506			dev->mem_end 	= s[i].map.mem_end;
 507			return 1;
 508		}
 509	}
 510	return 0;
 511}
 512
 513
 514/**
 515 *	netdev_boot_base	- get address from boot time settings
 516 *	@prefix: prefix for network device
 517 *	@unit: id for network device
 518 *
 519 * 	Check boot time settings for the base address of device.
 520 *	The found settings are set for the device to be used
 521 *	later in the device probing.
 522 *	Returns 0 if no settings found.
 523 */
 524unsigned long netdev_boot_base(const char *prefix, int unit)
 525{
 526	const struct netdev_boot_setup *s = dev_boot_setup;
 527	char name[IFNAMSIZ];
 528	int i;
 529
 530	sprintf(name, "%s%d", prefix, unit);
 531
 532	/*
 533	 * If device already registered then return base of 1
 534	 * to indicate not to probe for this interface
 535	 */
 536	if (__dev_get_by_name(&init_net, name))
 537		return 1;
 538
 539	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 540		if (!strcmp(name, s[i].name))
 541			return s[i].map.base_addr;
 542	return 0;
 543}
 544
 545/*
 546 * Saves at boot time configured settings for any netdevice.
 547 */
 548int __init netdev_boot_setup(char *str)
 549{
 550	int ints[5];
 551	struct ifmap map;
 552
 553	str = get_options(str, ARRAY_SIZE(ints), ints);
 554	if (!str || !*str)
 555		return 0;
 556
 557	/* Save settings */
 558	memset(&map, 0, sizeof(map));
 559	if (ints[0] > 0)
 560		map.irq = ints[1];
 561	if (ints[0] > 1)
 562		map.base_addr = ints[2];
 563	if (ints[0] > 2)
 564		map.mem_start = ints[3];
 565	if (ints[0] > 3)
 566		map.mem_end = ints[4];
 567
 568	/* Add new entry to the list */
 569	return netdev_boot_setup_add(str, &map);
 570}
 571
 572__setup("netdev=", netdev_boot_setup);
 573
 574/*******************************************************************************
 575
 576			    Device Interface Subroutines
 577
 578*******************************************************************************/
 579
 580/**
 581 *	__dev_get_by_name	- find a device by its name
 582 *	@net: the applicable net namespace
 583 *	@name: name to find
 584 *
 585 *	Find an interface by name. Must be called under RTNL semaphore
 586 *	or @dev_base_lock. If the name is found a pointer to the device
 587 *	is returned. If the name is not found then %NULL is returned. The
 588 *	reference counters are not incremented so the caller must be
 589 *	careful with locks.
 590 */
 591
 592struct net_device *__dev_get_by_name(struct net *net, const char *name)
 593{
 594	struct hlist_node *p;
 595
 596	hlist_for_each(p, dev_name_hash(net, name)) {
 597		struct net_device *dev
 598			= hlist_entry(p, struct net_device, name_hlist);
 599		if (!strncmp(dev->name, name, IFNAMSIZ))
 600			return dev;
 601	}
 602	return NULL;
 603}
 604
 605/**
 606 *	dev_get_by_name		- find a device by its name
 607 *	@net: the applicable net namespace
 608 *	@name: name to find
 609 *
 610 *	Find an interface by name. This can be called from any
 611 *	context and does its own locking. The returned handle has
 612 *	the usage count incremented and the caller must use dev_put() to
 613 *	release it when it is no longer needed. %NULL is returned if no
 614 *	matching device is found.
 615 */
 616
 617struct net_device *dev_get_by_name(struct net *net, const char *name)
 618{
 619	struct net_device *dev;
 620
 621	read_lock(&dev_base_lock);
 622	dev = __dev_get_by_name(net, name);
 623	if (dev)
 624		dev_hold(dev);
 625	read_unlock(&dev_base_lock);
 626	return dev;
 627}
 628
 629/**
 630 *	__dev_get_by_index - find a device by its ifindex
 631 *	@net: the applicable net namespace
 632 *	@ifindex: index of device
 633 *
 634 *	Search for an interface by index. Returns %NULL if the device
 635 *	is not found or a pointer to the device. The device has not
 636 *	had its reference counter increased so the caller must be careful
 637 *	about locking. The caller must hold either the RTNL semaphore
 638 *	or @dev_base_lock.
 639 */
 640
 641struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 642{
 643	struct hlist_node *p;
 644
 645	hlist_for_each(p, dev_index_hash(net, ifindex)) {
 646		struct net_device *dev
 647			= hlist_entry(p, struct net_device, index_hlist);
 648		if (dev->ifindex == ifindex)
 649			return dev;
 650	}
 651	return NULL;
 652}
 653
 654
 655/**
 656 *	dev_get_by_index - find a device by its ifindex
 657 *	@net: the applicable net namespace
 658 *	@ifindex: index of device
 659 *
 660 *	Search for an interface by index. Returns NULL if the device
 661 *	is not found or a pointer to the device. The device returned has
 662 *	had a reference added and the pointer is safe until the user calls
 663 *	dev_put to indicate they have finished with it.
 664 */
 665
 666struct net_device *dev_get_by_index(struct net *net, int ifindex)
 667{
 668	struct net_device *dev;
 669
 670	read_lock(&dev_base_lock);
 671	dev = __dev_get_by_index(net, ifindex);
 672	if (dev)
 673		dev_hold(dev);
 674	read_unlock(&dev_base_lock);
 675	return dev;
 676}
 677
 678/**
 679 *	dev_getbyhwaddr - find a device by its hardware address
 680 *	@net: the applicable net namespace
 681 *	@type: media type of device
 682 *	@ha: hardware address
 683 *
 684 *	Search for an interface by MAC address. Returns NULL if the device
 685 *	is not found or a pointer to the device. The caller must hold the
 686 *	rtnl semaphore. The returned device has not had its ref count increased
 687 *	and the caller must therefore be careful about locking
 688 *
 689 *	BUGS:
 690 *	If the API was consistent this would be __dev_get_by_hwaddr
 691 */
 692
 693struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 694{
 695	struct net_device *dev;
 696
 697	ASSERT_RTNL();
 698
 699	for_each_netdev(net, dev)
 700		if (dev->type == type &&
 701		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 702			return dev;
 703
 704	return NULL;
 705}
 706
 707EXPORT_SYMBOL(dev_getbyhwaddr);
 708
 709struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 710{
 711	struct net_device *dev;
 712
 713	ASSERT_RTNL();
 714	for_each_netdev(net, dev)
 715		if (dev->type == type)
 716			return dev;
 717
 718	return NULL;
 719}
 720
 721EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 722
 723struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 724{
 725	struct net_device *dev;
 726
 727	rtnl_lock();
 728	dev = __dev_getfirstbyhwtype(net, type);
 729	if (dev)
 730		dev_hold(dev);
 731	rtnl_unlock();
 732	return dev;
 733}
 734
 735EXPORT_SYMBOL(dev_getfirstbyhwtype);
 736
 737/**
 738 *	dev_get_by_flags - find any device with given flags
 739 *	@net: the applicable net namespace
 740 *	@if_flags: IFF_* values
 741 *	@mask: bitmask of bits in if_flags to check
 742 *
 743 *	Search for any interface with the given flags. Returns NULL if a device
 744 *	is not found or a pointer to the device. The device returned has
 745 *	had a reference added and the pointer is safe until the user calls
 746 *	dev_put to indicate they have finished with it.
 747 */
 748
 749struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 750{
 751	struct net_device *dev, *ret;
 752
 753	ret = NULL;
 754	read_lock(&dev_base_lock);
 755	for_each_netdev(net, dev) {
 756		if (((dev->flags ^ if_flags) & mask) == 0) {
 757			dev_hold(dev);
 758			ret = dev;
 759			break;
 760		}
 761	}
 762	read_unlock(&dev_base_lock);
 763	return ret;
 764}
 765
 766/**
 767 *	dev_valid_name - check if name is okay for network device
 768 *	@name: name string
 769 *
 770 *	Network device names need to be valid file names to
 771 *	to allow sysfs to work.  We also disallow any kind of
 772 *	whitespace.
 773 */
 774int dev_valid_name(const char *name)
 775{
 776	if (*name == '\0')
 777		return 0;
 778	if (strlen(name) >= IFNAMSIZ)
 779		return 0;
 780	if (!strcmp(name, ".") || !strcmp(name, ".."))
 781		return 0;
 782
 783	while (*name) {
 784		if (*name == '/' || isspace(*name))
 785			return 0;
 786		name++;
 787	}
 788	return 1;
 789}
 790
 791/**
 792 *	__dev_alloc_name - allocate a name for a device
 793 *	@net: network namespace to allocate the device name in
 794 *	@name: name format string
 795 *	@buf:  scratch buffer and result name string
 796 *
 797 *	Passed a format string - eg "lt%d" it will try and find a suitable
 798 *	id. It scans list of devices to build up a free map, then chooses
 799 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 800 *	while allocating the name and adding the device in order to avoid
 801 *	duplicates.
 802 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 803 *	Returns the number of the unit assigned or a negative errno code.
 804 */
 805
 806static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 807{
 808	int i = 0;
 809	const char *p;
 810	const int max_netdevices = 8*PAGE_SIZE;
 811	unsigned long *inuse;
 812	struct net_device *d;
 813
 814	p = strnchr(name, IFNAMSIZ-1, '%');
 815	if (p) {
 816		/*
 817		 * Verify the string as this thing may have come from
 818		 * the user.  There must be either one "%d" and no other "%"
 819		 * characters.
 820		 */
 821		if (p[1] != 'd' || strchr(p + 2, '%'))
 822			return -EINVAL;
 823
 824		/* Use one page as a bit array of possible slots */
 825		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 826		if (!inuse)
 827			return -ENOMEM;
 828
 829		for_each_netdev(net, d) {
 830			if (!sscanf(d->name, name, &i))
 831				continue;
 832			if (i < 0 || i >= max_netdevices)
 833				continue;
 834
 835			/*  avoid cases where sscanf is not exact inverse of printf */
 836			snprintf(buf, IFNAMSIZ, name, i);
 837			if (!strncmp(buf, d->name, IFNAMSIZ))
 838				set_bit(i, inuse);
 839		}
 840
 841		i = find_first_zero_bit(inuse, max_netdevices);
 842		free_page((unsigned long) inuse);
 843	}
 844
 845	snprintf(buf, IFNAMSIZ, name, i);
 846	if (!__dev_get_by_name(net, buf))
 847		return i;
 848
 849	/* It is possible to run out of possible slots
 850	 * when the name is long and there isn't enough space left
 851	 * for the digits, or if all bits are used.
 852	 */
 853	return -ENFILE;
 854}
 855
 856/**
 857 *	dev_alloc_name - allocate a name for a device
 858 *	@dev: device
 859 *	@name: name format string
 860 *
 861 *	Passed a format string - eg "lt%d" it will try and find a suitable
 862 *	id. It scans list of devices to build up a free map, then chooses
 863 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 864 *	while allocating the name and adding the device in order to avoid
 865 *	duplicates.
 866 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 867 *	Returns the number of the unit assigned or a negative errno code.
 868 */
 869
 870int dev_alloc_name(struct net_device *dev, const char *name)
 871{
 872	char buf[IFNAMSIZ];
 873	struct net *net;
 874	int ret;
 875
 876	BUG_ON(!dev_net(dev));
 877	net = dev_net(dev);
 878	ret = __dev_alloc_name(net, name, buf);
 879	if (ret >= 0)
 880		strlcpy(dev->name, buf, IFNAMSIZ);
 881	return ret;
 882}
 883
 884
 885/**
 886 *	dev_change_name - change name of a device
 887 *	@dev: device
 888 *	@newname: name (or format string) must be at least IFNAMSIZ
 889 *
 890 *	Change name of a device, can pass format strings "eth%d".
 891 *	for wildcarding.
 892 */
 893int dev_change_name(struct net_device *dev, char *newname)
 894{
 895	char oldname[IFNAMSIZ];
 896	int err = 0;
 897	int ret;
 898	struct net *net;
 899
 900	ASSERT_RTNL();
 901	BUG_ON(!dev_net(dev));
 902
 903	net = dev_net(dev);
 904	if (dev->flags & IFF_UP)
 905		return -EBUSY;
 906
 907	if (!dev_valid_name(newname))
 908		return -EINVAL;
 909
 910	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 911		return 0;
 912
 913	memcpy(oldname, dev->name, IFNAMSIZ);
 914
 915	if (strchr(newname, '%')) {
 916		err = dev_alloc_name(dev, newname);
 917		if (err < 0)
 918			return err;
 919		strcpy(newname, dev->name);
 920	}
 921	else if (__dev_get_by_name(net, newname))
 922		return -EEXIST;
 923	else
 924		strlcpy(dev->name, newname, IFNAMSIZ);
 925
 926rollback:
 927	err = device_rename(&dev->dev, dev->name);
 928	if (err) {
 929		memcpy(dev->name, oldname, IFNAMSIZ);
 930		return err;
 931	}
 932
 933	write_lock_bh(&dev_base_lock);
 934	hlist_del(&dev->name_hlist);
 935	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 936	write_unlock_bh(&dev_base_lock);
 937
 938	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 939	ret = notifier_to_errno(ret);
 940
 941	if (ret) {
 942		if (err) {
 943			printk(KERN_ERR
 944			       "%s: name change rollback failed: %d.\n",
 945			       dev->name, ret);
 946		} else {
 947			err = ret;
 948			memcpy(dev->name, oldname, IFNAMSIZ);
 949			goto rollback;
 950		}
 951	}
 952
 953	return err;
 954}
 955
 956/**
 957 *	netdev_features_change - device changes features
 958 *	@dev: device to cause notification
 959 *
 960 *	Called to indicate a device has changed features.
 961 */
 962void netdev_features_change(struct net_device *dev)
 963{
 964	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 965}
 966EXPORT_SYMBOL(netdev_features_change);
 967
 968/**
 969 *	netdev_state_change - device changes state
 970 *	@dev: device to cause notification
 971 *
 972 *	Called to indicate a device has changed state. This function calls
 973 *	the notifier chains for netdev_chain and sends a NEWLINK message
 974 *	to the routing socket.
 975 */
 976void netdev_state_change(struct net_device *dev)
 977{
 978	if (dev->flags & IFF_UP) {
 979		call_netdevice_notifiers(NETDEV_CHANGE, dev);
 980		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 981	}
 982}
 983
 984void netdev_bonding_change(struct net_device *dev)
 985{
 986	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
 987}
 988EXPORT_SYMBOL(netdev_bonding_change);
 989
 990/**
 991 *	dev_load 	- load a network module
 992 *	@net: the applicable net namespace
 993 *	@name: name of interface
 994 *
 995 *	If a network interface is not present and the process has suitable
 996 *	privileges this function loads the module. If module loading is not
 997 *	available in this kernel then it becomes a nop.
 998 */
 999
1000void dev_load(struct net *net, const char *name)
1001{
1002	struct net_device *dev;
1003
1004	read_lock(&dev_base_lock);
1005	dev = __dev_get_by_name(net, name);
1006	read_unlock(&dev_base_lock);
1007
1008	if (!dev && capable(CAP_SYS_MODULE))
1009		request_module("%s", name);
1010}
1011
1012/**
1013 *	dev_open	- prepare an interface for use.
1014 *	@dev:	device to open
1015 *
1016 *	Takes a device from down to up state. The device's private open
1017 *	function is invoked and then the multicast lists are loaded. Finally
1018 *	the device is moved into the up state and a %NETDEV_UP message is
1019 *	sent to the netdev notifier chain.
1020 *
1021 *	Calling this function on an active interface is a nop. On a failure
1022 *	a negative errno code is returned.
1023 */
1024int dev_open(struct net_device *dev)
1025{
1026	int ret = 0;
1027
1028	ASSERT_RTNL();
1029
1030	/*
1031	 *	Is it already up?
1032	 */
1033
1034	if (dev->flags & IFF_UP)
1035		return 0;
1036
1037	/*
1038	 *	Is it even present?
1039	 */
1040	if (!netif_device_present(dev))
1041		return -ENODEV;
1042
1043	/*
1044	 *	Call device private open method
1045	 */
1046	set_bit(__LINK_STATE_START, &dev->state);
1047
1048	if (dev->validate_addr)
1049		ret = dev->validate_addr(dev);
1050
1051	if (!ret && dev->open)
1052		ret = dev->open(dev);
1053
1054	/*
1055	 *	If it went open OK then:
1056	 */
1057
1058	if (ret)
1059		clear_bit(__LINK_STATE_START, &dev->state);
1060	else {
1061		/*
1062		 *	Set the flags.
1063		 */
1064		dev->flags |= IFF_UP;
1065
1066		/*
1067		 *	Initialize multicasting status
1068		 */
1069		dev_set_rx_mode(dev);
1070
1071		/*
1072		 *	Wakeup transmit queue engine
1073		 */
1074		dev_activate(dev);
1075
1076		/*
1077		 *	... and announce new interface.
1078		 */
1079		call_netdevice_notifiers(NETDEV_UP, dev);
1080	}
1081
1082	return ret;
1083}
1084
1085/**
1086 *	dev_close - shutdown an interface.
1087 *	@dev: device to shutdown
1088 *
1089 *	This function moves an active device into down state. A
1090 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1091 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1092 *	chain.
1093 */
1094int dev_close(struct net_device *dev)
1095{
1096	ASSERT_RTNL();
1097
1098	might_sleep();
1099
1100	if (!(dev->flags & IFF_UP))
1101		return 0;
1102
1103	/*
1104	 *	Tell people we are going down, so that they can
1105	 *	prepare to death, when device is still operating.
1106	 */
1107	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1108
1109	clear_bit(__LINK_STATE_START, &dev->state);
1110
1111	/* Synchronize to scheduled poll. We cannot touch poll list,
1112	 * it can be even on different cpu. So just clear netif_running().
1113	 *
1114	 * dev->stop() will invoke napi_disable() on all of it's
1115	 * napi_struct instances on this device.
1116	 */
1117	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1118
1119	dev_deactivate(dev);
1120
1121	/*
1122	 *	Call the device specific close. This cannot fail.
1123	 *	Only if device is UP
1124	 *
1125	 *	We allow it to be called even after a DETACH hot-plug
1126	 *	event.
1127	 */
1128	if (dev->stop)
1129		dev->stop(dev);
1130
1131	/*
1132	 *	Device is now down.
1133	 */
1134
1135	dev->flags &= ~IFF_UP;
1136
1137	/*
1138	 * Tell people we are down
1139	 */
1140	call_netdevice_notifiers(NETDEV_DOWN, dev);
1141
1142	return 0;
1143}
1144
1145
1146/**
1147 *	dev_disable_lro - disable Large Receive Offload on a device
1148 *	@dev: device
1149 *
1150 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1151 *	called under RTNL.  This is needed if received packets may be
1152 *	forwarded to another interface.
1153 */
1154void dev_disable_lro(struct net_device *dev)
1155{
1156	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1157	    dev->ethtool_ops->set_flags) {
1158		u32 flags = dev->ethtool_ops->get_flags(dev);
1159		if (flags & ETH_FLAG_LRO) {
1160			flags &= ~ETH_FLAG_LRO;
1161			dev->ethtool_ops->set_flags(dev, flags);
1162		}
1163	}
1164	WARN_ON(dev->features & NETIF_F_LRO);
1165}
1166EXPORT_SYMBOL(dev_disable_lro);
1167
1168
1169static int dev_boot_phase = 1;
1170
1171/*
1172 *	Device change register/unregister. These are not inline or static
1173 *	as we export them to the world.
1174 */
1175
1176/**
1177 *	register_netdevice_notifier - register a network notifier block
1178 *	@nb: notifier
1179 *
1180 *	Register a notifier to be called when network device events occur.
1181 *	The notifier passed is linked into the kernel structures and must
1182 *	not be reused until it has been unregistered. A negative errno code
1183 *	is returned on a failure.
1184 *
1185 * 	When registered all registration and up events are replayed
1186 *	to the new notifier to allow device to have a race free
1187 *	view of the network device list.
1188 */
1189
1190int register_netdevice_notifier(struct notifier_block *nb)
1191{
1192	struct net_device *dev;
1193	struct net_device *last;
1194	struct net *net;
1195	int err;
1196
1197	rtnl_lock();
1198	err = raw_notifier_chain_register(&netdev_chain, nb);
1199	if (err)
1200		goto unlock;
1201	if (dev_boot_phase)
1202		goto unlock;
1203	for_each_net(net) {
1204		for_each_netdev(net, dev) {
1205			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1206			err = notifier_to_errno(err);
1207			if (err)
1208				goto rollback;
1209
1210			if (!(dev->flags & IFF_UP))
1211				continue;
1212
1213			nb->notifier_call(nb, NETDEV_UP, dev);
1214		}
1215	}
1216
1217unlock:
1218	rtnl_unlock();
1219	return err;
1220
1221rollback:
1222	last = dev;
1223	for_each_net(net) {
1224		for_each_netdev(net, dev) {
1225			if (dev == last)
1226				break;
1227
1228			if (dev->flags & IFF_UP) {
1229				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1230				nb->notifier_call(nb, NETDEV_DOWN, dev);
1231			}
1232			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1233		}
1234	}
1235
1236	raw_notifier_chain_unregister(&netdev_chain, nb);
1237	goto unlock;
1238}
1239
1240/**
1241 *	unregister_netdevice_notifier - unregister a network notifier block
1242 *	@nb: notifier
1243 *
1244 *	Unregister a notifier previously registered by
1245 *	register_netdevice_notifier(). The notifier is unlinked into the
1246 *	kernel structures and may then be reused. A negative errno code
1247 *	is returned on a failure.
1248 */
1249
1250int unregister_netdevice_notifier(struct notifier_block *nb)
1251{
1252	int err;
1253
1254	rtnl_lock();
1255	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1256	rtnl_unlock();
1257	return err;
1258}
1259
1260/**
1261 *	call_netdevice_notifiers - call all network notifier blocks
1262 *      @val: value passed unmodified to notifier function
1263 *      @dev: net_device pointer passed unmodified to notifier function
1264 *
1265 *	Call all network notifier blocks.  Parameters and return value
1266 *	are as for raw_notifier_call_chain().
1267 */
1268
1269int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1270{
1271	return raw_notifier_call_chain(&netdev_chain, val, dev);
1272}
1273
1274/* When > 0 there are consumers of rx skb time stamps */
1275static atomic_t netstamp_needed = ATOMIC_INIT(0);
1276
1277void net_enable_timestamp(void)
1278{
1279	atomic_inc(&netstamp_needed);
1280}
1281
1282void net_disable_timestamp(void)
1283{
1284	atomic_dec(&netstamp_needed);
1285}
1286
1287static inline void net_timestamp(struct sk_buff *skb)
1288{
1289	if (atomic_read(&netstamp_needed))
1290		__net_timestamp(skb);
1291	else
1292		skb->tstamp.tv64 = 0;
1293}
1294
1295/*
1296 *	Support routine. Sends outgoing frames to any network
1297 *	taps currently in use.
1298 */
1299
1300static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1301{
1302	struct packet_type *ptype;
1303
1304	net_timestamp(skb);
1305
1306	rcu_read_lock();
1307	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1308		/* Never send packets back to the socket
1309		 * they originated from - MvS (miquels@drinkel.ow.org)
1310		 */
1311		if ((ptype->dev == dev || !ptype->dev) &&
1312		    (ptype->af_packet_priv == NULL ||
1313		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1314			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1315			if (!skb2)
1316				break;
1317
1318			/* skb->nh should be correctly
1319			   set by sender, so that the second statement is
1320			   just protection against buggy protocols.
1321			 */
1322			skb_reset_mac_header(skb2);
1323
1324			if (skb_network_header(skb2) < skb2->data ||
1325			    skb2->network_header > skb2->tail) {
1326				if (net_ratelimit())
1327					printk(KERN_CRIT "protocol %04x is "
1328					       "buggy, dev %s\n",
1329					       skb2->protocol, dev->name);
1330				skb_reset_network_header(skb2);
1331			}
1332
1333			skb2->transport_header = skb2->network_header;
1334			skb2->pkt_type = PACKET_OUTGOING;
1335			ptype->func(skb2, skb->dev, ptype, skb->dev);
1336		}
1337	}
1338	rcu_read_unlock();
1339}
1340
1341
1342static inline void __netif_reschedule(struct Qdisc *q)
1343{
1344	struct softnet_data *sd;
1345	unsigned long flags;
1346
1347	local_irq_save(flags);
1348	sd = &__get_cpu_var(softnet_data);
1349	q->next_sched = sd->output_queue;
1350	sd->output_queue = q;
1351	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1352	local_irq_restore(flags);
1353}
1354
1355void __netif_schedule(struct Qdisc *q)
1356{
1357	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1358		__netif_reschedule(q);
1359}
1360EXPORT_SYMBOL(__netif_schedule);
1361
1362void dev_kfree_skb_irq(struct sk_buff *skb)
1363{
1364	if (atomic_dec_and_test(&skb->users)) {
1365		struct softnet_data *sd;
1366		unsigned long flags;
1367
1368		local_irq_save(flags);
1369		sd = &__get_cpu_var(softnet_data);
1370		skb->next = sd->completion_queue;
1371		sd->completion_queue = skb;
1372		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1373		local_irq_restore(flags);
1374	}
1375}
1376EXPORT_SYMBOL(dev_kfree_skb_irq);
1377
1378void dev_kfree_skb_any(struct sk_buff *skb)
1379{
1380	if (in_irq() || irqs_disabled())
1381		dev_kfree_skb_irq(skb);
1382	else
1383		dev_kfree_skb(skb);
1384}
1385EXPORT_SYMBOL(dev_kfree_skb_any);
1386
1387
1388/**
1389 * netif_device_detach - mark device as removed
1390 * @dev: network device
1391 *
1392 * Mark device as removed from system and therefore no longer available.
1393 */
1394void netif_device_detach(struct net_device *dev)
1395{
1396	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1397	    netif_running(dev)) {
1398		netif_stop_queue(dev);
1399	}
1400}
1401EXPORT_SYMBOL(netif_device_detach);
1402
1403/**
1404 * netif_device_attach - mark device as attached
1405 * @dev: network device
1406 *
1407 * Mark device as attached from system and restart if needed.
1408 */
1409void netif_device_attach(struct net_device *dev)
1410{
1411	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1412	    netif_running(dev)) {
1413		netif_wake_queue(dev);
1414		__netdev_watchdog_up(dev);
1415	}
1416}
1417EXPORT_SYMBOL(netif_device_attach);
1418
1419static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1420{
1421	return ((features & NETIF_F_GEN_CSUM) ||
1422		((features & NETIF_F_IP_CSUM) &&
1423		 protocol == htons(ETH_P_IP)) ||
1424		((features & NETIF_F_IPV6_CSUM) &&
1425		 protocol == htons(ETH_P_IPV6)));
1426}
1427
1428static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1429{
1430	if (can_checksum_protocol(dev->features, skb->protocol))
1431		return true;
1432
1433	if (skb->protocol == htons(ETH_P_8021Q)) {
1434		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1435		if (can_checksum_protocol(dev->features & dev->vlan_features,
1436					  veh->h_vlan_encapsulated_proto))
1437			return true;
1438	}
1439
1440	return false;
1441}
1442
1443/*
1444 * Invalidate hardware checksum when packet is to be mangled, and
1445 * complete checksum manually on outgoing path.
1446 */
1447int skb_checksum_help(struct sk_buff *skb)
1448{
1449	__wsum csum;
1450	int ret = 0, offset;
1451
1452	if (skb->ip_summed == CHECKSUM_COMPLETE)
1453		goto out_set_summed;
1454
1455	if (unlikely(skb_shinfo(skb)->gso_size)) {
1456		/* Let GSO fix up the checksum. */
1457		goto out_set_summed;
1458	}
1459
1460	offset = skb->csum_start - skb_headroom(skb);
1461	BUG_ON(offset >= skb_headlen(skb));
1462	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1463
1464	offset += skb->csum_offset;
1465	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1466
1467	if (skb_cloned(skb) &&
1468	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1469		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1470		if (ret)
1471			goto out;
1472	}
1473
1474	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1475out_set_summed:
1476	skb->ip_summed = CHECKSUM_NONE;
1477out:
1478	return ret;
1479}
1480
1481/**
1482 *	skb_gso_segment - Perform segmentation on skb.
1483 *	@skb: buffer to segment
1484 *	@features: features for the output path (see dev->features)
1485 *
1486 *	This function segments the given skb and returns a list of segments.
1487 *
1488 *	It may return NULL if the skb requires no segmentation.  This is
1489 *	only possible when GSO is used for verifying header integrity.
1490 */
1491struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1492{
1493	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1494	struct packet_type *ptype;
1495	__be16 type = skb->protocol;
1496	int err;
1497
1498	BUG_ON(skb_shinfo(skb)->frag_list);
1499
1500	skb_reset_mac_header(skb);
1501	skb->mac_len = skb->network_header - skb->mac_header;
1502	__skb_pull(skb, skb->mac_len);
1503
1504	if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1505		if (skb_header_cloned(skb) &&
1506		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1507			return ERR_PTR(err);
1508	}
1509
1510	rcu_read_lock();
1511	list_for_each_entry_rcu(ptype,
1512			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1513		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1514			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1515				err = ptype->gso_send_check(skb);
1516				segs = ERR_PTR(err);
1517				if (err || skb_gso_ok(skb, features))
1518					break;
1519				__skb_push(skb, (skb->data -
1520						 skb_network_header(skb)));
1521			}
1522			segs = ptype->gso_segment(skb, features);
1523			break;
1524		}
1525	}
1526	rcu_read_unlock();
1527
1528	__skb_push(skb, skb->data - skb_mac_header(skb));
1529
1530	return segs;
1531}
1532
1533EXPORT_SYMBOL(skb_gso_segment);
1534
1535/* Take action when hardware reception checksum errors are detected. */
1536#ifdef CONFIG_BUG
1537void netdev_rx_csum_fault(struct net_device *dev)
1538{
1539	if (net_ratelimit()) {
1540		printk(KERN_ERR "%s: hw csum failure.\n",
1541			dev ? dev->name : "<unknown>");
1542		dump_stack();
1543	}
1544}
1545EXPORT_SYMBOL(netdev_rx_csum_fault);
1546#endif
1547
1548/* Actually, we should eliminate this check as soon as we know, that:
1549 * 1. IOMMU is present and allows to map all the memory.
1550 * 2. No high memory really exists on this machine.
1551 */
1552
1553static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1554{
1555#ifdef CONFIG_HIGHMEM
1556	int i;
1557
1558	if (dev->features & NETIF_F_HIGHDMA)
1559		return 0;
1560
1561	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1562		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1563			return 1;
1564
1565#endif
1566	return 0;
1567}
1568
1569struct dev_gso_cb {
1570	void (*destructor)(struct sk_buff *skb);
1571};
1572
1573#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1574
1575static void dev_gso_skb_destructor(struct sk_buff *skb)
1576{
1577	struct dev_gso_cb *cb;
1578
1579	do {
1580		struct sk_buff *nskb = skb->next;
1581
1582		skb->next = nskb->next;
1583		nskb->next = NULL;
1584		kfree_skb(nskb);
1585	} while (skb->next);
1586
1587	cb = DEV_GSO_CB(skb);
1588	if (cb->destructor)
1589		cb->destructor(skb);
1590}
1591
1592/**
1593 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1594 *	@skb: buffer to segment
1595 *
1596 *	This function segments the given skb and stores the list of segments
1597 *	in skb->next.
1598 */
1599static int dev_gso_segment(struct sk_buff *skb)
1600{
1601	struct net_device *dev = skb->dev;
1602	struct sk_buff *segs;
1603	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1604					 NETIF_F_SG : 0);
1605
1606	segs = skb_gso_segment(skb, features);
1607
1608	/* Verifying header integrity only. */
1609	if (!segs)
1610		return 0;
1611
1612	if (IS_ERR(segs))
1613		return PTR_ERR(segs);
1614
1615	skb->next = segs;
1616	DEV_GSO_CB(skb)->destructor = skb->destructor;
1617	skb->destructor = dev_gso_skb_destructor;
1618
1619	return 0;
1620}
1621
1622int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1623			struct netdev_queue *txq)
1624{
1625	if (likely(!skb->next)) {
1626		if (!list_empty(&ptype_all))
1627			dev_queue_xmit_nit(skb, dev);
1628
1629		if (netif_needs_gso(dev, skb)) {
1630			if (unlikely(dev_gso_segment(skb)))
1631				goto out_kfree_skb;
1632			if (skb->next)
1633				goto gso;
1634		}
1635
1636		return dev->hard_start_xmit(skb, dev);
1637	}
1638
1639gso:
1640	do {
1641		struct sk_buff *nskb = skb->next;
1642		int rc;
1643
1644		skb->next = nskb->next;
1645		nskb->next = NULL;
1646		rc = dev->hard_start_xmit(nskb, dev);
1647		if (unlikely(rc)) {
1648			nskb->next = skb->next;
1649			skb->next = nskb;
1650			return rc;
1651		}
1652		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1653			return NETDEV_TX_BUSY;
1654	} while (skb->next);
1655
1656	skb->destructor = DEV_GSO_CB(skb)->destructor;
1657
1658out_kfree_skb:
1659	kfree_skb(skb);
1660	return 0;
1661}
1662
1663static u32 simple_tx_hashrnd;
1664static int simple_tx_hashrnd_initialized = 0;
1665
1666static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1667{
1668	u32 addr1, addr2, ports;
1669	u32 hash, ihl;
1670	u8 ip_proto;
1671
1672	if (unlikely(!simple_tx_hashrnd_initialized)) {
1673		get_random_bytes(&simple_tx_hashrnd, 4);
1674		simple_tx_hashrnd_initialized = 1;
1675	}
1676
1677	switch (skb->protocol) {
1678	case __constant_htons(ETH_P_IP):
1679		ip_proto = ip_hdr(skb)->protocol;
1680		addr1 = ip_hdr(skb)->saddr;
1681		addr2 = ip_hdr(skb)->daddr;
1682		ihl = ip_hdr(skb)->ihl;
1683		break;
1684	case __constant_htons(ETH_P_IPV6):
1685		ip_proto = ipv6_hdr(skb)->nexthdr;
1686		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1687		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1688		ihl = (40 >> 2);
1689		break;
1690	default:
1691		return 0;
1692	}
1693
1694
1695	switch (ip_proto) {
1696	case IPPROTO_TCP:
1697	case IPPROTO_UDP:
1698	case IPPROTO_DCCP:
1699	case IPPROTO_ESP:
1700	case IPPROTO_AH:
1701	case IPPROTO_SCTP:
1702	case IPPROTO_UDPLITE:
1703		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1704		break;
1705
1706	default:
1707		ports = 0;
1708		break;
1709	}
1710
1711	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1712
1713	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1714}
1715
1716static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1717					struct sk_buff *skb)
1718{
1719	u16 queue_index = 0;
1720
1721	if (dev->select_queue)
1722		queue_index = dev->select_queue(dev, skb);
1723	else if (dev->real_num_tx_queues > 1)
1724		queue_index = simple_tx_hash(dev, skb);
1725
1726	skb_set_queue_mapping(skb, queue_index);
1727	return netdev_get_tx_queue(dev, queue_index);
1728}
1729
1730/**
1731 *	dev_queue_xmit - transmit a buffer
1732 *	@skb: buffer to transmit
1733 *
1734 *	Queue a buffer for transmission to a network device. The caller must
1735 *	have set the device and priority and built the buffer before calling
1736 *	this function. The function can be called from an interrupt.
1737 *
1738 *	A negative errno code is returned on a failure. A success does not
1739 *	guarantee the frame will be transmitted as it may be dropped due
1740 *	to congestion or traffic shaping.
1741 *
1742 * -----------------------------------------------------------------------------------
1743 *      I notice this method can also return errors from the queue disciplines,
1744 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1745 *      be positive.
1746 *
1747 *      Regardless of the return value, the skb is consumed, so it is currently
1748 *      difficult to retry a send to this method.  (You can bump the ref count
1749 *      before sending to hold a reference for retry if you are careful.)
1750 *
1751 *      When calling this method, interrupts MUST be enabled.  This is because
1752 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1753 *          --BLG
1754 */
1755int dev_queue_xmit(struct sk_buff *skb)
1756{
1757	struct net_device *dev = skb->dev;
1758	struct netdev_queue *txq;
1759	struct Qdisc *q;
1760	int rc = -ENOMEM;
1761
1762	/* GSO will handle the following emulations directly. */
1763	if (netif_needs_gso(dev, skb))
1764		goto gso;
1765
1766	if (skb_shinfo(skb)->frag_list &&
1767	    !(dev->features & NETIF_F_FRAGLIST) &&
1768	    __skb_linearize(skb))
1769		goto out_kfree_skb;
1770
1771	/* Fragmented skb is linearized if device does not support SG,
1772	 * or if at least one of fragments is in highmem and device
1773	 * does not support DMA from it.
1774	 */
1775	if (skb_shinfo(skb)->nr_frags &&
1776	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1777	    __skb_linearize(skb))
1778		goto out_kfree_skb;
1779
1780	/* If packet is not checksummed and device does not support
1781	 * checksumming for this protocol, complete checksumming here.
1782	 */
1783	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1784		skb_set_transport_header(skb, skb->csum_start -
1785					      skb_headroom(skb));
1786		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1787			goto out_kfree_skb;
1788	}
1789
1790gso:
1791	/* Disable soft irqs for various locks below. Also
1792	 * stops preemption for RCU.
1793	 */
1794	rcu_read_lock_bh();
1795
1796	txq = dev_pick_tx(dev, skb);
1797	q = rcu_dereference(txq->qdisc);
1798
1799#ifdef CONFIG_NET_CLS_ACT
1800	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1801#endif
1802	if (q->enqueue) {
1803		spinlock_t *root_lock = qdisc_lock(q);
1804
1805		spin_lock(root_lock);
1806
1807		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1808			kfree_skb(skb);
1809			rc = NET_XMIT_DROP;
1810		} else {
1811			rc = qdisc_enqueue_root(skb, q);
1812			qdisc_run(q);
1813		}
1814		spin_unlock(root_lock);
1815
1816		goto out;
1817	}
1818
1819	/* The device has no queue. Common case for software devices:
1820	   loopback, all the sorts of tunnels...
1821
1822	   Really, it is unlikely that netif_tx_lock protection is necessary
1823	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1824	   counters.)
1825	   However, it is possible, that they rely on protection
1826	   made by us here.
1827
1828	   Check this and shot the lock. It is not prone from deadlocks.
1829	   Either shot noqueue qdisc, it is even simpler 8)
1830	 */
1831	if (dev->flags & IFF_UP) {
1832		int cpu = smp_processor_id(); /* ok because BHs are off */
1833
1834		if (txq->xmit_lock_owner != cpu) {
1835
1836			HARD_TX_LOCK(dev, txq, cpu);
1837
1838			if (!netif_tx_queue_stopped(txq)) {
1839				rc = 0;
1840				if (!dev_hard_start_xmit(skb, dev, txq)) {
1841					HARD_TX_UNLOCK(dev, txq);
1842					goto out;
1843				}
1844			}
1845			HARD_TX_UNLOCK(dev, txq);
1846			if (net_ratelimit())
1847				printk(KERN_CRIT "Virtual device %s asks to "
1848				       "queue packet!\n", dev->name);
1849		} else {
1850			/* Recursion is detected! It is possible,
1851			 * unfortunately */
1852			if (net_ratelimit())
1853				printk(KERN_CRIT "Dead loop on virtual device "
1854				       "%s, fix it urgently!\n", dev->name);
1855		}
1856	}
1857
1858	rc = -ENETDOWN;
1859	rcu_read_unlock_bh();
1860
1861out_kfree_skb:
1862	kfree_skb(skb);
1863	return rc;
1864out:
1865	rcu_read_unlock_bh();
1866	return rc;
1867}
1868
1869
1870/*=======================================================================
1871			Receiver routines
1872  =======================================================================*/
1873
1874int netdev_max_backlog __read_mostly = 1000;
1875int netdev_budget __read_mostly = 300;
1876int weight_p __read_mostly = 64;            /* old backlog weight */
1877
1878DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1879
1880
1881/**
1882 *	netif_rx	-	post buffer to the network code
1883 *	@skb: buffer to post
1884 *
1885 *	This function receives a packet from a device driver and queues it for
1886 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1887 *	may be dropped during processing for congestion control or by the
1888 *	protocol layers.
1889 *
1890 *	return values:
1891 *	NET_RX_SUCCESS	(no congestion)
1892 *	NET_RX_DROP     (packet was dropped)
1893 *
1894 */
1895
1896int netif_rx(struct sk_buff *skb)
1897{
1898	struct softnet_data *queue;
1899	unsigned long flags;
1900
1901	/* if netpoll wants it, pretend we never saw it */
1902	if (netpoll_rx(skb))
1903		return NET_RX_DROP;
1904
1905	if (!skb->tstamp.tv64)
1906		net_timestamp(skb);
1907
1908	/*
1909	 * The code is rearranged so that the path is the most
1910	 * short when CPU is congested, but is still operating.
1911	 */
1912	local_irq_save(flags);
1913	queue = &__get_cpu_var(softnet_data);
1914
1915	__get_cpu_var(netdev_rx_stat).total++;
1916	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1917		if (queue->input_pkt_queue.qlen) {
1918enqueue:
1919			__skb_queue_tail(&queue->input_pkt_queue, skb);
1920			local_irq_restore(flags);
1921			return NET_RX_SUCCESS;
1922		}
1923
1924		napi_schedule(&queue->backlog);
1925		goto enqueue;
1926	}
1927
1928	__get_cpu_var(netdev_rx_stat).dropped++;
1929	local_irq_restore(flags);
1930
1931	kfree_skb(skb);
1932	return NET_RX_DROP;
1933}
1934
1935int netif_rx_ni(struct sk_buff *skb)
1936{
1937	int err;
1938
1939	preempt_disable();
1940	err = netif_rx(skb);
1941	if (local_softirq_pending())
1942		do_softirq();
1943	preempt_enable();
1944
1945	return err;
1946}
1947
1948EXPORT_SYMBOL(netif_rx_ni);
1949
1950static void net_tx_action(struct softirq_action *h)
1951{
1952	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1953
1954	if (sd->completion_queue) {
1955		struct sk_buff *clist;
1956
1957		local_irq_disable();
1958		clist = sd->completion_queue;
1959		sd->completion_queue = NULL;
1960		local_irq_enable();
1961
1962		while (clist) {
1963			struct sk_buff *skb = clist;
1964			clist = clist->next;
1965
1966			WARN_ON(atomic_read(&skb->users));
1967			__kfree_skb(skb);
1968		}
1969	}
1970
1971	if (sd->output_queue) {
1972		struct Qdisc *head;
1973
1974		local_irq_disable();
1975		head = sd->output_queue;
1976		sd->output_queue = NULL;
1977		local_irq_enable();
1978
1979		while (head) {
1980			struct Qdisc *q = head;
1981			spinlock_t *root_lock;
1982
1983			head = head->next_sched;
1984
1985			root_lock = qdisc_lock(q);
1986			if (spin_trylock(root_lock)) {
1987				smp_mb__before_clear_bit();
1988				clear_bit(__QDISC_STATE_SCHED,
1989					  &q->state);
1990				qdisc_run(q);
1991				spin_unlock(root_lock);
1992			} else {
1993				if (!test_bit(__QDISC_STATE_DEACTIVATED,
1994					      &q->state))
1995					__netif_reschedule(q);
1996			}
1997		}
1998	}
1999}
2000
2001static inline int deliver_skb(struct sk_buff *skb,
2002			      struct packet_type *pt_prev,
2003			      struct net_device *orig_dev)
2004{
2005	atomic_inc(&skb->users);
2006	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2007}
2008
2009#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2010/* These hooks defined here for ATM */
2011struct net_bridge;
2012struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2013						unsigned char *addr);
2014void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2015
2016/*
2017 * If bridge module is loaded call bridging hook.
2018 *  returns NULL if packet was consumed.
2019 */
2020struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2021					struct sk_buff *skb) __read_mostly;
2022static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2023					    struct packet_type **pt_prev, int *ret,
2024					    struct net_device *orig_dev)
2025{
2026	struct net_bridge_port *port;
2027
2028	if (skb->pkt_type == PACKET_LOOPBACK ||
2029	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2030		return skb;
2031
2032	if (*pt_prev) {
2033		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2034		*pt_prev = NULL;
2035	}
2036
2037	return br_handle_frame_hook(port, skb);
2038}
2039#else
2040#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2041#endif
2042
2043#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2044struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2045EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2046
2047static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2048					     struct packet_type **pt_prev,
2049					     int *ret,
2050					     struct net_device *orig_dev)
2051{
2052	if (skb->dev->macvlan_port == NULL)
2053		return skb;
2054
2055	if (*pt_prev) {
2056		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2057		*pt_prev = NULL;
2058	}
2059	return macvlan_handle_frame_hook(skb);
2060}
2061#else
2062#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2063#endif
2064
2065#ifdef CONFIG_NET_CLS_ACT
2066/* TODO: Maybe we should just force sch_ingress to be compiled in
2067 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2068 * a compare and 2 stores extra right now if we dont have it on
2069 * but have CONFIG_NET_CLS_ACT
2070 * NOTE: This doesnt stop any functionality; if you dont have
2071 * the ingress scheduler, you just cant add policies on ingress.
2072 *
2073 */
2074static int ing_filter(struct sk_buff *skb)
2075{
2076	struct net_device *dev = skb->dev;
2077	u32 ttl = G_TC_RTTL(skb->tc_verd);
2078	struct netdev_queue *rxq;
2079	int result = TC_ACT_OK;
2080	struct Qdisc *q;
2081
2082	if (MAX_RED_LOOP < ttl++) {
2083		printk(KERN_WARNING
2084		       "Redir loop detected Dropping packet (%d->%d)\n",
2085		       skb->iif, dev->ifindex);
2086		return TC_ACT_SHOT;
2087	}
2088
2089	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2090	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2091
2092	rxq = &dev->rx_queue;
2093
2094	q = rxq->qdisc;
2095	if (q != &noop_qdisc) {
2096		spin_lock(qdisc_lock(q));
2097		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2098			result = qdisc_enqueue_root(skb, q);
2099		spin_unlock(qdisc_lock(q));
2100	}
2101
2102	return result;
2103}
2104
2105static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2106					 struct packet_type **pt_prev,
2107					 int *ret, struct net_device *orig_dev)
2108{
2109	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2110		goto out;
2111
2112	if (*pt_prev) {
2113		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2114		*pt_prev = NULL;
2115	} else {
2116		/* Huh? Why does turning on AF_PACKET affect this? */
2117		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2118	}
2119
2120	switch (ing_filter(skb)) {
2121	case TC_ACT_SHOT:
2122	case TC_ACT_STOLEN:
2123		kfree_skb(skb);
2124		return NULL;
2125	}
2126
2127out:
2128	skb->tc_verd = 0;
2129	return skb;
2130}
2131#endif
2132
2133/*
2134 * 	netif_nit_deliver - deliver received packets to network taps
2135 * 	@skb: buffer
2136 *
2137 * 	This function is used to deliver incoming packets to network
2138 * 	taps. It should be used when the normal netif_receive_skb path
2139 * 	is bypassed, for example because of VLAN acceleration.
2140 */
2141void netif_nit_deliver(struct sk_buff *skb)
2142{
2143	struct packet_type *ptype;
2144
2145	if (list_empty(&ptype_all))
2146		return;
2147
2148	skb_reset_network_header(skb);
2149	skb_reset_transport_header(skb);
2150	skb->mac_len = skb->network_header - skb->mac_header;
2151
2152	rcu_read_lock();
2153	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2154		if (!ptype->dev || ptype->dev == skb->dev)
2155			deliver_skb(skb, ptype, skb->dev);
2156	}
2157	rcu_read_unlock();
2158}
2159
2160/**
2161 *	netif_receive_skb - process receive buffer from network
2162 *	@skb: buffer to process
2163 *
2164 *	netif_receive_skb() is the main receive data processing function.
2165 *	It always succeeds. The buffer may be dropped during processing
2166 *	for congestion control or by the protocol layers.
2167 *
2168 *	This function may only be called from softirq context and interrupts
2169 *	should be enabled.
2170 *
2171 *	Return values (usually ignored):
2172 *	NET_RX_SUCCESS: no congestion
2173 *	NET_RX_DROP: packet was dropped
2174 */
2175int netif_receive_skb(struct sk_buff *skb)
2176{
2177	struct packet_type *ptype, *pt_prev;
2178	struct net_device *orig_dev;
2179	struct net_device *null_or_orig;
2180	int ret = NET_RX_DROP;
2181	__be16 type;
2182
2183	/* if we've gotten here through NAPI, check netpoll */
2184	if (netpoll_receive_skb(skb))
2185		return NET_RX_DROP;
2186
2187	if (!skb->tstamp.tv64)
2188		net_timestamp(skb);
2189
2190	if (!skb->iif)
2191		skb->iif = skb->dev->ifindex;
2192
2193	null_or_orig = NULL;
2194	orig_dev = skb->dev;
2195	if (orig_dev->master) {
2196		if (skb_bond_should_drop(skb))
2197			null_or_orig = orig_dev; /* deliver only exact match */
2198		else
2199			skb->dev = orig_dev->master;
2200	}
2201
2202	__get_cpu_var(netdev_rx_stat).total++;
2203
2204	skb_reset_network_header(skb);
2205	skb_reset_transport_header(skb);
2206	skb->mac_len = skb->network_header - skb->mac_header;
2207
2208	pt_prev = NULL;
2209
2210	rcu_read_lock();
2211
2212	/* Don't receive packets in an exiting network namespace */
2213	if (!net_alive(dev_net(skb->dev)))
2214		goto out;
2215
2216#ifdef CONFIG_NET_CLS_ACT
2217	if (skb->tc_verd & TC_NCLS) {
2218		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2219		goto ncls;
2220	}
2221#endif
2222
2223	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2224		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2225		    ptype->dev == orig_dev) {
2226			if (pt_prev)
2227				ret = deliver_skb(skb, pt_prev, orig_dev);
2228			pt_prev = ptype;
2229		}
2230	}
2231
2232#ifdef CONFIG_NET_CLS_ACT
2233	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2234	if (!skb)
2235		goto out;
2236ncls:
2237#endif
2238
2239	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2240	if (!skb)
2241		goto out;
2242	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2243	if (!skb)
2244		goto out;
2245
2246	type = skb->protocol;
2247	list_for_each_entry_rcu(ptype,
2248			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2249		if (ptype->type == type &&
2250		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2251		     ptype->dev == orig_dev)) {
2252			if (pt_prev)
2253				ret = deliver_skb(skb, pt_prev, orig_dev);
2254			pt_prev = ptype;
2255		}
2256	}
2257
2258	if (pt_prev) {
2259		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2260	} else {
2261		kfree_skb(skb);
2262		/* Jamal, now you will not able to escape explaining
2263		 * me how you were going to use this. :-)
2264		 */
2265		ret = NET_RX_DROP;
2266	}
2267
2268out:
2269	rcu_read_unlock();
2270	return ret;
2271}
2272
2273/* Network device is going away, flush any packets still pending  */
2274static void flush_backlog(void *arg)
2275{
2276	struct net_device *dev = arg;
2277	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2278	struct sk_buff *skb, *tmp;
2279
2280	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2281		if (skb->dev == dev) {
2282			__skb_unlink(skb, &queue->input_pkt_queue);
2283			kfree_skb(skb);
2284		}
2285}
2286
2287static int process_backlog(struct napi_struct *napi, int quota)
2288{
2289	int work = 0;
2290	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2291	unsigned long start_time = jiffies;
2292
2293	napi->weight = weight_p;
2294	do {
2295		struct sk_buff *skb;
2296
2297		local_irq_disable();
2298		skb = __skb_dequeue(&queue->input_pkt_queue);
2299		if (!skb) {
2300			__napi_complete(napi);
2301			local_irq_enable();
2302			break;
2303		}
2304		local_irq_enable();
2305
2306		netif_receive_skb(skb);
2307	} while (++work < quota && jiffies == start_time);
2308
2309	return work;
2310}
2311
2312/**
2313 * __napi_schedule - schedule for receive
2314 * @n: entry to schedule
2315 *
2316 * The entry's receive function will be scheduled to run
2317 */
2318void __napi_schedule(struct napi_struct *n)
2319{
2320	unsigned long flags;
2321
2322	local_irq_save(flags);
2323	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2324	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2325	local_irq_restore(flags);
2326}
2327EXPORT_SYMBOL(__napi_schedule);
2328
2329
2330static void net_rx_action(struct softirq_action *h)
2331{
2332	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2333	unsigned long start_time = jiffies;
2334	int budget = netdev_budget;
2335	void *have;
2336
2337	local_irq_disable();
2338
2339	while (!list_empty(list)) {
2340		struct napi_struct *n;
2341		int work, weight;
2342
2343		/* If softirq window is exhuasted then punt.
2344		 *
2345		 * Note that this is a slight policy change from the
2346		 * previous NAPI code, which would allow up to 2
2347		 * jiffies to pass before breaking out.  The test
2348		 * used to be "jiffies - start_time > 1".
2349		 */
2350		if (unlikely(budget <= 0 || jiffies != start_time))
2351			goto softnet_break;
2352
2353		local_irq_enable();
2354
2355		/* Even though interrupts have been re-enabled, this
2356		 * access is safe because interrupts can only add new
2357		 * entries to the tail of this list, and only ->poll()
2358		 * calls can remove this head entry from the list.
2359		 */
2360		n = list_entry(list->next, struct napi_struct, poll_list);
2361
2362		have = netpoll_poll_lock(n);
2363
2364		weight = n->weight;
2365
2366		/* This NAPI_STATE_SCHED test is for avoiding a race
2367		 * with netpoll's poll_napi().  Only the entity which
2368		 * obtains the lock and sees NAPI_STATE_SCHED set will
2369		 * actually make the ->poll() call.  Therefore we avoid
2370		 * accidently calling ->poll() when NAPI is not scheduled.
2371		 */
2372		work = 0;
2373		if (test_bit(NAPI_STATE_SCHED, &n->state))
2374			work = n->poll(n, weight);
2375
2376		WARN_ON_ONCE(work > weight);
2377
2378		budget -= work;
2379
2380		local_irq_disable();
2381
2382		/* Drivers must not modify the NAPI state if they
2383		 * consume the entire weight.  In such cases this code
2384		 * still "owns" the NAPI instance and therefore can
2385		 * move the instance around on the list at-will.
2386		 */
2387		if (unlikely(work == weight)) {
2388			if (unlikely(napi_disable_pending(n)))
2389				__napi_complete(n);
2390			else
2391				list_move_tail(&n->poll_list, list);
2392		}
2393
2394		netpoll_poll_unlock(have);
2395	}
2396out:
2397	local_irq_enable();
2398
2399#ifdef CONFIG_NET_DMA
2400	/*
2401	 * There may not be any more sk_buffs coming right now, so push
2402	 * any pending DMA copies to hardware
2403	 */
2404	if (!cpus_empty(net_dma.channel_mask)) {
2405		int chan_idx;
2406		for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
2407			struct dma_chan *chan = net_dma.channels[chan_idx];
2408			if (chan)
2409				dma_async_memcpy_issue_pending(chan);
2410		}
2411	}
2412#endif
2413
2414	return;
2415
2416softnet_break:
2417	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2418	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2419	goto out;
2420}
2421
2422static gifconf_func_t * gifconf_list [NPROTO];
2423
2424/**
2425 *	register_gifconf	-	register a SIOCGIF handler
2426 *	@family: Address family
2427 *	@gifconf: Function handler
2428 *
2429 *	Register protocol dependent address dumping routines. The handler
2430 *	that is passed must not be freed or reused until it has been replaced
2431 *	by another handler.
2432 */
2433int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2434{
2435	if (family >= NPROTO)
2436		return -EINVAL;
2437	gifconf_list[family] = gifconf;
2438	return 0;
2439}
2440
2441
2442/*
2443 *	Map an interface index to its name (SIOCGIFNAME)
2444 */
2445
2446/*
2447 *	We need this ioctl for efficient implementation of the
2448 *	if_indextoname() function required by the IPv6 API.  Without
2449 *	it, we would have to search all the interfaces to find a
2450 *	match.  --pb
2451 */
2452
2453static int dev_ifname(struct net *net, struct ifreq __user *arg)
2454{
2455	struct net_device *dev;
2456	struct ifreq ifr;
2457
2458	/*
2459	 *	Fetch the caller's info block.
2460	 */
2461
2462	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2463		return -EFAULT;
2464
2465	read_lock(&dev_base_lock);
2466	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2467	if (!dev) {
2468		read_unlock(&dev_base_lock);
2469		return -ENODEV;
2470	}
2471
2472	strcpy(ifr.ifr_name, dev->name);
2473	read_unlock(&dev_base_lock);
2474
2475	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2476		return -EFAULT;
2477	return 0;
2478}
2479
2480/*
2481 *	Perform a SIOCGIFCONF call. This structure will change
2482 *	size eventually, and there is nothing I can do about it.
2483 *	Thus we will need a 'compatibility mode'.
2484 */
2485
2486static int dev_ifconf(struct net *net, char __user *arg)
2487{
2488	struct ifconf ifc;
2489	struct net_device *dev;
2490	char __user *pos;
2491	int len;
2492	int total;
2493	int i;
2494
2495	/*
2496	 *	Fetch the caller's info block.
2497	 */
2498
2499	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2500		return -EFAULT;
2501
2502	pos = ifc.ifc_buf;
2503	len = ifc.ifc_len;
2504
2505	/*
2506	 *	Loop over the interfaces, and write an info block for each.
2507	 */
2508
2509	total = 0;
2510	for_each_netdev(net, dev) {
2511		for (i = 0; i < NPROTO; i++) {
2512			if (gifconf_list[i]) {
2513				int done;
2514				if (!pos)
2515					done = gifconf_list[i](dev, NULL, 0);
2516				else
2517					done = gifconf_list[i](dev, pos + total,
2518							       len - total);
2519				if (done < 0)
2520					return -EFAULT;
2521				total += done;
2522			}
2523		}
2524	}
2525
2526	/*
2527	 *	All done.  Write the updated control block back to the caller.
2528	 */
2529	ifc.ifc_len = total;
2530
2531	/*
2532	 * 	Both BSD and Solaris return 0 here, so we do too.
2533	 */
2534	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2535}
2536
2537#ifdef CONFIG_PROC_FS
2538/*
2539 *	This is invoked by the /proc filesystem handler to display a device
2540 *	in detail.
2541 */
2542void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2543	__acquires(dev_base_lock)
2544{
2545	struct net *net = seq_file_net(seq);
2546	loff_t off;
2547	struct net_device *dev;
2548
2549	read_lock(&dev_base_lock);
2550	if (!*pos)
2551		return SEQ_START_TOKEN;
2552
2553	off = 1;
2554	for_each_netdev(net, dev)
2555		if (off++ == *pos)
2556			return dev;
2557
2558	return NULL;
2559}
2560
2561void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2562{
2563	struct net *net = seq_file_net(seq);
2564	++*pos;
2565	return v == SEQ_START_TOKEN ?
2566		first_net_device(net) : next_net_device((struct net_device *)v);
2567}
2568
2569void dev_seq_stop(struct seq_file *seq, void *v)
2570	__releases(dev_base_lock)
2571{
2572	read_unlock(&dev_base_lock);
2573}
2574
2575static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2576{
2577	struct net_device_stats *stats = dev->get_stats(dev);
2578
2579	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2580		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2581		   dev->name, stats->rx_bytes, stats->rx_packets,
2582		   stats->rx_errors,
2583		   stats->rx_dropped + stats->rx_missed_errors,
2584		   stats->rx_fifo_errors,
2585		   stats->rx_length_errors + stats->rx_over_errors +
2586		    stats->rx_crc_errors + stats->rx_frame_errors,
2587		   stats->rx_compressed, stats->multicast,
2588		   stats->tx_bytes, stats->tx_packets,
2589		   stats->tx_errors, stats->tx_dropped,
2590		   stats->tx_fifo_errors, stats->collisions,
2591		   stats->tx_carrier_errors +
2592		    stats->tx_aborted_errors +
2593		    stats->tx_window_errors +
2594		    stats->tx_heartbeat_errors,
2595		   stats->tx_compressed);
2596}
2597
2598/*
2599 *	Called from the PROCfs module. This now uses the new arbitrary sized
2600 *	/proc/net interface to create /proc/net/dev
2601 */
2602static int dev_seq_show(struct seq_file *seq, void *v)
2603{
2604	if (v == SEQ_START_TOKEN)
2605		seq_puts(seq, "Inter-|   Receive                            "
2606			      "                    |  Transmit\n"
2607			      " face |bytes    packets errs drop fifo frame "
2608			      "compressed multicast|bytes    packets errs "
2609			      "drop fifo colls carrier compressed\n");
2610	else
2611		dev_seq_printf_stats(seq, v);
2612	return 0;
2613}
2614
2615static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2616{
2617	struct netif_rx_stats *rc = NULL;
2618
2619	while (*pos < nr_cpu_ids)
2620		if (cpu_online(*pos)) {
2621			rc = &per_cpu(netdev_rx_stat, *pos);
2622			break;
2623		} else
2624			++*pos;
2625	return rc;
2626}
2627
2628static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2629{
2630	return softnet_get_online(pos);
2631}
2632
2633static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2634{
2635	++*pos;
2636	return softnet_get_online(pos);
2637}
2638
2639static void softnet_seq_stop(struct seq_file *seq, void *v)
2640{
2641}
2642
2643static int softnet_seq_show(struct seq_file *seq, void *v)
2644{
2645	struct netif_rx_stats *s = v;
2646
2647	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2648		   s->total, s->dropped, s->time_squeeze, 0,
2649		   0, 0, 0, 0, /* was fastroute */
2650		   s->cpu_collision );
2651	return 0;
2652}
2653
2654static const struct seq_operations dev_seq_ops = {
2655	.start = dev_seq_start,
2656	.next  = dev_seq_next,
2657	.stop  = dev_seq_stop,
2658	.show  = dev_seq_show,
2659};
2660
2661static int dev_seq_open(struct inode *inode, struct file *file)
2662{
2663	return seq_open_net(inode, file, &dev_seq_ops,
2664			    sizeof(struct seq_net_private));
2665}
2666
2667static const struct file_operations dev_seq_fops = {
2668	.owner	 = THIS_MODULE,
2669	.open    = dev_seq_open,
2670	.read    = seq_read,
2671	.llseek  = seq_lseek,
2672	.release = seq_release_net,
2673};
2674
2675static const struct seq_operations softnet_seq_ops = {
2676	.start = softnet_seq_start,
2677	.next  = softnet_seq_next,
2678	.stop  = softnet_seq_stop,
2679	.show  = softnet_seq_show,
2680};
2681
2682static int softnet_seq_open(struct inode *inode, struct file *file)
2683{
2684	return seq_open(file, &softnet_seq_ops);
2685}
2686
2687static const struct file_operations softnet_seq_fops = {
2688	.owner	 = THIS_MODULE,
2689	.open    = softnet_seq_open,
2690	.read    = seq_read,
2691	.llseek  = seq_lseek,
2692	.release = seq_release,
2693};
2694
2695static void *ptype_get_idx(loff_t pos)
2696{
2697	struct packet_type *pt = NULL;
2698	loff_t i = 0;
2699	int t;
2700
2701	list_for_each_entry_rcu(pt, &ptype_all, list) {
2702		if (i == pos)
2703			return pt;
2704		++i;
2705	}
2706
2707	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2708		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2709			if (i == pos)
2710				return pt;
2711			++i;
2712		}
2713	}
2714	return NULL;
2715}
2716
2717static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2718	__acquires(RCU)
2719{
2720	rcu_read_lock();
2721	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2722}
2723
2724static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2725{
2726	struct packet_type *pt;
2727	struct list_head *nxt;
2728	int hash;
2729
2730	++*pos;
2731	if (v == SEQ_START_TOKEN)
2732		return ptype_get_idx(0);
2733
2734	pt = v;
2735	nxt = pt->list.next;
2736	if (pt->type == htons(ETH_P_ALL)) {
2737		if (nxt != &ptype_all)
2738			goto found;
2739		hash = 0;
2740		nxt = ptype_base[0].next;
2741	} else
2742		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2743
2744	while (nxt == &ptype_base[hash]) {
2745		if (++hash >= PTYPE_HASH_SIZE)
2746			return NULL;
2747		nxt = ptype_base[hash].next;
2748	}
2749found:
2750	return list_entry(nxt, struct packet_type, list);
2751}
2752
2753static void ptype_seq_stop(struct seq_file *seq, void *v)
2754	__releases(RCU)
2755{
2756	rcu_read_unlock();
2757}
2758
2759static void ptype_seq_decode(struct seq_file *seq, void *sym)
2760{
2761#ifdef CONFIG_KALLSYMS
2762	unsigned long offset = 0, symsize;
2763	const char *symname;
2764	char *modname;
2765	char namebuf[128];
2766
2767	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2768				  &modname, namebuf);
2769
2770	if (symname) {
2771		char *delim = ":";
2772
2773		if (!modname)
2774			modname = delim = "";
2775		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2776			   symname, offset);
2777		return;
2778	}
2779#endif
2780
2781	seq_printf(seq, "[%p]", sym);
2782}
2783
2784static int ptype_seq_show(struct seq_file *seq, void *v)
2785{
2786	struct packet_type *pt = v;
2787
2788	if (v == SEQ_START_TOKEN)
2789		seq_puts(seq, "Type Device      Function\n");
2790	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2791		if (pt->type == htons(ETH_P_ALL))
2792			seq_puts(seq, "ALL ");
2793		else
2794			seq_printf(seq, "%04x", ntohs(pt->type));
2795
2796		seq_printf(seq, " %-8s ",
2797			   pt->dev ? pt->dev->name : "");
2798		ptype_seq_decode(seq,  pt->func);
2799		seq_putc(seq, '\n');
2800	}
2801
2802	return 0;
2803}
2804
2805static const struct seq_operations ptype_seq_ops = {
2806	.start = ptype_seq_start,
2807	.next  = ptype_seq_next,
2808	.stop  = ptype_seq_stop,
2809	.show  = ptype_seq_show,
2810};
2811
2812static int ptype_seq_open(struct inode *inode, struct file *file)
2813{
2814	return seq_open_net(inode, file, &ptype_seq_ops,
2815			sizeof(struct seq_net_private));
2816}
2817
2818static const struct file_operations ptype_seq_fops = {
2819	.owner	 = THIS_MODULE,
2820	.open    = ptype_seq_open,
2821	.read    = seq_read,
2822	.llseek  = seq_lseek,
2823	.release = seq_release_net,
2824};
2825
2826
2827static int __net_init dev_proc_net_init(struct net *net)
2828{
2829	int rc = -ENOMEM;
2830
2831	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2832		goto out;
2833	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2834		goto out_dev;
2835	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2836		goto out_softnet;
2837
2838	if (wext_proc_init(net))
2839		goto out_ptype;
2840	rc = 0;
2841out:
2842	return rc;
2843out_ptype:
2844	proc_net_remove(net, "ptype");
2845out_softnet:
2846	proc_net_remove(net, "softnet_stat");
2847out_dev:
2848	proc_net_remove(net, "dev");
2849	goto out;
2850}
2851
2852static void __net_exit dev_proc_net_exit(struct net *net)
2853{
2854	wext_proc_exit(net);
2855
2856	proc_net_remove(net, "ptype");
2857	proc_net_remove(net, "softnet_stat");
2858	proc_net_remove(net, "dev");
2859}
2860
2861static struct pernet_operations __net_initdata dev_proc_ops = {
2862	.init = dev_proc_net_init,
2863	.exit = dev_proc_net_exit,
2864};
2865
2866static int __init dev_proc_init(void)
2867{
2868	return register_pernet_subsys(&dev_proc_ops);
2869}
2870#else
2871#define dev_proc_init() 0
2872#endif	/* CONFIG_PROC_FS */
2873
2874
2875/**
2876 *	netdev_set_master	-	set up master/slave pair
2877 *	@slave: slave device
2878 *	@master: new master device
2879 *
2880 *	Changes the master device of the slave. Pass %NULL to break the
2881 *	bonding. The caller must hold the RTNL semaphore. On a failure
2882 *	a negative errno code is returned. On success the reference counts
2883 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2884 *	function returns zero.
2885 */
2886int netdev_set_master(struct net_device *slave, struct net_device *master)
2887{
2888	struct net_device *old = slave->master;
2889
2890	ASSERT_RTNL();
2891
2892	if (master) {
2893		if (old)
2894			return -EBUSY;
2895		dev_hold(master);
2896	}
2897
2898	slave->master = master;
2899
2900	synchronize_net();
2901
2902	if (old)
2903		dev_put(old);
2904
2905	if (master)
2906		slave->flags |= IFF_SLAVE;
2907	else
2908		slave->flags &= ~IFF_SLAVE;
2909
2910	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2911	return 0;
2912}
2913
2914static int __dev_set_promiscuity(struct net_device *dev, int inc)
2915{
2916	unsigned short old_flags = dev->flags;
2917
2918	ASSERT_RTNL();
2919
2920	dev->flags |= IFF_PROMISC;
2921	dev->promiscuity += inc;
2922	if (dev->promiscuity == 0) {
2923		/*
2924		 * Avoid overflow.
2925		 * If inc causes overflow, untouch promisc and return error.
2926		 */
2927		if (inc < 0)
2928			dev->flags &= ~IFF_PROMISC;
2929		else {
2930			dev->promiscuity -= inc;
2931			printk(KERN_WARNING "%s: promiscuity touches roof, "
2932				"set promiscuity failed, promiscuity feature "
2933				"of device might be broken.\n", dev->name);
2934			return -EOVERFLOW;
2935		}
2936	}
2937	if (dev->flags != old_flags) {
2938		printk(KERN_INFO "device %s %s promiscuous mode\n",
2939		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2940							       "left");
2941		if (audit_enabled)
2942			audit_log(current->audit_context, GFP_ATOMIC,
2943				AUDIT_ANOM_PROMISCUOUS,
2944				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2945				dev->name, (dev->flags & IFF_PROMISC),
2946				(old_flags & IFF_PROMISC),
2947				audit_get_loginuid(current),
2948				current->uid, current->gid,
2949				audit_get_sessionid(current));
2950
2951		if (dev->change_rx_flags)
2952			dev->change_rx_flags(dev, IFF_PROMISC);
2953	}
2954	return 0;
2955}
2956
2957/**
2958 *	dev_set_promiscuity	- update promiscuity count on a device
2959 *	@dev: device
2960 *	@inc: modifier
2961 *
2962 *	Add or remove promiscuity from a device. While the count in the device
2963 *	remains above zero the interface remains promiscuous. Once it hits zero
2964 *	the device reverts back to normal filtering operation. A negative inc
2965 *	value is used to drop promiscuity on the device.
2966 *	Return 0 if successful or a negative errno code on error.
2967 */
2968int dev_set_promiscuity(struct net_device *dev, int inc)
2969{
2970	unsigned short old_flags = dev->flags;
2971	int err;
2972
2973	err = __dev_set_promiscuity(dev, inc);
2974	if (err < 0)
2975		return err;
2976	if (dev->flags != old_flags)
2977		dev_set_rx_mode(dev);
2978	return err;
2979}
2980
2981/**
2982 *	dev_set_allmulti	- update allmulti count on a device
2983 *	@dev: device
2984 *	@inc: modifier
2985 *
2986 *	Add or remove reception of all multicast frames to a device. While the
2987 *	count in the device remains above zero the interface remains listening
2988 *	to all interfaces. Once it hits zero the device reverts back to normal
2989 *	filtering operation. A negative @inc value is used to drop the counter
2990 *	when releasing a resource needing all multicasts.
2991 *	Return 0 if successful or a negative errno code on error.
2992 */
2993
2994int dev_set_allmulti(struct net_device *dev, int inc)
2995{
2996	unsigned short old_flags = dev->flags;
2997
2998	ASSERT_RTNL();
2999
3000	dev->flags |= IFF_ALLMULTI;
3001	dev->allmulti += inc;
3002	if (dev->allmulti == 0) {
3003		/*
3004		 * Avoid overflow.
3005		 * If inc causes overflow, untouch allmulti and return error.
3006		 */
3007		if (inc < 0)
3008			dev->flags &= ~IFF_ALLMULTI;
3009		else {
3010			dev->allmulti -= inc;
3011			printk(KERN_WARNING "%s: allmulti touches roof, "
3012				"set allmulti failed, allmulti feature of "
3013				"device might be broken.\n", dev->name);
3014			return -EOVERFLOW;
3015		}
3016	}
3017	if (dev->flags ^ old_flags) {
3018		if (dev->change_rx_flags)
3019			dev->change_rx_flags(dev, IFF_ALLMULTI);
3020		dev_set_rx_mode(dev);
3021	}
3022	return 0;
3023}
3024
3025/*
3026 *	Upload unicast and multicast address lists to device and
3027 *	configure RX filtering. When the device doesn't support unicast
3028 *	filtering it is put in promiscuous mode while unicast addresses
3029 *	are present.
3030 */
3031void __dev_set_rx_mode(struct net_device *dev)
3032{
3033	/* dev_open will call this function so the list will stay sane. */
3034	if (!(dev->flags&IFF_UP))
3035		return;
3036
3037	if (!netif_device_present(dev))
3038		return;
3039
3040	if (dev->set_rx_mode)
3041		dev->set_rx_mode(dev);
3042	else {
3043		/* Unicast addresses changes may only happen under the rtnl,
3044		 * therefore calling __dev_set_promiscuity here is safe.
3045		 */
3046		if (dev->uc_count > 0 && !dev->uc_promisc) {
3047			__dev_set_promiscuity(dev, 1);
3048			dev->uc_promisc = 1;
3049		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3050			__dev_set_promiscuity(dev, -1);
3051			dev->uc_promisc = 0;
3052		}
3053
3054		if (dev->set_multicast_list)
3055			dev->set_multicast_list(dev);
3056	}
3057}
3058
3059void dev_set_rx_mode(struct net_device *dev)
3060{
3061	netif_addr_lock_bh(dev);
3062	__dev_set_rx_mode(dev);
3063	netif_addr_unlock_bh(dev);
3064}
3065
3066int __dev_addr_delete(struct dev_addr_list **list, int *count,
3067		      void *addr, int alen, int glbl)
3068{
3069	struct dev_addr_list *da;
3070
3071	for (; (da = *list) != NULL; list = &da->next) {
3072		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3073		    alen == da->da_addrlen) {
3074			if (glbl) {
3075				int old_glbl = da->da_gusers;
3076				da->da_gusers = 0;
3077				if (old_glbl == 0)
3078					break;
3079			}
3080			if (--da->da_users)
3081				return 0;
3082
3083			*list = da->next;
3084			kfree(da);
3085			(*count)--;
3086			return 0;
3087		}
3088	}
3089	return -ENOENT;
3090}
3091
3092int __dev_addr_add(struct dev_addr_list **list, int *count,
3093		   void *addr, int alen, int glbl)
3094{
3095	struct dev_addr_list *da;
3096
3097	for (da = *list; da != NULL; da = da->next) {
3098		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3099		    da->da_addrlen == alen) {
3100			if (glbl) {
3101				int old_glbl = da->da_gusers;
3102				da->da_gusers = 1;
3103				if (old_glbl)
3104					return 0;
3105			}
3106			da->da_users++;
3107			return 0;
3108		}
3109	}
3110
3111	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3112	if (da == NULL)
3113		return -ENOMEM;
3114	memcpy(da->da_addr, addr, alen);
3115	da->da_addrlen = alen;
3116	da->da_users = 1;
3117	da->da_gusers = glbl ? 1 : 0;
3118	da->next = *list;
3119	*list = da;
3120	(*count)++;
3121	return 0;
3122}
3123
3124/**
3125 *	dev_unicast_delete	- Release secondary unicast address.
3126 *	@dev: device
3127 *	@addr: address to delete
3128 *	@alen: length of @addr
3129 *
3130 *	Release reference to a secondary unicast address and remove it
3131 *	from the device if the reference count drops to zero.
3132 *
3133 * 	The caller must hold the rtnl_mutex.
3134 */
3135int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3136{
3137	int err;
3138
3139	ASSERT_RTNL();
3140
3141	netif_addr_lock_bh(dev);
3142	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3143	if (!err)
3144		__dev_set_rx_mode(dev);
3145	netif_addr_unlock_bh(dev);
3146	return err;
3147}
3148EXPORT_SYMBOL(dev_unicast_delete);
3149
3150/**
3151 *	dev_unicast_add		- add a secondary unicast address
3152 *	@dev: device
3153 *	@addr: address to add
3154 *	@alen: length of @addr
3155 *
3156 *	Add a secondary unicast address to the device or increase
3157 *	the reference count if it already exists.
3158 *
3159 *	The caller must hold the rtnl_mutex.
3160 */
3161int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3162{
3163	int err;
3164
3165	ASSERT_RTNL();
3166
3167	netif_addr_lock_bh(dev);
3168	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3169	if (!err)
3170		__dev_set_rx_mode(dev);
3171	netif_addr_unlock_bh(dev);
3172	return err;
3173}
3174EXPORT_SYMBOL(dev_unicast_add);
3175
3176int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3177		    struct dev_addr_list **from, int *from_count)
3178{
3179	struct dev_addr_list *da, *next;
3180	int err = 0;
3181
3182	da = *from;
3183	while (da != NULL) {
3184		next = da->next;
3185		if (!da->da_synced) {
3186			err = __dev_addr_add(to, to_count,
3187					     da->da_addr, da->da_addrlen, 0);
3188			if (err < 0)
3189				break;
3190			da->da_synced = 1;
3191			da->da_users++;
3192		} else if (da->da_users == 1) {
3193			__dev_addr_delete(to, to_count,
3194					  da->da_addr, da->da_addrlen, 0);
3195			__dev_addr_delete(from, from_count,
3196					  da->da_addr, da->da_addrlen, 0);
3197		}
3198		da = next;
3199	}
3200	return err;
3201}
3202
3203void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3204		       struct dev_addr_list **from, int *from_count)
3205{
3206	struct dev_addr_list *da, *next;
3207
3208	da = *from;
3209	while (da != NULL) {
3210		next = da->next;
3211		if (da->da_synced) {
3212			__dev_addr_delete(to, to_count,
3213					  da->da_addr, da->da_addrlen, 0);
3214			da->da_synced = 0;
3215			__dev_addr_delete(from, from_count,
3216					  da->da_addr, da->da_addrlen, 0);
3217		}
3218		da = next;
3219	}
3220}
3221
3222/**
3223 *	dev_unicast_sync - Synchronize device's unicast list to another device
3224 *	@to: destination device
3225 *	@from: source device
3226 *
3227 *	Add newly added addresses to the destination device and release
3228 *	addresses that have no users left. The source device must be
3229 *	locked by netif_tx_lock_bh.
3230 *
3231 *	This function is intended to be called from the dev->set_rx_mode
3232 *	function of layered software devices.
3233 */
3234int dev_unicast_sync(struct net_device *to, struct net_device *from)
3235{
3236	int err = 0;
3237
3238	netif_addr_lock_bh(to);
3239	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3240			      &from->uc_list, &from->uc_count);
3241	if (!err)
3242		__dev_set_rx_mode(to);
3243	netif_addr_unlock_bh(to);
3244	return err;
3245}
3246EXPORT_SYMBOL(dev_unicast_sync);
3247
3248/**
3249 *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3250 *	@to: destination device
3251 *	@from: source device
3252 *
3253 *	Remove all addresses that were added to the destination device by
3254 *	dev_unicast_sync(). This function is intended to be called from the
3255 *	dev->stop function of layered software devices.
3256 */
3257void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3258{
3259	netif_addr_lock_bh(from);
3260	netif_addr_lock(to);
3261
3262	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3263			  &from->uc_list, &from->uc_count);
3264	__dev_set_rx_mode(to);
3265
3266	netif_addr_unlock(to);
3267	netif_addr_unlock_bh(from);
3268}
3269EXPORT_SYMBOL(dev_unicast_unsync);
3270
3271static void __dev_addr_discard(struct dev_addr_list **list)
3272{
3273	struct dev_addr_list *tmp;
3274
3275	while (*list != NULL) {
3276		tmp = *list;
3277		*list = tmp->next;
3278		if (tmp->da_users > tmp->da_gusers)
3279			printk("__dev_addr_discard: address leakage! "
3280			       "da_users=%d\n", tmp->da_users);
3281		kfree(tmp);
3282	}
3283}
3284
3285static void dev_addr_discard(struct net_device *dev)
3286{
3287	netif_addr_lock_bh(dev);
3288
3289	__dev_addr_discard(&dev->uc_list);
3290	dev->uc_count = 0;
3291
3292	__dev_addr_discard(&dev->mc_list);
3293	dev->mc_count = 0;
3294
3295	netif_addr_unlock_bh(dev);
3296}
3297
3298unsigned dev_get_flags(const struct net_device *dev)
3299{
3300	unsigned flags;
3301
3302	flags = (dev->flags & ~(IFF_PROMISC |
3303				IFF_ALLMULTI |
3304				IFF_RUNNING |
3305				IFF_LOWER_UP |
3306				IFF_DORMANT)) |
3307		(dev->gflags & (IFF_PROMISC |
3308				IFF_ALLMULTI));
3309
3310	if (netif_running(dev)) {
3311		if (netif_oper_up(dev))
3312			flags |= IFF_RUNNING;
3313		if (netif_carrier_ok(dev))
3314			flags |= IFF_LOWER_UP;
3315		if (netif_dormant(dev))
3316			flags |= IFF_DORMANT;
3317	}
3318
3319	return flags;
3320}
3321
3322int dev_change_flags(struct net_device *dev, unsigned flags)
3323{
3324	int ret, changes;
3325	int old_flags = dev->flags;
3326
3327	ASSERT_RTNL();
3328
3329	/*
3330	 *	Set the flags on our device.
3331	 */
3332
3333	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3334			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3335			       IFF_AUTOMEDIA)) |
3336		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3337				    IFF_ALLMULTI));
3338
3339	/*
3340	 *	Load in the correct multicast list now the flags have changed.
3341	 */
3342
3343	if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
3344		dev->change_rx_flags(dev, IFF_MULTICAST);
3345
3346	dev_set_rx_mode(dev);
3347
3348	/*
3349	 *	Have we downed the interface. We handle IFF_UP ourselves
3350	 *	according to user attempts to set it, rather than blindly
3351	 *	setting it.
3352	 */
3353
3354	ret = 0;
3355	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3356		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3357
3358		if (!ret)
3359			dev_set_rx_mode(dev);
3360	}
3361
3362	if (dev->flags & IFF_UP &&
3363	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3364					  IFF_VOLATILE)))
3365		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3366
3367	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3368		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3369		dev->gflags ^= IFF_PROMISC;
3370		dev_set_promiscuity(dev, inc);
3371	}
3372
3373	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3374	   is important. Some (broken) drivers set IFF_PROMISC, when
3375	   IFF_ALLMULTI is requested not asking us and not reporting.
3376	 */
3377	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3378		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3379		dev->gflags ^= IFF_ALLMULTI;
3380		dev_set_allmulti(dev, inc);
3381	}
3382
3383	/* Exclude state transition flags, already notified */
3384	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3385	if (changes)
3386		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3387
3388	return ret;
3389}
3390
3391int dev_set_mtu(struct net_device *dev, int new_mtu)
3392{
3393	int err;
3394
3395	if (new_mtu == dev->mtu)
3396		return 0;
3397
3398	/*	MTU must be positive.	 */
3399	if (new_mtu < 0)
3400		return -EINVAL;
3401
3402	if (!netif_device_present(dev))
3403		return -ENODEV;
3404
3405	err = 0;
3406	if (dev->change_mtu)
3407		err = dev->change_mtu(dev, new_mtu);
3408	else
3409		dev->mtu = new_mtu;
3410	if (!err && dev->flags & IFF_UP)
3411		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3412	return err;
3413}
3414
3415int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3416{
3417	int err;
3418
3419	if (!dev->set_mac_address)
3420		return -EOPNOTSUPP;
3421	if (sa->sa_family != dev->type)
3422		return -EINVAL;
3423	if (!netif_device_present(dev))
3424		return -ENODEV;
3425	err = dev->set_mac_address(dev, sa);
3426	if (!err)
3427		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3428	return err;
3429}
3430
3431/*
3432 *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3433 */
3434static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3435{
3436	int err;
3437	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3438
3439	if (!dev)
3440		return -ENODEV;
3441
3442	switch (cmd) {
3443		case SIOCGIFFLAGS:	/* Get interface flags */
3444			ifr->ifr_flags = dev_get_flags(dev);
3445			return 0;
3446
3447		case SIOCGIFMETRIC:	/* Get the metric on the interface
3448					   (currently unused) */
3449			ifr->ifr_metric = 0;
3450			return 0;
3451
3452		case SIOCGIFMTU:	/* Get the MTU of a device */
3453			ifr->ifr_mtu = dev->mtu;
3454			return 0;
3455
3456		case SIOCGIFHWADDR:
3457			if (!dev->addr_len)
3458				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3459			else
3460				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3461				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3462			ifr->ifr_hwaddr.sa_family = dev->type;
3463			return 0;
3464
3465		case SIOCGIFSLAVE:
3466			err = -EINVAL;
3467			break;
3468
3469		case SIOCGIFMAP:
3470			ifr->ifr_map.mem_start = dev->mem_start;
3471			ifr->ifr_map.mem_end   = dev->mem_end;
3472			ifr->ifr_map.base_addr = dev->base_addr;
3473			ifr->ifr_map.irq       = dev->irq;
3474			ifr->ifr_map.dma       = dev->dma;
3475			ifr->ifr_map.port      = dev->if_port;
3476			return 0;
3477
3478		case SIOCGIFINDEX:
3479			ifr->ifr_ifindex = dev->ifindex;
3480			return 0;
3481
3482		case SIOCGIFTXQLEN:
3483			ifr->ifr_qlen = dev->tx_queue_len;
3484			return 0;
3485
3486		default:
3487			/* dev_ioctl() should ensure this case
3488			 * is never reached
3489			 */
3490			WARN_ON(1);
3491			err = -EINVAL;
3492			break;
3493
3494	}
3495	return err;
3496}
3497
3498/*
3499 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3500 */
3501static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3502{
3503	int err;
3504	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3505
3506	if (!dev)
3507		return -ENODEV;
3508
3509	switch (cmd) {
3510		case SIOCSIFFLAGS:	/* Set interface flags */
3511			return dev_change_flags(dev, ifr->ifr_flags);
3512
3513		case SIOCSIFMETRIC:	/* Set the metric on the interface
3514					   (currently unused) */
3515			return -EOPNOTSUPP;
3516
3517		case SIOCSIFMTU:	/* Set the MTU of a device */
3518			return dev_set_mtu(dev, ifr->ifr_mtu);
3519
3520		case SIOCSIFHWADDR:
3521			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3522
3523		case SIOCSIFHWBROADCAST:
3524			if (ifr->ifr_hwaddr.sa_family != dev->type)
3525				return -EINVAL;
3526			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3527			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3528			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3529			return 0;
3530
3531		case SIOCSIFMAP:
3532			if (dev->set_config) {
3533				if (!netif_device_present(dev))
3534					return -ENODEV;
3535				return dev->set_config(dev, &ifr->ifr_map);
3536			}
3537			return -EOPNOTSUPP;
3538
3539		case SIOCADDMULTI:
3540			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3541			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3542				return -EINVAL;
3543			if (!netif_device_present(dev))
3544				return -ENODEV;
3545			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3546					  dev->addr_len, 1);
3547
3548		case SIOCDELMULTI:
3549			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3550			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3551				return -EINVAL;
3552			if (!netif_device_present(dev))
3553				return -ENODEV;
3554			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3555					     dev->addr_len, 1);
3556
3557		case SIOCSIFTXQLEN:
3558			if (ifr->ifr_qlen < 0)
3559				return -EINVAL;
3560			dev->tx_queue_len = ifr->ifr_qlen;
3561			return 0;
3562
3563		case SIOCSIFNAME:
3564			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3565			return dev_change_name(dev, ifr->ifr_newname);
3566
3567		/*
3568		 *	Unknown or private ioctl
3569		 */
3570
3571		default:
3572			if ((cmd >= SIOCDEVPRIVATE &&
3573			    cmd <= SIOCDEVPRIVATE + 15) ||
3574			    cmd == SIOCBONDENSLAVE ||
3575			    cmd == SIOCBONDRELEASE ||
3576			    cmd == SIOCBONDSETHWADDR ||
3577			    cmd == SIOCBONDSLAVEINFOQUERY ||
3578			    cmd == SIOCBONDINFOQUERY ||
3579			    cmd == SIOCBONDCHANGEACTIVE ||
3580			    cmd == SIOCGMIIPHY ||
3581			    cmd == SIOCGMIIREG ||
3582			    cmd == SIOCSMIIREG ||
3583			    cmd == SIOCBRADDIF ||
3584			    cmd == SIOCBRDELIF ||
3585			    cmd == SIOCWANDEV) {
3586				err = -EOPNOTSUPP;
3587				if (dev->do_ioctl) {
3588					if (netif_device_present(dev))
3589						err = dev->do_ioctl(dev, ifr,
3590								    cmd);
3591					else
3592						err = -ENODEV;
3593				}
3594			} else
3595				err = -EINVAL;
3596
3597	}
3598	return err;
3599}
3600
3601/*
3602 *	This function handles all "interface"-type I/O control requests. The actual
3603 *	'doing' part of this is dev_ifsioc above.
3604 */
3605
3606/**
3607 *	dev_ioctl	-	network device ioctl
3608 *	@net: the applicable net namespace
3609 *	@cmd: command to issue
3610 *	@arg: pointer to a struct ifreq in user space
3611 *
3612 *	Issue ioctl functions to devices. This is normally called by the
3613 *	user space syscall interfaces but can sometimes be useful for
3614 *	other purposes. The return value is the return from the syscall if
3615 *	positive or a negative errno code on error.
3616 */
3617
3618int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3619{
3620	struct ifreq ifr;
3621	int ret;
3622	char *colon;
3623
3624	/* One special case: SIOCGIFCONF takes ifconf argument
3625	   and requires shared lock, because it sleeps writing
3626	   to user space.
3627	 */
3628
3629	if (cmd == SIOCGIFCONF) {
3630		rtnl_lock();
3631		ret = dev_ifconf(net, (char __user *) arg);
3632		rtnl_unlock();
3633		return ret;
3634	}
3635	if (cmd == SIOCGIFNAME)
3636		return dev_ifname(net, (struct ifreq __user *)arg);
3637
3638	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3639		return -EFAULT;
3640
3641	ifr.ifr_name[IFNAMSIZ-1] = 0;
3642
3643	colon = strchr(ifr.ifr_name, ':');
3644	if (colon)
3645		*colon = 0;
3646
3647	/*
3648	 *	See which interface the caller is talking about.
3649	 */
3650
3651	switch (cmd) {
3652		/*
3653		 *	These ioctl calls:
3654		 *	- can be done by all.
3655		 *	- atomic and do not require locking.
3656		 *	- return a value
3657		 */
3658		case SIOCGIFFLAGS:
3659		case SIOCGIFMETRIC:
3660		case SIOCGIFMTU:
3661		case SIOCGIFHWADDR:
3662		case SIOCGIFSLAVE:
3663		case SIOCGIFMAP:
3664		case SIOCGIFINDEX:
3665		case SIOCGIFTXQLEN:
3666			dev_load(net, ifr.ifr_name);
3667			read_lock(&dev_base_lock);
3668			ret = dev_ifsioc_locked(net, &ifr, cmd);
3669			read_unlock(&dev_base_lock);
3670			if (!ret) {
3671				if (colon)
3672					*colon = ':';
3673				if (copy_to_user(arg, &ifr,
3674						 sizeof(struct ifreq)))
3675					ret = -EFAULT;
3676			}
3677			return ret;
3678
3679		case SIOCETHTOOL:
3680			dev_load(net, ifr.ifr_name);
3681			rtnl_lock();
3682			ret = dev_ethtool(net, &ifr);
3683			rtnl_unlock();
3684			if (!ret) {
3685				if (colon)
3686					*colon = ':';
3687				if (copy_to_user(arg, &ifr,
3688						 sizeof(struct ifreq)))
3689					ret = -EFAULT;
3690			}
3691			return ret;
3692
3693		/*
3694		 *	These ioctl calls:
3695		 *	- require superuser power.
3696		 *	- require strict serialization.
3697		 *	- return a value
3698		 */
3699		case SIOCGMIIPHY:
3700		case SIOCGMIIREG:
3701		case SIOCSIFNAME:
3702			if (!capable(CAP_NET_ADMIN))
3703				return -EPERM;
3704			dev_load(net, ifr.ifr_name);
3705			rtnl_lock();
3706			ret = dev_ifsioc(net, &ifr, cmd);
3707			rtnl_unlock();
3708			if (!ret) {
3709				if (colon)
3710					*colon = ':';
3711				if (copy_to_user(arg, &ifr,
3712						 sizeof(struct ifreq)))
3713					ret = -EFAULT;
3714			}
3715			return ret;
3716
3717		/*
3718		 *	These ioctl calls:
3719		 *	- require superuser power.
3720		 *	- require strict serialization.
3721		 *	- do not return a value
3722		 */
3723		case SIOCSIFFLAGS:
3724		case SIOCSIFMETRIC:
3725		case SIOCSIFMTU:
3726		case SIOCSIFMAP:
3727		case SIOCSIFHWADDR:
3728		case SIOCSIFSLAVE:
3729		case SIOCADDMULTI:
3730		case SIOCDELMULTI:
3731		case SIOCSIFHWBROADCAST:
3732		case SIOCSIFTXQLEN:
3733		case SIOCSMIIREG:
3734		case SIOCBONDENSLAVE:
3735		case SIOCBONDRELEASE:
3736		case SIOCBONDSETHWADDR:
3737		case SIOCBONDCHANGEACTIVE:
3738		case SIOCBRADDIF:
3739		case SIOCBRDELIF:
3740			if (!capable(CAP_NET_ADMIN))
3741				return -EPERM;
3742			/* fall through */
3743		case SIOCBONDSLAVEINFOQUERY:
3744		case SIOCBONDINFOQUERY:
3745			dev_load(net, ifr.ifr_name);
3746			rtnl_lock();
3747			ret = dev_ifsioc(net, &ifr, cmd);
3748			rtnl_unlock();
3749			return ret;
3750
3751		case SIOCGIFMEM:
3752			/* Get the per device memory space. We can add this but
3753			 * currently do not support it */
3754		case SIOCSIFMEM:
3755			/* Set the per device memory buffer space.
3756			 * Not applicable in our case */
3757		case SIOCSIFLINK:
3758			return -EINVAL;
3759
3760		/*
3761		 *	Unknown or private ioctl.
3762		 */
3763		default:
3764			if (cmd == SIOCWANDEV ||
3765			    (cmd >= SIOCDEVPRIVATE &&
3766			     cmd <= SIOCDEVPRIVATE + 15)) {
3767				dev_load(net, ifr.ifr_name);
3768				rtnl_lock();
3769				ret = dev_ifsioc(net, &ifr, cmd);
3770				rtnl_unlock();
3771				if (!ret && copy_to_user(arg, &ifr,
3772							 sizeof(struct ifreq)))
3773					ret = -EFAULT;
3774				return ret;
3775			}
3776			/* Take care of Wireless Extensions */
3777			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3778				return wext_handle_ioctl(net, &ifr, cmd, arg);
3779			return -EINVAL;
3780	}
3781}
3782
3783
3784/**
3785 *	dev_new_index	-	allocate an ifindex
3786 *	@net: the applicable net namespace
3787 *
3788 *	Returns a suitable unique value for a new device interface
3789 *	number.  The caller must hold the rtnl semaphore or the
3790 *	dev_base_lock to be sure it remains unique.
3791 */
3792static int dev_new_index(struct net *net)
3793{
3794	static int ifindex;
3795	for (;;) {
3796		if (++ifindex <= 0)
3797			ifindex = 1;
3798		if (!__dev_get_by_index(net, ifindex))
3799			return ifindex;
3800	}
3801}
3802
3803/* Delayed registration/unregisteration */
3804static DEFINE_SPINLOCK(net_todo_list_lock);
3805static LIST_HEAD(net_todo_list);
3806
3807static void net_set_todo(struct net_device *dev)
3808{
3809	spin_lock(&net_todo_list_lock);
3810	list_add_tail(&dev->todo_list, &net_todo_list);
3811	spin_unlock(&net_todo_list_lock);
3812}
3813
3814static void rollback_registered(struct net_device *dev)
3815{
3816	BUG_ON(dev_boot_phase);
3817	ASSERT_RTNL();
3818
3819	/* Some devices call without registering for initialization unwind. */
3820	if (dev->reg_state == NETREG_UNINITIALIZED) {
3821		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3822				  "was registered\n", dev->name, dev);
3823
3824		WARN_ON(1);
3825		return;
3826	}
3827
3828	BUG_ON(dev->reg_state != NETREG_REGISTERED);
3829
3830	/* If device is running, close it first. */
3831	dev_close(dev);
3832
3833	/* And unlink it from device chain. */
3834	unlist_netdevice(dev);
3835
3836	dev->reg_state = NETREG_UNREGISTERING;
3837
3838	synchronize_net();
3839
3840	/* Shutdown queueing discipline. */
3841	dev_shutdown(dev);
3842
3843
3844	/* Notify protocols, that we are about to destroy
3845	   this device. They should clean all the things.
3846	*/
3847	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3848
3849	/*
3850	 *	Flush the unicast and multicast chains
3851	 */
3852	dev_addr_discard(dev);
3853
3854	if (dev->uninit)
3855		dev->uninit(dev);
3856
3857	/* Notifier chain MUST detach us from master device. */
3858	WARN_ON(dev->master);
3859
3860	/* Remove entries from kobject tree */
3861	netdev_unregister_kobject(dev);
3862
3863	synchronize_net();
3864
3865	dev_put(dev);
3866}
3867
3868static void __netdev_init_queue_locks_one(struct net_device *dev,
3869					  struct netdev_queue *dev_queue,
3870					  void *_unused)
3871{
3872	spin_lock_init(&dev_queue->_xmit_lock);
3873	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
3874	dev_queue->xmit_lock_owner = -1;
3875}
3876
3877static void netdev_init_queue_locks(struct net_device *dev)
3878{
3879	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
3880	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
3881}
3882
3883/**
3884 *	register_netdevice	- register a network device
3885 *	@dev: device to register
3886 *
3887 *	Take a completed network device structure and add it to the kernel
3888 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3889 *	chain. 0 is returned on success. A negative errno code is returned
3890 *	on a failure to set up the device, or if the name is a duplicate.
3891 *
3892 *	Callers must hold the rtnl semaphore. You may want
3893 *	register_netdev() instead of this.
3894 *
3895 *	BUGS:
3896 *	The locking appears insufficient to guarantee two parallel registers
3897 *	will not get the same name.
3898 */
3899
3900int register_netdevice(struct net_device *dev)
3901{
3902	struct hlist_head *head;
3903	struct hlist_node *p;
3904	int ret;
3905	struct net *net;
3906
3907	BUG_ON(dev_boot_phase);
3908	ASSERT_RTNL();
3909
3910	might_sleep();
3911
3912	/* When net_device's are persistent, this will be fatal. */
3913	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3914	BUG_ON(!dev_net(dev));
3915	net = dev_net(dev);
3916
3917	spin_lock_init(&dev->addr_list_lock);
3918	netdev_set_addr_lockdep_class(dev);
3919	netdev_init_queue_locks(dev);
3920
3921	dev->iflink = -1;
3922
3923	/* Init, if this function is available */
3924	if (dev->init) {
3925		ret = dev->init(dev);
3926		if (ret) {
3927			if (ret > 0)
3928				ret = -EIO;
3929			goto out;
3930		}
3931	}
3932
3933	if (!dev_valid_name(dev->name)) {
3934		ret = -EINVAL;
3935		goto err_uninit;
3936	}
3937
3938	dev->ifindex = dev_new_index(net);
3939	if (dev->iflink == -1)
3940		dev->iflink = dev->ifindex;
3941
3942	/* Check for existence of name */
3943	head = dev_name_hash(net, dev->name);
3944	hlist_for_each(p, head) {
3945		struct net_device *d
3946			= hlist_entry(p, struct net_device, name_hlist);
3947		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3948			ret = -EEXIST;
3949			goto err_uninit;
3950		}
3951	}
3952
3953	/* Fix illegal checksum combinations */
3954	if ((dev->features & NETIF_F_HW_CSUM) &&
3955	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3956		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3957		       dev->name);
3958		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3959	}
3960
3961	if ((dev->features & NETIF_F_NO_CSUM) &&
3962	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3963		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3964		       dev->name);
3965		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3966	}
3967
3968
3969	/* Fix illegal SG+CSUM combinations. */
3970	if ((dev->features & NETIF_F_SG) &&
3971	    !(dev->features & NETIF_F_ALL_CSUM)) {
3972		printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3973		       dev->name);
3974		dev->features &= ~NETIF_F_SG;
3975	}
3976
3977	/* TSO requires that SG is present as well. */
3978	if ((dev->features & NETIF_F_TSO) &&
3979	    !(dev->features & NETIF_F_SG)) {
3980		printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3981		       dev->name);
3982		dev->features &= ~NETIF_F_TSO;
3983	}
3984	if (dev->features & NETIF_F_UFO) {
3985		if (!(dev->features & NETIF_F_HW_CSUM)) {
3986			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3987					"NETIF_F_HW_CSUM feature.\n",
3988							dev->name);
3989			dev->features &= ~NETIF_F_UFO;
3990		}
3991		if (!(dev->features & NETIF_F_SG)) {
3992			printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3993					"NETIF_F_SG feature.\n",
3994					dev->name);
3995			dev->features &= ~NETIF_F_UFO;
3996		}
3997	}
3998
3999	/* Enable software GSO if SG is supported. */
4000	if (dev->features & NETIF_F_SG)
4001		dev->features |= NETIF_F_GSO;
4002
4003	netdev_initialize_kobject(dev);
4004	ret = netdev_register_kobject(dev);
4005	if (ret)
4006		goto err_uninit;
4007	dev->reg_state = NETREG_REGISTERED;
4008
4009	/*
4010	 *	Default initial state at registry is that the
4011	 *	device is present.
4012	 */
4013
4014	set_bit(__LINK_STATE_PRESENT, &dev->state);
4015
4016	dev_init_scheduler(dev);
4017	dev_hold(dev);
4018	list_netdevice(dev);
4019
4020	/* Notify protocols, that a new device appeared. */
4021	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4022	ret = notifier_to_errno(ret);
4023	if (ret) {
4024		rollback_registered(dev);
4025		dev->reg_state = NETREG_UNREGISTERED;
4026	}
4027
4028out:
4029	return ret;
4030
4031err_uninit:
4032	if (dev->uninit)
4033		dev->uninit(dev);
4034	goto out;
4035}
4036
4037/**
4038 *	register_netdev	- register a network device
4039 *	@dev: device to register
4040 *
4041 *	Take a completed network device structure and add it to the kernel
4042 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4043 *	chain. 0 is returned on success. A negative errno code is returned
4044 *	on a failure to set up the device, or if the name is a duplicate.
4045 *
4046 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4047 *	and expands the device name if you passed a format string to
4048 *	alloc_netdev.
4049 */
4050int register_netdev(struct net_device *dev)
4051{
4052	int err;
4053
4054	rtnl_lock();
4055
4056	/*
4057	 * If the name is a format string the caller wants us to do a
4058	 * name allocation.
4059	 */
4060	if (strchr(dev->name, '%')) {
4061		err = dev_alloc_name(dev, dev->name);
4062		if (err < 0)
4063			goto out;
4064	}
4065
4066	err = register_netdevice(dev);
4067out:
4068	rtnl_unlock();
4069	return err;
4070}
4071EXPORT_SYMBOL(register_netdev);
4072
4073/*
4074 * netdev_wait_allrefs - wait until all references are gone.
4075 *
4076 * This is called when unregistering network devices.
4077 *
4078 * Any protocol or device that holds a reference should register
4079 * for netdevice notification, and cleanup and put back the
4080 * reference if they receive an UNREGISTER event.
4081 * We can get stuck here if buggy protocols don't correctly
4082 * call dev_put.
4083 */
4084static void netdev_wait_allrefs(struct net_device *dev)
4085{
4086	unsigned long rebroadcast_time, warning_time;
4087
4088	rebroadcast_time = warning_time = jiffies;
4089	while (atomic_read(&dev->refcnt) != 0) {
4090		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4091			rtnl_lock();
4092
4093			/* Rebroadcast unregister notification */
4094			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4095
4096			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4097				     &dev->state)) {
4098				/* We must not have linkwatch events
4099				 * pending on unregister. If this
4100				 * happens, we simply run the queue
4101				 * unscheduled, resulting in a noop
4102				 * for this device.
4103				 */
4104				linkwatch_run_queue();
4105			}
4106
4107			__rtnl_unlock();
4108
4109			rebroadcast_time = jiffies;
4110		}
4111
4112		msleep(250);
4113
4114		if (time_after(jiffies, warning_time + 10 * HZ)) {
4115			printk(KERN_EMERG "unregister_netdevice: "
4116			       "waiting for %s to become free. Usage "
4117			       "count = %d\n",
4118			       dev->name, atomic_read(&dev->refcnt));
4119			warning_time = jiffies;
4120		}
4121	}
4122}
4123
4124/* The sequence is:
4125 *
4126 *	rtnl_lock();
4127 *	...
4128 *	register_netdevice(x1);
4129 *	register_netdevice(x2);
4130 *	...
4131 *	unregister_netdevice(y1);
4132 *	unregister_netdevice(y2);
4133 *      ...
4134 *	rtnl_unlock();
4135 *	free_netdev(y1);
4136 *	free_netdev(y2);
4137 *
4138 * We are invoked by rtnl_unlock() after it drops the semaphore.
4139 * This allows us to deal with problems:
4140 * 1) We can delete sysfs objects which invoke hotplug
4141 *    without deadlocking with linkwatch via keventd.
4142 * 2) Since we run with the RTNL semaphore not held, we can sleep
4143 *    safely in order to wait for the netdev refcnt to drop to zero.
4144 */
4145static DEFINE_MUTEX(net_todo_run_mutex);
4146void netdev_run_todo(void)
4147{
4148	struct list_head list;
4149
4150	/* Need to guard against multiple cpu's getting out of order. */
4151	mutex_lock(&net_todo_run_mutex);
4152
4153	/* Not safe to do outside the semaphore.  We must not return
4154	 * until all unregister events invoked by the local processor
4155	 * have been completed (either by this todo run, or one on
4156	 * another cpu).
4157	 */
4158	if (list_empty(&net_todo_list))
4159		goto out;
4160
4161	/* Snapshot list, allow later requests */
4162	spin_lock(&net_todo_list_lock);
4163	list_replace_init(&net_todo_list, &list);
4164	spin_unlock(&net_todo_list_lock);
4165
4166	while (!list_empty(&list)) {
4167		struct net_device *dev
4168			= list_entry(list.next, struct net_device, todo_list);
4169		list_del(&dev->todo_list);
4170
4171		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4172			printk(KERN_ERR "network todo '%s' but state %d\n",
4173			       dev->name, dev->reg_state);
4174			dump_stack();
4175			continue;
4176		}
4177
4178		dev->reg_state = NETREG_UNREGISTERED;
4179
4180		on_each_cpu(flush_backlog, dev, 1);
4181
4182		netdev_wait_allrefs(dev);
4183
4184		/* paranoia */
4185		BUG_ON(atomic_read(&dev->refcnt));
4186		WARN_ON(dev->ip_ptr);
4187		WARN_ON(dev->ip6_ptr);
4188		WARN_ON(dev->dn_ptr);
4189
4190		if (dev->destructor)
4191			dev->destructor(dev);
4192
4193		/* Free network device */
4194		kobject_put(&dev->dev.kobj);
4195	}
4196
4197out:
4198	mutex_unlock(&net_todo_run_mutex);
4199}
4200
4201static struct net_device_stats *internal_stats(struct net_device *dev)
4202{
4203	return &dev->stats;
4204}
4205
4206static void netdev_init_one_queue(struct net_device *dev,
4207				  struct netdev_queue *queue,
4208				  void *_unused)
4209{
4210	queue->dev = dev;
4211}
4212
4213static void netdev_init_queues(struct net_device *dev)
4214{
4215	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4216	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4217	spin_lock_init(&dev->tx_global_lock);
4218}
4219
4220/**
4221 *	alloc_netdev_mq - allocate network device
4222 *	@sizeof_priv:	size of private data to allocate space for
4223 *	@name:		device name format string
4224 *	@setup:		callback to initialize device
4225 *	@queue_count:	the number of subqueues to allocate
4226 *
4227 *	Allocates a struct net_device with private data area for driver use
4228 *	and performs basic initialization.  Also allocates subquue structs
4229 *	for each queue on the device at the end of the netdevice.
4230 */
4231struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4232		void (*setup)(struct net_device *), unsigned int queue_count)
4233{
4234	struct netdev_queue *tx;
4235	struct net_device *dev;
4236	size_t alloc_size;
4237	void *p;
4238
4239	BUG_ON(strlen(name) >= sizeof(dev->name));
4240
4241	alloc_size = sizeof(struct net_device);
4242	if (sizeof_priv) {
4243		/* ensure 32-byte alignment of private area */
4244		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4245		alloc_size += sizeof_priv;
4246	}
4247	/* ensure 32-byte alignment of whole construct */
4248	alloc_size += NETDEV_ALIGN_CONST;
4249
4250	p = kzalloc(alloc_size, GFP_KERNEL);
4251	if (!p) {
4252		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4253		return NULL;
4254	}
4255
4256	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4257	if (!tx) {
4258		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4259		       "tx qdiscs.\n");
4260		kfree(p);
4261		return NULL;
4262	}
4263
4264	dev = (struct net_device *)
4265		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4266	dev->padded = (char *)dev - (char *)p;
4267	dev_net_set(dev, &init_net);
4268
4269	dev->_tx = tx;
4270	dev->num_tx_queues = queue_count;
4271	dev->real_num_tx_queues = queue_count;
4272
4273	if (sizeof_priv) {
4274		dev->priv = ((char *)dev +
4275			     ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
4276			      & ~NETDEV_ALIGN_CONST));
4277	}
4278
4279	dev->gso_max_size = GSO_MAX_SIZE;
4280
4281	netdev_init_queues(dev);
4282
4283	dev->get_stats = internal_stats;
4284	netpoll_netdev_init(dev);
4285	setup(dev);
4286	strcpy(dev->name, name);
4287	return dev;
4288}
4289EXPORT_SYMBOL(alloc_netdev_mq);
4290
4291/**
4292 *	free_netdev - free network device
4293 *	@dev: device
4294 *
4295 *	This function does the last stage of destroying an allocated device
4296 * 	interface. The reference to the device object is released.
4297 *	If this is the last reference then it will be freed.
4298 */
4299void free_netdev(struct net_device *dev)
4300{
4301	release_net(dev_net(dev));
4302
4303	kfree(dev->_tx);
4304
4305	/*  Compatibility with error handling in drivers */
4306	if (dev->reg_state == NETREG_UNINITIALIZED) {
4307		kfree((char *)dev - dev->padded);
4308		return;
4309	}
4310
4311	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4312	dev->reg_state = NETREG_RELEASED;
4313
4314	/* will free via device release */
4315	put_device(&dev->dev);
4316}
4317
4318/* Synchronize with packet receive processing. */
4319void synchronize_net(void)
4320{
4321	might_sleep();
4322	synchronize_rcu();
4323}
4324
4325/**
4326 *	unregister_netdevice - remove device from the kernel
4327 *	@dev: device
4328 *
4329 *	This function shuts down a device interface and removes it
4330 *	from the kernel tables.
4331 *
4332 *	Callers must hold the rtnl semaphore.  You may want
4333 *	unregister_netdev() instead of this.
4334 */
4335
4336void unregister_netdevice(struct net_device *dev)
4337{
4338	ASSERT_RTNL();
4339
4340	rollback_registered(dev);
4341	/* Finish processing unregister after unlock */
4342	net_set_todo(dev);
4343}
4344
4345/**
4346 *	unregister_netdev - remove device from the kernel
4347 *	@dev: device
4348 *
4349 *	This function shuts down a device interface and removes it
4350 *	from the kernel tables.
4351 *
4352 *	This is just a wrapper for unregister_netdevice that takes
4353 *	the rtnl semaphore.  In general you want to use this and not
4354 *	unregister_netdevice.
4355 */
4356void unregister_netdev(struct net_device *dev)
4357{
4358	rtnl_lock();
4359	unregister_netdevice(dev);
4360	rtnl_unlock();
4361}
4362
4363EXPORT_SYMBOL(unregister_netdev);
4364
4365/**
4366 *	dev_change_net_namespace - move device to different nethost namespace
4367 *	@dev: device
4368 *	@net: network namespace
4369 *	@pat: If not NULL name pattern to try if the current device name
4370 *	      is already taken in the destination network namespace.
4371 *
4372 *	This function shuts down a device interface and moves it
4373 *	to a new network namespace. On success 0 is returned, on
4374 *	a failure a netagive errno code is returned.
4375 *
4376 *	Callers must hold the rtnl semaphore.
4377 */
4378
4379int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4380{
4381	char buf[IFNAMSIZ];
4382	const char *destname;
4383	int err;
4384
4385	ASSERT_RTNL();
4386
4387	/* Don't allow namespace local devices to be moved. */
4388	err = -EINVAL;
4389	if (dev->features & NETIF_F_NETNS_LOCAL)
4390		goto out;
4391
4392	/* Ensure the device has been registrered */
4393	err = -EINVAL;
4394	if (dev->reg_state != NETREG_REGISTERED)
4395		goto out;
4396
4397	/* Get out if there is nothing todo */
4398	err = 0;
4399	if (net_eq(dev_net(dev), net))
4400		goto out;
4401
4402	/* Pick the destination device name, and ensure
4403	 * we can use it in the destination network namespace.
4404	 */
4405	err = -EEXIST;
4406	destname = dev->name;
4407	if (__dev_get_by_name(net, destname)) {
4408		/* We get here if we can't use the current device name */
4409		if (!pat)
4410			goto out;
4411		if (!dev_valid_name(pat))
4412			goto out;
4413		if (strchr(pat, '%')) {
4414			if (__dev_alloc_name(net, pat, buf) < 0)
4415				goto out;
4416			destname = buf;
4417		} else
4418			destname = pat;
4419		if (__dev_get_by_name(net, destname))
4420			goto out;
4421	}
4422
4423	/*
4424	 * And now a mini version of register_netdevice unregister_netdevice.
4425	 */
4426
4427	/* If device is running close it first. */
4428	dev_close(dev);
4429
4430	/* And unlink it from device chain */
4431	err = -ENODEV;
4432	unlist_netdevice(dev);
4433
4434	synchronize_net();
4435
4436	/* Shutdown queueing discipline. */
4437	dev_shutdown(dev);
4438
4439	/* Notify protocols, that we are about to destroy
4440	   this device. They should clean all the things.
4441	*/
4442	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4443
4444	/*
4445	 *	Flush the unicast and multicast chains
4446	 */
4447	dev_addr_discard(dev);
4448
4449	/* Actually switch the network namespace */
4450	dev_net_set(dev, net);
4451
4452	/* Assign the new device name */
4453	if (destname != dev->name)
4454		strcpy(dev->name, destname);
4455
4456	/* If there is an ifindex conflict assign a new one */
4457	if (__dev_get_by_index(net, dev->ifindex)) {
4458		int iflink = (dev->iflink == dev->ifindex);
4459		dev->ifindex = dev_new_index(net);
4460		if (iflink)
4461			dev->iflink = dev->ifindex;
4462	}
4463
4464	/* Fixup kobjects */
4465	netdev_unregister_kobject(dev);
4466	err = netdev_register_kobject(dev);
4467	WARN_ON(err);
4468
4469	/* Add the device back in the hashes */
4470	list_netdevice(dev);
4471
4472	/* Notify protocols, that a new device appeared. */
4473	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4474
4475	synchronize_net();
4476	err = 0;
4477out:
4478	return err;
4479}
4480
4481static int dev_cpu_callback(struct notifier_block *nfb,
4482			    unsigned long action,
4483			    void *ocpu)
4484{
4485	struct sk_buff **list_skb;
4486	struct Qdisc **list_net;
4487	struct sk_buff *skb;
4488	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4489	struct softnet_data *sd, *oldsd;
4490
4491	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4492		return NOTIFY_OK;
4493
4494	local_irq_disable();
4495	cpu = smp_processor_id();
4496	sd = &per_cpu(softnet_data, cpu);
4497	oldsd = &per_cpu(softnet_data, oldcpu);
4498
4499	/* Find end of our completion_queue. */
4500	list_skb = &sd->completion_queue;
4501	while (*list_skb)
4502		list_skb = &(*list_skb)->next;
4503	/* Append completion queue from offline CPU. */
4504	*list_skb = oldsd->completion_queue;
4505	oldsd->completion_queue = NULL;
4506
4507	/* Find end of our output_queue. */
4508	list_net = &sd->output_queue;
4509	while (*list_net)
4510		list_net = &(*list_net)->next_sched;
4511	/* Append output queue from offline CPU. */
4512	*list_net = oldsd->output_queue;
4513	oldsd->output_queue = NULL;
4514
4515	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4516	local_irq_enable();
4517
4518	/* Process offline CPU's input_pkt_queue */
4519	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4520		netif_rx(skb);
4521
4522	return NOTIFY_OK;
4523}
4524
4525#ifdef CONFIG_NET_DMA
4526/**
4527 * net_dma_rebalance - try to maintain one DMA channel per CPU
4528 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4529 *
4530 * This is called when the number of channels allocated to the net_dma client
4531 * changes.  The net_dma client tries to have one DMA channel per CPU.
4532 */
4533
4534static void net_dma_rebalance(struct net_dma *net_dma)
4535{
4536	unsigned int cpu, i, n, chan_idx;
4537	struct dma_chan *chan;
4538
4539	if (cpus_empty(net_dma->channel_mask)) {
4540		for_each_online_cpu(cpu)
4541			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4542		return;
4543	}
4544
4545	i = 0;
4546	cpu = first_cpu(cpu_online_map);
4547
4548	for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
4549		chan = net_dma->channels[chan_idx];
4550
4551		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4552		   + (i < (num_online_cpus() %
4553			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4554
4555		while(n) {
4556			per_cpu(softnet_data, cpu).net_dma = chan;
4557			cpu = next_cpu(cpu, cpu_online_map);
4558			n--;
4559		}
4560		i++;
4561	}
4562}
4563
4564/**
4565 * netdev_dma_event - event callback for the net_dma_client
4566 * @client: should always be net_dma_client
4567 * @chan: DMA channel for the event
4568 * @state: DMA state to be handled
4569 */
4570static enum dma_state_client
4571netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4572	enum dma_state state)
4573{
4574	int i, found = 0, pos = -1;
4575	struct net_dma *net_dma =
4576		container_of(client, struct net_dma, client);
4577	enum dma_state_client ack = DMA_DUP; /* default: take no action */
4578
4579	spin_lock(&net_dma->lock);
4580	switch (state) {
4581	case DMA_RESOURCE_AVAILABLE:
4582		for (i = 0; i < nr_cpu_ids; i++)
4583			if (net_dma->channels[i] == chan) {
4584				found = 1;
4585				break;
4586			} else if (net_dma->channels[i] == NULL && pos < 0)
4587				pos = i;
4588
4589		if (!found && pos >= 0) {
4590			ack = DMA_ACK;
4591			net_dma->channels[pos] = chan;
4592			cpu_set(pos, net_dma->channel_mask);
4593			net_dma_rebalance(net_dma);
4594		}
4595		break;
4596	case DMA_RESOURCE_REMOVED:
4597		for (i = 0; i < nr_cpu_ids; i++)
4598			if (net_dma->channels[i] == chan) {
4599				found = 1;
4600				pos = i;
4601				break;
4602			}
4603
4604		if (found) {
4605			ack = DMA_ACK;
4606			cpu_clear(pos, net_dma->channel_mask);
4607			net_dma->channels[i] = NULL;
4608			net_dma_rebalance(net_dma);
4609		}
4610		break;
4611	default:
4612		break;
4613	}
4614	spin_unlock(&net_dma->lock);
4615
4616	return ack;
4617}
4618
4619/**
4620 * netdev_dma_regiser - register the networking subsystem as a DMA client
4621 */
4622static int __init netdev_dma_register(void)
4623{
4624	net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4625								GFP_KERNEL);
4626	if (unlikely(!net_dma.channels)) {
4627		printk(KERN_NOTICE
4628				"netdev_dma: no memory for net_dma.channels\n");
4629		return -ENOMEM;
4630	}
4631	spin_lock_init(&net_dma.lock);
4632	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4633	dma_async_client_register(&net_dma.client);
4634	dma_async_client_chan_request(&net_dma.client);
4635	return 0;
4636}
4637
4638#else
4639static int __init netdev_dma_register(void) { return -ENODEV; }
4640#endif /* CONFIG_NET_DMA */
4641
4642/**
4643 *	netdev_compute_feature - compute conjunction of two feature sets
4644 *	@all: first feature set
4645 *	@one: second feature set
4646 *
4647 *	Computes a new feature set after adding a device with feature set
4648 *	@one to the master device with current feature set @all.  Returns
4649 *	the new feature set.
4650 */
4651int netdev_compute_features(unsigned long all, unsigned long one)
4652{
4653	/* if device needs checksumming, downgrade to hw checksumming */
4654	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4655		all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4656
4657	/* if device can't do all checksum, downgrade to ipv4/ipv6 */
4658	if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4659		all ^= NETIF_F_HW_CSUM
4660			| NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4661
4662	if (one & NETIF_F_GSO)
4663		one |= NETIF_F_GSO_SOFTWARE;
4664	one |= NETIF_F_GSO;
4665
4666	/* If even one device supports robust GSO, enable it for all. */
4667	if (one & NETIF_F_GSO_ROBUST)
4668		all |= NETIF_F_GSO_ROBUST;
4669
4670	all &= one | NETIF_F_LLTX;
4671
4672	if (!(all & NETIF_F_ALL_CSUM))
4673		all &= ~NETIF_F_SG;
4674	if (!(all & NETIF_F_SG))
4675		all &= ~NETIF_F_GSO_MASK;
4676
4677	return all;
4678}
4679EXPORT_SYMBOL(netdev_compute_features);
4680
4681static struct hlist_head *netdev_create_hash(void)
4682{
4683	int i;
4684	struct hlist_head *hash;
4685
4686	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4687	if (hash != NULL)
4688		for (i = 0; i < NETDEV_HASHENTRIES; i++)
4689			INIT_HLIST_HEAD(&hash[i]);
4690
4691	return hash;
4692}
4693
4694/* Initialize per network namespace state */
4695static int __net_init netdev_init(struct net *net)
4696{
4697	INIT_LIST_HEAD(&net->dev_base_head);
4698
4699	net->dev_name_head = netdev_create_hash();
4700	if (net->dev_name_head == NULL)
4701		goto err_name;
4702
4703	net->dev_index_head = netdev_create_hash();
4704	if (net->dev_index_head == NULL)
4705		goto err_idx;
4706
4707	return 0;
4708
4709err_idx:
4710	kfree(net->dev_name_head);
4711err_name:
4712	return -ENOMEM;
4713}
4714
4715char *netdev_drivername(struct net_device *dev, char *buffer, int len)
4716{
4717	struct device_driver *driver;
4718	struct device *parent;
4719
4720	if (len <= 0 || !buffer)
4721		return buffer;
4722	buffer[0] = 0;
4723
4724	parent = dev->dev.parent;
4725
4726	if (!parent)
4727		return buffer;
4728
4729	driver = parent->driver;
4730	if (driver && driver->name)
4731		strlcpy(buffer, driver->name, len);
4732	return buffer;
4733}
4734
4735static void __net_exit netdev_exit(struct net *net)
4736{
4737	kfree(net->dev_name_head);
4738	kfree(net->dev_index_head);
4739}
4740
4741static struct pernet_operations __net_initdata netdev_net_ops = {
4742	.init = netdev_init,
4743	.exit = netdev_exit,
4744};
4745
4746static void __net_exit default_device_exit(struct net *net)
4747{
4748	struct net_device *dev, *next;
4749	/*
4750	 * Push all migratable of the network devices back to the
4751	 * initial network namespace
4752	 */
4753	rtnl_lock();
4754	for_each_netdev_safe(net, dev, next) {
4755		int err;
4756		char fb_name[IFNAMSIZ];
4757
4758		/* Ignore unmoveable devices (i.e. loopback) */
4759		if (dev->features & NETIF_F_NETNS_LOCAL)
4760			continue;
4761
4762		/* Push remaing network devices to init_net */
4763		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4764		err = dev_change_net_namespace(dev, &init_net, fb_name);
4765		if (err) {
4766			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4767				__func__, dev->name, err);
4768			BUG();
4769		}
4770	}
4771	rtnl_unlock();
4772}
4773
4774static struct pernet_operations __net_initdata default_device_ops = {
4775	.exit = default_device_exit,
4776};
4777
4778/*
4779 *	Initialize the DEV module. At boot time this walks the device list and
4780 *	unhooks any devices that fail to initialise (normally hardware not
4781 *	present) and leaves us with a valid list of present and active devices.
4782 *
4783 */
4784
4785/*
4786 *       This is called single threaded during boot, so no need
4787 *       to take the rtnl semaphore.
4788 */
4789static int __init net_dev_init(void)
4790{
4791	int i, rc = -ENOMEM;
4792
4793	BUG_ON(!dev_boot_phase);
4794
4795	if (dev_proc_init())
4796		goto out;
4797
4798	if (netdev_kobject_init())
4799		goto out;
4800
4801	INIT_LIST_HEAD(&ptype_all);
4802	for (i = 0; i < PTYPE_HASH_SIZE; i++)
4803		INIT_LIST_HEAD(&ptype_base[i]);
4804
4805	if (register_pernet_subsys(&netdev_net_ops))
4806		goto out;
4807
4808	if (register_pernet_device(&default_device_ops))
4809		goto out;
4810
4811	/*
4812	 *	Initialise the packet receive queues.
4813	 */
4814
4815	for_each_possible_cpu(i) {
4816		struct softnet_data *queue;
4817
4818		queue = &per_cpu(softnet_data, i);
4819		skb_queue_head_init(&queue->input_pkt_queue);
4820		queue->completion_queue = NULL;
4821		INIT_LIST_HEAD(&queue->poll_list);
4822
4823		queue->backlog.poll = process_backlog;
4824		queue->backlog.weight = weight_p;
4825	}
4826
4827	netdev_dma_register();
4828
4829	dev_boot_phase = 0;
4830
4831	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
4832	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
4833
4834	hotcpu_notifier(dev_cpu_callback, 0);
4835	dst_init();
4836	dev_mcast_init();
4837	rc = 0;
4838out:
4839	return rc;
4840}
4841
4842subsys_initcall(net_dev_init);
4843
4844EXPORT_SYMBOL(__dev_get_by_index);
4845EXPORT_SYMBOL(__dev_get_by_name);
4846EXPORT_SYMBOL(__dev_remove_pack);
4847EXPORT_SYMBOL(dev_valid_name);
4848EXPORT_SYMBOL(dev_add_pack);
4849EXPORT_SYMBOL(dev_alloc_name);
4850EXPORT_SYMBOL(dev_close);
4851EXPORT_SYMBOL(dev_get_by_flags);
4852EXPORT_SYMBOL(dev_get_by_index);
4853EXPORT_SYMBOL(dev_get_by_name);
4854EXPORT_SYMBOL(dev_open);
4855EXPORT_SYMBOL(dev_queue_xmit);
4856EXPORT_SYMBOL(dev_remove_pack);
4857EXPORT_SYMBOL(dev_set_allmulti);
4858EXPORT_SYMBOL(dev_set_promiscuity);
4859EXPORT_SYMBOL(dev_change_flags);
4860EXPORT_SYMBOL(dev_set_mtu);
4861EXPORT_SYMBOL(dev_set_mac_address);
4862EXPORT_SYMBOL(free_netdev);
4863EXPORT_SYMBOL(netdev_boot_setup_check);
4864EXPORT_SYMBOL(netdev_set_master);
4865EXPORT_SYMBOL(netdev_state_change);
4866EXPORT_SYMBOL(netif_receive_skb);
4867EXPORT_SYMBOL(netif_rx);
4868EXPORT_SYMBOL(register_gifconf);
4869EXPORT_SYMBOL(register_netdevice);
4870EXPORT_SYMBOL(register_netdevice_notifier);
4871EXPORT_SYMBOL(skb_checksum_help);
4872EXPORT_SYMBOL(synchronize_net);
4873EXPORT_SYMBOL(unregister_netdevice);
4874EXPORT_SYMBOL(unregister_netdevice_notifier);
4875EXPORT_SYMBOL(net_enable_timestamp);
4876EXPORT_SYMBOL(net_disable_timestamp);
4877EXPORT_SYMBOL(dev_get_flags);
4878
4879#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4880EXPORT_SYMBOL(br_handle_frame_hook);
4881EXPORT_SYMBOL(br_fdb_get_hook);
4882EXPORT_SYMBOL(br_fdb_put_hook);
4883#endif
4884
4885#ifdef CONFIG_KMOD
4886EXPORT_SYMBOL(dev_load);
4887#endif
4888
4889EXPORT_PER_CPU_SYMBOL(softnet_data);