net/core/dev.c at v2.6.30-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.30-rc4 5335 lines 132 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/sched.h>
  83#include <linux/mutex.h>
  84#include <linux/string.h>
  85#include <linux/mm.h>
  86#include <linux/socket.h>
  87#include <linux/sockios.h>
  88#include <linux/errno.h>
  89#include <linux/interrupt.h>
  90#include <linux/if_ether.h>
  91#include <linux/netdevice.h>
  92#include <linux/etherdevice.h>
  93#include <linux/ethtool.h>
  94#include <linux/notifier.h>
  95#include <linux/skbuff.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <linux/rtnetlink.h>
  99#include <linux/proc_fs.h>
 100#include <linux/seq_file.h>
 101#include <linux/stat.h>
 102#include <linux/if_bridge.h>
 103#include <linux/if_macvlan.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/kmod.h>
 110#include <linux/module.h>
 111#include <linux/netpoll.h>
 112#include <linux/rcupdate.h>
 113#include <linux/delay.h>
 114#include <net/wext.h>
 115#include <net/iw_handler.h>
 116#include <asm/current.h>
 117#include <linux/audit.h>
 118#include <linux/dmaengine.h>
 119#include <linux/err.h>
 120#include <linux/ctype.h>
 121#include <linux/if_arp.h>
 122#include <linux/if_vlan.h>
 123#include <linux/ip.h>
 124#include <net/ip.h>
 125#include <linux/ipv6.h>
 126#include <linux/in.h>
 127#include <linux/jhash.h>
 128#include <linux/random.h>
 129
 130#include "net-sysfs.h"
 131
 132/* Instead of increasing this, you should create a hash table. */
 133#define MAX_GRO_SKBS 8
 134
 135/* This should be increased if a protocol with a bigger head is added. */
 136#define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138/*
 139 *	The list of packet types we will receive (as opposed to discard)
 140 *	and the routines to invoke.
 141 *
 142 *	Why 16. Because with 16 the only overlap we get on a hash of the
 143 *	low nibble of the protocol value is RARP/SNAP/X.25.
 144 *
 145 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 146 *             sure which should go first, but I bet it won't make much
 147 *             difference if we are running VLANs.  The good news is that
 148 *             this protocol won't be in the list unless compiled in, so
 149 *             the average user (w/out VLANs) will not be adversely affected.
 150 *             --BLG
 151 *
 152 *		0800	IP
 153 *		8100    802.1Q VLAN
 154 *		0001	802.3
 155 *		0002	AX.25
 156 *		0004	802.2
 157 *		8035	RARP
 158 *		0005	SNAP
 159 *		0805	X.25
 160 *		0806	ARP
 161 *		8137	IPX
 162 *		0009	Localtalk
 163 *		86DD	IPv6
 164 */
 165
 166#define PTYPE_HASH_SIZE	(16)
 167#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 168
 169static DEFINE_SPINLOCK(ptype_lock);
 170static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 171static struct list_head ptype_all __read_mostly;	/* Taps */
 172
 173/*
 174 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 175 * semaphore.
 176 *
 177 * Pure readers hold dev_base_lock for reading.
 178 *
 179 * Writers must hold the rtnl semaphore while they loop through the
 180 * dev_base_head list, and hold dev_base_lock for writing when they do the
 181 * actual updates.  This allows pure readers to access the list even
 182 * while a writer is preparing to update it.
 183 *
 184 * To put it another way, dev_base_lock is held for writing only to
 185 * protect against pure readers; the rtnl semaphore provides the
 186 * protection against other writers.
 187 *
 188 * See, for example usages, register_netdevice() and
 189 * unregister_netdevice(), which must be called with the rtnl
 190 * semaphore held.
 191 */
 192DEFINE_RWLOCK(dev_base_lock);
 193
 194EXPORT_SYMBOL(dev_base_lock);
 195
 196#define NETDEV_HASHBITS	8
 197#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 198
 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200{
 201	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 208}
 209
 210/* Device list insertion */
 211static int list_netdevice(struct net_device *dev)
 212{
 213	struct net *net = dev_net(dev);
 214
 215	ASSERT_RTNL();
 216
 217	write_lock_bh(&dev_base_lock);
 218	list_add_tail(&dev->dev_list, &net->dev_base_head);
 219	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 220	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 221	write_unlock_bh(&dev_base_lock);
 222	return 0;
 223}
 224
 225/* Device list removal */
 226static void unlist_netdevice(struct net_device *dev)
 227{
 228	ASSERT_RTNL();
 229
 230	/* Unlink dev from the device chain */
 231	write_lock_bh(&dev_base_lock);
 232	list_del(&dev->dev_list);
 233	hlist_del(&dev->name_hlist);
 234	hlist_del(&dev->index_hlist);
 235	write_unlock_bh(&dev_base_lock);
 236}
 237
 238/*
 239 *	Our notifier list
 240 */
 241
 242static RAW_NOTIFIER_HEAD(netdev_chain);
 243
 244/*
 245 *	Device drivers call our routines to queue packets here. We empty the
 246 *	queue in the local softnet handler.
 247 */
 248
 249DEFINE_PER_CPU(struct softnet_data, softnet_data);
 250
 251#ifdef CONFIG_LOCKDEP
 252/*
 253 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 254 * according to dev->type
 255 */
 256static const unsigned short netdev_lock_type[] =
 257	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 258	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 259	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 260	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 261	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 262	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 263	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 264	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 265	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 266	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 267	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 268	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 269	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 270	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 271	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 272
 273static const char *netdev_lock_name[] =
 274	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 275	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 276	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 277	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 278	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 279	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 280	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 281	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 282	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 283	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 284	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 285	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 286	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 287	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 288	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 289
 290static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 292
 293static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 294{
 295	int i;
 296
 297	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 298		if (netdev_lock_type[i] == dev_type)
 299			return i;
 300	/* the last key is used by default */
 301	return ARRAY_SIZE(netdev_lock_type) - 1;
 302}
 303
 304static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 305						 unsigned short dev_type)
 306{
 307	int i;
 308
 309	i = netdev_lock_pos(dev_type);
 310	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 311				   netdev_lock_name[i]);
 312}
 313
 314static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 315{
 316	int i;
 317
 318	i = netdev_lock_pos(dev->type);
 319	lockdep_set_class_and_name(&dev->addr_list_lock,
 320				   &netdev_addr_lock_key[i],
 321				   netdev_lock_name[i]);
 322}
 323#else
 324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325						 unsigned short dev_type)
 326{
 327}
 328static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329{
 330}
 331#endif
 332
 333/*******************************************************************************
 334
 335		Protocol management and registration routines
 336
 337*******************************************************************************/
 338
 339/*
 340 *	Add a protocol ID to the list. Now that the input handler is
 341 *	smarter we can dispense with all the messy stuff that used to be
 342 *	here.
 343 *
 344 *	BEWARE!!! Protocol handlers, mangling input packets,
 345 *	MUST BE last in hash buckets and checking protocol handlers
 346 *	MUST start from promiscuous ptype_all chain in net_bh.
 347 *	It is true now, do not change it.
 348 *	Explanation follows: if protocol handler, mangling packet, will
 349 *	be the first on list, it is not able to sense, that packet
 350 *	is cloned and should be copied-on-write, so that it will
 351 *	change it and subsequent readers will get broken packet.
 352 *							--ANK (980803)
 353 */
 354
 355/**
 356 *	dev_add_pack - add packet handler
 357 *	@pt: packet type declaration
 358 *
 359 *	Add a protocol handler to the networking stack. The passed &packet_type
 360 *	is linked into kernel lists and may not be freed until it has been
 361 *	removed from the kernel lists.
 362 *
 363 *	This call does not sleep therefore it can not
 364 *	guarantee all CPU's that are in middle of receiving packets
 365 *	will see the new packet type (until the next received packet).
 366 */
 367
 368void dev_add_pack(struct packet_type *pt)
 369{
 370	int hash;
 371
 372	spin_lock_bh(&ptype_lock);
 373	if (pt->type == htons(ETH_P_ALL))
 374		list_add_rcu(&pt->list, &ptype_all);
 375	else {
 376		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 377		list_add_rcu(&pt->list, &ptype_base[hash]);
 378	}
 379	spin_unlock_bh(&ptype_lock);
 380}
 381
 382/**
 383 *	__dev_remove_pack	 - remove packet handler
 384 *	@pt: packet type declaration
 385 *
 386 *	Remove a protocol handler that was previously added to the kernel
 387 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 388 *	from the kernel lists and can be freed or reused once this function
 389 *	returns.
 390 *
 391 *      The packet type might still be in use by receivers
 392 *	and must not be freed until after all the CPU's have gone
 393 *	through a quiescent state.
 394 */
 395void __dev_remove_pack(struct packet_type *pt)
 396{
 397	struct list_head *head;
 398	struct packet_type *pt1;
 399
 400	spin_lock_bh(&ptype_lock);
 401
 402	if (pt->type == htons(ETH_P_ALL))
 403		head = &ptype_all;
 404	else
 405		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 406
 407	list_for_each_entry(pt1, head, list) {
 408		if (pt == pt1) {
 409			list_del_rcu(&pt->list);
 410			goto out;
 411		}
 412	}
 413
 414	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 415out:
 416	spin_unlock_bh(&ptype_lock);
 417}
 418/**
 419 *	dev_remove_pack	 - remove packet handler
 420 *	@pt: packet type declaration
 421 *
 422 *	Remove a protocol handler that was previously added to the kernel
 423 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424 *	from the kernel lists and can be freed or reused once this function
 425 *	returns.
 426 *
 427 *	This call sleeps to guarantee that no CPU is looking at the packet
 428 *	type after return.
 429 */
 430void dev_remove_pack(struct packet_type *pt)
 431{
 432	__dev_remove_pack(pt);
 433
 434	synchronize_net();
 435}
 436
 437/******************************************************************************
 438
 439		      Device Boot-time Settings Routines
 440
 441*******************************************************************************/
 442
 443/* Boot time configuration table */
 444static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 445
 446/**
 447 *	netdev_boot_setup_add	- add new setup entry
 448 *	@name: name of the device
 449 *	@map: configured settings for the device
 450 *
 451 *	Adds new setup entry to the dev_boot_setup list.  The function
 452 *	returns 0 on error and 1 on success.  This is a generic routine to
 453 *	all netdevices.
 454 */
 455static int netdev_boot_setup_add(char *name, struct ifmap *map)
 456{
 457	struct netdev_boot_setup *s;
 458	int i;
 459
 460	s = dev_boot_setup;
 461	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 462		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 463			memset(s[i].name, 0, sizeof(s[i].name));
 464			strlcpy(s[i].name, name, IFNAMSIZ);
 465			memcpy(&s[i].map, map, sizeof(s[i].map));
 466			break;
 467		}
 468	}
 469
 470	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 471}
 472
 473/**
 474 *	netdev_boot_setup_check	- check boot time settings
 475 *	@dev: the netdevice
 476 *
 477 * 	Check boot time settings for the device.
 478 *	The found settings are set for the device to be used
 479 *	later in the device probing.
 480 *	Returns 0 if no settings found, 1 if they are.
 481 */
 482int netdev_boot_setup_check(struct net_device *dev)
 483{
 484	struct netdev_boot_setup *s = dev_boot_setup;
 485	int i;
 486
 487	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 488		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 489		    !strcmp(dev->name, s[i].name)) {
 490			dev->irq 	= s[i].map.irq;
 491			dev->base_addr 	= s[i].map.base_addr;
 492			dev->mem_start 	= s[i].map.mem_start;
 493			dev->mem_end 	= s[i].map.mem_end;
 494			return 1;
 495		}
 496	}
 497	return 0;
 498}
 499
 500
 501/**
 502 *	netdev_boot_base	- get address from boot time settings
 503 *	@prefix: prefix for network device
 504 *	@unit: id for network device
 505 *
 506 * 	Check boot time settings for the base address of device.
 507 *	The found settings are set for the device to be used
 508 *	later in the device probing.
 509 *	Returns 0 if no settings found.
 510 */
 511unsigned long netdev_boot_base(const char *prefix, int unit)
 512{
 513	const struct netdev_boot_setup *s = dev_boot_setup;
 514	char name[IFNAMSIZ];
 515	int i;
 516
 517	sprintf(name, "%s%d", prefix, unit);
 518
 519	/*
 520	 * If device already registered then return base of 1
 521	 * to indicate not to probe for this interface
 522	 */
 523	if (__dev_get_by_name(&init_net, name))
 524		return 1;
 525
 526	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 527		if (!strcmp(name, s[i].name))
 528			return s[i].map.base_addr;
 529	return 0;
 530}
 531
 532/*
 533 * Saves at boot time configured settings for any netdevice.
 534 */
 535int __init netdev_boot_setup(char *str)
 536{
 537	int ints[5];
 538	struct ifmap map;
 539
 540	str = get_options(str, ARRAY_SIZE(ints), ints);
 541	if (!str || !*str)
 542		return 0;
 543
 544	/* Save settings */
 545	memset(&map, 0, sizeof(map));
 546	if (ints[0] > 0)
 547		map.irq = ints[1];
 548	if (ints[0] > 1)
 549		map.base_addr = ints[2];
 550	if (ints[0] > 2)
 551		map.mem_start = ints[3];
 552	if (ints[0] > 3)
 553		map.mem_end = ints[4];
 554
 555	/* Add new entry to the list */
 556	return netdev_boot_setup_add(str, &map);
 557}
 558
 559__setup("netdev=", netdev_boot_setup);
 560
 561/*******************************************************************************
 562
 563			    Device Interface Subroutines
 564
 565*******************************************************************************/
 566
 567/**
 568 *	__dev_get_by_name	- find a device by its name
 569 *	@net: the applicable net namespace
 570 *	@name: name to find
 571 *
 572 *	Find an interface by name. Must be called under RTNL semaphore
 573 *	or @dev_base_lock. If the name is found a pointer to the device
 574 *	is returned. If the name is not found then %NULL is returned. The
 575 *	reference counters are not incremented so the caller must be
 576 *	careful with locks.
 577 */
 578
 579struct net_device *__dev_get_by_name(struct net *net, const char *name)
 580{
 581	struct hlist_node *p;
 582
 583	hlist_for_each(p, dev_name_hash(net, name)) {
 584		struct net_device *dev
 585			= hlist_entry(p, struct net_device, name_hlist);
 586		if (!strncmp(dev->name, name, IFNAMSIZ))
 587			return dev;
 588	}
 589	return NULL;
 590}
 591
 592/**
 593 *	dev_get_by_name		- find a device by its name
 594 *	@net: the applicable net namespace
 595 *	@name: name to find
 596 *
 597 *	Find an interface by name. This can be called from any
 598 *	context and does its own locking. The returned handle has
 599 *	the usage count incremented and the caller must use dev_put() to
 600 *	release it when it is no longer needed. %NULL is returned if no
 601 *	matching device is found.
 602 */
 603
 604struct net_device *dev_get_by_name(struct net *net, const char *name)
 605{
 606	struct net_device *dev;
 607
 608	read_lock(&dev_base_lock);
 609	dev = __dev_get_by_name(net, name);
 610	if (dev)
 611		dev_hold(dev);
 612	read_unlock(&dev_base_lock);
 613	return dev;
 614}
 615
 616/**
 617 *	__dev_get_by_index - find a device by its ifindex
 618 *	@net: the applicable net namespace
 619 *	@ifindex: index of device
 620 *
 621 *	Search for an interface by index. Returns %NULL if the device
 622 *	is not found or a pointer to the device. The device has not
 623 *	had its reference counter increased so the caller must be careful
 624 *	about locking. The caller must hold either the RTNL semaphore
 625 *	or @dev_base_lock.
 626 */
 627
 628struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 629{
 630	struct hlist_node *p;
 631
 632	hlist_for_each(p, dev_index_hash(net, ifindex)) {
 633		struct net_device *dev
 634			= hlist_entry(p, struct net_device, index_hlist);
 635		if (dev->ifindex == ifindex)
 636			return dev;
 637	}
 638	return NULL;
 639}
 640
 641
 642/**
 643 *	dev_get_by_index - find a device by its ifindex
 644 *	@net: the applicable net namespace
 645 *	@ifindex: index of device
 646 *
 647 *	Search for an interface by index. Returns NULL if the device
 648 *	is not found or a pointer to the device. The device returned has
 649 *	had a reference added and the pointer is safe until the user calls
 650 *	dev_put to indicate they have finished with it.
 651 */
 652
 653struct net_device *dev_get_by_index(struct net *net, int ifindex)
 654{
 655	struct net_device *dev;
 656
 657	read_lock(&dev_base_lock);
 658	dev = __dev_get_by_index(net, ifindex);
 659	if (dev)
 660		dev_hold(dev);
 661	read_unlock(&dev_base_lock);
 662	return dev;
 663}
 664
 665/**
 666 *	dev_getbyhwaddr - find a device by its hardware address
 667 *	@net: the applicable net namespace
 668 *	@type: media type of device
 669 *	@ha: hardware address
 670 *
 671 *	Search for an interface by MAC address. Returns NULL if the device
 672 *	is not found or a pointer to the device. The caller must hold the
 673 *	rtnl semaphore. The returned device has not had its ref count increased
 674 *	and the caller must therefore be careful about locking
 675 *
 676 *	BUGS:
 677 *	If the API was consistent this would be __dev_get_by_hwaddr
 678 */
 679
 680struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 681{
 682	struct net_device *dev;
 683
 684	ASSERT_RTNL();
 685
 686	for_each_netdev(net, dev)
 687		if (dev->type == type &&
 688		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 689			return dev;
 690
 691	return NULL;
 692}
 693
 694EXPORT_SYMBOL(dev_getbyhwaddr);
 695
 696struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 697{
 698	struct net_device *dev;
 699
 700	ASSERT_RTNL();
 701	for_each_netdev(net, dev)
 702		if (dev->type == type)
 703			return dev;
 704
 705	return NULL;
 706}
 707
 708EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 709
 710struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 711{
 712	struct net_device *dev;
 713
 714	rtnl_lock();
 715	dev = __dev_getfirstbyhwtype(net, type);
 716	if (dev)
 717		dev_hold(dev);
 718	rtnl_unlock();
 719	return dev;
 720}
 721
 722EXPORT_SYMBOL(dev_getfirstbyhwtype);
 723
 724/**
 725 *	dev_get_by_flags - find any device with given flags
 726 *	@net: the applicable net namespace
 727 *	@if_flags: IFF_* values
 728 *	@mask: bitmask of bits in if_flags to check
 729 *
 730 *	Search for any interface with the given flags. Returns NULL if a device
 731 *	is not found or a pointer to the device. The device returned has
 732 *	had a reference added and the pointer is safe until the user calls
 733 *	dev_put to indicate they have finished with it.
 734 */
 735
 736struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 737{
 738	struct net_device *dev, *ret;
 739
 740	ret = NULL;
 741	read_lock(&dev_base_lock);
 742	for_each_netdev(net, dev) {
 743		if (((dev->flags ^ if_flags) & mask) == 0) {
 744			dev_hold(dev);
 745			ret = dev;
 746			break;
 747		}
 748	}
 749	read_unlock(&dev_base_lock);
 750	return ret;
 751}
 752
 753/**
 754 *	dev_valid_name - check if name is okay for network device
 755 *	@name: name string
 756 *
 757 *	Network device names need to be valid file names to
 758 *	to allow sysfs to work.  We also disallow any kind of
 759 *	whitespace.
 760 */
 761int dev_valid_name(const char *name)
 762{
 763	if (*name == '\0')
 764		return 0;
 765	if (strlen(name) >= IFNAMSIZ)
 766		return 0;
 767	if (!strcmp(name, ".") || !strcmp(name, ".."))
 768		return 0;
 769
 770	while (*name) {
 771		if (*name == '/' || isspace(*name))
 772			return 0;
 773		name++;
 774	}
 775	return 1;
 776}
 777
 778/**
 779 *	__dev_alloc_name - allocate a name for a device
 780 *	@net: network namespace to allocate the device name in
 781 *	@name: name format string
 782 *	@buf:  scratch buffer and result name string
 783 *
 784 *	Passed a format string - eg "lt%d" it will try and find a suitable
 785 *	id. It scans list of devices to build up a free map, then chooses
 786 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 787 *	while allocating the name and adding the device in order to avoid
 788 *	duplicates.
 789 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 790 *	Returns the number of the unit assigned or a negative errno code.
 791 */
 792
 793static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 794{
 795	int i = 0;
 796	const char *p;
 797	const int max_netdevices = 8*PAGE_SIZE;
 798	unsigned long *inuse;
 799	struct net_device *d;
 800
 801	p = strnchr(name, IFNAMSIZ-1, '%');
 802	if (p) {
 803		/*
 804		 * Verify the string as this thing may have come from
 805		 * the user.  There must be either one "%d" and no other "%"
 806		 * characters.
 807		 */
 808		if (p[1] != 'd' || strchr(p + 2, '%'))
 809			return -EINVAL;
 810
 811		/* Use one page as a bit array of possible slots */
 812		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 813		if (!inuse)
 814			return -ENOMEM;
 815
 816		for_each_netdev(net, d) {
 817			if (!sscanf(d->name, name, &i))
 818				continue;
 819			if (i < 0 || i >= max_netdevices)
 820				continue;
 821
 822			/*  avoid cases where sscanf is not exact inverse of printf */
 823			snprintf(buf, IFNAMSIZ, name, i);
 824			if (!strncmp(buf, d->name, IFNAMSIZ))
 825				set_bit(i, inuse);
 826		}
 827
 828		i = find_first_zero_bit(inuse, max_netdevices);
 829		free_page((unsigned long) inuse);
 830	}
 831
 832	snprintf(buf, IFNAMSIZ, name, i);
 833	if (!__dev_get_by_name(net, buf))
 834		return i;
 835
 836	/* It is possible to run out of possible slots
 837	 * when the name is long and there isn't enough space left
 838	 * for the digits, or if all bits are used.
 839	 */
 840	return -ENFILE;
 841}
 842
 843/**
 844 *	dev_alloc_name - allocate a name for a device
 845 *	@dev: device
 846 *	@name: name format string
 847 *
 848 *	Passed a format string - eg "lt%d" it will try and find a suitable
 849 *	id. It scans list of devices to build up a free map, then chooses
 850 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 851 *	while allocating the name and adding the device in order to avoid
 852 *	duplicates.
 853 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 854 *	Returns the number of the unit assigned or a negative errno code.
 855 */
 856
 857int dev_alloc_name(struct net_device *dev, const char *name)
 858{
 859	char buf[IFNAMSIZ];
 860	struct net *net;
 861	int ret;
 862
 863	BUG_ON(!dev_net(dev));
 864	net = dev_net(dev);
 865	ret = __dev_alloc_name(net, name, buf);
 866	if (ret >= 0)
 867		strlcpy(dev->name, buf, IFNAMSIZ);
 868	return ret;
 869}
 870
 871
 872/**
 873 *	dev_change_name - change name of a device
 874 *	@dev: device
 875 *	@newname: name (or format string) must be at least IFNAMSIZ
 876 *
 877 *	Change name of a device, can pass format strings "eth%d".
 878 *	for wildcarding.
 879 */
 880int dev_change_name(struct net_device *dev, const char *newname)
 881{
 882	char oldname[IFNAMSIZ];
 883	int err = 0;
 884	int ret;
 885	struct net *net;
 886
 887	ASSERT_RTNL();
 888	BUG_ON(!dev_net(dev));
 889
 890	net = dev_net(dev);
 891	if (dev->flags & IFF_UP)
 892		return -EBUSY;
 893
 894	if (!dev_valid_name(newname))
 895		return -EINVAL;
 896
 897	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 898		return 0;
 899
 900	memcpy(oldname, dev->name, IFNAMSIZ);
 901
 902	if (strchr(newname, '%')) {
 903		err = dev_alloc_name(dev, newname);
 904		if (err < 0)
 905			return err;
 906	}
 907	else if (__dev_get_by_name(net, newname))
 908		return -EEXIST;
 909	else
 910		strlcpy(dev->name, newname, IFNAMSIZ);
 911
 912rollback:
 913	/* For now only devices in the initial network namespace
 914	 * are in sysfs.
 915	 */
 916	if (net == &init_net) {
 917		ret = device_rename(&dev->dev, dev->name);
 918		if (ret) {
 919			memcpy(dev->name, oldname, IFNAMSIZ);
 920			return ret;
 921		}
 922	}
 923
 924	write_lock_bh(&dev_base_lock);
 925	hlist_del(&dev->name_hlist);
 926	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 927	write_unlock_bh(&dev_base_lock);
 928
 929	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 930	ret = notifier_to_errno(ret);
 931
 932	if (ret) {
 933		if (err) {
 934			printk(KERN_ERR
 935			       "%s: name change rollback failed: %d.\n",
 936			       dev->name, ret);
 937		} else {
 938			err = ret;
 939			memcpy(dev->name, oldname, IFNAMSIZ);
 940			goto rollback;
 941		}
 942	}
 943
 944	return err;
 945}
 946
 947/**
 948 *	dev_set_alias - change ifalias of a device
 949 *	@dev: device
 950 *	@alias: name up to IFALIASZ
 951 *	@len: limit of bytes to copy from info
 952 *
 953 *	Set ifalias for a device,
 954 */
 955int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 956{
 957	ASSERT_RTNL();
 958
 959	if (len >= IFALIASZ)
 960		return -EINVAL;
 961
 962	if (!len) {
 963		if (dev->ifalias) {
 964			kfree(dev->ifalias);
 965			dev->ifalias = NULL;
 966		}
 967		return 0;
 968	}
 969
 970	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 971	if (!dev->ifalias)
 972		return -ENOMEM;
 973
 974	strlcpy(dev->ifalias, alias, len+1);
 975	return len;
 976}
 977
 978
 979/**
 980 *	netdev_features_change - device changes features
 981 *	@dev: device to cause notification
 982 *
 983 *	Called to indicate a device has changed features.
 984 */
 985void netdev_features_change(struct net_device *dev)
 986{
 987	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 988}
 989EXPORT_SYMBOL(netdev_features_change);
 990
 991/**
 992 *	netdev_state_change - device changes state
 993 *	@dev: device to cause notification
 994 *
 995 *	Called to indicate a device has changed state. This function calls
 996 *	the notifier chains for netdev_chain and sends a NEWLINK message
 997 *	to the routing socket.
 998 */
 999void netdev_state_change(struct net_device *dev)
1000{
1001	if (dev->flags & IFF_UP) {
1002		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004	}
1005}
1006
1007void netdev_bonding_change(struct net_device *dev)
1008{
1009	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010}
1011EXPORT_SYMBOL(netdev_bonding_change);
1012
1013/**
1014 *	dev_load 	- load a network module
1015 *	@net: the applicable net namespace
1016 *	@name: name of interface
1017 *
1018 *	If a network interface is not present and the process has suitable
1019 *	privileges this function loads the module. If module loading is not
1020 *	available in this kernel then it becomes a nop.
1021 */
1022
1023void dev_load(struct net *net, const char *name)
1024{
1025	struct net_device *dev;
1026
1027	read_lock(&dev_base_lock);
1028	dev = __dev_get_by_name(net, name);
1029	read_unlock(&dev_base_lock);
1030
1031	if (!dev && capable(CAP_SYS_MODULE))
1032		request_module("%s", name);
1033}
1034
1035/**
1036 *	dev_open	- prepare an interface for use.
1037 *	@dev:	device to open
1038 *
1039 *	Takes a device from down to up state. The device's private open
1040 *	function is invoked and then the multicast lists are loaded. Finally
1041 *	the device is moved into the up state and a %NETDEV_UP message is
1042 *	sent to the netdev notifier chain.
1043 *
1044 *	Calling this function on an active interface is a nop. On a failure
1045 *	a negative errno code is returned.
1046 */
1047int dev_open(struct net_device *dev)
1048{
1049	const struct net_device_ops *ops = dev->netdev_ops;
1050	int ret = 0;
1051
1052	ASSERT_RTNL();
1053
1054	/*
1055	 *	Is it already up?
1056	 */
1057
1058	if (dev->flags & IFF_UP)
1059		return 0;
1060
1061	/*
1062	 *	Is it even present?
1063	 */
1064	if (!netif_device_present(dev))
1065		return -ENODEV;
1066
1067	/*
1068	 *	Call device private open method
1069	 */
1070	set_bit(__LINK_STATE_START, &dev->state);
1071
1072	if (ops->ndo_validate_addr)
1073		ret = ops->ndo_validate_addr(dev);
1074
1075	if (!ret && ops->ndo_open)
1076		ret = ops->ndo_open(dev);
1077
1078	/*
1079	 *	If it went open OK then:
1080	 */
1081
1082	if (ret)
1083		clear_bit(__LINK_STATE_START, &dev->state);
1084	else {
1085		/*
1086		 *	Set the flags.
1087		 */
1088		dev->flags |= IFF_UP;
1089
1090		/*
1091		 *	Enable NET_DMA
1092		 */
1093		net_dmaengine_get();
1094
1095		/*
1096		 *	Initialize multicasting status
1097		 */
1098		dev_set_rx_mode(dev);
1099
1100		/*
1101		 *	Wakeup transmit queue engine
1102		 */
1103		dev_activate(dev);
1104
1105		/*
1106		 *	... and announce new interface.
1107		 */
1108		call_netdevice_notifiers(NETDEV_UP, dev);
1109	}
1110
1111	return ret;
1112}
1113
1114/**
1115 *	dev_close - shutdown an interface.
1116 *	@dev: device to shutdown
1117 *
1118 *	This function moves an active device into down state. A
1119 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121 *	chain.
1122 */
1123int dev_close(struct net_device *dev)
1124{
1125	const struct net_device_ops *ops = dev->netdev_ops;
1126	ASSERT_RTNL();
1127
1128	might_sleep();
1129
1130	if (!(dev->flags & IFF_UP))
1131		return 0;
1132
1133	/*
1134	 *	Tell people we are going down, so that they can
1135	 *	prepare to death, when device is still operating.
1136	 */
1137	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138
1139	clear_bit(__LINK_STATE_START, &dev->state);
1140
1141	/* Synchronize to scheduled poll. We cannot touch poll list,
1142	 * it can be even on different cpu. So just clear netif_running().
1143	 *
1144	 * dev->stop() will invoke napi_disable() on all of it's
1145	 * napi_struct instances on this device.
1146	 */
1147	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148
1149	dev_deactivate(dev);
1150
1151	/*
1152	 *	Call the device specific close. This cannot fail.
1153	 *	Only if device is UP
1154	 *
1155	 *	We allow it to be called even after a DETACH hot-plug
1156	 *	event.
1157	 */
1158	if (ops->ndo_stop)
1159		ops->ndo_stop(dev);
1160
1161	/*
1162	 *	Device is now down.
1163	 */
1164
1165	dev->flags &= ~IFF_UP;
1166
1167	/*
1168	 * Tell people we are down
1169	 */
1170	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171
1172	/*
1173	 *	Shutdown NET_DMA
1174	 */
1175	net_dmaengine_put();
1176
1177	return 0;
1178}
1179
1180
1181/**
1182 *	dev_disable_lro - disable Large Receive Offload on a device
1183 *	@dev: device
1184 *
1185 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186 *	called under RTNL.  This is needed if received packets may be
1187 *	forwarded to another interface.
1188 */
1189void dev_disable_lro(struct net_device *dev)
1190{
1191	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192	    dev->ethtool_ops->set_flags) {
1193		u32 flags = dev->ethtool_ops->get_flags(dev);
1194		if (flags & ETH_FLAG_LRO) {
1195			flags &= ~ETH_FLAG_LRO;
1196			dev->ethtool_ops->set_flags(dev, flags);
1197		}
1198	}
1199	WARN_ON(dev->features & NETIF_F_LRO);
1200}
1201EXPORT_SYMBOL(dev_disable_lro);
1202
1203
1204static int dev_boot_phase = 1;
1205
1206/*
1207 *	Device change register/unregister. These are not inline or static
1208 *	as we export them to the world.
1209 */
1210
1211/**
1212 *	register_netdevice_notifier - register a network notifier block
1213 *	@nb: notifier
1214 *
1215 *	Register a notifier to be called when network device events occur.
1216 *	The notifier passed is linked into the kernel structures and must
1217 *	not be reused until it has been unregistered. A negative errno code
1218 *	is returned on a failure.
1219 *
1220 * 	When registered all registration and up events are replayed
1221 *	to the new notifier to allow device to have a race free
1222 *	view of the network device list.
1223 */
1224
1225int register_netdevice_notifier(struct notifier_block *nb)
1226{
1227	struct net_device *dev;
1228	struct net_device *last;
1229	struct net *net;
1230	int err;
1231
1232	rtnl_lock();
1233	err = raw_notifier_chain_register(&netdev_chain, nb);
1234	if (err)
1235		goto unlock;
1236	if (dev_boot_phase)
1237		goto unlock;
1238	for_each_net(net) {
1239		for_each_netdev(net, dev) {
1240			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241			err = notifier_to_errno(err);
1242			if (err)
1243				goto rollback;
1244
1245			if (!(dev->flags & IFF_UP))
1246				continue;
1247
1248			nb->notifier_call(nb, NETDEV_UP, dev);
1249		}
1250	}
1251
1252unlock:
1253	rtnl_unlock();
1254	return err;
1255
1256rollback:
1257	last = dev;
1258	for_each_net(net) {
1259		for_each_netdev(net, dev) {
1260			if (dev == last)
1261				break;
1262
1263			if (dev->flags & IFF_UP) {
1264				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266			}
1267			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268		}
1269	}
1270
1271	raw_notifier_chain_unregister(&netdev_chain, nb);
1272	goto unlock;
1273}
1274
1275/**
1276 *	unregister_netdevice_notifier - unregister a network notifier block
1277 *	@nb: notifier
1278 *
1279 *	Unregister a notifier previously registered by
1280 *	register_netdevice_notifier(). The notifier is unlinked into the
1281 *	kernel structures and may then be reused. A negative errno code
1282 *	is returned on a failure.
1283 */
1284
1285int unregister_netdevice_notifier(struct notifier_block *nb)
1286{
1287	int err;
1288
1289	rtnl_lock();
1290	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291	rtnl_unlock();
1292	return err;
1293}
1294
1295/**
1296 *	call_netdevice_notifiers - call all network notifier blocks
1297 *      @val: value passed unmodified to notifier function
1298 *      @dev: net_device pointer passed unmodified to notifier function
1299 *
1300 *	Call all network notifier blocks.  Parameters and return value
1301 *	are as for raw_notifier_call_chain().
1302 */
1303
1304int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305{
1306	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307}
1308
1309/* When > 0 there are consumers of rx skb time stamps */
1310static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311
1312void net_enable_timestamp(void)
1313{
1314	atomic_inc(&netstamp_needed);
1315}
1316
1317void net_disable_timestamp(void)
1318{
1319	atomic_dec(&netstamp_needed);
1320}
1321
1322static inline void net_timestamp(struct sk_buff *skb)
1323{
1324	if (atomic_read(&netstamp_needed))
1325		__net_timestamp(skb);
1326	else
1327		skb->tstamp.tv64 = 0;
1328}
1329
1330/*
1331 *	Support routine. Sends outgoing frames to any network
1332 *	taps currently in use.
1333 */
1334
1335static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336{
1337	struct packet_type *ptype;
1338
1339#ifdef CONFIG_NET_CLS_ACT
1340	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1341		net_timestamp(skb);
1342#else
1343	net_timestamp(skb);
1344#endif
1345
1346	rcu_read_lock();
1347	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1348		/* Never send packets back to the socket
1349		 * they originated from - MvS (miquels@drinkel.ow.org)
1350		 */
1351		if ((ptype->dev == dev || !ptype->dev) &&
1352		    (ptype->af_packet_priv == NULL ||
1353		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1354			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1355			if (!skb2)
1356				break;
1357
1358			/* skb->nh should be correctly
1359			   set by sender, so that the second statement is
1360			   just protection against buggy protocols.
1361			 */
1362			skb_reset_mac_header(skb2);
1363
1364			if (skb_network_header(skb2) < skb2->data ||
1365			    skb2->network_header > skb2->tail) {
1366				if (net_ratelimit())
1367					printk(KERN_CRIT "protocol %04x is "
1368					       "buggy, dev %s\n",
1369					       skb2->protocol, dev->name);
1370				skb_reset_network_header(skb2);
1371			}
1372
1373			skb2->transport_header = skb2->network_header;
1374			skb2->pkt_type = PACKET_OUTGOING;
1375			ptype->func(skb2, skb->dev, ptype, skb->dev);
1376		}
1377	}
1378	rcu_read_unlock();
1379}
1380
1381
1382static inline void __netif_reschedule(struct Qdisc *q)
1383{
1384	struct softnet_data *sd;
1385	unsigned long flags;
1386
1387	local_irq_save(flags);
1388	sd = &__get_cpu_var(softnet_data);
1389	q->next_sched = sd->output_queue;
1390	sd->output_queue = q;
1391	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1392	local_irq_restore(flags);
1393}
1394
1395void __netif_schedule(struct Qdisc *q)
1396{
1397	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1398		__netif_reschedule(q);
1399}
1400EXPORT_SYMBOL(__netif_schedule);
1401
1402void dev_kfree_skb_irq(struct sk_buff *skb)
1403{
1404	if (atomic_dec_and_test(&skb->users)) {
1405		struct softnet_data *sd;
1406		unsigned long flags;
1407
1408		local_irq_save(flags);
1409		sd = &__get_cpu_var(softnet_data);
1410		skb->next = sd->completion_queue;
1411		sd->completion_queue = skb;
1412		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1413		local_irq_restore(flags);
1414	}
1415}
1416EXPORT_SYMBOL(dev_kfree_skb_irq);
1417
1418void dev_kfree_skb_any(struct sk_buff *skb)
1419{
1420	if (in_irq() || irqs_disabled())
1421		dev_kfree_skb_irq(skb);
1422	else
1423		dev_kfree_skb(skb);
1424}
1425EXPORT_SYMBOL(dev_kfree_skb_any);
1426
1427
1428/**
1429 * netif_device_detach - mark device as removed
1430 * @dev: network device
1431 *
1432 * Mark device as removed from system and therefore no longer available.
1433 */
1434void netif_device_detach(struct net_device *dev)
1435{
1436	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1437	    netif_running(dev)) {
1438		netif_tx_stop_all_queues(dev);
1439	}
1440}
1441EXPORT_SYMBOL(netif_device_detach);
1442
1443/**
1444 * netif_device_attach - mark device as attached
1445 * @dev: network device
1446 *
1447 * Mark device as attached from system and restart if needed.
1448 */
1449void netif_device_attach(struct net_device *dev)
1450{
1451	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1452	    netif_running(dev)) {
1453		netif_tx_wake_all_queues(dev);
1454		__netdev_watchdog_up(dev);
1455	}
1456}
1457EXPORT_SYMBOL(netif_device_attach);
1458
1459static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1460{
1461	return ((features & NETIF_F_GEN_CSUM) ||
1462		((features & NETIF_F_IP_CSUM) &&
1463		 protocol == htons(ETH_P_IP)) ||
1464		((features & NETIF_F_IPV6_CSUM) &&
1465		 protocol == htons(ETH_P_IPV6)) ||
1466		((features & NETIF_F_FCOE_CRC) &&
1467		 protocol == htons(ETH_P_FCOE)));
1468}
1469
1470static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1471{
1472	if (can_checksum_protocol(dev->features, skb->protocol))
1473		return true;
1474
1475	if (skb->protocol == htons(ETH_P_8021Q)) {
1476		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1477		if (can_checksum_protocol(dev->features & dev->vlan_features,
1478					  veh->h_vlan_encapsulated_proto))
1479			return true;
1480	}
1481
1482	return false;
1483}
1484
1485/*
1486 * Invalidate hardware checksum when packet is to be mangled, and
1487 * complete checksum manually on outgoing path.
1488 */
1489int skb_checksum_help(struct sk_buff *skb)
1490{
1491	__wsum csum;
1492	int ret = 0, offset;
1493
1494	if (skb->ip_summed == CHECKSUM_COMPLETE)
1495		goto out_set_summed;
1496
1497	if (unlikely(skb_shinfo(skb)->gso_size)) {
1498		/* Let GSO fix up the checksum. */
1499		goto out_set_summed;
1500	}
1501
1502	offset = skb->csum_start - skb_headroom(skb);
1503	BUG_ON(offset >= skb_headlen(skb));
1504	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1505
1506	offset += skb->csum_offset;
1507	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1508
1509	if (skb_cloned(skb) &&
1510	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1511		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1512		if (ret)
1513			goto out;
1514	}
1515
1516	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1517out_set_summed:
1518	skb->ip_summed = CHECKSUM_NONE;
1519out:
1520	return ret;
1521}
1522
1523/**
1524 *	skb_gso_segment - Perform segmentation on skb.
1525 *	@skb: buffer to segment
1526 *	@features: features for the output path (see dev->features)
1527 *
1528 *	This function segments the given skb and returns a list of segments.
1529 *
1530 *	It may return NULL if the skb requires no segmentation.  This is
1531 *	only possible when GSO is used for verifying header integrity.
1532 */
1533struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1534{
1535	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1536	struct packet_type *ptype;
1537	__be16 type = skb->protocol;
1538	int err;
1539
1540	skb_reset_mac_header(skb);
1541	skb->mac_len = skb->network_header - skb->mac_header;
1542	__skb_pull(skb, skb->mac_len);
1543
1544	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1545		struct net_device *dev = skb->dev;
1546		struct ethtool_drvinfo info = {};
1547
1548		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1549			dev->ethtool_ops->get_drvinfo(dev, &info);
1550
1551		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1552			"ip_summed=%d",
1553		     info.driver, dev ? dev->features : 0L,
1554		     skb->sk ? skb->sk->sk_route_caps : 0L,
1555		     skb->len, skb->data_len, skb->ip_summed);
1556
1557		if (skb_header_cloned(skb) &&
1558		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1559			return ERR_PTR(err);
1560	}
1561
1562	rcu_read_lock();
1563	list_for_each_entry_rcu(ptype,
1564			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1565		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1566			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1567				err = ptype->gso_send_check(skb);
1568				segs = ERR_PTR(err);
1569				if (err || skb_gso_ok(skb, features))
1570					break;
1571				__skb_push(skb, (skb->data -
1572						 skb_network_header(skb)));
1573			}
1574			segs = ptype->gso_segment(skb, features);
1575			break;
1576		}
1577	}
1578	rcu_read_unlock();
1579
1580	__skb_push(skb, skb->data - skb_mac_header(skb));
1581
1582	return segs;
1583}
1584
1585EXPORT_SYMBOL(skb_gso_segment);
1586
1587/* Take action when hardware reception checksum errors are detected. */
1588#ifdef CONFIG_BUG
1589void netdev_rx_csum_fault(struct net_device *dev)
1590{
1591	if (net_ratelimit()) {
1592		printk(KERN_ERR "%s: hw csum failure.\n",
1593			dev ? dev->name : "<unknown>");
1594		dump_stack();
1595	}
1596}
1597EXPORT_SYMBOL(netdev_rx_csum_fault);
1598#endif
1599
1600/* Actually, we should eliminate this check as soon as we know, that:
1601 * 1. IOMMU is present and allows to map all the memory.
1602 * 2. No high memory really exists on this machine.
1603 */
1604
1605static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1606{
1607#ifdef CONFIG_HIGHMEM
1608	int i;
1609
1610	if (dev->features & NETIF_F_HIGHDMA)
1611		return 0;
1612
1613	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1614		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1615			return 1;
1616
1617#endif
1618	return 0;
1619}
1620
1621struct dev_gso_cb {
1622	void (*destructor)(struct sk_buff *skb);
1623};
1624
1625#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1626
1627static void dev_gso_skb_destructor(struct sk_buff *skb)
1628{
1629	struct dev_gso_cb *cb;
1630
1631	do {
1632		struct sk_buff *nskb = skb->next;
1633
1634		skb->next = nskb->next;
1635		nskb->next = NULL;
1636		kfree_skb(nskb);
1637	} while (skb->next);
1638
1639	cb = DEV_GSO_CB(skb);
1640	if (cb->destructor)
1641		cb->destructor(skb);
1642}
1643
1644/**
1645 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1646 *	@skb: buffer to segment
1647 *
1648 *	This function segments the given skb and stores the list of segments
1649 *	in skb->next.
1650 */
1651static int dev_gso_segment(struct sk_buff *skb)
1652{
1653	struct net_device *dev = skb->dev;
1654	struct sk_buff *segs;
1655	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1656					 NETIF_F_SG : 0);
1657
1658	segs = skb_gso_segment(skb, features);
1659
1660	/* Verifying header integrity only. */
1661	if (!segs)
1662		return 0;
1663
1664	if (IS_ERR(segs))
1665		return PTR_ERR(segs);
1666
1667	skb->next = segs;
1668	DEV_GSO_CB(skb)->destructor = skb->destructor;
1669	skb->destructor = dev_gso_skb_destructor;
1670
1671	return 0;
1672}
1673
1674int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1675			struct netdev_queue *txq)
1676{
1677	const struct net_device_ops *ops = dev->netdev_ops;
1678	int rc;
1679
1680	if (likely(!skb->next)) {
1681		if (!list_empty(&ptype_all))
1682			dev_queue_xmit_nit(skb, dev);
1683
1684		if (netif_needs_gso(dev, skb)) {
1685			if (unlikely(dev_gso_segment(skb)))
1686				goto out_kfree_skb;
1687			if (skb->next)
1688				goto gso;
1689		}
1690
1691		rc = ops->ndo_start_xmit(skb, dev);
1692		/*
1693		 * TODO: if skb_orphan() was called by
1694		 * dev->hard_start_xmit() (for example, the unmodified
1695		 * igb driver does that; bnx2 doesn't), then
1696		 * skb_tx_software_timestamp() will be unable to send
1697		 * back the time stamp.
1698		 *
1699		 * How can this be prevented? Always create another
1700		 * reference to the socket before calling
1701		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1702		 * does anything in dev->hard_start_xmit() by clearing
1703		 * the skb destructor before the call and restoring it
1704		 * afterwards, then doing the skb_orphan() ourselves?
1705		 */
1706		return rc;
1707	}
1708
1709gso:
1710	do {
1711		struct sk_buff *nskb = skb->next;
1712
1713		skb->next = nskb->next;
1714		nskb->next = NULL;
1715		rc = ops->ndo_start_xmit(nskb, dev);
1716		if (unlikely(rc)) {
1717			nskb->next = skb->next;
1718			skb->next = nskb;
1719			return rc;
1720		}
1721		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1722			return NETDEV_TX_BUSY;
1723	} while (skb->next);
1724
1725	skb->destructor = DEV_GSO_CB(skb)->destructor;
1726
1727out_kfree_skb:
1728	kfree_skb(skb);
1729	return 0;
1730}
1731
1732static u32 skb_tx_hashrnd;
1733
1734u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1735{
1736	u32 hash;
1737
1738	if (skb_rx_queue_recorded(skb)) {
1739		hash = skb_get_rx_queue(skb);
1740	} else if (skb->sk && skb->sk->sk_hash) {
1741		hash = skb->sk->sk_hash;
1742	} else
1743		hash = skb->protocol;
1744
1745	hash = jhash_1word(hash, skb_tx_hashrnd);
1746
1747	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1748}
1749EXPORT_SYMBOL(skb_tx_hash);
1750
1751static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1752					struct sk_buff *skb)
1753{
1754	const struct net_device_ops *ops = dev->netdev_ops;
1755	u16 queue_index = 0;
1756
1757	if (ops->ndo_select_queue)
1758		queue_index = ops->ndo_select_queue(dev, skb);
1759	else if (dev->real_num_tx_queues > 1)
1760		queue_index = skb_tx_hash(dev, skb);
1761
1762	skb_set_queue_mapping(skb, queue_index);
1763	return netdev_get_tx_queue(dev, queue_index);
1764}
1765
1766/**
1767 *	dev_queue_xmit - transmit a buffer
1768 *	@skb: buffer to transmit
1769 *
1770 *	Queue a buffer for transmission to a network device. The caller must
1771 *	have set the device and priority and built the buffer before calling
1772 *	this function. The function can be called from an interrupt.
1773 *
1774 *	A negative errno code is returned on a failure. A success does not
1775 *	guarantee the frame will be transmitted as it may be dropped due
1776 *	to congestion or traffic shaping.
1777 *
1778 * -----------------------------------------------------------------------------------
1779 *      I notice this method can also return errors from the queue disciplines,
1780 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1781 *      be positive.
1782 *
1783 *      Regardless of the return value, the skb is consumed, so it is currently
1784 *      difficult to retry a send to this method.  (You can bump the ref count
1785 *      before sending to hold a reference for retry if you are careful.)
1786 *
1787 *      When calling this method, interrupts MUST be enabled.  This is because
1788 *      the BH enable code must have IRQs enabled so that it will not deadlock.
1789 *          --BLG
1790 */
1791int dev_queue_xmit(struct sk_buff *skb)
1792{
1793	struct net_device *dev = skb->dev;
1794	struct netdev_queue *txq;
1795	struct Qdisc *q;
1796	int rc = -ENOMEM;
1797
1798	/* GSO will handle the following emulations directly. */
1799	if (netif_needs_gso(dev, skb))
1800		goto gso;
1801
1802	if (skb_shinfo(skb)->frag_list &&
1803	    !(dev->features & NETIF_F_FRAGLIST) &&
1804	    __skb_linearize(skb))
1805		goto out_kfree_skb;
1806
1807	/* Fragmented skb is linearized if device does not support SG,
1808	 * or if at least one of fragments is in highmem and device
1809	 * does not support DMA from it.
1810	 */
1811	if (skb_shinfo(skb)->nr_frags &&
1812	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1813	    __skb_linearize(skb))
1814		goto out_kfree_skb;
1815
1816	/* If packet is not checksummed and device does not support
1817	 * checksumming for this protocol, complete checksumming here.
1818	 */
1819	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1820		skb_set_transport_header(skb, skb->csum_start -
1821					      skb_headroom(skb));
1822		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1823			goto out_kfree_skb;
1824	}
1825
1826gso:
1827	/* Disable soft irqs for various locks below. Also
1828	 * stops preemption for RCU.
1829	 */
1830	rcu_read_lock_bh();
1831
1832	txq = dev_pick_tx(dev, skb);
1833	q = rcu_dereference(txq->qdisc);
1834
1835#ifdef CONFIG_NET_CLS_ACT
1836	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1837#endif
1838	if (q->enqueue) {
1839		spinlock_t *root_lock = qdisc_lock(q);
1840
1841		spin_lock(root_lock);
1842
1843		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1844			kfree_skb(skb);
1845			rc = NET_XMIT_DROP;
1846		} else {
1847			rc = qdisc_enqueue_root(skb, q);
1848			qdisc_run(q);
1849		}
1850		spin_unlock(root_lock);
1851
1852		goto out;
1853	}
1854
1855	/* The device has no queue. Common case for software devices:
1856	   loopback, all the sorts of tunnels...
1857
1858	   Really, it is unlikely that netif_tx_lock protection is necessary
1859	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1860	   counters.)
1861	   However, it is possible, that they rely on protection
1862	   made by us here.
1863
1864	   Check this and shot the lock. It is not prone from deadlocks.
1865	   Either shot noqueue qdisc, it is even simpler 8)
1866	 */
1867	if (dev->flags & IFF_UP) {
1868		int cpu = smp_processor_id(); /* ok because BHs are off */
1869
1870		if (txq->xmit_lock_owner != cpu) {
1871
1872			HARD_TX_LOCK(dev, txq, cpu);
1873
1874			if (!netif_tx_queue_stopped(txq)) {
1875				rc = 0;
1876				if (!dev_hard_start_xmit(skb, dev, txq)) {
1877					HARD_TX_UNLOCK(dev, txq);
1878					goto out;
1879				}
1880			}
1881			HARD_TX_UNLOCK(dev, txq);
1882			if (net_ratelimit())
1883				printk(KERN_CRIT "Virtual device %s asks to "
1884				       "queue packet!\n", dev->name);
1885		} else {
1886			/* Recursion is detected! It is possible,
1887			 * unfortunately */
1888			if (net_ratelimit())
1889				printk(KERN_CRIT "Dead loop on virtual device "
1890				       "%s, fix it urgently!\n", dev->name);
1891		}
1892	}
1893
1894	rc = -ENETDOWN;
1895	rcu_read_unlock_bh();
1896
1897out_kfree_skb:
1898	kfree_skb(skb);
1899	return rc;
1900out:
1901	rcu_read_unlock_bh();
1902	return rc;
1903}
1904
1905
1906/*=======================================================================
1907			Receiver routines
1908  =======================================================================*/
1909
1910int netdev_max_backlog __read_mostly = 1000;
1911int netdev_budget __read_mostly = 300;
1912int weight_p __read_mostly = 64;            /* old backlog weight */
1913
1914DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1915
1916
1917/**
1918 *	netif_rx	-	post buffer to the network code
1919 *	@skb: buffer to post
1920 *
1921 *	This function receives a packet from a device driver and queues it for
1922 *	the upper (protocol) levels to process.  It always succeeds. The buffer
1923 *	may be dropped during processing for congestion control or by the
1924 *	protocol layers.
1925 *
1926 *	return values:
1927 *	NET_RX_SUCCESS	(no congestion)
1928 *	NET_RX_DROP     (packet was dropped)
1929 *
1930 */
1931
1932int netif_rx(struct sk_buff *skb)
1933{
1934	struct softnet_data *queue;
1935	unsigned long flags;
1936
1937	/* if netpoll wants it, pretend we never saw it */
1938	if (netpoll_rx(skb))
1939		return NET_RX_DROP;
1940
1941	if (!skb->tstamp.tv64)
1942		net_timestamp(skb);
1943
1944	/*
1945	 * The code is rearranged so that the path is the most
1946	 * short when CPU is congested, but is still operating.
1947	 */
1948	local_irq_save(flags);
1949	queue = &__get_cpu_var(softnet_data);
1950
1951	__get_cpu_var(netdev_rx_stat).total++;
1952	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1953		if (queue->input_pkt_queue.qlen) {
1954enqueue:
1955			__skb_queue_tail(&queue->input_pkt_queue, skb);
1956			local_irq_restore(flags);
1957			return NET_RX_SUCCESS;
1958		}
1959
1960		napi_schedule(&queue->backlog);
1961		goto enqueue;
1962	}
1963
1964	__get_cpu_var(netdev_rx_stat).dropped++;
1965	local_irq_restore(flags);
1966
1967	kfree_skb(skb);
1968	return NET_RX_DROP;
1969}
1970
1971int netif_rx_ni(struct sk_buff *skb)
1972{
1973	int err;
1974
1975	preempt_disable();
1976	err = netif_rx(skb);
1977	if (local_softirq_pending())
1978		do_softirq();
1979	preempt_enable();
1980
1981	return err;
1982}
1983
1984EXPORT_SYMBOL(netif_rx_ni);
1985
1986static void net_tx_action(struct softirq_action *h)
1987{
1988	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1989
1990	if (sd->completion_queue) {
1991		struct sk_buff *clist;
1992
1993		local_irq_disable();
1994		clist = sd->completion_queue;
1995		sd->completion_queue = NULL;
1996		local_irq_enable();
1997
1998		while (clist) {
1999			struct sk_buff *skb = clist;
2000			clist = clist->next;
2001
2002			WARN_ON(atomic_read(&skb->users));
2003			__kfree_skb(skb);
2004		}
2005	}
2006
2007	if (sd->output_queue) {
2008		struct Qdisc *head;
2009
2010		local_irq_disable();
2011		head = sd->output_queue;
2012		sd->output_queue = NULL;
2013		local_irq_enable();
2014
2015		while (head) {
2016			struct Qdisc *q = head;
2017			spinlock_t *root_lock;
2018
2019			head = head->next_sched;
2020
2021			root_lock = qdisc_lock(q);
2022			if (spin_trylock(root_lock)) {
2023				smp_mb__before_clear_bit();
2024				clear_bit(__QDISC_STATE_SCHED,
2025					  &q->state);
2026				qdisc_run(q);
2027				spin_unlock(root_lock);
2028			} else {
2029				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2030					      &q->state)) {
2031					__netif_reschedule(q);
2032				} else {
2033					smp_mb__before_clear_bit();
2034					clear_bit(__QDISC_STATE_SCHED,
2035						  &q->state);
2036				}
2037			}
2038		}
2039	}
2040}
2041
2042static inline int deliver_skb(struct sk_buff *skb,
2043			      struct packet_type *pt_prev,
2044			      struct net_device *orig_dev)
2045{
2046	atomic_inc(&skb->users);
2047	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2048}
2049
2050#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2051/* These hooks defined here for ATM */
2052struct net_bridge;
2053struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2054						unsigned char *addr);
2055void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2056
2057/*
2058 * If bridge module is loaded call bridging hook.
2059 *  returns NULL if packet was consumed.
2060 */
2061struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2062					struct sk_buff *skb) __read_mostly;
2063static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2064					    struct packet_type **pt_prev, int *ret,
2065					    struct net_device *orig_dev)
2066{
2067	struct net_bridge_port *port;
2068
2069	if (skb->pkt_type == PACKET_LOOPBACK ||
2070	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2071		return skb;
2072
2073	if (*pt_prev) {
2074		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2075		*pt_prev = NULL;
2076	}
2077
2078	return br_handle_frame_hook(port, skb);
2079}
2080#else
2081#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2082#endif
2083
2084#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2085struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2086EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2087
2088static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2089					     struct packet_type **pt_prev,
2090					     int *ret,
2091					     struct net_device *orig_dev)
2092{
2093	if (skb->dev->macvlan_port == NULL)
2094		return skb;
2095
2096	if (*pt_prev) {
2097		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2098		*pt_prev = NULL;
2099	}
2100	return macvlan_handle_frame_hook(skb);
2101}
2102#else
2103#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2104#endif
2105
2106#ifdef CONFIG_NET_CLS_ACT
2107/* TODO: Maybe we should just force sch_ingress to be compiled in
2108 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2109 * a compare and 2 stores extra right now if we dont have it on
2110 * but have CONFIG_NET_CLS_ACT
2111 * NOTE: This doesnt stop any functionality; if you dont have
2112 * the ingress scheduler, you just cant add policies on ingress.
2113 *
2114 */
2115static int ing_filter(struct sk_buff *skb)
2116{
2117	struct net_device *dev = skb->dev;
2118	u32 ttl = G_TC_RTTL(skb->tc_verd);
2119	struct netdev_queue *rxq;
2120	int result = TC_ACT_OK;
2121	struct Qdisc *q;
2122
2123	if (MAX_RED_LOOP < ttl++) {
2124		printk(KERN_WARNING
2125		       "Redir loop detected Dropping packet (%d->%d)\n",
2126		       skb->iif, dev->ifindex);
2127		return TC_ACT_SHOT;
2128	}
2129
2130	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2131	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2132
2133	rxq = &dev->rx_queue;
2134
2135	q = rxq->qdisc;
2136	if (q != &noop_qdisc) {
2137		spin_lock(qdisc_lock(q));
2138		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2139			result = qdisc_enqueue_root(skb, q);
2140		spin_unlock(qdisc_lock(q));
2141	}
2142
2143	return result;
2144}
2145
2146static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2147					 struct packet_type **pt_prev,
2148					 int *ret, struct net_device *orig_dev)
2149{
2150	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2151		goto out;
2152
2153	if (*pt_prev) {
2154		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2155		*pt_prev = NULL;
2156	} else {
2157		/* Huh? Why does turning on AF_PACKET affect this? */
2158		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2159	}
2160
2161	switch (ing_filter(skb)) {
2162	case TC_ACT_SHOT:
2163	case TC_ACT_STOLEN:
2164		kfree_skb(skb);
2165		return NULL;
2166	}
2167
2168out:
2169	skb->tc_verd = 0;
2170	return skb;
2171}
2172#endif
2173
2174/*
2175 * 	netif_nit_deliver - deliver received packets to network taps
2176 * 	@skb: buffer
2177 *
2178 * 	This function is used to deliver incoming packets to network
2179 * 	taps. It should be used when the normal netif_receive_skb path
2180 * 	is bypassed, for example because of VLAN acceleration.
2181 */
2182void netif_nit_deliver(struct sk_buff *skb)
2183{
2184	struct packet_type *ptype;
2185
2186	if (list_empty(&ptype_all))
2187		return;
2188
2189	skb_reset_network_header(skb);
2190	skb_reset_transport_header(skb);
2191	skb->mac_len = skb->network_header - skb->mac_header;
2192
2193	rcu_read_lock();
2194	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2195		if (!ptype->dev || ptype->dev == skb->dev)
2196			deliver_skb(skb, ptype, skb->dev);
2197	}
2198	rcu_read_unlock();
2199}
2200
2201/**
2202 *	netif_receive_skb - process receive buffer from network
2203 *	@skb: buffer to process
2204 *
2205 *	netif_receive_skb() is the main receive data processing function.
2206 *	It always succeeds. The buffer may be dropped during processing
2207 *	for congestion control or by the protocol layers.
2208 *
2209 *	This function may only be called from softirq context and interrupts
2210 *	should be enabled.
2211 *
2212 *	Return values (usually ignored):
2213 *	NET_RX_SUCCESS: no congestion
2214 *	NET_RX_DROP: packet was dropped
2215 */
2216int netif_receive_skb(struct sk_buff *skb)
2217{
2218	struct packet_type *ptype, *pt_prev;
2219	struct net_device *orig_dev;
2220	struct net_device *null_or_orig;
2221	int ret = NET_RX_DROP;
2222	__be16 type;
2223
2224	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2225		return NET_RX_SUCCESS;
2226
2227	/* if we've gotten here through NAPI, check netpoll */
2228	if (netpoll_receive_skb(skb))
2229		return NET_RX_DROP;
2230
2231	if (!skb->tstamp.tv64)
2232		net_timestamp(skb);
2233
2234	if (!skb->iif)
2235		skb->iif = skb->dev->ifindex;
2236
2237	null_or_orig = NULL;
2238	orig_dev = skb->dev;
2239	if (orig_dev->master) {
2240		if (skb_bond_should_drop(skb))
2241			null_or_orig = orig_dev; /* deliver only exact match */
2242		else
2243			skb->dev = orig_dev->master;
2244	}
2245
2246	__get_cpu_var(netdev_rx_stat).total++;
2247
2248	skb_reset_network_header(skb);
2249	skb_reset_transport_header(skb);
2250	skb->mac_len = skb->network_header - skb->mac_header;
2251
2252	pt_prev = NULL;
2253
2254	rcu_read_lock();
2255
2256#ifdef CONFIG_NET_CLS_ACT
2257	if (skb->tc_verd & TC_NCLS) {
2258		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2259		goto ncls;
2260	}
2261#endif
2262
2263	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2264		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2265		    ptype->dev == orig_dev) {
2266			if (pt_prev)
2267				ret = deliver_skb(skb, pt_prev, orig_dev);
2268			pt_prev = ptype;
2269		}
2270	}
2271
2272#ifdef CONFIG_NET_CLS_ACT
2273	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2274	if (!skb)
2275		goto out;
2276ncls:
2277#endif
2278
2279	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2280	if (!skb)
2281		goto out;
2282	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2283	if (!skb)
2284		goto out;
2285
2286	skb_orphan(skb);
2287
2288	type = skb->protocol;
2289	list_for_each_entry_rcu(ptype,
2290			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2291		if (ptype->type == type &&
2292		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2293		     ptype->dev == orig_dev)) {
2294			if (pt_prev)
2295				ret = deliver_skb(skb, pt_prev, orig_dev);
2296			pt_prev = ptype;
2297		}
2298	}
2299
2300	if (pt_prev) {
2301		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2302	} else {
2303		kfree_skb(skb);
2304		/* Jamal, now you will not able to escape explaining
2305		 * me how you were going to use this. :-)
2306		 */
2307		ret = NET_RX_DROP;
2308	}
2309
2310out:
2311	rcu_read_unlock();
2312	return ret;
2313}
2314
2315/* Network device is going away, flush any packets still pending  */
2316static void flush_backlog(void *arg)
2317{
2318	struct net_device *dev = arg;
2319	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2320	struct sk_buff *skb, *tmp;
2321
2322	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2323		if (skb->dev == dev) {
2324			__skb_unlink(skb, &queue->input_pkt_queue);
2325			kfree_skb(skb);
2326		}
2327}
2328
2329static int napi_gro_complete(struct sk_buff *skb)
2330{
2331	struct packet_type *ptype;
2332	__be16 type = skb->protocol;
2333	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2334	int err = -ENOENT;
2335
2336	if (NAPI_GRO_CB(skb)->count == 1) {
2337		skb_shinfo(skb)->gso_size = 0;
2338		goto out;
2339	}
2340
2341	rcu_read_lock();
2342	list_for_each_entry_rcu(ptype, head, list) {
2343		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2344			continue;
2345
2346		err = ptype->gro_complete(skb);
2347		break;
2348	}
2349	rcu_read_unlock();
2350
2351	if (err) {
2352		WARN_ON(&ptype->list == head);
2353		kfree_skb(skb);
2354		return NET_RX_SUCCESS;
2355	}
2356
2357out:
2358	return netif_receive_skb(skb);
2359}
2360
2361void napi_gro_flush(struct napi_struct *napi)
2362{
2363	struct sk_buff *skb, *next;
2364
2365	for (skb = napi->gro_list; skb; skb = next) {
2366		next = skb->next;
2367		skb->next = NULL;
2368		napi_gro_complete(skb);
2369	}
2370
2371	napi->gro_count = 0;
2372	napi->gro_list = NULL;
2373}
2374EXPORT_SYMBOL(napi_gro_flush);
2375
2376void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
2377{
2378	unsigned int offset = skb_gro_offset(skb);
2379
2380	hlen += offset;
2381	if (hlen <= skb_headlen(skb))
2382		return skb->data + offset;
2383
2384	if (unlikely(!skb_shinfo(skb)->nr_frags ||
2385		     skb_shinfo(skb)->frags[0].size <=
2386		     hlen - skb_headlen(skb) ||
2387		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
2388		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
2389
2390	return page_address(skb_shinfo(skb)->frags[0].page) +
2391	       skb_shinfo(skb)->frags[0].page_offset +
2392	       offset - skb_headlen(skb);
2393}
2394EXPORT_SYMBOL(skb_gro_header);
2395
2396int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2397{
2398	struct sk_buff **pp = NULL;
2399	struct packet_type *ptype;
2400	__be16 type = skb->protocol;
2401	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2402	int same_flow;
2403	int mac_len;
2404	int ret;
2405
2406	if (!(skb->dev->features & NETIF_F_GRO))
2407		goto normal;
2408
2409	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2410		goto normal;
2411
2412	rcu_read_lock();
2413	list_for_each_entry_rcu(ptype, head, list) {
2414		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2415			continue;
2416
2417		skb_set_network_header(skb, skb_gro_offset(skb));
2418		mac_len = skb->network_header - skb->mac_header;
2419		skb->mac_len = mac_len;
2420		NAPI_GRO_CB(skb)->same_flow = 0;
2421		NAPI_GRO_CB(skb)->flush = 0;
2422		NAPI_GRO_CB(skb)->free = 0;
2423
2424		pp = ptype->gro_receive(&napi->gro_list, skb);
2425		break;
2426	}
2427	rcu_read_unlock();
2428
2429	if (&ptype->list == head)
2430		goto normal;
2431
2432	same_flow = NAPI_GRO_CB(skb)->same_flow;
2433	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2434
2435	if (pp) {
2436		struct sk_buff *nskb = *pp;
2437
2438		*pp = nskb->next;
2439		nskb->next = NULL;
2440		napi_gro_complete(nskb);
2441		napi->gro_count--;
2442	}
2443
2444	if (same_flow)
2445		goto ok;
2446
2447	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2448		goto normal;
2449
2450	napi->gro_count++;
2451	NAPI_GRO_CB(skb)->count = 1;
2452	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2453	skb->next = napi->gro_list;
2454	napi->gro_list = skb;
2455	ret = GRO_HELD;
2456
2457pull:
2458	if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
2459		if (napi->gro_list == skb)
2460			napi->gro_list = skb->next;
2461		ret = GRO_DROP;
2462	}
2463
2464ok:
2465	return ret;
2466
2467normal:
2468	ret = GRO_NORMAL;
2469	goto pull;
2470}
2471EXPORT_SYMBOL(dev_gro_receive);
2472
2473static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2474{
2475	struct sk_buff *p;
2476
2477	if (netpoll_rx_on(skb))
2478		return GRO_NORMAL;
2479
2480	for (p = napi->gro_list; p; p = p->next) {
2481		NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2482			&& !compare_ether_header(skb_mac_header(p),
2483						 skb_gro_mac_header(skb));
2484		NAPI_GRO_CB(p)->flush = 0;
2485	}
2486
2487	return dev_gro_receive(napi, skb);
2488}
2489
2490int napi_skb_finish(int ret, struct sk_buff *skb)
2491{
2492	int err = NET_RX_SUCCESS;
2493
2494	switch (ret) {
2495	case GRO_NORMAL:
2496		return netif_receive_skb(skb);
2497
2498	case GRO_DROP:
2499		err = NET_RX_DROP;
2500		/* fall through */
2501
2502	case GRO_MERGED_FREE:
2503		kfree_skb(skb);
2504		break;
2505	}
2506
2507	return err;
2508}
2509EXPORT_SYMBOL(napi_skb_finish);
2510
2511int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2512{
2513	skb_gro_reset_offset(skb);
2514
2515	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2516}
2517EXPORT_SYMBOL(napi_gro_receive);
2518
2519void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2520{
2521	__skb_pull(skb, skb_headlen(skb));
2522	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2523
2524	napi->skb = skb;
2525}
2526EXPORT_SYMBOL(napi_reuse_skb);
2527
2528struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2529				  struct napi_gro_fraginfo *info)
2530{
2531	struct net_device *dev = napi->dev;
2532	struct sk_buff *skb = napi->skb;
2533	struct ethhdr *eth;
2534	skb_frag_t *frag;
2535	int i;
2536
2537	napi->skb = NULL;
2538
2539	if (!skb) {
2540		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2541		if (!skb)
2542			goto out;
2543
2544		skb_reserve(skb, NET_IP_ALIGN);
2545	}
2546
2547	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2548	frag = info->frags;
2549
2550	for (i = 0; i < info->nr_frags; i++) {
2551		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
2552				   frag->size);
2553		frag++;
2554	}
2555	skb_shinfo(skb)->nr_frags = info->nr_frags;
2556
2557	skb->data_len = info->len;
2558	skb->len += info->len;
2559	skb->truesize += info->len;
2560
2561	skb_reset_mac_header(skb);
2562	skb_gro_reset_offset(skb);
2563
2564	eth = skb_gro_header(skb, sizeof(*eth));
2565	if (!eth) {
2566		napi_reuse_skb(napi, skb);
2567		skb = NULL;
2568		goto out;
2569	}
2570
2571	skb_gro_pull(skb, sizeof(*eth));
2572
2573	/*
2574	 * This works because the only protocols we care about don't require
2575	 * special handling.  We'll fix it up properly at the end.
2576	 */
2577	skb->protocol = eth->h_proto;
2578
2579	skb->ip_summed = info->ip_summed;
2580	skb->csum = info->csum;
2581
2582out:
2583	return skb;
2584}
2585EXPORT_SYMBOL(napi_fraginfo_skb);
2586
2587int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2588{
2589	int err = NET_RX_SUCCESS;
2590
2591	switch (ret) {
2592	case GRO_NORMAL:
2593	case GRO_HELD:
2594		skb->protocol = eth_type_trans(skb, napi->dev);
2595
2596		if (ret == GRO_NORMAL)
2597			return netif_receive_skb(skb);
2598
2599		skb_gro_pull(skb, -ETH_HLEN);
2600		break;
2601
2602	case GRO_DROP:
2603		err = NET_RX_DROP;
2604		/* fall through */
2605
2606	case GRO_MERGED_FREE:
2607		napi_reuse_skb(napi, skb);
2608		break;
2609	}
2610
2611	return err;
2612}
2613EXPORT_SYMBOL(napi_frags_finish);
2614
2615int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2616{
2617	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2618
2619	if (!skb)
2620		return NET_RX_DROP;
2621
2622	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2623}
2624EXPORT_SYMBOL(napi_gro_frags);
2625
2626static int process_backlog(struct napi_struct *napi, int quota)
2627{
2628	int work = 0;
2629	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2630	unsigned long start_time = jiffies;
2631
2632	napi->weight = weight_p;
2633	do {
2634		struct sk_buff *skb;
2635
2636		local_irq_disable();
2637		skb = __skb_dequeue(&queue->input_pkt_queue);
2638		if (!skb) {
2639			__napi_complete(napi);
2640			local_irq_enable();
2641			break;
2642		}
2643		local_irq_enable();
2644
2645		netif_receive_skb(skb);
2646	} while (++work < quota && jiffies == start_time);
2647
2648	return work;
2649}
2650
2651/**
2652 * __napi_schedule - schedule for receive
2653 * @n: entry to schedule
2654 *
2655 * The entry's receive function will be scheduled to run
2656 */
2657void __napi_schedule(struct napi_struct *n)
2658{
2659	unsigned long flags;
2660
2661	local_irq_save(flags);
2662	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2663	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2664	local_irq_restore(flags);
2665}
2666EXPORT_SYMBOL(__napi_schedule);
2667
2668void __napi_complete(struct napi_struct *n)
2669{
2670	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2671	BUG_ON(n->gro_list);
2672
2673	list_del(&n->poll_list);
2674	smp_mb__before_clear_bit();
2675	clear_bit(NAPI_STATE_SCHED, &n->state);
2676}
2677EXPORT_SYMBOL(__napi_complete);
2678
2679void napi_complete(struct napi_struct *n)
2680{
2681	unsigned long flags;
2682
2683	/*
2684	 * don't let napi dequeue from the cpu poll list
2685	 * just in case its running on a different cpu
2686	 */
2687	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2688		return;
2689
2690	napi_gro_flush(n);
2691	local_irq_save(flags);
2692	__napi_complete(n);
2693	local_irq_restore(flags);
2694}
2695EXPORT_SYMBOL(napi_complete);
2696
2697void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2698		    int (*poll)(struct napi_struct *, int), int weight)
2699{
2700	INIT_LIST_HEAD(&napi->poll_list);
2701	napi->gro_count = 0;
2702	napi->gro_list = NULL;
2703	napi->skb = NULL;
2704	napi->poll = poll;
2705	napi->weight = weight;
2706	list_add(&napi->dev_list, &dev->napi_list);
2707	napi->dev = dev;
2708#ifdef CONFIG_NETPOLL
2709	spin_lock_init(&napi->poll_lock);
2710	napi->poll_owner = -1;
2711#endif
2712	set_bit(NAPI_STATE_SCHED, &napi->state);
2713}
2714EXPORT_SYMBOL(netif_napi_add);
2715
2716void netif_napi_del(struct napi_struct *napi)
2717{
2718	struct sk_buff *skb, *next;
2719
2720	list_del_init(&napi->dev_list);
2721	kfree_skb(napi->skb);
2722
2723	for (skb = napi->gro_list; skb; skb = next) {
2724		next = skb->next;
2725		skb->next = NULL;
2726		kfree_skb(skb);
2727	}
2728
2729	napi->gro_list = NULL;
2730	napi->gro_count = 0;
2731}
2732EXPORT_SYMBOL(netif_napi_del);
2733
2734
2735static void net_rx_action(struct softirq_action *h)
2736{
2737	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2738	unsigned long time_limit = jiffies + 2;
2739	int budget = netdev_budget;
2740	void *have;
2741
2742	local_irq_disable();
2743
2744	while (!list_empty(list)) {
2745		struct napi_struct *n;
2746		int work, weight;
2747
2748		/* If softirq window is exhuasted then punt.
2749		 * Allow this to run for 2 jiffies since which will allow
2750		 * an average latency of 1.5/HZ.
2751		 */
2752		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2753			goto softnet_break;
2754
2755		local_irq_enable();
2756
2757		/* Even though interrupts have been re-enabled, this
2758		 * access is safe because interrupts can only add new
2759		 * entries to the tail of this list, and only ->poll()
2760		 * calls can remove this head entry from the list.
2761		 */
2762		n = list_entry(list->next, struct napi_struct, poll_list);
2763
2764		have = netpoll_poll_lock(n);
2765
2766		weight = n->weight;
2767
2768		/* This NAPI_STATE_SCHED test is for avoiding a race
2769		 * with netpoll's poll_napi().  Only the entity which
2770		 * obtains the lock and sees NAPI_STATE_SCHED set will
2771		 * actually make the ->poll() call.  Therefore we avoid
2772		 * accidently calling ->poll() when NAPI is not scheduled.
2773		 */
2774		work = 0;
2775		if (test_bit(NAPI_STATE_SCHED, &n->state))
2776			work = n->poll(n, weight);
2777
2778		WARN_ON_ONCE(work > weight);
2779
2780		budget -= work;
2781
2782		local_irq_disable();
2783
2784		/* Drivers must not modify the NAPI state if they
2785		 * consume the entire weight.  In such cases this code
2786		 * still "owns" the NAPI instance and therefore can
2787		 * move the instance around on the list at-will.
2788		 */
2789		if (unlikely(work == weight)) {
2790			if (unlikely(napi_disable_pending(n)))
2791				__napi_complete(n);
2792			else
2793				list_move_tail(&n->poll_list, list);
2794		}
2795
2796		netpoll_poll_unlock(have);
2797	}
2798out:
2799	local_irq_enable();
2800
2801#ifdef CONFIG_NET_DMA
2802	/*
2803	 * There may not be any more sk_buffs coming right now, so push
2804	 * any pending DMA copies to hardware
2805	 */
2806	dma_issue_pending_all();
2807#endif
2808
2809	return;
2810
2811softnet_break:
2812	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2813	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2814	goto out;
2815}
2816
2817static gifconf_func_t * gifconf_list [NPROTO];
2818
2819/**
2820 *	register_gifconf	-	register a SIOCGIF handler
2821 *	@family: Address family
2822 *	@gifconf: Function handler
2823 *
2824 *	Register protocol dependent address dumping routines. The handler
2825 *	that is passed must not be freed or reused until it has been replaced
2826 *	by another handler.
2827 */
2828int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2829{
2830	if (family >= NPROTO)
2831		return -EINVAL;
2832	gifconf_list[family] = gifconf;
2833	return 0;
2834}
2835
2836
2837/*
2838 *	Map an interface index to its name (SIOCGIFNAME)
2839 */
2840
2841/*
2842 *	We need this ioctl for efficient implementation of the
2843 *	if_indextoname() function required by the IPv6 API.  Without
2844 *	it, we would have to search all the interfaces to find a
2845 *	match.  --pb
2846 */
2847
2848static int dev_ifname(struct net *net, struct ifreq __user *arg)
2849{
2850	struct net_device *dev;
2851	struct ifreq ifr;
2852
2853	/*
2854	 *	Fetch the caller's info block.
2855	 */
2856
2857	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2858		return -EFAULT;
2859
2860	read_lock(&dev_base_lock);
2861	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2862	if (!dev) {
2863		read_unlock(&dev_base_lock);
2864		return -ENODEV;
2865	}
2866
2867	strcpy(ifr.ifr_name, dev->name);
2868	read_unlock(&dev_base_lock);
2869
2870	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2871		return -EFAULT;
2872	return 0;
2873}
2874
2875/*
2876 *	Perform a SIOCGIFCONF call. This structure will change
2877 *	size eventually, and there is nothing I can do about it.
2878 *	Thus we will need a 'compatibility mode'.
2879 */
2880
2881static int dev_ifconf(struct net *net, char __user *arg)
2882{
2883	struct ifconf ifc;
2884	struct net_device *dev;
2885	char __user *pos;
2886	int len;
2887	int total;
2888	int i;
2889
2890	/*
2891	 *	Fetch the caller's info block.
2892	 */
2893
2894	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2895		return -EFAULT;
2896
2897	pos = ifc.ifc_buf;
2898	len = ifc.ifc_len;
2899
2900	/*
2901	 *	Loop over the interfaces, and write an info block for each.
2902	 */
2903
2904	total = 0;
2905	for_each_netdev(net, dev) {
2906		for (i = 0; i < NPROTO; i++) {
2907			if (gifconf_list[i]) {
2908				int done;
2909				if (!pos)
2910					done = gifconf_list[i](dev, NULL, 0);
2911				else
2912					done = gifconf_list[i](dev, pos + total,
2913							       len - total);
2914				if (done < 0)
2915					return -EFAULT;
2916				total += done;
2917			}
2918		}
2919	}
2920
2921	/*
2922	 *	All done.  Write the updated control block back to the caller.
2923	 */
2924	ifc.ifc_len = total;
2925
2926	/*
2927	 * 	Both BSD and Solaris return 0 here, so we do too.
2928	 */
2929	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2930}
2931
2932#ifdef CONFIG_PROC_FS
2933/*
2934 *	This is invoked by the /proc filesystem handler to display a device
2935 *	in detail.
2936 */
2937void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2938	__acquires(dev_base_lock)
2939{
2940	struct net *net = seq_file_net(seq);
2941	loff_t off;
2942	struct net_device *dev;
2943
2944	read_lock(&dev_base_lock);
2945	if (!*pos)
2946		return SEQ_START_TOKEN;
2947
2948	off = 1;
2949	for_each_netdev(net, dev)
2950		if (off++ == *pos)
2951			return dev;
2952
2953	return NULL;
2954}
2955
2956void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2957{
2958	struct net *net = seq_file_net(seq);
2959	++*pos;
2960	return v == SEQ_START_TOKEN ?
2961		first_net_device(net) : next_net_device((struct net_device *)v);
2962}
2963
2964void dev_seq_stop(struct seq_file *seq, void *v)
2965	__releases(dev_base_lock)
2966{
2967	read_unlock(&dev_base_lock);
2968}
2969
2970static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2971{
2972	const struct net_device_stats *stats = dev_get_stats(dev);
2973
2974	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2975		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2976		   dev->name, stats->rx_bytes, stats->rx_packets,
2977		   stats->rx_errors,
2978		   stats->rx_dropped + stats->rx_missed_errors,
2979		   stats->rx_fifo_errors,
2980		   stats->rx_length_errors + stats->rx_over_errors +
2981		    stats->rx_crc_errors + stats->rx_frame_errors,
2982		   stats->rx_compressed, stats->multicast,
2983		   stats->tx_bytes, stats->tx_packets,
2984		   stats->tx_errors, stats->tx_dropped,
2985		   stats->tx_fifo_errors, stats->collisions,
2986		   stats->tx_carrier_errors +
2987		    stats->tx_aborted_errors +
2988		    stats->tx_window_errors +
2989		    stats->tx_heartbeat_errors,
2990		   stats->tx_compressed);
2991}
2992
2993/*
2994 *	Called from the PROCfs module. This now uses the new arbitrary sized
2995 *	/proc/net interface to create /proc/net/dev
2996 */
2997static int dev_seq_show(struct seq_file *seq, void *v)
2998{
2999	if (v == SEQ_START_TOKEN)
3000		seq_puts(seq, "Inter-|   Receive                            "
3001			      "                    |  Transmit\n"
3002			      " face |bytes    packets errs drop fifo frame "
3003			      "compressed multicast|bytes    packets errs "
3004			      "drop fifo colls carrier compressed\n");
3005	else
3006		dev_seq_printf_stats(seq, v);
3007	return 0;
3008}
3009
3010static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3011{
3012	struct netif_rx_stats *rc = NULL;
3013
3014	while (*pos < nr_cpu_ids)
3015		if (cpu_online(*pos)) {
3016			rc = &per_cpu(netdev_rx_stat, *pos);
3017			break;
3018		} else
3019			++*pos;
3020	return rc;
3021}
3022
3023static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3024{
3025	return softnet_get_online(pos);
3026}
3027
3028static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3029{
3030	++*pos;
3031	return softnet_get_online(pos);
3032}
3033
3034static void softnet_seq_stop(struct seq_file *seq, void *v)
3035{
3036}
3037
3038static int softnet_seq_show(struct seq_file *seq, void *v)
3039{
3040	struct netif_rx_stats *s = v;
3041
3042	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3043		   s->total, s->dropped, s->time_squeeze, 0,
3044		   0, 0, 0, 0, /* was fastroute */
3045		   s->cpu_collision );
3046	return 0;
3047}
3048
3049static const struct seq_operations dev_seq_ops = {
3050	.start = dev_seq_start,
3051	.next  = dev_seq_next,
3052	.stop  = dev_seq_stop,
3053	.show  = dev_seq_show,
3054};
3055
3056static int dev_seq_open(struct inode *inode, struct file *file)
3057{
3058	return seq_open_net(inode, file, &dev_seq_ops,
3059			    sizeof(struct seq_net_private));
3060}
3061
3062static const struct file_operations dev_seq_fops = {
3063	.owner	 = THIS_MODULE,
3064	.open    = dev_seq_open,
3065	.read    = seq_read,
3066	.llseek  = seq_lseek,
3067	.release = seq_release_net,
3068};
3069
3070static const struct seq_operations softnet_seq_ops = {
3071	.start = softnet_seq_start,
3072	.next  = softnet_seq_next,
3073	.stop  = softnet_seq_stop,
3074	.show  = softnet_seq_show,
3075};
3076
3077static int softnet_seq_open(struct inode *inode, struct file *file)
3078{
3079	return seq_open(file, &softnet_seq_ops);
3080}
3081
3082static const struct file_operations softnet_seq_fops = {
3083	.owner	 = THIS_MODULE,
3084	.open    = softnet_seq_open,
3085	.read    = seq_read,
3086	.llseek  = seq_lseek,
3087	.release = seq_release,
3088};
3089
3090static void *ptype_get_idx(loff_t pos)
3091{
3092	struct packet_type *pt = NULL;
3093	loff_t i = 0;
3094	int t;
3095
3096	list_for_each_entry_rcu(pt, &ptype_all, list) {
3097		if (i == pos)
3098			return pt;
3099		++i;
3100	}
3101
3102	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3103		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3104			if (i == pos)
3105				return pt;
3106			++i;
3107		}
3108	}
3109	return NULL;
3110}
3111
3112static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3113	__acquires(RCU)
3114{
3115	rcu_read_lock();
3116	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3117}
3118
3119static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3120{
3121	struct packet_type *pt;
3122	struct list_head *nxt;
3123	int hash;
3124
3125	++*pos;
3126	if (v == SEQ_START_TOKEN)
3127		return ptype_get_idx(0);
3128
3129	pt = v;
3130	nxt = pt->list.next;
3131	if (pt->type == htons(ETH_P_ALL)) {
3132		if (nxt != &ptype_all)
3133			goto found;
3134		hash = 0;
3135		nxt = ptype_base[0].next;
3136	} else
3137		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3138
3139	while (nxt == &ptype_base[hash]) {
3140		if (++hash >= PTYPE_HASH_SIZE)
3141			return NULL;
3142		nxt = ptype_base[hash].next;
3143	}
3144found:
3145	return list_entry(nxt, struct packet_type, list);
3146}
3147
3148static void ptype_seq_stop(struct seq_file *seq, void *v)
3149	__releases(RCU)
3150{
3151	rcu_read_unlock();
3152}
3153
3154static int ptype_seq_show(struct seq_file *seq, void *v)
3155{
3156	struct packet_type *pt = v;
3157
3158	if (v == SEQ_START_TOKEN)
3159		seq_puts(seq, "Type Device      Function\n");
3160	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3161		if (pt->type == htons(ETH_P_ALL))
3162			seq_puts(seq, "ALL ");
3163		else
3164			seq_printf(seq, "%04x", ntohs(pt->type));
3165
3166		seq_printf(seq, " %-8s %pF\n",
3167			   pt->dev ? pt->dev->name : "", pt->func);
3168	}
3169
3170	return 0;
3171}
3172
3173static const struct seq_operations ptype_seq_ops = {
3174	.start = ptype_seq_start,
3175	.next  = ptype_seq_next,
3176	.stop  = ptype_seq_stop,
3177	.show  = ptype_seq_show,
3178};
3179
3180static int ptype_seq_open(struct inode *inode, struct file *file)
3181{
3182	return seq_open_net(inode, file, &ptype_seq_ops,
3183			sizeof(struct seq_net_private));
3184}
3185
3186static const struct file_operations ptype_seq_fops = {
3187	.owner	 = THIS_MODULE,
3188	.open    = ptype_seq_open,
3189	.read    = seq_read,
3190	.llseek  = seq_lseek,
3191	.release = seq_release_net,
3192};
3193
3194
3195static int __net_init dev_proc_net_init(struct net *net)
3196{
3197	int rc = -ENOMEM;
3198
3199	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3200		goto out;
3201	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3202		goto out_dev;
3203	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3204		goto out_softnet;
3205
3206	if (wext_proc_init(net))
3207		goto out_ptype;
3208	rc = 0;
3209out:
3210	return rc;
3211out_ptype:
3212	proc_net_remove(net, "ptype");
3213out_softnet:
3214	proc_net_remove(net, "softnet_stat");
3215out_dev:
3216	proc_net_remove(net, "dev");
3217	goto out;
3218}
3219
3220static void __net_exit dev_proc_net_exit(struct net *net)
3221{
3222	wext_proc_exit(net);
3223
3224	proc_net_remove(net, "ptype");
3225	proc_net_remove(net, "softnet_stat");
3226	proc_net_remove(net, "dev");
3227}
3228
3229static struct pernet_operations __net_initdata dev_proc_ops = {
3230	.init = dev_proc_net_init,
3231	.exit = dev_proc_net_exit,
3232};
3233
3234static int __init dev_proc_init(void)
3235{
3236	return register_pernet_subsys(&dev_proc_ops);
3237}
3238#else
3239#define dev_proc_init() 0
3240#endif	/* CONFIG_PROC_FS */
3241
3242
3243/**
3244 *	netdev_set_master	-	set up master/slave pair
3245 *	@slave: slave device
3246 *	@master: new master device
3247 *
3248 *	Changes the master device of the slave. Pass %NULL to break the
3249 *	bonding. The caller must hold the RTNL semaphore. On a failure
3250 *	a negative errno code is returned. On success the reference counts
3251 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3252 *	function returns zero.
3253 */
3254int netdev_set_master(struct net_device *slave, struct net_device *master)
3255{
3256	struct net_device *old = slave->master;
3257
3258	ASSERT_RTNL();
3259
3260	if (master) {
3261		if (old)
3262			return -EBUSY;
3263		dev_hold(master);
3264	}
3265
3266	slave->master = master;
3267
3268	synchronize_net();
3269
3270	if (old)
3271		dev_put(old);
3272
3273	if (master)
3274		slave->flags |= IFF_SLAVE;
3275	else
3276		slave->flags &= ~IFF_SLAVE;
3277
3278	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3279	return 0;
3280}
3281
3282static void dev_change_rx_flags(struct net_device *dev, int flags)
3283{
3284	const struct net_device_ops *ops = dev->netdev_ops;
3285
3286	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3287		ops->ndo_change_rx_flags(dev, flags);
3288}
3289
3290static int __dev_set_promiscuity(struct net_device *dev, int inc)
3291{
3292	unsigned short old_flags = dev->flags;
3293	uid_t uid;
3294	gid_t gid;
3295
3296	ASSERT_RTNL();
3297
3298	dev->flags |= IFF_PROMISC;
3299	dev->promiscuity += inc;
3300	if (dev->promiscuity == 0) {
3301		/*
3302		 * Avoid overflow.
3303		 * If inc causes overflow, untouch promisc and return error.
3304		 */
3305		if (inc < 0)
3306			dev->flags &= ~IFF_PROMISC;
3307		else {
3308			dev->promiscuity -= inc;
3309			printk(KERN_WARNING "%s: promiscuity touches roof, "
3310				"set promiscuity failed, promiscuity feature "
3311				"of device might be broken.\n", dev->name);
3312			return -EOVERFLOW;
3313		}
3314	}
3315	if (dev->flags != old_flags) {
3316		printk(KERN_INFO "device %s %s promiscuous mode\n",
3317		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3318							       "left");
3319		if (audit_enabled) {
3320			current_uid_gid(&uid, &gid);
3321			audit_log(current->audit_context, GFP_ATOMIC,
3322				AUDIT_ANOM_PROMISCUOUS,
3323				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3324				dev->name, (dev->flags & IFF_PROMISC),
3325				(old_flags & IFF_PROMISC),
3326				audit_get_loginuid(current),
3327				uid, gid,
3328				audit_get_sessionid(current));
3329		}
3330
3331		dev_change_rx_flags(dev, IFF_PROMISC);
3332	}
3333	return 0;
3334}
3335
3336/**
3337 *	dev_set_promiscuity	- update promiscuity count on a device
3338 *	@dev: device
3339 *	@inc: modifier
3340 *
3341 *	Add or remove promiscuity from a device. While the count in the device
3342 *	remains above zero the interface remains promiscuous. Once it hits zero
3343 *	the device reverts back to normal filtering operation. A negative inc
3344 *	value is used to drop promiscuity on the device.
3345 *	Return 0 if successful or a negative errno code on error.
3346 */
3347int dev_set_promiscuity(struct net_device *dev, int inc)
3348{
3349	unsigned short old_flags = dev->flags;
3350	int err;
3351
3352	err = __dev_set_promiscuity(dev, inc);
3353	if (err < 0)
3354		return err;
3355	if (dev->flags != old_flags)
3356		dev_set_rx_mode(dev);
3357	return err;
3358}
3359
3360/**
3361 *	dev_set_allmulti	- update allmulti count on a device
3362 *	@dev: device
3363 *	@inc: modifier
3364 *
3365 *	Add or remove reception of all multicast frames to a device. While the
3366 *	count in the device remains above zero the interface remains listening
3367 *	to all interfaces. Once it hits zero the device reverts back to normal
3368 *	filtering operation. A negative @inc value is used to drop the counter
3369 *	when releasing a resource needing all multicasts.
3370 *	Return 0 if successful or a negative errno code on error.
3371 */
3372
3373int dev_set_allmulti(struct net_device *dev, int inc)
3374{
3375	unsigned short old_flags = dev->flags;
3376
3377	ASSERT_RTNL();
3378
3379	dev->flags |= IFF_ALLMULTI;
3380	dev->allmulti += inc;
3381	if (dev->allmulti == 0) {
3382		/*
3383		 * Avoid overflow.
3384		 * If inc causes overflow, untouch allmulti and return error.
3385		 */
3386		if (inc < 0)
3387			dev->flags &= ~IFF_ALLMULTI;
3388		else {
3389			dev->allmulti -= inc;
3390			printk(KERN_WARNING "%s: allmulti touches roof, "
3391				"set allmulti failed, allmulti feature of "
3392				"device might be broken.\n", dev->name);
3393			return -EOVERFLOW;
3394		}
3395	}
3396	if (dev->flags ^ old_flags) {
3397		dev_change_rx_flags(dev, IFF_ALLMULTI);
3398		dev_set_rx_mode(dev);
3399	}
3400	return 0;
3401}
3402
3403/*
3404 *	Upload unicast and multicast address lists to device and
3405 *	configure RX filtering. When the device doesn't support unicast
3406 *	filtering it is put in promiscuous mode while unicast addresses
3407 *	are present.
3408 */
3409void __dev_set_rx_mode(struct net_device *dev)
3410{
3411	const struct net_device_ops *ops = dev->netdev_ops;
3412
3413	/* dev_open will call this function so the list will stay sane. */
3414	if (!(dev->flags&IFF_UP))
3415		return;
3416
3417	if (!netif_device_present(dev))
3418		return;
3419
3420	if (ops->ndo_set_rx_mode)
3421		ops->ndo_set_rx_mode(dev);
3422	else {
3423		/* Unicast addresses changes may only happen under the rtnl,
3424		 * therefore calling __dev_set_promiscuity here is safe.
3425		 */
3426		if (dev->uc_count > 0 && !dev->uc_promisc) {
3427			__dev_set_promiscuity(dev, 1);
3428			dev->uc_promisc = 1;
3429		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3430			__dev_set_promiscuity(dev, -1);
3431			dev->uc_promisc = 0;
3432		}
3433
3434		if (ops->ndo_set_multicast_list)
3435			ops->ndo_set_multicast_list(dev);
3436	}
3437}
3438
3439void dev_set_rx_mode(struct net_device *dev)
3440{
3441	netif_addr_lock_bh(dev);
3442	__dev_set_rx_mode(dev);
3443	netif_addr_unlock_bh(dev);
3444}
3445
3446int __dev_addr_delete(struct dev_addr_list **list, int *count,
3447		      void *addr, int alen, int glbl)
3448{
3449	struct dev_addr_list *da;
3450
3451	for (; (da = *list) != NULL; list = &da->next) {
3452		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3453		    alen == da->da_addrlen) {
3454			if (glbl) {
3455				int old_glbl = da->da_gusers;
3456				da->da_gusers = 0;
3457				if (old_glbl == 0)
3458					break;
3459			}
3460			if (--da->da_users)
3461				return 0;
3462
3463			*list = da->next;
3464			kfree(da);
3465			(*count)--;
3466			return 0;
3467		}
3468	}
3469	return -ENOENT;
3470}
3471
3472int __dev_addr_add(struct dev_addr_list **list, int *count,
3473		   void *addr, int alen, int glbl)
3474{
3475	struct dev_addr_list *da;
3476
3477	for (da = *list; da != NULL; da = da->next) {
3478		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3479		    da->da_addrlen == alen) {
3480			if (glbl) {
3481				int old_glbl = da->da_gusers;
3482				da->da_gusers = 1;
3483				if (old_glbl)
3484					return 0;
3485			}
3486			da->da_users++;
3487			return 0;
3488		}
3489	}
3490
3491	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3492	if (da == NULL)
3493		return -ENOMEM;
3494	memcpy(da->da_addr, addr, alen);
3495	da->da_addrlen = alen;
3496	da->da_users = 1;
3497	da->da_gusers = glbl ? 1 : 0;
3498	da->next = *list;
3499	*list = da;
3500	(*count)++;
3501	return 0;
3502}
3503
3504/**
3505 *	dev_unicast_delete	- Release secondary unicast address.
3506 *	@dev: device
3507 *	@addr: address to delete
3508 *	@alen: length of @addr
3509 *
3510 *	Release reference to a secondary unicast address and remove it
3511 *	from the device if the reference count drops to zero.
3512 *
3513 * 	The caller must hold the rtnl_mutex.
3514 */
3515int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3516{
3517	int err;
3518
3519	ASSERT_RTNL();
3520
3521	netif_addr_lock_bh(dev);
3522	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3523	if (!err)
3524		__dev_set_rx_mode(dev);
3525	netif_addr_unlock_bh(dev);
3526	return err;
3527}
3528EXPORT_SYMBOL(dev_unicast_delete);
3529
3530/**
3531 *	dev_unicast_add		- add a secondary unicast address
3532 *	@dev: device
3533 *	@addr: address to add
3534 *	@alen: length of @addr
3535 *
3536 *	Add a secondary unicast address to the device or increase
3537 *	the reference count if it already exists.
3538 *
3539 *	The caller must hold the rtnl_mutex.
3540 */
3541int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3542{
3543	int err;
3544
3545	ASSERT_RTNL();
3546
3547	netif_addr_lock_bh(dev);
3548	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3549	if (!err)
3550		__dev_set_rx_mode(dev);
3551	netif_addr_unlock_bh(dev);
3552	return err;
3553}
3554EXPORT_SYMBOL(dev_unicast_add);
3555
3556int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3557		    struct dev_addr_list **from, int *from_count)
3558{
3559	struct dev_addr_list *da, *next;
3560	int err = 0;
3561
3562	da = *from;
3563	while (da != NULL) {
3564		next = da->next;
3565		if (!da->da_synced) {
3566			err = __dev_addr_add(to, to_count,
3567					     da->da_addr, da->da_addrlen, 0);
3568			if (err < 0)
3569				break;
3570			da->da_synced = 1;
3571			da->da_users++;
3572		} else if (da->da_users == 1) {
3573			__dev_addr_delete(to, to_count,
3574					  da->da_addr, da->da_addrlen, 0);
3575			__dev_addr_delete(from, from_count,
3576					  da->da_addr, da->da_addrlen, 0);
3577		}
3578		da = next;
3579	}
3580	return err;
3581}
3582
3583void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3584		       struct dev_addr_list **from, int *from_count)
3585{
3586	struct dev_addr_list *da, *next;
3587
3588	da = *from;
3589	while (da != NULL) {
3590		next = da->next;
3591		if (da->da_synced) {
3592			__dev_addr_delete(to, to_count,
3593					  da->da_addr, da->da_addrlen, 0);
3594			da->da_synced = 0;
3595			__dev_addr_delete(from, from_count,
3596					  da->da_addr, da->da_addrlen, 0);
3597		}
3598		da = next;
3599	}
3600}
3601
3602/**
3603 *	dev_unicast_sync - Synchronize device's unicast list to another device
3604 *	@to: destination device
3605 *	@from: source device
3606 *
3607 *	Add newly added addresses to the destination device and release
3608 *	addresses that have no users left. The source device must be
3609 *	locked by netif_tx_lock_bh.
3610 *
3611 *	This function is intended to be called from the dev->set_rx_mode
3612 *	function of layered software devices.
3613 */
3614int dev_unicast_sync(struct net_device *to, struct net_device *from)
3615{
3616	int err = 0;
3617
3618	netif_addr_lock_bh(to);
3619	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3620			      &from->uc_list, &from->uc_count);
3621	if (!err)
3622		__dev_set_rx_mode(to);
3623	netif_addr_unlock_bh(to);
3624	return err;
3625}
3626EXPORT_SYMBOL(dev_unicast_sync);
3627
3628/**
3629 *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3630 *	@to: destination device
3631 *	@from: source device
3632 *
3633 *	Remove all addresses that were added to the destination device by
3634 *	dev_unicast_sync(). This function is intended to be called from the
3635 *	dev->stop function of layered software devices.
3636 */
3637void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3638{
3639	netif_addr_lock_bh(from);
3640	netif_addr_lock(to);
3641
3642	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3643			  &from->uc_list, &from->uc_count);
3644	__dev_set_rx_mode(to);
3645
3646	netif_addr_unlock(to);
3647	netif_addr_unlock_bh(from);
3648}
3649EXPORT_SYMBOL(dev_unicast_unsync);
3650
3651static void __dev_addr_discard(struct dev_addr_list **list)
3652{
3653	struct dev_addr_list *tmp;
3654
3655	while (*list != NULL) {
3656		tmp = *list;
3657		*list = tmp->next;
3658		if (tmp->da_users > tmp->da_gusers)
3659			printk("__dev_addr_discard: address leakage! "
3660			       "da_users=%d\n", tmp->da_users);
3661		kfree(tmp);
3662	}
3663}
3664
3665static void dev_addr_discard(struct net_device *dev)
3666{
3667	netif_addr_lock_bh(dev);
3668
3669	__dev_addr_discard(&dev->uc_list);
3670	dev->uc_count = 0;
3671
3672	__dev_addr_discard(&dev->mc_list);
3673	dev->mc_count = 0;
3674
3675	netif_addr_unlock_bh(dev);
3676}
3677
3678/**
3679 *	dev_get_flags - get flags reported to userspace
3680 *	@dev: device
3681 *
3682 *	Get the combination of flag bits exported through APIs to userspace.
3683 */
3684unsigned dev_get_flags(const struct net_device *dev)
3685{
3686	unsigned flags;
3687
3688	flags = (dev->flags & ~(IFF_PROMISC |
3689				IFF_ALLMULTI |
3690				IFF_RUNNING |
3691				IFF_LOWER_UP |
3692				IFF_DORMANT)) |
3693		(dev->gflags & (IFF_PROMISC |
3694				IFF_ALLMULTI));
3695
3696	if (netif_running(dev)) {
3697		if (netif_oper_up(dev))
3698			flags |= IFF_RUNNING;
3699		if (netif_carrier_ok(dev))
3700			flags |= IFF_LOWER_UP;
3701		if (netif_dormant(dev))
3702			flags |= IFF_DORMANT;
3703	}
3704
3705	return flags;
3706}
3707
3708/**
3709 *	dev_change_flags - change device settings
3710 *	@dev: device
3711 *	@flags: device state flags
3712 *
3713 *	Change settings on device based state flags. The flags are
3714 *	in the userspace exported format.
3715 */
3716int dev_change_flags(struct net_device *dev, unsigned flags)
3717{
3718	int ret, changes;
3719	int old_flags = dev->flags;
3720
3721	ASSERT_RTNL();
3722
3723	/*
3724	 *	Set the flags on our device.
3725	 */
3726
3727	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3728			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3729			       IFF_AUTOMEDIA)) |
3730		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3731				    IFF_ALLMULTI));
3732
3733	/*
3734	 *	Load in the correct multicast list now the flags have changed.
3735	 */
3736
3737	if ((old_flags ^ flags) & IFF_MULTICAST)
3738		dev_change_rx_flags(dev, IFF_MULTICAST);
3739
3740	dev_set_rx_mode(dev);
3741
3742	/*
3743	 *	Have we downed the interface. We handle IFF_UP ourselves
3744	 *	according to user attempts to set it, rather than blindly
3745	 *	setting it.
3746	 */
3747
3748	ret = 0;
3749	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3750		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3751
3752		if (!ret)
3753			dev_set_rx_mode(dev);
3754	}
3755
3756	if (dev->flags & IFF_UP &&
3757	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3758					  IFF_VOLATILE)))
3759		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3760
3761	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3762		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3763		dev->gflags ^= IFF_PROMISC;
3764		dev_set_promiscuity(dev, inc);
3765	}
3766
3767	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3768	   is important. Some (broken) drivers set IFF_PROMISC, when
3769	   IFF_ALLMULTI is requested not asking us and not reporting.
3770	 */
3771	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3772		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3773		dev->gflags ^= IFF_ALLMULTI;
3774		dev_set_allmulti(dev, inc);
3775	}
3776
3777	/* Exclude state transition flags, already notified */
3778	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3779	if (changes)
3780		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3781
3782	return ret;
3783}
3784
3785/**
3786 *	dev_set_mtu - Change maximum transfer unit
3787 *	@dev: device
3788 *	@new_mtu: new transfer unit
3789 *
3790 *	Change the maximum transfer size of the network device.
3791 */
3792int dev_set_mtu(struct net_device *dev, int new_mtu)
3793{
3794	const struct net_device_ops *ops = dev->netdev_ops;
3795	int err;
3796
3797	if (new_mtu == dev->mtu)
3798		return 0;
3799
3800	/*	MTU must be positive.	 */
3801	if (new_mtu < 0)
3802		return -EINVAL;
3803
3804	if (!netif_device_present(dev))
3805		return -ENODEV;
3806
3807	err = 0;
3808	if (ops->ndo_change_mtu)
3809		err = ops->ndo_change_mtu(dev, new_mtu);
3810	else
3811		dev->mtu = new_mtu;
3812
3813	if (!err && dev->flags & IFF_UP)
3814		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3815	return err;
3816}
3817
3818/**
3819 *	dev_set_mac_address - Change Media Access Control Address
3820 *	@dev: device
3821 *	@sa: new address
3822 *
3823 *	Change the hardware (MAC) address of the device
3824 */
3825int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3826{
3827	const struct net_device_ops *ops = dev->netdev_ops;
3828	int err;
3829
3830	if (!ops->ndo_set_mac_address)
3831		return -EOPNOTSUPP;
3832	if (sa->sa_family != dev->type)
3833		return -EINVAL;
3834	if (!netif_device_present(dev))
3835		return -ENODEV;
3836	err = ops->ndo_set_mac_address(dev, sa);
3837	if (!err)
3838		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3839	return err;
3840}
3841
3842/*
3843 *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3844 */
3845static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3846{
3847	int err;
3848	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3849
3850	if (!dev)
3851		return -ENODEV;
3852
3853	switch (cmd) {
3854		case SIOCGIFFLAGS:	/* Get interface flags */
3855			ifr->ifr_flags = dev_get_flags(dev);
3856			return 0;
3857
3858		case SIOCGIFMETRIC:	/* Get the metric on the interface
3859					   (currently unused) */
3860			ifr->ifr_metric = 0;
3861			return 0;
3862
3863		case SIOCGIFMTU:	/* Get the MTU of a device */
3864			ifr->ifr_mtu = dev->mtu;
3865			return 0;
3866
3867		case SIOCGIFHWADDR:
3868			if (!dev->addr_len)
3869				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3870			else
3871				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3872				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3873			ifr->ifr_hwaddr.sa_family = dev->type;
3874			return 0;
3875
3876		case SIOCGIFSLAVE:
3877			err = -EINVAL;
3878			break;
3879
3880		case SIOCGIFMAP:
3881			ifr->ifr_map.mem_start = dev->mem_start;
3882			ifr->ifr_map.mem_end   = dev->mem_end;
3883			ifr->ifr_map.base_addr = dev->base_addr;
3884			ifr->ifr_map.irq       = dev->irq;
3885			ifr->ifr_map.dma       = dev->dma;
3886			ifr->ifr_map.port      = dev->if_port;
3887			return 0;
3888
3889		case SIOCGIFINDEX:
3890			ifr->ifr_ifindex = dev->ifindex;
3891			return 0;
3892
3893		case SIOCGIFTXQLEN:
3894			ifr->ifr_qlen = dev->tx_queue_len;
3895			return 0;
3896
3897		default:
3898			/* dev_ioctl() should ensure this case
3899			 * is never reached
3900			 */
3901			WARN_ON(1);
3902			err = -EINVAL;
3903			break;
3904
3905	}
3906	return err;
3907}
3908
3909/*
3910 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3911 */
3912static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3913{
3914	int err;
3915	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3916	const struct net_device_ops *ops;
3917
3918	if (!dev)
3919		return -ENODEV;
3920
3921	ops = dev->netdev_ops;
3922
3923	switch (cmd) {
3924		case SIOCSIFFLAGS:	/* Set interface flags */
3925			return dev_change_flags(dev, ifr->ifr_flags);
3926
3927		case SIOCSIFMETRIC:	/* Set the metric on the interface
3928					   (currently unused) */
3929			return -EOPNOTSUPP;
3930
3931		case SIOCSIFMTU:	/* Set the MTU of a device */
3932			return dev_set_mtu(dev, ifr->ifr_mtu);
3933
3934		case SIOCSIFHWADDR:
3935			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3936
3937		case SIOCSIFHWBROADCAST:
3938			if (ifr->ifr_hwaddr.sa_family != dev->type)
3939				return -EINVAL;
3940			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3941			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3942			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3943			return 0;
3944
3945		case SIOCSIFMAP:
3946			if (ops->ndo_set_config) {
3947				if (!netif_device_present(dev))
3948					return -ENODEV;
3949				return ops->ndo_set_config(dev, &ifr->ifr_map);
3950			}
3951			return -EOPNOTSUPP;
3952
3953		case SIOCADDMULTI:
3954			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3955			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3956				return -EINVAL;
3957			if (!netif_device_present(dev))
3958				return -ENODEV;
3959			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3960					  dev->addr_len, 1);
3961
3962		case SIOCDELMULTI:
3963			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3964			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3965				return -EINVAL;
3966			if (!netif_device_present(dev))
3967				return -ENODEV;
3968			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3969					     dev->addr_len, 1);
3970
3971		case SIOCSIFTXQLEN:
3972			if (ifr->ifr_qlen < 0)
3973				return -EINVAL;
3974			dev->tx_queue_len = ifr->ifr_qlen;
3975			return 0;
3976
3977		case SIOCSIFNAME:
3978			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3979			return dev_change_name(dev, ifr->ifr_newname);
3980
3981		/*
3982		 *	Unknown or private ioctl
3983		 */
3984
3985		default:
3986			if ((cmd >= SIOCDEVPRIVATE &&
3987			    cmd <= SIOCDEVPRIVATE + 15) ||
3988			    cmd == SIOCBONDENSLAVE ||
3989			    cmd == SIOCBONDRELEASE ||
3990			    cmd == SIOCBONDSETHWADDR ||
3991			    cmd == SIOCBONDSLAVEINFOQUERY ||
3992			    cmd == SIOCBONDINFOQUERY ||
3993			    cmd == SIOCBONDCHANGEACTIVE ||
3994			    cmd == SIOCGMIIPHY ||
3995			    cmd == SIOCGMIIREG ||
3996			    cmd == SIOCSMIIREG ||
3997			    cmd == SIOCBRADDIF ||
3998			    cmd == SIOCBRDELIF ||
3999			    cmd == SIOCSHWTSTAMP ||
4000			    cmd == SIOCWANDEV) {
4001				err = -EOPNOTSUPP;
4002				if (ops->ndo_do_ioctl) {
4003					if (netif_device_present(dev))
4004						err = ops->ndo_do_ioctl(dev, ifr, cmd);
4005					else
4006						err = -ENODEV;
4007				}
4008			} else
4009				err = -EINVAL;
4010
4011	}
4012	return err;
4013}
4014
4015/*
4016 *	This function handles all "interface"-type I/O control requests. The actual
4017 *	'doing' part of this is dev_ifsioc above.
4018 */
4019
4020/**
4021 *	dev_ioctl	-	network device ioctl
4022 *	@net: the applicable net namespace
4023 *	@cmd: command to issue
4024 *	@arg: pointer to a struct ifreq in user space
4025 *
4026 *	Issue ioctl functions to devices. This is normally called by the
4027 *	user space syscall interfaces but can sometimes be useful for
4028 *	other purposes. The return value is the return from the syscall if
4029 *	positive or a negative errno code on error.
4030 */
4031
4032int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4033{
4034	struct ifreq ifr;
4035	int ret;
4036	char *colon;
4037
4038	/* One special case: SIOCGIFCONF takes ifconf argument
4039	   and requires shared lock, because it sleeps writing
4040	   to user space.
4041	 */
4042
4043	if (cmd == SIOCGIFCONF) {
4044		rtnl_lock();
4045		ret = dev_ifconf(net, (char __user *) arg);
4046		rtnl_unlock();
4047		return ret;
4048	}
4049	if (cmd == SIOCGIFNAME)
4050		return dev_ifname(net, (struct ifreq __user *)arg);
4051
4052	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4053		return -EFAULT;
4054
4055	ifr.ifr_name[IFNAMSIZ-1] = 0;
4056
4057	colon = strchr(ifr.ifr_name, ':');
4058	if (colon)
4059		*colon = 0;
4060
4061	/*
4062	 *	See which interface the caller is talking about.
4063	 */
4064
4065	switch (cmd) {
4066		/*
4067		 *	These ioctl calls:
4068		 *	- can be done by all.
4069		 *	- atomic and do not require locking.
4070		 *	- return a value
4071		 */
4072		case SIOCGIFFLAGS:
4073		case SIOCGIFMETRIC:
4074		case SIOCGIFMTU:
4075		case SIOCGIFHWADDR:
4076		case SIOCGIFSLAVE:
4077		case SIOCGIFMAP:
4078		case SIOCGIFINDEX:
4079		case SIOCGIFTXQLEN:
4080			dev_load(net, ifr.ifr_name);
4081			read_lock(&dev_base_lock);
4082			ret = dev_ifsioc_locked(net, &ifr, cmd);
4083			read_unlock(&dev_base_lock);
4084			if (!ret) {
4085				if (colon)
4086					*colon = ':';
4087				if (copy_to_user(arg, &ifr,
4088						 sizeof(struct ifreq)))
4089					ret = -EFAULT;
4090			}
4091			return ret;
4092
4093		case SIOCETHTOOL:
4094			dev_load(net, ifr.ifr_name);
4095			rtnl_lock();
4096			ret = dev_ethtool(net, &ifr);
4097			rtnl_unlock();
4098			if (!ret) {
4099				if (colon)
4100					*colon = ':';
4101				if (copy_to_user(arg, &ifr,
4102						 sizeof(struct ifreq)))
4103					ret = -EFAULT;
4104			}
4105			return ret;
4106
4107		/*
4108		 *	These ioctl calls:
4109		 *	- require superuser power.
4110		 *	- require strict serialization.
4111		 *	- return a value
4112		 */
4113		case SIOCGMIIPHY:
4114		case SIOCGMIIREG:
4115		case SIOCSIFNAME:
4116			if (!capable(CAP_NET_ADMIN))
4117				return -EPERM;
4118			dev_load(net, ifr.ifr_name);
4119			rtnl_lock();
4120			ret = dev_ifsioc(net, &ifr, cmd);
4121			rtnl_unlock();
4122			if (!ret) {
4123				if (colon)
4124					*colon = ':';
4125				if (copy_to_user(arg, &ifr,
4126						 sizeof(struct ifreq)))
4127					ret = -EFAULT;
4128			}
4129			return ret;
4130
4131		/*
4132		 *	These ioctl calls:
4133		 *	- require superuser power.
4134		 *	- require strict serialization.
4135		 *	- do not return a value
4136		 */
4137		case SIOCSIFFLAGS:
4138		case SIOCSIFMETRIC:
4139		case SIOCSIFMTU:
4140		case SIOCSIFMAP:
4141		case SIOCSIFHWADDR:
4142		case SIOCSIFSLAVE:
4143		case SIOCADDMULTI:
4144		case SIOCDELMULTI:
4145		case SIOCSIFHWBROADCAST:
4146		case SIOCSIFTXQLEN:
4147		case SIOCSMIIREG:
4148		case SIOCBONDENSLAVE:
4149		case SIOCBONDRELEASE:
4150		case SIOCBONDSETHWADDR:
4151		case SIOCBONDCHANGEACTIVE:
4152		case SIOCBRADDIF:
4153		case SIOCBRDELIF:
4154		case SIOCSHWTSTAMP:
4155			if (!capable(CAP_NET_ADMIN))
4156				return -EPERM;
4157			/* fall through */
4158		case SIOCBONDSLAVEINFOQUERY:
4159		case SIOCBONDINFOQUERY:
4160			dev_load(net, ifr.ifr_name);
4161			rtnl_lock();
4162			ret = dev_ifsioc(net, &ifr, cmd);
4163			rtnl_unlock();
4164			return ret;
4165
4166		case SIOCGIFMEM:
4167			/* Get the per device memory space. We can add this but
4168			 * currently do not support it */
4169		case SIOCSIFMEM:
4170			/* Set the per device memory buffer space.
4171			 * Not applicable in our case */
4172		case SIOCSIFLINK:
4173			return -EINVAL;
4174
4175		/*
4176		 *	Unknown or private ioctl.
4177		 */
4178		default:
4179			if (cmd == SIOCWANDEV ||
4180			    (cmd >= SIOCDEVPRIVATE &&
4181			     cmd <= SIOCDEVPRIVATE + 15)) {
4182				dev_load(net, ifr.ifr_name);
4183				rtnl_lock();
4184				ret = dev_ifsioc(net, &ifr, cmd);
4185				rtnl_unlock();
4186				if (!ret && copy_to_user(arg, &ifr,
4187							 sizeof(struct ifreq)))
4188					ret = -EFAULT;
4189				return ret;
4190			}
4191			/* Take care of Wireless Extensions */
4192			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4193				return wext_handle_ioctl(net, &ifr, cmd, arg);
4194			return -EINVAL;
4195	}
4196}
4197
4198
4199/**
4200 *	dev_new_index	-	allocate an ifindex
4201 *	@net: the applicable net namespace
4202 *
4203 *	Returns a suitable unique value for a new device interface
4204 *	number.  The caller must hold the rtnl semaphore or the
4205 *	dev_base_lock to be sure it remains unique.
4206 */
4207static int dev_new_index(struct net *net)
4208{
4209	static int ifindex;
4210	for (;;) {
4211		if (++ifindex <= 0)
4212			ifindex = 1;
4213		if (!__dev_get_by_index(net, ifindex))
4214			return ifindex;
4215	}
4216}
4217
4218/* Delayed registration/unregisteration */
4219static LIST_HEAD(net_todo_list);
4220
4221static void net_set_todo(struct net_device *dev)
4222{
4223	list_add_tail(&dev->todo_list, &net_todo_list);
4224}
4225
4226static void rollback_registered(struct net_device *dev)
4227{
4228	BUG_ON(dev_boot_phase);
4229	ASSERT_RTNL();
4230
4231	/* Some devices call without registering for initialization unwind. */
4232	if (dev->reg_state == NETREG_UNINITIALIZED) {
4233		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4234				  "was registered\n", dev->name, dev);
4235
4236		WARN_ON(1);
4237		return;
4238	}
4239
4240	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4241
4242	/* If device is running, close it first. */
4243	dev_close(dev);
4244
4245	/* And unlink it from device chain. */
4246	unlist_netdevice(dev);
4247
4248	dev->reg_state = NETREG_UNREGISTERING;
4249
4250	synchronize_net();
4251
4252	/* Shutdown queueing discipline. */
4253	dev_shutdown(dev);
4254
4255
4256	/* Notify protocols, that we are about to destroy
4257	   this device. They should clean all the things.
4258	*/
4259	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4260
4261	/*
4262	 *	Flush the unicast and multicast chains
4263	 */
4264	dev_addr_discard(dev);
4265
4266	if (dev->netdev_ops->ndo_uninit)
4267		dev->netdev_ops->ndo_uninit(dev);
4268
4269	/* Notifier chain MUST detach us from master device. */
4270	WARN_ON(dev->master);
4271
4272	/* Remove entries from kobject tree */
4273	netdev_unregister_kobject(dev);
4274
4275	synchronize_net();
4276
4277	dev_put(dev);
4278}
4279
4280static void __netdev_init_queue_locks_one(struct net_device *dev,
4281					  struct netdev_queue *dev_queue,
4282					  void *_unused)
4283{
4284	spin_lock_init(&dev_queue->_xmit_lock);
4285	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4286	dev_queue->xmit_lock_owner = -1;
4287}
4288
4289static void netdev_init_queue_locks(struct net_device *dev)
4290{
4291	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4292	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4293}
4294
4295unsigned long netdev_fix_features(unsigned long features, const char *name)
4296{
4297	/* Fix illegal SG+CSUM combinations. */
4298	if ((features & NETIF_F_SG) &&
4299	    !(features & NETIF_F_ALL_CSUM)) {
4300		if (name)
4301			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4302			       "checksum feature.\n", name);
4303		features &= ~NETIF_F_SG;
4304	}
4305
4306	/* TSO requires that SG is present as well. */
4307	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4308		if (name)
4309			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4310			       "SG feature.\n", name);
4311		features &= ~NETIF_F_TSO;
4312	}
4313
4314	if (features & NETIF_F_UFO) {
4315		if (!(features & NETIF_F_GEN_CSUM)) {
4316			if (name)
4317				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4318				       "since no NETIF_F_HW_CSUM feature.\n",
4319				       name);
4320			features &= ~NETIF_F_UFO;
4321		}
4322
4323		if (!(features & NETIF_F_SG)) {
4324			if (name)
4325				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4326				       "since no NETIF_F_SG feature.\n", name);
4327			features &= ~NETIF_F_UFO;
4328		}
4329	}
4330
4331	return features;
4332}
4333EXPORT_SYMBOL(netdev_fix_features);
4334
4335/* Some devices need to (re-)set their netdev_ops inside
4336 * ->init() or similar.  If that happens, we have to setup
4337 * the compat pointers again.
4338 */
4339void netdev_resync_ops(struct net_device *dev)
4340{
4341#ifdef CONFIG_COMPAT_NET_DEV_OPS
4342	const struct net_device_ops *ops = dev->netdev_ops;
4343
4344	dev->init = ops->ndo_init;
4345	dev->uninit = ops->ndo_uninit;
4346	dev->open = ops->ndo_open;
4347	dev->change_rx_flags = ops->ndo_change_rx_flags;
4348	dev->set_rx_mode = ops->ndo_set_rx_mode;
4349	dev->set_multicast_list = ops->ndo_set_multicast_list;
4350	dev->set_mac_address = ops->ndo_set_mac_address;
4351	dev->validate_addr = ops->ndo_validate_addr;
4352	dev->do_ioctl = ops->ndo_do_ioctl;
4353	dev->set_config = ops->ndo_set_config;
4354	dev->change_mtu = ops->ndo_change_mtu;
4355	dev->neigh_setup = ops->ndo_neigh_setup;
4356	dev->tx_timeout = ops->ndo_tx_timeout;
4357	dev->get_stats = ops->ndo_get_stats;
4358	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4359	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4360	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4361#ifdef CONFIG_NET_POLL_CONTROLLER
4362	dev->poll_controller = ops->ndo_poll_controller;
4363#endif
4364#endif
4365}
4366EXPORT_SYMBOL(netdev_resync_ops);
4367
4368/**
4369 *	register_netdevice	- register a network device
4370 *	@dev: device to register
4371 *
4372 *	Take a completed network device structure and add it to the kernel
4373 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4374 *	chain. 0 is returned on success. A negative errno code is returned
4375 *	on a failure to set up the device, or if the name is a duplicate.
4376 *
4377 *	Callers must hold the rtnl semaphore. You may want
4378 *	register_netdev() instead of this.
4379 *
4380 *	BUGS:
4381 *	The locking appears insufficient to guarantee two parallel registers
4382 *	will not get the same name.
4383 */
4384
4385int register_netdevice(struct net_device *dev)
4386{
4387	struct hlist_head *head;
4388	struct hlist_node *p;
4389	int ret;
4390	struct net *net = dev_net(dev);
4391
4392	BUG_ON(dev_boot_phase);
4393	ASSERT_RTNL();
4394
4395	might_sleep();
4396
4397	/* When net_device's are persistent, this will be fatal. */
4398	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4399	BUG_ON(!net);
4400
4401	spin_lock_init(&dev->addr_list_lock);
4402	netdev_set_addr_lockdep_class(dev);
4403	netdev_init_queue_locks(dev);
4404
4405	dev->iflink = -1;
4406
4407#ifdef CONFIG_COMPAT_NET_DEV_OPS
4408	/* Netdevice_ops API compatibility support.
4409	 * This is temporary until all network devices are converted.
4410	 */
4411	if (dev->netdev_ops) {
4412		netdev_resync_ops(dev);
4413	} else {
4414		char drivername[64];
4415		pr_info("%s (%s): not using net_device_ops yet\n",
4416			dev->name, netdev_drivername(dev, drivername, 64));
4417
4418		/* This works only because net_device_ops and the
4419		   compatibility structure are the same. */
4420		dev->netdev_ops = (void *) &(dev->init);
4421	}
4422#endif
4423
4424	/* Init, if this function is available */
4425	if (dev->netdev_ops->ndo_init) {
4426		ret = dev->netdev_ops->ndo_init(dev);
4427		if (ret) {
4428			if (ret > 0)
4429				ret = -EIO;
4430			goto out;
4431		}
4432	}
4433
4434	if (!dev_valid_name(dev->name)) {
4435		ret = -EINVAL;
4436		goto err_uninit;
4437	}
4438
4439	dev->ifindex = dev_new_index(net);
4440	if (dev->iflink == -1)
4441		dev->iflink = dev->ifindex;
4442
4443	/* Check for existence of name */
4444	head = dev_name_hash(net, dev->name);
4445	hlist_for_each(p, head) {
4446		struct net_device *d
4447			= hlist_entry(p, struct net_device, name_hlist);
4448		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4449			ret = -EEXIST;
4450			goto err_uninit;
4451		}
4452	}
4453
4454	/* Fix illegal checksum combinations */
4455	if ((dev->features & NETIF_F_HW_CSUM) &&
4456	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4457		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4458		       dev->name);
4459		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4460	}
4461
4462	if ((dev->features & NETIF_F_NO_CSUM) &&
4463	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4464		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4465		       dev->name);
4466		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4467	}
4468
4469	dev->features = netdev_fix_features(dev->features, dev->name);
4470
4471	/* Enable software GSO if SG is supported. */
4472	if (dev->features & NETIF_F_SG)
4473		dev->features |= NETIF_F_GSO;
4474
4475	netdev_initialize_kobject(dev);
4476	ret = netdev_register_kobject(dev);
4477	if (ret)
4478		goto err_uninit;
4479	dev->reg_state = NETREG_REGISTERED;
4480
4481	/*
4482	 *	Default initial state at registry is that the
4483	 *	device is present.
4484	 */
4485
4486	set_bit(__LINK_STATE_PRESENT, &dev->state);
4487
4488	dev_init_scheduler(dev);
4489	dev_hold(dev);
4490	list_netdevice(dev);
4491
4492	/* Notify protocols, that a new device appeared. */
4493	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4494	ret = notifier_to_errno(ret);
4495	if (ret) {
4496		rollback_registered(dev);
4497		dev->reg_state = NETREG_UNREGISTERED;
4498	}
4499
4500out:
4501	return ret;
4502
4503err_uninit:
4504	if (dev->netdev_ops->ndo_uninit)
4505		dev->netdev_ops->ndo_uninit(dev);
4506	goto out;
4507}
4508
4509/**
4510 *	init_dummy_netdev	- init a dummy network device for NAPI
4511 *	@dev: device to init
4512 *
4513 *	This takes a network device structure and initialize the minimum
4514 *	amount of fields so it can be used to schedule NAPI polls without
4515 *	registering a full blown interface. This is to be used by drivers
4516 *	that need to tie several hardware interfaces to a single NAPI
4517 *	poll scheduler due to HW limitations.
4518 */
4519int init_dummy_netdev(struct net_device *dev)
4520{
4521	/* Clear everything. Note we don't initialize spinlocks
4522	 * are they aren't supposed to be taken by any of the
4523	 * NAPI code and this dummy netdev is supposed to be
4524	 * only ever used for NAPI polls
4525	 */
4526	memset(dev, 0, sizeof(struct net_device));
4527
4528	/* make sure we BUG if trying to hit standard
4529	 * register/unregister code path
4530	 */
4531	dev->reg_state = NETREG_DUMMY;
4532
4533	/* initialize the ref count */
4534	atomic_set(&dev->refcnt, 1);
4535
4536	/* NAPI wants this */
4537	INIT_LIST_HEAD(&dev->napi_list);
4538
4539	/* a dummy interface is started by default */
4540	set_bit(__LINK_STATE_PRESENT, &dev->state);
4541	set_bit(__LINK_STATE_START, &dev->state);
4542
4543	return 0;
4544}
4545EXPORT_SYMBOL_GPL(init_dummy_netdev);
4546
4547
4548/**
4549 *	register_netdev	- register a network device
4550 *	@dev: device to register
4551 *
4552 *	Take a completed network device structure and add it to the kernel
4553 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4554 *	chain. 0 is returned on success. A negative errno code is returned
4555 *	on a failure to set up the device, or if the name is a duplicate.
4556 *
4557 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4558 *	and expands the device name if you passed a format string to
4559 *	alloc_netdev.
4560 */
4561int register_netdev(struct net_device *dev)
4562{
4563	int err;
4564
4565	rtnl_lock();
4566
4567	/*
4568	 * If the name is a format string the caller wants us to do a
4569	 * name allocation.
4570	 */
4571	if (strchr(dev->name, '%')) {
4572		err = dev_alloc_name(dev, dev->name);
4573		if (err < 0)
4574			goto out;
4575	}
4576
4577	err = register_netdevice(dev);
4578out:
4579	rtnl_unlock();
4580	return err;
4581}
4582EXPORT_SYMBOL(register_netdev);
4583
4584/*
4585 * netdev_wait_allrefs - wait until all references are gone.
4586 *
4587 * This is called when unregistering network devices.
4588 *
4589 * Any protocol or device that holds a reference should register
4590 * for netdevice notification, and cleanup and put back the
4591 * reference if they receive an UNREGISTER event.
4592 * We can get stuck here if buggy protocols don't correctly
4593 * call dev_put.
4594 */
4595static void netdev_wait_allrefs(struct net_device *dev)
4596{
4597	unsigned long rebroadcast_time, warning_time;
4598
4599	rebroadcast_time = warning_time = jiffies;
4600	while (atomic_read(&dev->refcnt) != 0) {
4601		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4602			rtnl_lock();
4603
4604			/* Rebroadcast unregister notification */
4605			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4606
4607			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4608				     &dev->state)) {
4609				/* We must not have linkwatch events
4610				 * pending on unregister. If this
4611				 * happens, we simply run the queue
4612				 * unscheduled, resulting in a noop
4613				 * for this device.
4614				 */
4615				linkwatch_run_queue();
4616			}
4617
4618			__rtnl_unlock();
4619
4620			rebroadcast_time = jiffies;
4621		}
4622
4623		msleep(250);
4624
4625		if (time_after(jiffies, warning_time + 10 * HZ)) {
4626			printk(KERN_EMERG "unregister_netdevice: "
4627			       "waiting for %s to become free. Usage "
4628			       "count = %d\n",
4629			       dev->name, atomic_read(&dev->refcnt));
4630			warning_time = jiffies;
4631		}
4632	}
4633}
4634
4635/* The sequence is:
4636 *
4637 *	rtnl_lock();
4638 *	...
4639 *	register_netdevice(x1);
4640 *	register_netdevice(x2);
4641 *	...
4642 *	unregister_netdevice(y1);
4643 *	unregister_netdevice(y2);
4644 *      ...
4645 *	rtnl_unlock();
4646 *	free_netdev(y1);
4647 *	free_netdev(y2);
4648 *
4649 * We are invoked by rtnl_unlock().
4650 * This allows us to deal with problems:
4651 * 1) We can delete sysfs objects which invoke hotplug
4652 *    without deadlocking with linkwatch via keventd.
4653 * 2) Since we run with the RTNL semaphore not held, we can sleep
4654 *    safely in order to wait for the netdev refcnt to drop to zero.
4655 *
4656 * We must not return until all unregister events added during
4657 * the interval the lock was held have been completed.
4658 */
4659void netdev_run_todo(void)
4660{
4661	struct list_head list;
4662
4663	/* Snapshot list, allow later requests */
4664	list_replace_init(&net_todo_list, &list);
4665
4666	__rtnl_unlock();
4667
4668	while (!list_empty(&list)) {
4669		struct net_device *dev
4670			= list_entry(list.next, struct net_device, todo_list);
4671		list_del(&dev->todo_list);
4672
4673		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4674			printk(KERN_ERR "network todo '%s' but state %d\n",
4675			       dev->name, dev->reg_state);
4676			dump_stack();
4677			continue;
4678		}
4679
4680		dev->reg_state = NETREG_UNREGISTERED;
4681
4682		on_each_cpu(flush_backlog, dev, 1);
4683
4684		netdev_wait_allrefs(dev);
4685
4686		/* paranoia */
4687		BUG_ON(atomic_read(&dev->refcnt));
4688		WARN_ON(dev->ip_ptr);
4689		WARN_ON(dev->ip6_ptr);
4690		WARN_ON(dev->dn_ptr);
4691
4692		if (dev->destructor)
4693			dev->destructor(dev);
4694
4695		/* Free network device */
4696		kobject_put(&dev->dev.kobj);
4697	}
4698}
4699
4700/**
4701 *	dev_get_stats	- get network device statistics
4702 *	@dev: device to get statistics from
4703 *
4704 *	Get network statistics from device. The device driver may provide
4705 *	its own method by setting dev->netdev_ops->get_stats; otherwise
4706 *	the internal statistics structure is used.
4707 */
4708const struct net_device_stats *dev_get_stats(struct net_device *dev)
4709 {
4710	const struct net_device_ops *ops = dev->netdev_ops;
4711
4712	if (ops->ndo_get_stats)
4713		return ops->ndo_get_stats(dev);
4714	else
4715		return &dev->stats;
4716}
4717EXPORT_SYMBOL(dev_get_stats);
4718
4719static void netdev_init_one_queue(struct net_device *dev,
4720				  struct netdev_queue *queue,
4721				  void *_unused)
4722{
4723	queue->dev = dev;
4724}
4725
4726static void netdev_init_queues(struct net_device *dev)
4727{
4728	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4729	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4730	spin_lock_init(&dev->tx_global_lock);
4731}
4732
4733/**
4734 *	alloc_netdev_mq - allocate network device
4735 *	@sizeof_priv:	size of private data to allocate space for
4736 *	@name:		device name format string
4737 *	@setup:		callback to initialize device
4738 *	@queue_count:	the number of subqueues to allocate
4739 *
4740 *	Allocates a struct net_device with private data area for driver use
4741 *	and performs basic initialization.  Also allocates subquue structs
4742 *	for each queue on the device at the end of the netdevice.
4743 */
4744struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4745		void (*setup)(struct net_device *), unsigned int queue_count)
4746{
4747	struct netdev_queue *tx;
4748	struct net_device *dev;
4749	size_t alloc_size;
4750	void *p;
4751
4752	BUG_ON(strlen(name) >= sizeof(dev->name));
4753
4754	alloc_size = sizeof(struct net_device);
4755	if (sizeof_priv) {
4756		/* ensure 32-byte alignment of private area */
4757		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4758		alloc_size += sizeof_priv;
4759	}
4760	/* ensure 32-byte alignment of whole construct */
4761	alloc_size += NETDEV_ALIGN_CONST;
4762
4763	p = kzalloc(alloc_size, GFP_KERNEL);
4764	if (!p) {
4765		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4766		return NULL;
4767	}
4768
4769	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4770	if (!tx) {
4771		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4772		       "tx qdiscs.\n");
4773		kfree(p);
4774		return NULL;
4775	}
4776
4777	dev = (struct net_device *)
4778		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4779	dev->padded = (char *)dev - (char *)p;
4780	dev_net_set(dev, &init_net);
4781
4782	dev->_tx = tx;
4783	dev->num_tx_queues = queue_count;
4784	dev->real_num_tx_queues = queue_count;
4785
4786	dev->gso_max_size = GSO_MAX_SIZE;
4787
4788	netdev_init_queues(dev);
4789
4790	INIT_LIST_HEAD(&dev->napi_list);
4791	setup(dev);
4792	strcpy(dev->name, name);
4793	return dev;
4794}
4795EXPORT_SYMBOL(alloc_netdev_mq);
4796
4797/**
4798 *	free_netdev - free network device
4799 *	@dev: device
4800 *
4801 *	This function does the last stage of destroying an allocated device
4802 * 	interface. The reference to the device object is released.
4803 *	If this is the last reference then it will be freed.
4804 */
4805void free_netdev(struct net_device *dev)
4806{
4807	struct napi_struct *p, *n;
4808
4809	release_net(dev_net(dev));
4810
4811	kfree(dev->_tx);
4812
4813	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4814		netif_napi_del(p);
4815
4816	/*  Compatibility with error handling in drivers */
4817	if (dev->reg_state == NETREG_UNINITIALIZED) {
4818		kfree((char *)dev - dev->padded);
4819		return;
4820	}
4821
4822	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4823	dev->reg_state = NETREG_RELEASED;
4824
4825	/* will free via device release */
4826	put_device(&dev->dev);
4827}
4828
4829/**
4830 *	synchronize_net -  Synchronize with packet receive processing
4831 *
4832 *	Wait for packets currently being received to be done.
4833 *	Does not block later packets from starting.
4834 */
4835void synchronize_net(void)
4836{
4837	might_sleep();
4838	synchronize_rcu();
4839}
4840
4841/**
4842 *	unregister_netdevice - remove device from the kernel
4843 *	@dev: device
4844 *
4845 *	This function shuts down a device interface and removes it
4846 *	from the kernel tables.
4847 *
4848 *	Callers must hold the rtnl semaphore.  You may want
4849 *	unregister_netdev() instead of this.
4850 */
4851
4852void unregister_netdevice(struct net_device *dev)
4853{
4854	ASSERT_RTNL();
4855
4856	rollback_registered(dev);
4857	/* Finish processing unregister after unlock */
4858	net_set_todo(dev);
4859}
4860
4861/**
4862 *	unregister_netdev - remove device from the kernel
4863 *	@dev: device
4864 *
4865 *	This function shuts down a device interface and removes it
4866 *	from the kernel tables.
4867 *
4868 *	This is just a wrapper for unregister_netdevice that takes
4869 *	the rtnl semaphore.  In general you want to use this and not
4870 *	unregister_netdevice.
4871 */
4872void unregister_netdev(struct net_device *dev)
4873{
4874	rtnl_lock();
4875	unregister_netdevice(dev);
4876	rtnl_unlock();
4877}
4878
4879EXPORT_SYMBOL(unregister_netdev);
4880
4881/**
4882 *	dev_change_net_namespace - move device to different nethost namespace
4883 *	@dev: device
4884 *	@net: network namespace
4885 *	@pat: If not NULL name pattern to try if the current device name
4886 *	      is already taken in the destination network namespace.
4887 *
4888 *	This function shuts down a device interface and moves it
4889 *	to a new network namespace. On success 0 is returned, on
4890 *	a failure a netagive errno code is returned.
4891 *
4892 *	Callers must hold the rtnl semaphore.
4893 */
4894
4895int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4896{
4897	char buf[IFNAMSIZ];
4898	const char *destname;
4899	int err;
4900
4901	ASSERT_RTNL();
4902
4903	/* Don't allow namespace local devices to be moved. */
4904	err = -EINVAL;
4905	if (dev->features & NETIF_F_NETNS_LOCAL)
4906		goto out;
4907
4908#ifdef CONFIG_SYSFS
4909	/* Don't allow real devices to be moved when sysfs
4910	 * is enabled.
4911	 */
4912	err = -EINVAL;
4913	if (dev->dev.parent)
4914		goto out;
4915#endif
4916
4917	/* Ensure the device has been registrered */
4918	err = -EINVAL;
4919	if (dev->reg_state != NETREG_REGISTERED)
4920		goto out;
4921
4922	/* Get out if there is nothing todo */
4923	err = 0;
4924	if (net_eq(dev_net(dev), net))
4925		goto out;
4926
4927	/* Pick the destination device name, and ensure
4928	 * we can use it in the destination network namespace.
4929	 */
4930	err = -EEXIST;
4931	destname = dev->name;
4932	if (__dev_get_by_name(net, destname)) {
4933		/* We get here if we can't use the current device name */
4934		if (!pat)
4935			goto out;
4936		if (!dev_valid_name(pat))
4937			goto out;
4938		if (strchr(pat, '%')) {
4939			if (__dev_alloc_name(net, pat, buf) < 0)
4940				goto out;
4941			destname = buf;
4942		} else
4943			destname = pat;
4944		if (__dev_get_by_name(net, destname))
4945			goto out;
4946	}
4947
4948	/*
4949	 * And now a mini version of register_netdevice unregister_netdevice.
4950	 */
4951
4952	/* If device is running close it first. */
4953	dev_close(dev);
4954
4955	/* And unlink it from device chain */
4956	err = -ENODEV;
4957	unlist_netdevice(dev);
4958
4959	synchronize_net();
4960
4961	/* Shutdown queueing discipline. */
4962	dev_shutdown(dev);
4963
4964	/* Notify protocols, that we are about to destroy
4965	   this device. They should clean all the things.
4966	*/
4967	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4968
4969	/*
4970	 *	Flush the unicast and multicast chains
4971	 */
4972	dev_addr_discard(dev);
4973
4974	netdev_unregister_kobject(dev);
4975
4976	/* Actually switch the network namespace */
4977	dev_net_set(dev, net);
4978
4979	/* Assign the new device name */
4980	if (destname != dev->name)
4981		strcpy(dev->name, destname);
4982
4983	/* If there is an ifindex conflict assign a new one */
4984	if (__dev_get_by_index(net, dev->ifindex)) {
4985		int iflink = (dev->iflink == dev->ifindex);
4986		dev->ifindex = dev_new_index(net);
4987		if (iflink)
4988			dev->iflink = dev->ifindex;
4989	}
4990
4991	/* Fixup kobjects */
4992	err = netdev_register_kobject(dev);
4993	WARN_ON(err);
4994
4995	/* Add the device back in the hashes */
4996	list_netdevice(dev);
4997
4998	/* Notify protocols, that a new device appeared. */
4999	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5000
5001	synchronize_net();
5002	err = 0;
5003out:
5004	return err;
5005}
5006
5007static int dev_cpu_callback(struct notifier_block *nfb,
5008			    unsigned long action,
5009			    void *ocpu)
5010{
5011	struct sk_buff **list_skb;
5012	struct Qdisc **list_net;
5013	struct sk_buff *skb;
5014	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5015	struct softnet_data *sd, *oldsd;
5016
5017	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5018		return NOTIFY_OK;
5019
5020	local_irq_disable();
5021	cpu = smp_processor_id();
5022	sd = &per_cpu(softnet_data, cpu);
5023	oldsd = &per_cpu(softnet_data, oldcpu);
5024
5025	/* Find end of our completion_queue. */
5026	list_skb = &sd->completion_queue;
5027	while (*list_skb)
5028		list_skb = &(*list_skb)->next;
5029	/* Append completion queue from offline CPU. */
5030	*list_skb = oldsd->completion_queue;
5031	oldsd->completion_queue = NULL;
5032
5033	/* Find end of our output_queue. */
5034	list_net = &sd->output_queue;
5035	while (*list_net)
5036		list_net = &(*list_net)->next_sched;
5037	/* Append output queue from offline CPU. */
5038	*list_net = oldsd->output_queue;
5039	oldsd->output_queue = NULL;
5040
5041	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5042	local_irq_enable();
5043
5044	/* Process offline CPU's input_pkt_queue */
5045	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5046		netif_rx(skb);
5047
5048	return NOTIFY_OK;
5049}
5050
5051
5052/**
5053 *	netdev_increment_features - increment feature set by one
5054 *	@all: current feature set
5055 *	@one: new feature set
5056 *	@mask: mask feature set
5057 *
5058 *	Computes a new feature set after adding a device with feature set
5059 *	@one to the master device with current feature set @all.  Will not
5060 *	enable anything that is off in @mask. Returns the new feature set.
5061 */
5062unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5063					unsigned long mask)
5064{
5065	/* If device needs checksumming, downgrade to it. */
5066        if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5067		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5068	else if (mask & NETIF_F_ALL_CSUM) {
5069		/* If one device supports v4/v6 checksumming, set for all. */
5070		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5071		    !(all & NETIF_F_GEN_CSUM)) {
5072			all &= ~NETIF_F_ALL_CSUM;
5073			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5074		}
5075
5076		/* If one device supports hw checksumming, set for all. */
5077		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5078			all &= ~NETIF_F_ALL_CSUM;
5079			all |= NETIF_F_HW_CSUM;
5080		}
5081	}
5082
5083	one |= NETIF_F_ALL_CSUM;
5084
5085	one |= all & NETIF_F_ONE_FOR_ALL;
5086	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5087	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5088
5089	return all;
5090}
5091EXPORT_SYMBOL(netdev_increment_features);
5092
5093static struct hlist_head *netdev_create_hash(void)
5094{
5095	int i;
5096	struct hlist_head *hash;
5097
5098	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5099	if (hash != NULL)
5100		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5101			INIT_HLIST_HEAD(&hash[i]);
5102
5103	return hash;
5104}
5105
5106/* Initialize per network namespace state */
5107static int __net_init netdev_init(struct net *net)
5108{
5109	INIT_LIST_HEAD(&net->dev_base_head);
5110
5111	net->dev_name_head = netdev_create_hash();
5112	if (net->dev_name_head == NULL)
5113		goto err_name;
5114
5115	net->dev_index_head = netdev_create_hash();
5116	if (net->dev_index_head == NULL)
5117		goto err_idx;
5118
5119	return 0;
5120
5121err_idx:
5122	kfree(net->dev_name_head);
5123err_name:
5124	return -ENOMEM;
5125}
5126
5127/**
5128 *	netdev_drivername - network driver for the device
5129 *	@dev: network device
5130 *	@buffer: buffer for resulting name
5131 *	@len: size of buffer
5132 *
5133 *	Determine network driver for device.
5134 */
5135char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5136{
5137	const struct device_driver *driver;
5138	const struct device *parent;
5139
5140	if (len <= 0 || !buffer)
5141		return buffer;
5142	buffer[0] = 0;
5143
5144	parent = dev->dev.parent;
5145
5146	if (!parent)
5147		return buffer;
5148
5149	driver = parent->driver;
5150	if (driver && driver->name)
5151		strlcpy(buffer, driver->name, len);
5152	return buffer;
5153}
5154
5155static void __net_exit netdev_exit(struct net *net)
5156{
5157	kfree(net->dev_name_head);
5158	kfree(net->dev_index_head);
5159}
5160
5161static struct pernet_operations __net_initdata netdev_net_ops = {
5162	.init = netdev_init,
5163	.exit = netdev_exit,
5164};
5165
5166static void __net_exit default_device_exit(struct net *net)
5167{
5168	struct net_device *dev;
5169	/*
5170	 * Push all migratable of the network devices back to the
5171	 * initial network namespace
5172	 */
5173	rtnl_lock();
5174restart:
5175	for_each_netdev(net, dev) {
5176		int err;
5177		char fb_name[IFNAMSIZ];
5178
5179		/* Ignore unmoveable devices (i.e. loopback) */
5180		if (dev->features & NETIF_F_NETNS_LOCAL)
5181			continue;
5182
5183		/* Delete virtual devices */
5184		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5185			dev->rtnl_link_ops->dellink(dev);
5186			goto restart;
5187		}
5188
5189		/* Push remaing network devices to init_net */
5190		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5191		err = dev_change_net_namespace(dev, &init_net, fb_name);
5192		if (err) {
5193			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5194				__func__, dev->name, err);
5195			BUG();
5196		}
5197		goto restart;
5198	}
5199	rtnl_unlock();
5200}
5201
5202static struct pernet_operations __net_initdata default_device_ops = {
5203	.exit = default_device_exit,
5204};
5205
5206/*
5207 *	Initialize the DEV module. At boot time this walks the device list and
5208 *	unhooks any devices that fail to initialise (normally hardware not
5209 *	present) and leaves us with a valid list of present and active devices.
5210 *
5211 */
5212
5213/*
5214 *       This is called single threaded during boot, so no need
5215 *       to take the rtnl semaphore.
5216 */
5217static int __init net_dev_init(void)
5218{
5219	int i, rc = -ENOMEM;
5220
5221	BUG_ON(!dev_boot_phase);
5222
5223	if (dev_proc_init())
5224		goto out;
5225
5226	if (netdev_kobject_init())
5227		goto out;
5228
5229	INIT_LIST_HEAD(&ptype_all);
5230	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5231		INIT_LIST_HEAD(&ptype_base[i]);
5232
5233	if (register_pernet_subsys(&netdev_net_ops))
5234		goto out;
5235
5236	/*
5237	 *	Initialise the packet receive queues.
5238	 */
5239
5240	for_each_possible_cpu(i) {
5241		struct softnet_data *queue;
5242
5243		queue = &per_cpu(softnet_data, i);
5244		skb_queue_head_init(&queue->input_pkt_queue);
5245		queue->completion_queue = NULL;
5246		INIT_LIST_HEAD(&queue->poll_list);
5247
5248		queue->backlog.poll = process_backlog;
5249		queue->backlog.weight = weight_p;
5250		queue->backlog.gro_list = NULL;
5251		queue->backlog.gro_count = 0;
5252	}
5253
5254	dev_boot_phase = 0;
5255
5256	/* The loopback device is special if any other network devices
5257	 * is present in a network namespace the loopback device must
5258	 * be present. Since we now dynamically allocate and free the
5259	 * loopback device ensure this invariant is maintained by
5260	 * keeping the loopback device as the first device on the
5261	 * list of network devices.  Ensuring the loopback devices
5262	 * is the first device that appears and the last network device
5263	 * that disappears.
5264	 */
5265	if (register_pernet_device(&loopback_net_ops))
5266		goto out;
5267
5268	if (register_pernet_device(&default_device_ops))
5269		goto out;
5270
5271	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5272	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5273
5274	hotcpu_notifier(dev_cpu_callback, 0);
5275	dst_init();
5276	dev_mcast_init();
5277	rc = 0;
5278out:
5279	return rc;
5280}
5281
5282subsys_initcall(net_dev_init);
5283
5284static int __init initialize_hashrnd(void)
5285{
5286	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5287	return 0;
5288}
5289
5290late_initcall_sync(initialize_hashrnd);
5291
5292EXPORT_SYMBOL(__dev_get_by_index);
5293EXPORT_SYMBOL(__dev_get_by_name);
5294EXPORT_SYMBOL(__dev_remove_pack);
5295EXPORT_SYMBOL(dev_valid_name);
5296EXPORT_SYMBOL(dev_add_pack);
5297EXPORT_SYMBOL(dev_alloc_name);
5298EXPORT_SYMBOL(dev_close);
5299EXPORT_SYMBOL(dev_get_by_flags);
5300EXPORT_SYMBOL(dev_get_by_index);
5301EXPORT_SYMBOL(dev_get_by_name);
5302EXPORT_SYMBOL(dev_open);
5303EXPORT_SYMBOL(dev_queue_xmit);
5304EXPORT_SYMBOL(dev_remove_pack);
5305EXPORT_SYMBOL(dev_set_allmulti);
5306EXPORT_SYMBOL(dev_set_promiscuity);
5307EXPORT_SYMBOL(dev_change_flags);
5308EXPORT_SYMBOL(dev_set_mtu);
5309EXPORT_SYMBOL(dev_set_mac_address);
5310EXPORT_SYMBOL(free_netdev);
5311EXPORT_SYMBOL(netdev_boot_setup_check);
5312EXPORT_SYMBOL(netdev_set_master);
5313EXPORT_SYMBOL(netdev_state_change);
5314EXPORT_SYMBOL(netif_receive_skb);
5315EXPORT_SYMBOL(netif_rx);
5316EXPORT_SYMBOL(register_gifconf);
5317EXPORT_SYMBOL(register_netdevice);
5318EXPORT_SYMBOL(register_netdevice_notifier);
5319EXPORT_SYMBOL(skb_checksum_help);
5320EXPORT_SYMBOL(synchronize_net);
5321EXPORT_SYMBOL(unregister_netdevice);
5322EXPORT_SYMBOL(unregister_netdevice_notifier);
5323EXPORT_SYMBOL(net_enable_timestamp);
5324EXPORT_SYMBOL(net_disable_timestamp);
5325EXPORT_SYMBOL(dev_get_flags);
5326
5327#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5328EXPORT_SYMBOL(br_handle_frame_hook);
5329EXPORT_SYMBOL(br_fdb_get_hook);
5330EXPORT_SYMBOL(br_fdb_put_hook);
5331#endif
5332
5333EXPORT_SYMBOL(dev_load);
5334
5335EXPORT_PER_CPU_SYMBOL(softnet_data);