net/core/dev.c at v2.6.34 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v2.6.34 149 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <linux/if_bridge.h>
 105#include <linux/if_macvlan.h>
 106#include <net/dst.h>
 107#include <net/pkt_sched.h>
 108#include <net/checksum.h>
 109#include <net/xfrm.h>
 110#include <linux/highmem.h>
 111#include <linux/init.h>
 112#include <linux/kmod.h>
 113#include <linux/module.h>
 114#include <linux/netpoll.h>
 115#include <linux/rcupdate.h>
 116#include <linux/delay.h>
 117#include <net/wext.h>
 118#include <net/iw_handler.h>
 119#include <asm/current.h>
 120#include <linux/audit.h>
 121#include <linux/dmaengine.h>
 122#include <linux/err.h>
 123#include <linux/ctype.h>
 124#include <linux/if_arp.h>
 125#include <linux/if_vlan.h>
 126#include <linux/ip.h>
 127#include <net/ip.h>
 128#include <linux/ipv6.h>
 129#include <linux/in.h>
 130#include <linux/jhash.h>
 131#include <linux/random.h>
 132#include <trace/events/napi.h>
 133
 134#include "net-sysfs.h"
 135
 136/* Instead of increasing this, you should create a hash table. */
 137#define MAX_GRO_SKBS 8
 138
 139/* This should be increased if a protocol with a bigger head is added. */
 140#define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142/*
 143 *	The list of packet types we will receive (as opposed to discard)
 144 *	and the routines to invoke.
 145 *
 146 *	Why 16. Because with 16 the only overlap we get on a hash of the
 147 *	low nibble of the protocol value is RARP/SNAP/X.25.
 148 *
 149 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 150 *             sure which should go first, but I bet it won't make much
 151 *             difference if we are running VLANs.  The good news is that
 152 *             this protocol won't be in the list unless compiled in, so
 153 *             the average user (w/out VLANs) will not be adversely affected.
 154 *             --BLG
 155 *
 156 *		0800	IP
 157 *		8100    802.1Q VLAN
 158 *		0001	802.3
 159 *		0002	AX.25
 160 *		0004	802.2
 161 *		8035	RARP
 162 *		0005	SNAP
 163 *		0805	X.25
 164 *		0806	ARP
 165 *		8137	IPX
 166 *		0009	Localtalk
 167 *		86DD	IPv6
 168 */
 169
 170#define PTYPE_HASH_SIZE	(16)
 171#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 172
 173static DEFINE_SPINLOCK(ptype_lock);
 174static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 175static struct list_head ptype_all __read_mostly;	/* Taps */
 176
 177/*
 178 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 179 * semaphore.
 180 *
 181 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 182 *
 183 * Writers must hold the rtnl semaphore while they loop through the
 184 * dev_base_head list, and hold dev_base_lock for writing when they do the
 185 * actual updates.  This allows pure readers to access the list even
 186 * while a writer is preparing to update it.
 187 *
 188 * To put it another way, dev_base_lock is held for writing only to
 189 * protect against pure readers; the rtnl semaphore provides the
 190 * protection against other writers.
 191 *
 192 * See, for example usages, register_netdevice() and
 193 * unregister_netdevice(), which must be called with the rtnl
 194 * semaphore held.
 195 */
 196DEFINE_RWLOCK(dev_base_lock);
 197EXPORT_SYMBOL(dev_base_lock);
 198
 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200{
 201	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203}
 204
 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206{
 207	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208}
 209
 210/* Device list insertion */
 211static int list_netdevice(struct net_device *dev)
 212{
 213	struct net *net = dev_net(dev);
 214
 215	ASSERT_RTNL();
 216
 217	write_lock_bh(&dev_base_lock);
 218	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 219	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 220	hlist_add_head_rcu(&dev->index_hlist,
 221			   dev_index_hash(net, dev->ifindex));
 222	write_unlock_bh(&dev_base_lock);
 223	return 0;
 224}
 225
 226/* Device list removal
 227 * caller must respect a RCU grace period before freeing/reusing dev
 228 */
 229static void unlist_netdevice(struct net_device *dev)
 230{
 231	ASSERT_RTNL();
 232
 233	/* Unlink dev from the device chain */
 234	write_lock_bh(&dev_base_lock);
 235	list_del_rcu(&dev->dev_list);
 236	hlist_del_rcu(&dev->name_hlist);
 237	hlist_del_rcu(&dev->index_hlist);
 238	write_unlock_bh(&dev_base_lock);
 239}
 240
 241/*
 242 *	Our notifier list
 243 */
 244
 245static RAW_NOTIFIER_HEAD(netdev_chain);
 246
 247/*
 248 *	Device drivers call our routines to queue packets here. We empty the
 249 *	queue in the local softnet handler.
 250 */
 251
 252DEFINE_PER_CPU(struct softnet_data, softnet_data);
 253EXPORT_PER_CPU_SYMBOL(softnet_data);
 254
 255#ifdef CONFIG_LOCKDEP
 256/*
 257 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 258 * according to dev->type
 259 */
 260static const unsigned short netdev_lock_type[] =
 261	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 262	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 263	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 264	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 265	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 266	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 267	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 268	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 269	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 270	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 271	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 272	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 273	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 274	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 275	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 276	 ARPHRD_VOID, ARPHRD_NONE};
 277
 278static const char *const netdev_lock_name[] =
 279	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 280	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 281	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 282	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 283	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 284	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 285	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 286	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 287	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 288	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 289	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 290	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 291	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 292	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 293	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 294	 "_xmit_VOID", "_xmit_NONE"};
 295
 296static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 297static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300{
 301	int i;
 302
 303	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304		if (netdev_lock_type[i] == dev_type)
 305			return i;
 306	/* the last key is used by default */
 307	return ARRAY_SIZE(netdev_lock_type) - 1;
 308}
 309
 310static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 311						 unsigned short dev_type)
 312{
 313	int i;
 314
 315	i = netdev_lock_pos(dev_type);
 316	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317				   netdev_lock_name[i]);
 318}
 319
 320static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 321{
 322	int i;
 323
 324	i = netdev_lock_pos(dev->type);
 325	lockdep_set_class_and_name(&dev->addr_list_lock,
 326				   &netdev_addr_lock_key[i],
 327				   netdev_lock_name[i]);
 328}
 329#else
 330static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 331						 unsigned short dev_type)
 332{
 333}
 334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335{
 336}
 337#endif
 338
 339/*******************************************************************************
 340
 341		Protocol management and registration routines
 342
 343*******************************************************************************/
 344
 345/*
 346 *	Add a protocol ID to the list. Now that the input handler is
 347 *	smarter we can dispense with all the messy stuff that used to be
 348 *	here.
 349 *
 350 *	BEWARE!!! Protocol handlers, mangling input packets,
 351 *	MUST BE last in hash buckets and checking protocol handlers
 352 *	MUST start from promiscuous ptype_all chain in net_bh.
 353 *	It is true now, do not change it.
 354 *	Explanation follows: if protocol handler, mangling packet, will
 355 *	be the first on list, it is not able to sense, that packet
 356 *	is cloned and should be copied-on-write, so that it will
 357 *	change it and subsequent readers will get broken packet.
 358 *							--ANK (980803)
 359 */
 360
 361/**
 362 *	dev_add_pack - add packet handler
 363 *	@pt: packet type declaration
 364 *
 365 *	Add a protocol handler to the networking stack. The passed &packet_type
 366 *	is linked into kernel lists and may not be freed until it has been
 367 *	removed from the kernel lists.
 368 *
 369 *	This call does not sleep therefore it can not
 370 *	guarantee all CPU's that are in middle of receiving packets
 371 *	will see the new packet type (until the next received packet).
 372 */
 373
 374void dev_add_pack(struct packet_type *pt)
 375{
 376	int hash;
 377
 378	spin_lock_bh(&ptype_lock);
 379	if (pt->type == htons(ETH_P_ALL))
 380		list_add_rcu(&pt->list, &ptype_all);
 381	else {
 382		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 383		list_add_rcu(&pt->list, &ptype_base[hash]);
 384	}
 385	spin_unlock_bh(&ptype_lock);
 386}
 387EXPORT_SYMBOL(dev_add_pack);
 388
 389/**
 390 *	__dev_remove_pack	 - remove packet handler
 391 *	@pt: packet type declaration
 392 *
 393 *	Remove a protocol handler that was previously added to the kernel
 394 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 395 *	from the kernel lists and can be freed or reused once this function
 396 *	returns.
 397 *
 398 *      The packet type might still be in use by receivers
 399 *	and must not be freed until after all the CPU's have gone
 400 *	through a quiescent state.
 401 */
 402void __dev_remove_pack(struct packet_type *pt)
 403{
 404	struct list_head *head;
 405	struct packet_type *pt1;
 406
 407	spin_lock_bh(&ptype_lock);
 408
 409	if (pt->type == htons(ETH_P_ALL))
 410		head = &ptype_all;
 411	else
 412		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 413
 414	list_for_each_entry(pt1, head, list) {
 415		if (pt == pt1) {
 416			list_del_rcu(&pt->list);
 417			goto out;
 418		}
 419	}
 420
 421	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 422out:
 423	spin_unlock_bh(&ptype_lock);
 424}
 425EXPORT_SYMBOL(__dev_remove_pack);
 426
 427/**
 428 *	dev_remove_pack	 - remove packet handler
 429 *	@pt: packet type declaration
 430 *
 431 *	Remove a protocol handler that was previously added to the kernel
 432 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 433 *	from the kernel lists and can be freed or reused once this function
 434 *	returns.
 435 *
 436 *	This call sleeps to guarantee that no CPU is looking at the packet
 437 *	type after return.
 438 */
 439void dev_remove_pack(struct packet_type *pt)
 440{
 441	__dev_remove_pack(pt);
 442
 443	synchronize_net();
 444}
 445EXPORT_SYMBOL(dev_remove_pack);
 446
 447/******************************************************************************
 448
 449		      Device Boot-time Settings Routines
 450
 451*******************************************************************************/
 452
 453/* Boot time configuration table */
 454static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 455
 456/**
 457 *	netdev_boot_setup_add	- add new setup entry
 458 *	@name: name of the device
 459 *	@map: configured settings for the device
 460 *
 461 *	Adds new setup entry to the dev_boot_setup list.  The function
 462 *	returns 0 on error and 1 on success.  This is a generic routine to
 463 *	all netdevices.
 464 */
 465static int netdev_boot_setup_add(char *name, struct ifmap *map)
 466{
 467	struct netdev_boot_setup *s;
 468	int i;
 469
 470	s = dev_boot_setup;
 471	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 472		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 473			memset(s[i].name, 0, sizeof(s[i].name));
 474			strlcpy(s[i].name, name, IFNAMSIZ);
 475			memcpy(&s[i].map, map, sizeof(s[i].map));
 476			break;
 477		}
 478	}
 479
 480	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 481}
 482
 483/**
 484 *	netdev_boot_setup_check	- check boot time settings
 485 *	@dev: the netdevice
 486 *
 487 * 	Check boot time settings for the device.
 488 *	The found settings are set for the device to be used
 489 *	later in the device probing.
 490 *	Returns 0 if no settings found, 1 if they are.
 491 */
 492int netdev_boot_setup_check(struct net_device *dev)
 493{
 494	struct netdev_boot_setup *s = dev_boot_setup;
 495	int i;
 496
 497	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 499		    !strcmp(dev->name, s[i].name)) {
 500			dev->irq 	= s[i].map.irq;
 501			dev->base_addr 	= s[i].map.base_addr;
 502			dev->mem_start 	= s[i].map.mem_start;
 503			dev->mem_end 	= s[i].map.mem_end;
 504			return 1;
 505		}
 506	}
 507	return 0;
 508}
 509EXPORT_SYMBOL(netdev_boot_setup_check);
 510
 511
 512/**
 513 *	netdev_boot_base	- get address from boot time settings
 514 *	@prefix: prefix for network device
 515 *	@unit: id for network device
 516 *
 517 * 	Check boot time settings for the base address of device.
 518 *	The found settings are set for the device to be used
 519 *	later in the device probing.
 520 *	Returns 0 if no settings found.
 521 */
 522unsigned long netdev_boot_base(const char *prefix, int unit)
 523{
 524	const struct netdev_boot_setup *s = dev_boot_setup;
 525	char name[IFNAMSIZ];
 526	int i;
 527
 528	sprintf(name, "%s%d", prefix, unit);
 529
 530	/*
 531	 * If device already registered then return base of 1
 532	 * to indicate not to probe for this interface
 533	 */
 534	if (__dev_get_by_name(&init_net, name))
 535		return 1;
 536
 537	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 538		if (!strcmp(name, s[i].name))
 539			return s[i].map.base_addr;
 540	return 0;
 541}
 542
 543/*
 544 * Saves at boot time configured settings for any netdevice.
 545 */
 546int __init netdev_boot_setup(char *str)
 547{
 548	int ints[5];
 549	struct ifmap map;
 550
 551	str = get_options(str, ARRAY_SIZE(ints), ints);
 552	if (!str || !*str)
 553		return 0;
 554
 555	/* Save settings */
 556	memset(&map, 0, sizeof(map));
 557	if (ints[0] > 0)
 558		map.irq = ints[1];
 559	if (ints[0] > 1)
 560		map.base_addr = ints[2];
 561	if (ints[0] > 2)
 562		map.mem_start = ints[3];
 563	if (ints[0] > 3)
 564		map.mem_end = ints[4];
 565
 566	/* Add new entry to the list */
 567	return netdev_boot_setup_add(str, &map);
 568}
 569
 570__setup("netdev=", netdev_boot_setup);
 571
 572/*******************************************************************************
 573
 574			    Device Interface Subroutines
 575
 576*******************************************************************************/
 577
 578/**
 579 *	__dev_get_by_name	- find a device by its name
 580 *	@net: the applicable net namespace
 581 *	@name: name to find
 582 *
 583 *	Find an interface by name. Must be called under RTNL semaphore
 584 *	or @dev_base_lock. If the name is found a pointer to the device
 585 *	is returned. If the name is not found then %NULL is returned. The
 586 *	reference counters are not incremented so the caller must be
 587 *	careful with locks.
 588 */
 589
 590struct net_device *__dev_get_by_name(struct net *net, const char *name)
 591{
 592	struct hlist_node *p;
 593	struct net_device *dev;
 594	struct hlist_head *head = dev_name_hash(net, name);
 595
 596	hlist_for_each_entry(dev, p, head, name_hlist)
 597		if (!strncmp(dev->name, name, IFNAMSIZ))
 598			return dev;
 599
 600	return NULL;
 601}
 602EXPORT_SYMBOL(__dev_get_by_name);
 603
 604/**
 605 *	dev_get_by_name_rcu	- find a device by its name
 606 *	@net: the applicable net namespace
 607 *	@name: name to find
 608 *
 609 *	Find an interface by name.
 610 *	If the name is found a pointer to the device is returned.
 611 * 	If the name is not found then %NULL is returned.
 612 *	The reference counters are not incremented so the caller must be
 613 *	careful with locks. The caller must hold RCU lock.
 614 */
 615
 616struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 617{
 618	struct hlist_node *p;
 619	struct net_device *dev;
 620	struct hlist_head *head = dev_name_hash(net, name);
 621
 622	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 623		if (!strncmp(dev->name, name, IFNAMSIZ))
 624			return dev;
 625
 626	return NULL;
 627}
 628EXPORT_SYMBOL(dev_get_by_name_rcu);
 629
 630/**
 631 *	dev_get_by_name		- find a device by its name
 632 *	@net: the applicable net namespace
 633 *	@name: name to find
 634 *
 635 *	Find an interface by name. This can be called from any
 636 *	context and does its own locking. The returned handle has
 637 *	the usage count incremented and the caller must use dev_put() to
 638 *	release it when it is no longer needed. %NULL is returned if no
 639 *	matching device is found.
 640 */
 641
 642struct net_device *dev_get_by_name(struct net *net, const char *name)
 643{
 644	struct net_device *dev;
 645
 646	rcu_read_lock();
 647	dev = dev_get_by_name_rcu(net, name);
 648	if (dev)
 649		dev_hold(dev);
 650	rcu_read_unlock();
 651	return dev;
 652}
 653EXPORT_SYMBOL(dev_get_by_name);
 654
 655/**
 656 *	__dev_get_by_index - find a device by its ifindex
 657 *	@net: the applicable net namespace
 658 *	@ifindex: index of device
 659 *
 660 *	Search for an interface by index. Returns %NULL if the device
 661 *	is not found or a pointer to the device. The device has not
 662 *	had its reference counter increased so the caller must be careful
 663 *	about locking. The caller must hold either the RTNL semaphore
 664 *	or @dev_base_lock.
 665 */
 666
 667struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 668{
 669	struct hlist_node *p;
 670	struct net_device *dev;
 671	struct hlist_head *head = dev_index_hash(net, ifindex);
 672
 673	hlist_for_each_entry(dev, p, head, index_hlist)
 674		if (dev->ifindex == ifindex)
 675			return dev;
 676
 677	return NULL;
 678}
 679EXPORT_SYMBOL(__dev_get_by_index);
 680
 681/**
 682 *	dev_get_by_index_rcu - find a device by its ifindex
 683 *	@net: the applicable net namespace
 684 *	@ifindex: index of device
 685 *
 686 *	Search for an interface by index. Returns %NULL if the device
 687 *	is not found or a pointer to the device. The device has not
 688 *	had its reference counter increased so the caller must be careful
 689 *	about locking. The caller must hold RCU lock.
 690 */
 691
 692struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 693{
 694	struct hlist_node *p;
 695	struct net_device *dev;
 696	struct hlist_head *head = dev_index_hash(net, ifindex);
 697
 698	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 699		if (dev->ifindex == ifindex)
 700			return dev;
 701
 702	return NULL;
 703}
 704EXPORT_SYMBOL(dev_get_by_index_rcu);
 705
 706
 707/**
 708 *	dev_get_by_index - find a device by its ifindex
 709 *	@net: the applicable net namespace
 710 *	@ifindex: index of device
 711 *
 712 *	Search for an interface by index. Returns NULL if the device
 713 *	is not found or a pointer to the device. The device returned has
 714 *	had a reference added and the pointer is safe until the user calls
 715 *	dev_put to indicate they have finished with it.
 716 */
 717
 718struct net_device *dev_get_by_index(struct net *net, int ifindex)
 719{
 720	struct net_device *dev;
 721
 722	rcu_read_lock();
 723	dev = dev_get_by_index_rcu(net, ifindex);
 724	if (dev)
 725		dev_hold(dev);
 726	rcu_read_unlock();
 727	return dev;
 728}
 729EXPORT_SYMBOL(dev_get_by_index);
 730
 731/**
 732 *	dev_getbyhwaddr - find a device by its hardware address
 733 *	@net: the applicable net namespace
 734 *	@type: media type of device
 735 *	@ha: hardware address
 736 *
 737 *	Search for an interface by MAC address. Returns NULL if the device
 738 *	is not found or a pointer to the device. The caller must hold the
 739 *	rtnl semaphore. The returned device has not had its ref count increased
 740 *	and the caller must therefore be careful about locking
 741 *
 742 *	BUGS:
 743 *	If the API was consistent this would be __dev_get_by_hwaddr
 744 */
 745
 746struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 747{
 748	struct net_device *dev;
 749
 750	ASSERT_RTNL();
 751
 752	for_each_netdev(net, dev)
 753		if (dev->type == type &&
 754		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 755			return dev;
 756
 757	return NULL;
 758}
 759EXPORT_SYMBOL(dev_getbyhwaddr);
 760
 761struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 762{
 763	struct net_device *dev;
 764
 765	ASSERT_RTNL();
 766	for_each_netdev(net, dev)
 767		if (dev->type == type)
 768			return dev;
 769
 770	return NULL;
 771}
 772EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 773
 774struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775{
 776	struct net_device *dev;
 777
 778	rtnl_lock();
 779	dev = __dev_getfirstbyhwtype(net, type);
 780	if (dev)
 781		dev_hold(dev);
 782	rtnl_unlock();
 783	return dev;
 784}
 785EXPORT_SYMBOL(dev_getfirstbyhwtype);
 786
 787/**
 788 *	dev_get_by_flags - find any device with given flags
 789 *	@net: the applicable net namespace
 790 *	@if_flags: IFF_* values
 791 *	@mask: bitmask of bits in if_flags to check
 792 *
 793 *	Search for any interface with the given flags. Returns NULL if a device
 794 *	is not found or a pointer to the device. The device returned has
 795 *	had a reference added and the pointer is safe until the user calls
 796 *	dev_put to indicate they have finished with it.
 797 */
 798
 799struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 800				    unsigned short mask)
 801{
 802	struct net_device *dev, *ret;
 803
 804	ret = NULL;
 805	rcu_read_lock();
 806	for_each_netdev_rcu(net, dev) {
 807		if (((dev->flags ^ if_flags) & mask) == 0) {
 808			dev_hold(dev);
 809			ret = dev;
 810			break;
 811		}
 812	}
 813	rcu_read_unlock();
 814	return ret;
 815}
 816EXPORT_SYMBOL(dev_get_by_flags);
 817
 818/**
 819 *	dev_valid_name - check if name is okay for network device
 820 *	@name: name string
 821 *
 822 *	Network device names need to be valid file names to
 823 *	to allow sysfs to work.  We also disallow any kind of
 824 *	whitespace.
 825 */
 826int dev_valid_name(const char *name)
 827{
 828	if (*name == '\0')
 829		return 0;
 830	if (strlen(name) >= IFNAMSIZ)
 831		return 0;
 832	if (!strcmp(name, ".") || !strcmp(name, ".."))
 833		return 0;
 834
 835	while (*name) {
 836		if (*name == '/' || isspace(*name))
 837			return 0;
 838		name++;
 839	}
 840	return 1;
 841}
 842EXPORT_SYMBOL(dev_valid_name);
 843
 844/**
 845 *	__dev_alloc_name - allocate a name for a device
 846 *	@net: network namespace to allocate the device name in
 847 *	@name: name format string
 848 *	@buf:  scratch buffer and result name string
 849 *
 850 *	Passed a format string - eg "lt%d" it will try and find a suitable
 851 *	id. It scans list of devices to build up a free map, then chooses
 852 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 853 *	while allocating the name and adding the device in order to avoid
 854 *	duplicates.
 855 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 856 *	Returns the number of the unit assigned or a negative errno code.
 857 */
 858
 859static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 860{
 861	int i = 0;
 862	const char *p;
 863	const int max_netdevices = 8*PAGE_SIZE;
 864	unsigned long *inuse;
 865	struct net_device *d;
 866
 867	p = strnchr(name, IFNAMSIZ-1, '%');
 868	if (p) {
 869		/*
 870		 * Verify the string as this thing may have come from
 871		 * the user.  There must be either one "%d" and no other "%"
 872		 * characters.
 873		 */
 874		if (p[1] != 'd' || strchr(p + 2, '%'))
 875			return -EINVAL;
 876
 877		/* Use one page as a bit array of possible slots */
 878		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 879		if (!inuse)
 880			return -ENOMEM;
 881
 882		for_each_netdev(net, d) {
 883			if (!sscanf(d->name, name, &i))
 884				continue;
 885			if (i < 0 || i >= max_netdevices)
 886				continue;
 887
 888			/*  avoid cases where sscanf is not exact inverse of printf */
 889			snprintf(buf, IFNAMSIZ, name, i);
 890			if (!strncmp(buf, d->name, IFNAMSIZ))
 891				set_bit(i, inuse);
 892		}
 893
 894		i = find_first_zero_bit(inuse, max_netdevices);
 895		free_page((unsigned long) inuse);
 896	}
 897
 898	if (buf != name)
 899		snprintf(buf, IFNAMSIZ, name, i);
 900	if (!__dev_get_by_name(net, buf))
 901		return i;
 902
 903	/* It is possible to run out of possible slots
 904	 * when the name is long and there isn't enough space left
 905	 * for the digits, or if all bits are used.
 906	 */
 907	return -ENFILE;
 908}
 909
 910/**
 911 *	dev_alloc_name - allocate a name for a device
 912 *	@dev: device
 913 *	@name: name format string
 914 *
 915 *	Passed a format string - eg "lt%d" it will try and find a suitable
 916 *	id. It scans list of devices to build up a free map, then chooses
 917 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 918 *	while allocating the name and adding the device in order to avoid
 919 *	duplicates.
 920 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 921 *	Returns the number of the unit assigned or a negative errno code.
 922 */
 923
 924int dev_alloc_name(struct net_device *dev, const char *name)
 925{
 926	char buf[IFNAMSIZ];
 927	struct net *net;
 928	int ret;
 929
 930	BUG_ON(!dev_net(dev));
 931	net = dev_net(dev);
 932	ret = __dev_alloc_name(net, name, buf);
 933	if (ret >= 0)
 934		strlcpy(dev->name, buf, IFNAMSIZ);
 935	return ret;
 936}
 937EXPORT_SYMBOL(dev_alloc_name);
 938
 939static int dev_get_valid_name(struct net *net, const char *name, char *buf,
 940			      bool fmt)
 941{
 942	if (!dev_valid_name(name))
 943		return -EINVAL;
 944
 945	if (fmt && strchr(name, '%'))
 946		return __dev_alloc_name(net, name, buf);
 947	else if (__dev_get_by_name(net, name))
 948		return -EEXIST;
 949	else if (buf != name)
 950		strlcpy(buf, name, IFNAMSIZ);
 951
 952	return 0;
 953}
 954
 955/**
 956 *	dev_change_name - change name of a device
 957 *	@dev: device
 958 *	@newname: name (or format string) must be at least IFNAMSIZ
 959 *
 960 *	Change name of a device, can pass format strings "eth%d".
 961 *	for wildcarding.
 962 */
 963int dev_change_name(struct net_device *dev, const char *newname)
 964{
 965	char oldname[IFNAMSIZ];
 966	int err = 0;
 967	int ret;
 968	struct net *net;
 969
 970	ASSERT_RTNL();
 971	BUG_ON(!dev_net(dev));
 972
 973	net = dev_net(dev);
 974	if (dev->flags & IFF_UP)
 975		return -EBUSY;
 976
 977	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 978		return 0;
 979
 980	memcpy(oldname, dev->name, IFNAMSIZ);
 981
 982	err = dev_get_valid_name(net, newname, dev->name, 1);
 983	if (err < 0)
 984		return err;
 985
 986rollback:
 987	/* For now only devices in the initial network namespace
 988	 * are in sysfs.
 989	 */
 990	if (net_eq(net, &init_net)) {
 991		ret = device_rename(&dev->dev, dev->name);
 992		if (ret) {
 993			memcpy(dev->name, oldname, IFNAMSIZ);
 994			return ret;
 995		}
 996	}
 997
 998	write_lock_bh(&dev_base_lock);
 999	hlist_del(&dev->name_hlist);
1000	write_unlock_bh(&dev_base_lock);
1001
1002	synchronize_rcu();
1003
1004	write_lock_bh(&dev_base_lock);
1005	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1006	write_unlock_bh(&dev_base_lock);
1007
1008	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1009	ret = notifier_to_errno(ret);
1010
1011	if (ret) {
1012		/* err >= 0 after dev_alloc_name() or stores the first errno */
1013		if (err >= 0) {
1014			err = ret;
1015			memcpy(dev->name, oldname, IFNAMSIZ);
1016			goto rollback;
1017		} else {
1018			printk(KERN_ERR
1019			       "%s: name change rollback failed: %d.\n",
1020			       dev->name, ret);
1021		}
1022	}
1023
1024	return err;
1025}
1026
1027/**
1028 *	dev_set_alias - change ifalias of a device
1029 *	@dev: device
1030 *	@alias: name up to IFALIASZ
1031 *	@len: limit of bytes to copy from info
1032 *
1033 *	Set ifalias for a device,
1034 */
1035int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1036{
1037	ASSERT_RTNL();
1038
1039	if (len >= IFALIASZ)
1040		return -EINVAL;
1041
1042	if (!len) {
1043		if (dev->ifalias) {
1044			kfree(dev->ifalias);
1045			dev->ifalias = NULL;
1046		}
1047		return 0;
1048	}
1049
1050	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1051	if (!dev->ifalias)
1052		return -ENOMEM;
1053
1054	strlcpy(dev->ifalias, alias, len+1);
1055	return len;
1056}
1057
1058
1059/**
1060 *	netdev_features_change - device changes features
1061 *	@dev: device to cause notification
1062 *
1063 *	Called to indicate a device has changed features.
1064 */
1065void netdev_features_change(struct net_device *dev)
1066{
1067	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1068}
1069EXPORT_SYMBOL(netdev_features_change);
1070
1071/**
1072 *	netdev_state_change - device changes state
1073 *	@dev: device to cause notification
1074 *
1075 *	Called to indicate a device has changed state. This function calls
1076 *	the notifier chains for netdev_chain and sends a NEWLINK message
1077 *	to the routing socket.
1078 */
1079void netdev_state_change(struct net_device *dev)
1080{
1081	if (dev->flags & IFF_UP) {
1082		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1083		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1084	}
1085}
1086EXPORT_SYMBOL(netdev_state_change);
1087
1088void netdev_bonding_change(struct net_device *dev, unsigned long event)
1089{
1090	call_netdevice_notifiers(event, dev);
1091}
1092EXPORT_SYMBOL(netdev_bonding_change);
1093
1094/**
1095 *	dev_load 	- load a network module
1096 *	@net: the applicable net namespace
1097 *	@name: name of interface
1098 *
1099 *	If a network interface is not present and the process has suitable
1100 *	privileges this function loads the module. If module loading is not
1101 *	available in this kernel then it becomes a nop.
1102 */
1103
1104void dev_load(struct net *net, const char *name)
1105{
1106	struct net_device *dev;
1107
1108	rcu_read_lock();
1109	dev = dev_get_by_name_rcu(net, name);
1110	rcu_read_unlock();
1111
1112	if (!dev && capable(CAP_NET_ADMIN))
1113		request_module("%s", name);
1114}
1115EXPORT_SYMBOL(dev_load);
1116
1117static int __dev_open(struct net_device *dev)
1118{
1119	const struct net_device_ops *ops = dev->netdev_ops;
1120	int ret;
1121
1122	ASSERT_RTNL();
1123
1124	/*
1125	 *	Is it even present?
1126	 */
1127	if (!netif_device_present(dev))
1128		return -ENODEV;
1129
1130	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1131	ret = notifier_to_errno(ret);
1132	if (ret)
1133		return ret;
1134
1135	/*
1136	 *	Call device private open method
1137	 */
1138	set_bit(__LINK_STATE_START, &dev->state);
1139
1140	if (ops->ndo_validate_addr)
1141		ret = ops->ndo_validate_addr(dev);
1142
1143	if (!ret && ops->ndo_open)
1144		ret = ops->ndo_open(dev);
1145
1146	/*
1147	 *	If it went open OK then:
1148	 */
1149
1150	if (ret)
1151		clear_bit(__LINK_STATE_START, &dev->state);
1152	else {
1153		/*
1154		 *	Set the flags.
1155		 */
1156		dev->flags |= IFF_UP;
1157
1158		/*
1159		 *	Enable NET_DMA
1160		 */
1161		net_dmaengine_get();
1162
1163		/*
1164		 *	Initialize multicasting status
1165		 */
1166		dev_set_rx_mode(dev);
1167
1168		/*
1169		 *	Wakeup transmit queue engine
1170		 */
1171		dev_activate(dev);
1172	}
1173
1174	return ret;
1175}
1176
1177/**
1178 *	dev_open	- prepare an interface for use.
1179 *	@dev:	device to open
1180 *
1181 *	Takes a device from down to up state. The device's private open
1182 *	function is invoked and then the multicast lists are loaded. Finally
1183 *	the device is moved into the up state and a %NETDEV_UP message is
1184 *	sent to the netdev notifier chain.
1185 *
1186 *	Calling this function on an active interface is a nop. On a failure
1187 *	a negative errno code is returned.
1188 */
1189int dev_open(struct net_device *dev)
1190{
1191	int ret;
1192
1193	/*
1194	 *	Is it already up?
1195	 */
1196	if (dev->flags & IFF_UP)
1197		return 0;
1198
1199	/*
1200	 *	Open device
1201	 */
1202	ret = __dev_open(dev);
1203	if (ret < 0)
1204		return ret;
1205
1206	/*
1207	 *	... and announce new interface.
1208	 */
1209	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210	call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212	return ret;
1213}
1214EXPORT_SYMBOL(dev_open);
1215
1216static int __dev_close(struct net_device *dev)
1217{
1218	const struct net_device_ops *ops = dev->netdev_ops;
1219
1220	ASSERT_RTNL();
1221	might_sleep();
1222
1223	/*
1224	 *	Tell people we are going down, so that they can
1225	 *	prepare to death, when device is still operating.
1226	 */
1227	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1228
1229	clear_bit(__LINK_STATE_START, &dev->state);
1230
1231	/* Synchronize to scheduled poll. We cannot touch poll list,
1232	 * it can be even on different cpu. So just clear netif_running().
1233	 *
1234	 * dev->stop() will invoke napi_disable() on all of it's
1235	 * napi_struct instances on this device.
1236	 */
1237	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1238
1239	dev_deactivate(dev);
1240
1241	/*
1242	 *	Call the device specific close. This cannot fail.
1243	 *	Only if device is UP
1244	 *
1245	 *	We allow it to be called even after a DETACH hot-plug
1246	 *	event.
1247	 */
1248	if (ops->ndo_stop)
1249		ops->ndo_stop(dev);
1250
1251	/*
1252	 *	Device is now down.
1253	 */
1254
1255	dev->flags &= ~IFF_UP;
1256
1257	/*
1258	 *	Shutdown NET_DMA
1259	 */
1260	net_dmaengine_put();
1261
1262	return 0;
1263}
1264
1265/**
1266 *	dev_close - shutdown an interface.
1267 *	@dev: device to shutdown
1268 *
1269 *	This function moves an active device into down state. A
1270 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1271 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1272 *	chain.
1273 */
1274int dev_close(struct net_device *dev)
1275{
1276	if (!(dev->flags & IFF_UP))
1277		return 0;
1278
1279	__dev_close(dev);
1280
1281	/*
1282	 * Tell people we are down
1283	 */
1284	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1285	call_netdevice_notifiers(NETDEV_DOWN, dev);
1286
1287	return 0;
1288}
1289EXPORT_SYMBOL(dev_close);
1290
1291
1292/**
1293 *	dev_disable_lro - disable Large Receive Offload on a device
1294 *	@dev: device
1295 *
1296 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1297 *	called under RTNL.  This is needed if received packets may be
1298 *	forwarded to another interface.
1299 */
1300void dev_disable_lro(struct net_device *dev)
1301{
1302	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1303	    dev->ethtool_ops->set_flags) {
1304		u32 flags = dev->ethtool_ops->get_flags(dev);
1305		if (flags & ETH_FLAG_LRO) {
1306			flags &= ~ETH_FLAG_LRO;
1307			dev->ethtool_ops->set_flags(dev, flags);
1308		}
1309	}
1310	WARN_ON(dev->features & NETIF_F_LRO);
1311}
1312EXPORT_SYMBOL(dev_disable_lro);
1313
1314
1315static int dev_boot_phase = 1;
1316
1317/*
1318 *	Device change register/unregister. These are not inline or static
1319 *	as we export them to the world.
1320 */
1321
1322/**
1323 *	register_netdevice_notifier - register a network notifier block
1324 *	@nb: notifier
1325 *
1326 *	Register a notifier to be called when network device events occur.
1327 *	The notifier passed is linked into the kernel structures and must
1328 *	not be reused until it has been unregistered. A negative errno code
1329 *	is returned on a failure.
1330 *
1331 * 	When registered all registration and up events are replayed
1332 *	to the new notifier to allow device to have a race free
1333 *	view of the network device list.
1334 */
1335
1336int register_netdevice_notifier(struct notifier_block *nb)
1337{
1338	struct net_device *dev;
1339	struct net_device *last;
1340	struct net *net;
1341	int err;
1342
1343	rtnl_lock();
1344	err = raw_notifier_chain_register(&netdev_chain, nb);
1345	if (err)
1346		goto unlock;
1347	if (dev_boot_phase)
1348		goto unlock;
1349	for_each_net(net) {
1350		for_each_netdev(net, dev) {
1351			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1352			err = notifier_to_errno(err);
1353			if (err)
1354				goto rollback;
1355
1356			if (!(dev->flags & IFF_UP))
1357				continue;
1358
1359			nb->notifier_call(nb, NETDEV_UP, dev);
1360		}
1361	}
1362
1363unlock:
1364	rtnl_unlock();
1365	return err;
1366
1367rollback:
1368	last = dev;
1369	for_each_net(net) {
1370		for_each_netdev(net, dev) {
1371			if (dev == last)
1372				break;
1373
1374			if (dev->flags & IFF_UP) {
1375				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1376				nb->notifier_call(nb, NETDEV_DOWN, dev);
1377			}
1378			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1379			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1380		}
1381	}
1382
1383	raw_notifier_chain_unregister(&netdev_chain, nb);
1384	goto unlock;
1385}
1386EXPORT_SYMBOL(register_netdevice_notifier);
1387
1388/**
1389 *	unregister_netdevice_notifier - unregister a network notifier block
1390 *	@nb: notifier
1391 *
1392 *	Unregister a notifier previously registered by
1393 *	register_netdevice_notifier(). The notifier is unlinked into the
1394 *	kernel structures and may then be reused. A negative errno code
1395 *	is returned on a failure.
1396 */
1397
1398int unregister_netdevice_notifier(struct notifier_block *nb)
1399{
1400	int err;
1401
1402	rtnl_lock();
1403	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1404	rtnl_unlock();
1405	return err;
1406}
1407EXPORT_SYMBOL(unregister_netdevice_notifier);
1408
1409/**
1410 *	call_netdevice_notifiers - call all network notifier blocks
1411 *      @val: value passed unmodified to notifier function
1412 *      @dev: net_device pointer passed unmodified to notifier function
1413 *
1414 *	Call all network notifier blocks.  Parameters and return value
1415 *	are as for raw_notifier_call_chain().
1416 */
1417
1418int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1419{
1420	return raw_notifier_call_chain(&netdev_chain, val, dev);
1421}
1422
1423/* When > 0 there are consumers of rx skb time stamps */
1424static atomic_t netstamp_needed = ATOMIC_INIT(0);
1425
1426void net_enable_timestamp(void)
1427{
1428	atomic_inc(&netstamp_needed);
1429}
1430EXPORT_SYMBOL(net_enable_timestamp);
1431
1432void net_disable_timestamp(void)
1433{
1434	atomic_dec(&netstamp_needed);
1435}
1436EXPORT_SYMBOL(net_disable_timestamp);
1437
1438static inline void net_timestamp(struct sk_buff *skb)
1439{
1440	if (atomic_read(&netstamp_needed))
1441		__net_timestamp(skb);
1442	else
1443		skb->tstamp.tv64 = 0;
1444}
1445
1446/**
1447 * dev_forward_skb - loopback an skb to another netif
1448 *
1449 * @dev: destination network device
1450 * @skb: buffer to forward
1451 *
1452 * return values:
1453 *	NET_RX_SUCCESS	(no congestion)
1454 *	NET_RX_DROP     (packet was dropped, but freed)
1455 *
1456 * dev_forward_skb can be used for injecting an skb from the
1457 * start_xmit function of one device into the receive queue
1458 * of another device.
1459 *
1460 * The receiving device may be in another namespace, so
1461 * we have to clear all information in the skb that could
1462 * impact namespace isolation.
1463 */
1464int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1465{
1466	skb_orphan(skb);
1467
1468	if (!(dev->flags & IFF_UP) ||
1469	    (skb->len > (dev->mtu + dev->hard_header_len))) {
1470		kfree_skb(skb);
1471		return NET_RX_DROP;
1472	}
1473	skb_set_dev(skb, dev);
1474	skb->tstamp.tv64 = 0;
1475	skb->pkt_type = PACKET_HOST;
1476	skb->protocol = eth_type_trans(skb, dev);
1477	return netif_rx(skb);
1478}
1479EXPORT_SYMBOL_GPL(dev_forward_skb);
1480
1481/*
1482 *	Support routine. Sends outgoing frames to any network
1483 *	taps currently in use.
1484 */
1485
1486static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1487{
1488	struct packet_type *ptype;
1489
1490#ifdef CONFIG_NET_CLS_ACT
1491	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1492		net_timestamp(skb);
1493#else
1494	net_timestamp(skb);
1495#endif
1496
1497	rcu_read_lock();
1498	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1499		/* Never send packets back to the socket
1500		 * they originated from - MvS (miquels@drinkel.ow.org)
1501		 */
1502		if ((ptype->dev == dev || !ptype->dev) &&
1503		    (ptype->af_packet_priv == NULL ||
1504		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1505			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1506			if (!skb2)
1507				break;
1508
1509			/* skb->nh should be correctly
1510			   set by sender, so that the second statement is
1511			   just protection against buggy protocols.
1512			 */
1513			skb_reset_mac_header(skb2);
1514
1515			if (skb_network_header(skb2) < skb2->data ||
1516			    skb2->network_header > skb2->tail) {
1517				if (net_ratelimit())
1518					printk(KERN_CRIT "protocol %04x is "
1519					       "buggy, dev %s\n",
1520					       skb2->protocol, dev->name);
1521				skb_reset_network_header(skb2);
1522			}
1523
1524			skb2->transport_header = skb2->network_header;
1525			skb2->pkt_type = PACKET_OUTGOING;
1526			ptype->func(skb2, skb->dev, ptype, skb->dev);
1527		}
1528	}
1529	rcu_read_unlock();
1530}
1531
1532
1533static inline void __netif_reschedule(struct Qdisc *q)
1534{
1535	struct softnet_data *sd;
1536	unsigned long flags;
1537
1538	local_irq_save(flags);
1539	sd = &__get_cpu_var(softnet_data);
1540	q->next_sched = sd->output_queue;
1541	sd->output_queue = q;
1542	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1543	local_irq_restore(flags);
1544}
1545
1546void __netif_schedule(struct Qdisc *q)
1547{
1548	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1549		__netif_reschedule(q);
1550}
1551EXPORT_SYMBOL(__netif_schedule);
1552
1553void dev_kfree_skb_irq(struct sk_buff *skb)
1554{
1555	if (atomic_dec_and_test(&skb->users)) {
1556		struct softnet_data *sd;
1557		unsigned long flags;
1558
1559		local_irq_save(flags);
1560		sd = &__get_cpu_var(softnet_data);
1561		skb->next = sd->completion_queue;
1562		sd->completion_queue = skb;
1563		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1564		local_irq_restore(flags);
1565	}
1566}
1567EXPORT_SYMBOL(dev_kfree_skb_irq);
1568
1569void dev_kfree_skb_any(struct sk_buff *skb)
1570{
1571	if (in_irq() || irqs_disabled())
1572		dev_kfree_skb_irq(skb);
1573	else
1574		dev_kfree_skb(skb);
1575}
1576EXPORT_SYMBOL(dev_kfree_skb_any);
1577
1578
1579/**
1580 * netif_device_detach - mark device as removed
1581 * @dev: network device
1582 *
1583 * Mark device as removed from system and therefore no longer available.
1584 */
1585void netif_device_detach(struct net_device *dev)
1586{
1587	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1588	    netif_running(dev)) {
1589		netif_tx_stop_all_queues(dev);
1590	}
1591}
1592EXPORT_SYMBOL(netif_device_detach);
1593
1594/**
1595 * netif_device_attach - mark device as attached
1596 * @dev: network device
1597 *
1598 * Mark device as attached from system and restart if needed.
1599 */
1600void netif_device_attach(struct net_device *dev)
1601{
1602	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1603	    netif_running(dev)) {
1604		netif_tx_wake_all_queues(dev);
1605		__netdev_watchdog_up(dev);
1606	}
1607}
1608EXPORT_SYMBOL(netif_device_attach);
1609
1610static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1611{
1612	return ((features & NETIF_F_GEN_CSUM) ||
1613		((features & NETIF_F_IP_CSUM) &&
1614		 protocol == htons(ETH_P_IP)) ||
1615		((features & NETIF_F_IPV6_CSUM) &&
1616		 protocol == htons(ETH_P_IPV6)) ||
1617		((features & NETIF_F_FCOE_CRC) &&
1618		 protocol == htons(ETH_P_FCOE)));
1619}
1620
1621static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1622{
1623	if (can_checksum_protocol(dev->features, skb->protocol))
1624		return true;
1625
1626	if (skb->protocol == htons(ETH_P_8021Q)) {
1627		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1628		if (can_checksum_protocol(dev->features & dev->vlan_features,
1629					  veh->h_vlan_encapsulated_proto))
1630			return true;
1631	}
1632
1633	return false;
1634}
1635
1636/**
1637 * skb_dev_set -- assign a new device to a buffer
1638 * @skb: buffer for the new device
1639 * @dev: network device
1640 *
1641 * If an skb is owned by a device already, we have to reset
1642 * all data private to the namespace a device belongs to
1643 * before assigning it a new device.
1644 */
1645#ifdef CONFIG_NET_NS
1646void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1647{
1648	skb_dst_drop(skb);
1649	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1650		secpath_reset(skb);
1651		nf_reset(skb);
1652		skb_init_secmark(skb);
1653		skb->mark = 0;
1654		skb->priority = 0;
1655		skb->nf_trace = 0;
1656		skb->ipvs_property = 0;
1657#ifdef CONFIG_NET_SCHED
1658		skb->tc_index = 0;
1659#endif
1660	}
1661	skb->dev = dev;
1662}
1663EXPORT_SYMBOL(skb_set_dev);
1664#endif /* CONFIG_NET_NS */
1665
1666/*
1667 * Invalidate hardware checksum when packet is to be mangled, and
1668 * complete checksum manually on outgoing path.
1669 */
1670int skb_checksum_help(struct sk_buff *skb)
1671{
1672	__wsum csum;
1673	int ret = 0, offset;
1674
1675	if (skb->ip_summed == CHECKSUM_COMPLETE)
1676		goto out_set_summed;
1677
1678	if (unlikely(skb_shinfo(skb)->gso_size)) {
1679		/* Let GSO fix up the checksum. */
1680		goto out_set_summed;
1681	}
1682
1683	offset = skb->csum_start - skb_headroom(skb);
1684	BUG_ON(offset >= skb_headlen(skb));
1685	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1686
1687	offset += skb->csum_offset;
1688	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1689
1690	if (skb_cloned(skb) &&
1691	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1692		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1693		if (ret)
1694			goto out;
1695	}
1696
1697	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1698out_set_summed:
1699	skb->ip_summed = CHECKSUM_NONE;
1700out:
1701	return ret;
1702}
1703EXPORT_SYMBOL(skb_checksum_help);
1704
1705/**
1706 *	skb_gso_segment - Perform segmentation on skb.
1707 *	@skb: buffer to segment
1708 *	@features: features for the output path (see dev->features)
1709 *
1710 *	This function segments the given skb and returns a list of segments.
1711 *
1712 *	It may return NULL if the skb requires no segmentation.  This is
1713 *	only possible when GSO is used for verifying header integrity.
1714 */
1715struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1716{
1717	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1718	struct packet_type *ptype;
1719	__be16 type = skb->protocol;
1720	int err;
1721
1722	skb_reset_mac_header(skb);
1723	skb->mac_len = skb->network_header - skb->mac_header;
1724	__skb_pull(skb, skb->mac_len);
1725
1726	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1727		struct net_device *dev = skb->dev;
1728		struct ethtool_drvinfo info = {};
1729
1730		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1731			dev->ethtool_ops->get_drvinfo(dev, &info);
1732
1733		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1734			"ip_summed=%d",
1735		     info.driver, dev ? dev->features : 0L,
1736		     skb->sk ? skb->sk->sk_route_caps : 0L,
1737		     skb->len, skb->data_len, skb->ip_summed);
1738
1739		if (skb_header_cloned(skb) &&
1740		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1741			return ERR_PTR(err);
1742	}
1743
1744	rcu_read_lock();
1745	list_for_each_entry_rcu(ptype,
1746			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1747		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1748			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1749				err = ptype->gso_send_check(skb);
1750				segs = ERR_PTR(err);
1751				if (err || skb_gso_ok(skb, features))
1752					break;
1753				__skb_push(skb, (skb->data -
1754						 skb_network_header(skb)));
1755			}
1756			segs = ptype->gso_segment(skb, features);
1757			break;
1758		}
1759	}
1760	rcu_read_unlock();
1761
1762	__skb_push(skb, skb->data - skb_mac_header(skb));
1763
1764	return segs;
1765}
1766EXPORT_SYMBOL(skb_gso_segment);
1767
1768/* Take action when hardware reception checksum errors are detected. */
1769#ifdef CONFIG_BUG
1770void netdev_rx_csum_fault(struct net_device *dev)
1771{
1772	if (net_ratelimit()) {
1773		printk(KERN_ERR "%s: hw csum failure.\n",
1774			dev ? dev->name : "<unknown>");
1775		dump_stack();
1776	}
1777}
1778EXPORT_SYMBOL(netdev_rx_csum_fault);
1779#endif
1780
1781/* Actually, we should eliminate this check as soon as we know, that:
1782 * 1. IOMMU is present and allows to map all the memory.
1783 * 2. No high memory really exists on this machine.
1784 */
1785
1786static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1787{
1788#ifdef CONFIG_HIGHMEM
1789	int i;
1790
1791	if (dev->features & NETIF_F_HIGHDMA)
1792		return 0;
1793
1794	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1795		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1796			return 1;
1797
1798#endif
1799	return 0;
1800}
1801
1802struct dev_gso_cb {
1803	void (*destructor)(struct sk_buff *skb);
1804};
1805
1806#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1807
1808static void dev_gso_skb_destructor(struct sk_buff *skb)
1809{
1810	struct dev_gso_cb *cb;
1811
1812	do {
1813		struct sk_buff *nskb = skb->next;
1814
1815		skb->next = nskb->next;
1816		nskb->next = NULL;
1817		kfree_skb(nskb);
1818	} while (skb->next);
1819
1820	cb = DEV_GSO_CB(skb);
1821	if (cb->destructor)
1822		cb->destructor(skb);
1823}
1824
1825/**
1826 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1827 *	@skb: buffer to segment
1828 *
1829 *	This function segments the given skb and stores the list of segments
1830 *	in skb->next.
1831 */
1832static int dev_gso_segment(struct sk_buff *skb)
1833{
1834	struct net_device *dev = skb->dev;
1835	struct sk_buff *segs;
1836	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1837					 NETIF_F_SG : 0);
1838
1839	segs = skb_gso_segment(skb, features);
1840
1841	/* Verifying header integrity only. */
1842	if (!segs)
1843		return 0;
1844
1845	if (IS_ERR(segs))
1846		return PTR_ERR(segs);
1847
1848	skb->next = segs;
1849	DEV_GSO_CB(skb)->destructor = skb->destructor;
1850	skb->destructor = dev_gso_skb_destructor;
1851
1852	return 0;
1853}
1854
1855int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1856			struct netdev_queue *txq)
1857{
1858	const struct net_device_ops *ops = dev->netdev_ops;
1859	int rc = NETDEV_TX_OK;
1860
1861	if (likely(!skb->next)) {
1862		if (!list_empty(&ptype_all))
1863			dev_queue_xmit_nit(skb, dev);
1864
1865		if (netif_needs_gso(dev, skb)) {
1866			if (unlikely(dev_gso_segment(skb)))
1867				goto out_kfree_skb;
1868			if (skb->next)
1869				goto gso;
1870		}
1871
1872		/*
1873		 * If device doesnt need skb->dst, release it right now while
1874		 * its hot in this cpu cache
1875		 */
1876		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1877			skb_dst_drop(skb);
1878
1879		rc = ops->ndo_start_xmit(skb, dev);
1880		if (rc == NETDEV_TX_OK)
1881			txq_trans_update(txq);
1882		/*
1883		 * TODO: if skb_orphan() was called by
1884		 * dev->hard_start_xmit() (for example, the unmodified
1885		 * igb driver does that; bnx2 doesn't), then
1886		 * skb_tx_software_timestamp() will be unable to send
1887		 * back the time stamp.
1888		 *
1889		 * How can this be prevented? Always create another
1890		 * reference to the socket before calling
1891		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1892		 * does anything in dev->hard_start_xmit() by clearing
1893		 * the skb destructor before the call and restoring it
1894		 * afterwards, then doing the skb_orphan() ourselves?
1895		 */
1896		return rc;
1897	}
1898
1899gso:
1900	do {
1901		struct sk_buff *nskb = skb->next;
1902
1903		skb->next = nskb->next;
1904		nskb->next = NULL;
1905
1906		/*
1907		 * If device doesnt need nskb->dst, release it right now while
1908		 * its hot in this cpu cache
1909		 */
1910		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1911			skb_dst_drop(nskb);
1912
1913		rc = ops->ndo_start_xmit(nskb, dev);
1914		if (unlikely(rc != NETDEV_TX_OK)) {
1915			if (rc & ~NETDEV_TX_MASK)
1916				goto out_kfree_gso_skb;
1917			nskb->next = skb->next;
1918			skb->next = nskb;
1919			return rc;
1920		}
1921		txq_trans_update(txq);
1922		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1923			return NETDEV_TX_BUSY;
1924	} while (skb->next);
1925
1926out_kfree_gso_skb:
1927	if (likely(skb->next == NULL))
1928		skb->destructor = DEV_GSO_CB(skb)->destructor;
1929out_kfree_skb:
1930	kfree_skb(skb);
1931	return rc;
1932}
1933
1934static u32 skb_tx_hashrnd;
1935
1936u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1937{
1938	u32 hash;
1939
1940	if (skb_rx_queue_recorded(skb)) {
1941		hash = skb_get_rx_queue(skb);
1942		while (unlikely(hash >= dev->real_num_tx_queues))
1943			hash -= dev->real_num_tx_queues;
1944		return hash;
1945	}
1946
1947	if (skb->sk && skb->sk->sk_hash)
1948		hash = skb->sk->sk_hash;
1949	else
1950		hash = skb->protocol;
1951
1952	hash = jhash_1word(hash, skb_tx_hashrnd);
1953
1954	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1955}
1956EXPORT_SYMBOL(skb_tx_hash);
1957
1958static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1959{
1960	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1961		if (net_ratelimit()) {
1962			WARN(1, "%s selects TX queue %d, but "
1963			     "real number of TX queues is %d\n",
1964			     dev->name, queue_index,
1965			     dev->real_num_tx_queues);
1966		}
1967		return 0;
1968	}
1969	return queue_index;
1970}
1971
1972static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1973					struct sk_buff *skb)
1974{
1975	u16 queue_index;
1976	struct sock *sk = skb->sk;
1977
1978	if (sk_tx_queue_recorded(sk)) {
1979		queue_index = sk_tx_queue_get(sk);
1980	} else {
1981		const struct net_device_ops *ops = dev->netdev_ops;
1982
1983		if (ops->ndo_select_queue) {
1984			queue_index = ops->ndo_select_queue(dev, skb);
1985			queue_index = dev_cap_txqueue(dev, queue_index);
1986		} else {
1987			queue_index = 0;
1988			if (dev->real_num_tx_queues > 1)
1989				queue_index = skb_tx_hash(dev, skb);
1990
1991			if (sk) {
1992				struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
1993
1994				if (dst && skb_dst(skb) == dst)
1995					sk_tx_queue_set(sk, queue_index);
1996			}
1997		}
1998	}
1999
2000	skb_set_queue_mapping(skb, queue_index);
2001	return netdev_get_tx_queue(dev, queue_index);
2002}
2003
2004static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2005				 struct net_device *dev,
2006				 struct netdev_queue *txq)
2007{
2008	spinlock_t *root_lock = qdisc_lock(q);
2009	int rc;
2010
2011	spin_lock(root_lock);
2012	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2013		kfree_skb(skb);
2014		rc = NET_XMIT_DROP;
2015	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2016		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2017		/*
2018		 * This is a work-conserving queue; there are no old skbs
2019		 * waiting to be sent out; and the qdisc is not running -
2020		 * xmit the skb directly.
2021		 */
2022		__qdisc_update_bstats(q, skb->len);
2023		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2024			__qdisc_run(q);
2025		else
2026			clear_bit(__QDISC_STATE_RUNNING, &q->state);
2027
2028		rc = NET_XMIT_SUCCESS;
2029	} else {
2030		rc = qdisc_enqueue_root(skb, q);
2031		qdisc_run(q);
2032	}
2033	spin_unlock(root_lock);
2034
2035	return rc;
2036}
2037
2038/*
2039 * Returns true if either:
2040 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2041 *	2. skb is fragmented and the device does not support SG, or if
2042 *	   at least one of fragments is in highmem and device does not
2043 *	   support DMA from it.
2044 */
2045static inline int skb_needs_linearize(struct sk_buff *skb,
2046				      struct net_device *dev)
2047{
2048	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2049	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2050					      illegal_highdma(dev, skb)));
2051}
2052
2053/**
2054 *	dev_queue_xmit - transmit a buffer
2055 *	@skb: buffer to transmit
2056 *
2057 *	Queue a buffer for transmission to a network device. The caller must
2058 *	have set the device and priority and built the buffer before calling
2059 *	this function. The function can be called from an interrupt.
2060 *
2061 *	A negative errno code is returned on a failure. A success does not
2062 *	guarantee the frame will be transmitted as it may be dropped due
2063 *	to congestion or traffic shaping.
2064 *
2065 * -----------------------------------------------------------------------------------
2066 *      I notice this method can also return errors from the queue disciplines,
2067 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2068 *      be positive.
2069 *
2070 *      Regardless of the return value, the skb is consumed, so it is currently
2071 *      difficult to retry a send to this method.  (You can bump the ref count
2072 *      before sending to hold a reference for retry if you are careful.)
2073 *
2074 *      When calling this method, interrupts MUST be enabled.  This is because
2075 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2076 *          --BLG
2077 */
2078int dev_queue_xmit(struct sk_buff *skb)
2079{
2080	struct net_device *dev = skb->dev;
2081	struct netdev_queue *txq;
2082	struct Qdisc *q;
2083	int rc = -ENOMEM;
2084
2085	/* GSO will handle the following emulations directly. */
2086	if (netif_needs_gso(dev, skb))
2087		goto gso;
2088
2089	/* Convert a paged skb to linear, if required */
2090	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2091		goto out_kfree_skb;
2092
2093	/* If packet is not checksummed and device does not support
2094	 * checksumming for this protocol, complete checksumming here.
2095	 */
2096	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2097		skb_set_transport_header(skb, skb->csum_start -
2098					      skb_headroom(skb));
2099		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2100			goto out_kfree_skb;
2101	}
2102
2103gso:
2104	/* Disable soft irqs for various locks below. Also
2105	 * stops preemption for RCU.
2106	 */
2107	rcu_read_lock_bh();
2108
2109	txq = dev_pick_tx(dev, skb);
2110	q = rcu_dereference_bh(txq->qdisc);
2111
2112#ifdef CONFIG_NET_CLS_ACT
2113	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2114#endif
2115	if (q->enqueue) {
2116		rc = __dev_xmit_skb(skb, q, dev, txq);
2117		goto out;
2118	}
2119
2120	/* The device has no queue. Common case for software devices:
2121	   loopback, all the sorts of tunnels...
2122
2123	   Really, it is unlikely that netif_tx_lock protection is necessary
2124	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2125	   counters.)
2126	   However, it is possible, that they rely on protection
2127	   made by us here.
2128
2129	   Check this and shot the lock. It is not prone from deadlocks.
2130	   Either shot noqueue qdisc, it is even simpler 8)
2131	 */
2132	if (dev->flags & IFF_UP) {
2133		int cpu = smp_processor_id(); /* ok because BHs are off */
2134
2135		if (txq->xmit_lock_owner != cpu) {
2136
2137			HARD_TX_LOCK(dev, txq, cpu);
2138
2139			if (!netif_tx_queue_stopped(txq)) {
2140				rc = dev_hard_start_xmit(skb, dev, txq);
2141				if (dev_xmit_complete(rc)) {
2142					HARD_TX_UNLOCK(dev, txq);
2143					goto out;
2144				}
2145			}
2146			HARD_TX_UNLOCK(dev, txq);
2147			if (net_ratelimit())
2148				printk(KERN_CRIT "Virtual device %s asks to "
2149				       "queue packet!\n", dev->name);
2150		} else {
2151			/* Recursion is detected! It is possible,
2152			 * unfortunately */
2153			if (net_ratelimit())
2154				printk(KERN_CRIT "Dead loop on virtual device "
2155				       "%s, fix it urgently!\n", dev->name);
2156		}
2157	}
2158
2159	rc = -ENETDOWN;
2160	rcu_read_unlock_bh();
2161
2162out_kfree_skb:
2163	kfree_skb(skb);
2164	return rc;
2165out:
2166	rcu_read_unlock_bh();
2167	return rc;
2168}
2169EXPORT_SYMBOL(dev_queue_xmit);
2170
2171
2172/*=======================================================================
2173			Receiver routines
2174  =======================================================================*/
2175
2176int netdev_max_backlog __read_mostly = 1000;
2177int netdev_budget __read_mostly = 300;
2178int weight_p __read_mostly = 64;            /* old backlog weight */
2179
2180DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2181
2182
2183/**
2184 *	netif_rx	-	post buffer to the network code
2185 *	@skb: buffer to post
2186 *
2187 *	This function receives a packet from a device driver and queues it for
2188 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2189 *	may be dropped during processing for congestion control or by the
2190 *	protocol layers.
2191 *
2192 *	return values:
2193 *	NET_RX_SUCCESS	(no congestion)
2194 *	NET_RX_DROP     (packet was dropped)
2195 *
2196 */
2197
2198int netif_rx(struct sk_buff *skb)
2199{
2200	struct softnet_data *queue;
2201	unsigned long flags;
2202
2203	/* if netpoll wants it, pretend we never saw it */
2204	if (netpoll_rx(skb))
2205		return NET_RX_DROP;
2206
2207	if (!skb->tstamp.tv64)
2208		net_timestamp(skb);
2209
2210	/*
2211	 * The code is rearranged so that the path is the most
2212	 * short when CPU is congested, but is still operating.
2213	 */
2214	local_irq_save(flags);
2215	queue = &__get_cpu_var(softnet_data);
2216
2217	__get_cpu_var(netdev_rx_stat).total++;
2218	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2219		if (queue->input_pkt_queue.qlen) {
2220enqueue:
2221			__skb_queue_tail(&queue->input_pkt_queue, skb);
2222			local_irq_restore(flags);
2223			return NET_RX_SUCCESS;
2224		}
2225
2226		napi_schedule(&queue->backlog);
2227		goto enqueue;
2228	}
2229
2230	__get_cpu_var(netdev_rx_stat).dropped++;
2231	local_irq_restore(flags);
2232
2233	kfree_skb(skb);
2234	return NET_RX_DROP;
2235}
2236EXPORT_SYMBOL(netif_rx);
2237
2238int netif_rx_ni(struct sk_buff *skb)
2239{
2240	int err;
2241
2242	preempt_disable();
2243	err = netif_rx(skb);
2244	if (local_softirq_pending())
2245		do_softirq();
2246	preempt_enable();
2247
2248	return err;
2249}
2250EXPORT_SYMBOL(netif_rx_ni);
2251
2252static void net_tx_action(struct softirq_action *h)
2253{
2254	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2255
2256	if (sd->completion_queue) {
2257		struct sk_buff *clist;
2258
2259		local_irq_disable();
2260		clist = sd->completion_queue;
2261		sd->completion_queue = NULL;
2262		local_irq_enable();
2263
2264		while (clist) {
2265			struct sk_buff *skb = clist;
2266			clist = clist->next;
2267
2268			WARN_ON(atomic_read(&skb->users));
2269			__kfree_skb(skb);
2270		}
2271	}
2272
2273	if (sd->output_queue) {
2274		struct Qdisc *head;
2275
2276		local_irq_disable();
2277		head = sd->output_queue;
2278		sd->output_queue = NULL;
2279		local_irq_enable();
2280
2281		while (head) {
2282			struct Qdisc *q = head;
2283			spinlock_t *root_lock;
2284
2285			head = head->next_sched;
2286
2287			root_lock = qdisc_lock(q);
2288			if (spin_trylock(root_lock)) {
2289				smp_mb__before_clear_bit();
2290				clear_bit(__QDISC_STATE_SCHED,
2291					  &q->state);
2292				qdisc_run(q);
2293				spin_unlock(root_lock);
2294			} else {
2295				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2296					      &q->state)) {
2297					__netif_reschedule(q);
2298				} else {
2299					smp_mb__before_clear_bit();
2300					clear_bit(__QDISC_STATE_SCHED,
2301						  &q->state);
2302				}
2303			}
2304		}
2305	}
2306}
2307
2308static inline int deliver_skb(struct sk_buff *skb,
2309			      struct packet_type *pt_prev,
2310			      struct net_device *orig_dev)
2311{
2312	atomic_inc(&skb->users);
2313	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2314}
2315
2316#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2317
2318#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2319/* This hook is defined here for ATM LANE */
2320int (*br_fdb_test_addr_hook)(struct net_device *dev,
2321			     unsigned char *addr) __read_mostly;
2322EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2323#endif
2324
2325/*
2326 * If bridge module is loaded call bridging hook.
2327 *  returns NULL if packet was consumed.
2328 */
2329struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2330					struct sk_buff *skb) __read_mostly;
2331EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2332
2333static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2334					    struct packet_type **pt_prev, int *ret,
2335					    struct net_device *orig_dev)
2336{
2337	struct net_bridge_port *port;
2338
2339	if (skb->pkt_type == PACKET_LOOPBACK ||
2340	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2341		return skb;
2342
2343	if (*pt_prev) {
2344		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2345		*pt_prev = NULL;
2346	}
2347
2348	return br_handle_frame_hook(port, skb);
2349}
2350#else
2351#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2352#endif
2353
2354#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2355struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2356EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2357
2358static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2359					     struct packet_type **pt_prev,
2360					     int *ret,
2361					     struct net_device *orig_dev)
2362{
2363	if (skb->dev->macvlan_port == NULL)
2364		return skb;
2365
2366	if (*pt_prev) {
2367		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2368		*pt_prev = NULL;
2369	}
2370	return macvlan_handle_frame_hook(skb);
2371}
2372#else
2373#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2374#endif
2375
2376#ifdef CONFIG_NET_CLS_ACT
2377/* TODO: Maybe we should just force sch_ingress to be compiled in
2378 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2379 * a compare and 2 stores extra right now if we dont have it on
2380 * but have CONFIG_NET_CLS_ACT
2381 * NOTE: This doesnt stop any functionality; if you dont have
2382 * the ingress scheduler, you just cant add policies on ingress.
2383 *
2384 */
2385static int ing_filter(struct sk_buff *skb)
2386{
2387	struct net_device *dev = skb->dev;
2388	u32 ttl = G_TC_RTTL(skb->tc_verd);
2389	struct netdev_queue *rxq;
2390	int result = TC_ACT_OK;
2391	struct Qdisc *q;
2392
2393	if (MAX_RED_LOOP < ttl++) {
2394		printk(KERN_WARNING
2395		       "Redir loop detected Dropping packet (%d->%d)\n",
2396		       skb->skb_iif, dev->ifindex);
2397		return TC_ACT_SHOT;
2398	}
2399
2400	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2401	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2402
2403	rxq = &dev->rx_queue;
2404
2405	q = rxq->qdisc;
2406	if (q != &noop_qdisc) {
2407		spin_lock(qdisc_lock(q));
2408		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2409			result = qdisc_enqueue_root(skb, q);
2410		spin_unlock(qdisc_lock(q));
2411	}
2412
2413	return result;
2414}
2415
2416static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2417					 struct packet_type **pt_prev,
2418					 int *ret, struct net_device *orig_dev)
2419{
2420	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2421		goto out;
2422
2423	if (*pt_prev) {
2424		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2425		*pt_prev = NULL;
2426	} else {
2427		/* Huh? Why does turning on AF_PACKET affect this? */
2428		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2429	}
2430
2431	switch (ing_filter(skb)) {
2432	case TC_ACT_SHOT:
2433	case TC_ACT_STOLEN:
2434		kfree_skb(skb);
2435		return NULL;
2436	}
2437
2438out:
2439	skb->tc_verd = 0;
2440	return skb;
2441}
2442#endif
2443
2444/*
2445 * 	netif_nit_deliver - deliver received packets to network taps
2446 * 	@skb: buffer
2447 *
2448 * 	This function is used to deliver incoming packets to network
2449 * 	taps. It should be used when the normal netif_receive_skb path
2450 * 	is bypassed, for example because of VLAN acceleration.
2451 */
2452void netif_nit_deliver(struct sk_buff *skb)
2453{
2454	struct packet_type *ptype;
2455
2456	if (list_empty(&ptype_all))
2457		return;
2458
2459	skb_reset_network_header(skb);
2460	skb_reset_transport_header(skb);
2461	skb->mac_len = skb->network_header - skb->mac_header;
2462
2463	rcu_read_lock();
2464	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2465		if (!ptype->dev || ptype->dev == skb->dev)
2466			deliver_skb(skb, ptype, skb->dev);
2467	}
2468	rcu_read_unlock();
2469}
2470
2471/**
2472 *	netif_receive_skb - process receive buffer from network
2473 *	@skb: buffer to process
2474 *
2475 *	netif_receive_skb() is the main receive data processing function.
2476 *	It always succeeds. The buffer may be dropped during processing
2477 *	for congestion control or by the protocol layers.
2478 *
2479 *	This function may only be called from softirq context and interrupts
2480 *	should be enabled.
2481 *
2482 *	Return values (usually ignored):
2483 *	NET_RX_SUCCESS: no congestion
2484 *	NET_RX_DROP: packet was dropped
2485 */
2486int netif_receive_skb(struct sk_buff *skb)
2487{
2488	struct packet_type *ptype, *pt_prev;
2489	struct net_device *orig_dev;
2490	struct net_device *master;
2491	struct net_device *null_or_orig;
2492	struct net_device *null_or_bond;
2493	int ret = NET_RX_DROP;
2494	__be16 type;
2495
2496	if (!skb->tstamp.tv64)
2497		net_timestamp(skb);
2498
2499	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2500		return NET_RX_SUCCESS;
2501
2502	/* if we've gotten here through NAPI, check netpoll */
2503	if (netpoll_receive_skb(skb))
2504		return NET_RX_DROP;
2505
2506	if (!skb->skb_iif)
2507		skb->skb_iif = skb->dev->ifindex;
2508
2509	null_or_orig = NULL;
2510	orig_dev = skb->dev;
2511	master = ACCESS_ONCE(orig_dev->master);
2512	if (master) {
2513		if (skb_bond_should_drop(skb, master))
2514			null_or_orig = orig_dev; /* deliver only exact match */
2515		else
2516			skb->dev = master;
2517	}
2518
2519	__get_cpu_var(netdev_rx_stat).total++;
2520
2521	skb_reset_network_header(skb);
2522	skb_reset_transport_header(skb);
2523	skb->mac_len = skb->network_header - skb->mac_header;
2524
2525	pt_prev = NULL;
2526
2527	rcu_read_lock();
2528
2529#ifdef CONFIG_NET_CLS_ACT
2530	if (skb->tc_verd & TC_NCLS) {
2531		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2532		goto ncls;
2533	}
2534#endif
2535
2536	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2537		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2538		    ptype->dev == orig_dev) {
2539			if (pt_prev)
2540				ret = deliver_skb(skb, pt_prev, orig_dev);
2541			pt_prev = ptype;
2542		}
2543	}
2544
2545#ifdef CONFIG_NET_CLS_ACT
2546	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2547	if (!skb)
2548		goto out;
2549ncls:
2550#endif
2551
2552	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2553	if (!skb)
2554		goto out;
2555	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2556	if (!skb)
2557		goto out;
2558
2559	/*
2560	 * Make sure frames received on VLAN interfaces stacked on
2561	 * bonding interfaces still make their way to any base bonding
2562	 * device that may have registered for a specific ptype.  The
2563	 * handler may have to adjust skb->dev and orig_dev.
2564	 */
2565	null_or_bond = NULL;
2566	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2567	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2568		null_or_bond = vlan_dev_real_dev(skb->dev);
2569	}
2570
2571	type = skb->protocol;
2572	list_for_each_entry_rcu(ptype,
2573			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2574		if (ptype->type == type && (ptype->dev == null_or_orig ||
2575		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2576		     ptype->dev == null_or_bond)) {
2577			if (pt_prev)
2578				ret = deliver_skb(skb, pt_prev, orig_dev);
2579			pt_prev = ptype;
2580		}
2581	}
2582
2583	if (pt_prev) {
2584		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2585	} else {
2586		kfree_skb(skb);
2587		/* Jamal, now you will not able to escape explaining
2588		 * me how you were going to use this. :-)
2589		 */
2590		ret = NET_RX_DROP;
2591	}
2592
2593out:
2594	rcu_read_unlock();
2595	return ret;
2596}
2597EXPORT_SYMBOL(netif_receive_skb);
2598
2599/* Network device is going away, flush any packets still pending  */
2600static void flush_backlog(void *arg)
2601{
2602	struct net_device *dev = arg;
2603	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2604	struct sk_buff *skb, *tmp;
2605
2606	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2607		if (skb->dev == dev) {
2608			__skb_unlink(skb, &queue->input_pkt_queue);
2609			kfree_skb(skb);
2610		}
2611}
2612
2613static int napi_gro_complete(struct sk_buff *skb)
2614{
2615	struct packet_type *ptype;
2616	__be16 type = skb->protocol;
2617	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2618	int err = -ENOENT;
2619
2620	if (NAPI_GRO_CB(skb)->count == 1) {
2621		skb_shinfo(skb)->gso_size = 0;
2622		goto out;
2623	}
2624
2625	rcu_read_lock();
2626	list_for_each_entry_rcu(ptype, head, list) {
2627		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2628			continue;
2629
2630		err = ptype->gro_complete(skb);
2631		break;
2632	}
2633	rcu_read_unlock();
2634
2635	if (err) {
2636		WARN_ON(&ptype->list == head);
2637		kfree_skb(skb);
2638		return NET_RX_SUCCESS;
2639	}
2640
2641out:
2642	return netif_receive_skb(skb);
2643}
2644
2645static void napi_gro_flush(struct napi_struct *napi)
2646{
2647	struct sk_buff *skb, *next;
2648
2649	for (skb = napi->gro_list; skb; skb = next) {
2650		next = skb->next;
2651		skb->next = NULL;
2652		napi_gro_complete(skb);
2653	}
2654
2655	napi->gro_count = 0;
2656	napi->gro_list = NULL;
2657}
2658
2659enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2660{
2661	struct sk_buff **pp = NULL;
2662	struct packet_type *ptype;
2663	__be16 type = skb->protocol;
2664	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2665	int same_flow;
2666	int mac_len;
2667	enum gro_result ret;
2668
2669	if (!(skb->dev->features & NETIF_F_GRO))
2670		goto normal;
2671
2672	if (skb_is_gso(skb) || skb_has_frags(skb))
2673		goto normal;
2674
2675	rcu_read_lock();
2676	list_for_each_entry_rcu(ptype, head, list) {
2677		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2678			continue;
2679
2680		skb_set_network_header(skb, skb_gro_offset(skb));
2681		mac_len = skb->network_header - skb->mac_header;
2682		skb->mac_len = mac_len;
2683		NAPI_GRO_CB(skb)->same_flow = 0;
2684		NAPI_GRO_CB(skb)->flush = 0;
2685		NAPI_GRO_CB(skb)->free = 0;
2686
2687		pp = ptype->gro_receive(&napi->gro_list, skb);
2688		break;
2689	}
2690	rcu_read_unlock();
2691
2692	if (&ptype->list == head)
2693		goto normal;
2694
2695	same_flow = NAPI_GRO_CB(skb)->same_flow;
2696	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2697
2698	if (pp) {
2699		struct sk_buff *nskb = *pp;
2700
2701		*pp = nskb->next;
2702		nskb->next = NULL;
2703		napi_gro_complete(nskb);
2704		napi->gro_count--;
2705	}
2706
2707	if (same_flow)
2708		goto ok;
2709
2710	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2711		goto normal;
2712
2713	napi->gro_count++;
2714	NAPI_GRO_CB(skb)->count = 1;
2715	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2716	skb->next = napi->gro_list;
2717	napi->gro_list = skb;
2718	ret = GRO_HELD;
2719
2720pull:
2721	if (skb_headlen(skb) < skb_gro_offset(skb)) {
2722		int grow = skb_gro_offset(skb) - skb_headlen(skb);
2723
2724		BUG_ON(skb->end - skb->tail < grow);
2725
2726		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2727
2728		skb->tail += grow;
2729		skb->data_len -= grow;
2730
2731		skb_shinfo(skb)->frags[0].page_offset += grow;
2732		skb_shinfo(skb)->frags[0].size -= grow;
2733
2734		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2735			put_page(skb_shinfo(skb)->frags[0].page);
2736			memmove(skb_shinfo(skb)->frags,
2737				skb_shinfo(skb)->frags + 1,
2738				--skb_shinfo(skb)->nr_frags);
2739		}
2740	}
2741
2742ok:
2743	return ret;
2744
2745normal:
2746	ret = GRO_NORMAL;
2747	goto pull;
2748}
2749EXPORT_SYMBOL(dev_gro_receive);
2750
2751static gro_result_t
2752__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2753{
2754	struct sk_buff *p;
2755
2756	if (netpoll_rx_on(skb))
2757		return GRO_NORMAL;
2758
2759	for (p = napi->gro_list; p; p = p->next) {
2760		NAPI_GRO_CB(p)->same_flow =
2761			(p->dev == skb->dev) &&
2762			!compare_ether_header(skb_mac_header(p),
2763					      skb_gro_mac_header(skb));
2764		NAPI_GRO_CB(p)->flush = 0;
2765	}
2766
2767	return dev_gro_receive(napi, skb);
2768}
2769
2770gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2771{
2772	switch (ret) {
2773	case GRO_NORMAL:
2774		if (netif_receive_skb(skb))
2775			ret = GRO_DROP;
2776		break;
2777
2778	case GRO_DROP:
2779	case GRO_MERGED_FREE:
2780		kfree_skb(skb);
2781		break;
2782
2783	case GRO_HELD:
2784	case GRO_MERGED:
2785		break;
2786	}
2787
2788	return ret;
2789}
2790EXPORT_SYMBOL(napi_skb_finish);
2791
2792void skb_gro_reset_offset(struct sk_buff *skb)
2793{
2794	NAPI_GRO_CB(skb)->data_offset = 0;
2795	NAPI_GRO_CB(skb)->frag0 = NULL;
2796	NAPI_GRO_CB(skb)->frag0_len = 0;
2797
2798	if (skb->mac_header == skb->tail &&
2799	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2800		NAPI_GRO_CB(skb)->frag0 =
2801			page_address(skb_shinfo(skb)->frags[0].page) +
2802			skb_shinfo(skb)->frags[0].page_offset;
2803		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2804	}
2805}
2806EXPORT_SYMBOL(skb_gro_reset_offset);
2807
2808gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2809{
2810	skb_gro_reset_offset(skb);
2811
2812	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2813}
2814EXPORT_SYMBOL(napi_gro_receive);
2815
2816void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2817{
2818	__skb_pull(skb, skb_headlen(skb));
2819	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2820
2821	napi->skb = skb;
2822}
2823EXPORT_SYMBOL(napi_reuse_skb);
2824
2825struct sk_buff *napi_get_frags(struct napi_struct *napi)
2826{
2827	struct sk_buff *skb = napi->skb;
2828
2829	if (!skb) {
2830		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2831		if (skb)
2832			napi->skb = skb;
2833	}
2834	return skb;
2835}
2836EXPORT_SYMBOL(napi_get_frags);
2837
2838gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2839			       gro_result_t ret)
2840{
2841	switch (ret) {
2842	case GRO_NORMAL:
2843	case GRO_HELD:
2844		skb->protocol = eth_type_trans(skb, skb->dev);
2845
2846		if (ret == GRO_HELD)
2847			skb_gro_pull(skb, -ETH_HLEN);
2848		else if (netif_receive_skb(skb))
2849			ret = GRO_DROP;
2850		break;
2851
2852	case GRO_DROP:
2853	case GRO_MERGED_FREE:
2854		napi_reuse_skb(napi, skb);
2855		break;
2856
2857	case GRO_MERGED:
2858		break;
2859	}
2860
2861	return ret;
2862}
2863EXPORT_SYMBOL(napi_frags_finish);
2864
2865struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2866{
2867	struct sk_buff *skb = napi->skb;
2868	struct ethhdr *eth;
2869	unsigned int hlen;
2870	unsigned int off;
2871
2872	napi->skb = NULL;
2873
2874	skb_reset_mac_header(skb);
2875	skb_gro_reset_offset(skb);
2876
2877	off = skb_gro_offset(skb);
2878	hlen = off + sizeof(*eth);
2879	eth = skb_gro_header_fast(skb, off);
2880	if (skb_gro_header_hard(skb, hlen)) {
2881		eth = skb_gro_header_slow(skb, hlen, off);
2882		if (unlikely(!eth)) {
2883			napi_reuse_skb(napi, skb);
2884			skb = NULL;
2885			goto out;
2886		}
2887	}
2888
2889	skb_gro_pull(skb, sizeof(*eth));
2890
2891	/*
2892	 * This works because the only protocols we care about don't require
2893	 * special handling.  We'll fix it up properly at the end.
2894	 */
2895	skb->protocol = eth->h_proto;
2896
2897out:
2898	return skb;
2899}
2900EXPORT_SYMBOL(napi_frags_skb);
2901
2902gro_result_t napi_gro_frags(struct napi_struct *napi)
2903{
2904	struct sk_buff *skb = napi_frags_skb(napi);
2905
2906	if (!skb)
2907		return GRO_DROP;
2908
2909	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2910}
2911EXPORT_SYMBOL(napi_gro_frags);
2912
2913static int process_backlog(struct napi_struct *napi, int quota)
2914{
2915	int work = 0;
2916	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2917	unsigned long start_time = jiffies;
2918
2919	napi->weight = weight_p;
2920	do {
2921		struct sk_buff *skb;
2922
2923		local_irq_disable();
2924		skb = __skb_dequeue(&queue->input_pkt_queue);
2925		if (!skb) {
2926			__napi_complete(napi);
2927			local_irq_enable();
2928			break;
2929		}
2930		local_irq_enable();
2931
2932		netif_receive_skb(skb);
2933	} while (++work < quota && jiffies == start_time);
2934
2935	return work;
2936}
2937
2938/**
2939 * __napi_schedule - schedule for receive
2940 * @n: entry to schedule
2941 *
2942 * The entry's receive function will be scheduled to run
2943 */
2944void __napi_schedule(struct napi_struct *n)
2945{
2946	unsigned long flags;
2947
2948	local_irq_save(flags);
2949	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2950	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2951	local_irq_restore(flags);
2952}
2953EXPORT_SYMBOL(__napi_schedule);
2954
2955void __napi_complete(struct napi_struct *n)
2956{
2957	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2958	BUG_ON(n->gro_list);
2959
2960	list_del(&n->poll_list);
2961	smp_mb__before_clear_bit();
2962	clear_bit(NAPI_STATE_SCHED, &n->state);
2963}
2964EXPORT_SYMBOL(__napi_complete);
2965
2966void napi_complete(struct napi_struct *n)
2967{
2968	unsigned long flags;
2969
2970	/*
2971	 * don't let napi dequeue from the cpu poll list
2972	 * just in case its running on a different cpu
2973	 */
2974	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2975		return;
2976
2977	napi_gro_flush(n);
2978	local_irq_save(flags);
2979	__napi_complete(n);
2980	local_irq_restore(flags);
2981}
2982EXPORT_SYMBOL(napi_complete);
2983
2984void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2985		    int (*poll)(struct napi_struct *, int), int weight)
2986{
2987	INIT_LIST_HEAD(&napi->poll_list);
2988	napi->gro_count = 0;
2989	napi->gro_list = NULL;
2990	napi->skb = NULL;
2991	napi->poll = poll;
2992	napi->weight = weight;
2993	list_add(&napi->dev_list, &dev->napi_list);
2994	napi->dev = dev;
2995#ifdef CONFIG_NETPOLL
2996	spin_lock_init(&napi->poll_lock);
2997	napi->poll_owner = -1;
2998#endif
2999	set_bit(NAPI_STATE_SCHED, &napi->state);
3000}
3001EXPORT_SYMBOL(netif_napi_add);
3002
3003void netif_napi_del(struct napi_struct *napi)
3004{
3005	struct sk_buff *skb, *next;
3006
3007	list_del_init(&napi->dev_list);
3008	napi_free_frags(napi);
3009
3010	for (skb = napi->gro_list; skb; skb = next) {
3011		next = skb->next;
3012		skb->next = NULL;
3013		kfree_skb(skb);
3014	}
3015
3016	napi->gro_list = NULL;
3017	napi->gro_count = 0;
3018}
3019EXPORT_SYMBOL(netif_napi_del);
3020
3021
3022static void net_rx_action(struct softirq_action *h)
3023{
3024	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3025	unsigned long time_limit = jiffies + 2;
3026	int budget = netdev_budget;
3027	void *have;
3028
3029	local_irq_disable();
3030
3031	while (!list_empty(list)) {
3032		struct napi_struct *n;
3033		int work, weight;
3034
3035		/* If softirq window is exhuasted then punt.
3036		 * Allow this to run for 2 jiffies since which will allow
3037		 * an average latency of 1.5/HZ.
3038		 */
3039		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3040			goto softnet_break;
3041
3042		local_irq_enable();
3043
3044		/* Even though interrupts have been re-enabled, this
3045		 * access is safe because interrupts can only add new
3046		 * entries to the tail of this list, and only ->poll()
3047		 * calls can remove this head entry from the list.
3048		 */
3049		n = list_first_entry(list, struct napi_struct, poll_list);
3050
3051		have = netpoll_poll_lock(n);
3052
3053		weight = n->weight;
3054
3055		/* This NAPI_STATE_SCHED test is for avoiding a race
3056		 * with netpoll's poll_napi().  Only the entity which
3057		 * obtains the lock and sees NAPI_STATE_SCHED set will
3058		 * actually make the ->poll() call.  Therefore we avoid
3059		 * accidently calling ->poll() when NAPI is not scheduled.
3060		 */
3061		work = 0;
3062		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3063			work = n->poll(n, weight);
3064			trace_napi_poll(n);
3065		}
3066
3067		WARN_ON_ONCE(work > weight);
3068
3069		budget -= work;
3070
3071		local_irq_disable();
3072
3073		/* Drivers must not modify the NAPI state if they
3074		 * consume the entire weight.  In such cases this code
3075		 * still "owns" the NAPI instance and therefore can
3076		 * move the instance around on the list at-will.
3077		 */
3078		if (unlikely(work == weight)) {
3079			if (unlikely(napi_disable_pending(n))) {
3080				local_irq_enable();
3081				napi_complete(n);
3082				local_irq_disable();
3083			} else
3084				list_move_tail(&n->poll_list, list);
3085		}
3086
3087		netpoll_poll_unlock(have);
3088	}
3089out:
3090	local_irq_enable();
3091
3092#ifdef CONFIG_NET_DMA
3093	/*
3094	 * There may not be any more sk_buffs coming right now, so push
3095	 * any pending DMA copies to hardware
3096	 */
3097	dma_issue_pending_all();
3098#endif
3099
3100	return;
3101
3102softnet_break:
3103	__get_cpu_var(netdev_rx_stat).time_squeeze++;
3104	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3105	goto out;
3106}
3107
3108static gifconf_func_t *gifconf_list[NPROTO];
3109
3110/**
3111 *	register_gifconf	-	register a SIOCGIF handler
3112 *	@family: Address family
3113 *	@gifconf: Function handler
3114 *
3115 *	Register protocol dependent address dumping routines. The handler
3116 *	that is passed must not be freed or reused until it has been replaced
3117 *	by another handler.
3118 */
3119int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3120{
3121	if (family >= NPROTO)
3122		return -EINVAL;
3123	gifconf_list[family] = gifconf;
3124	return 0;
3125}
3126EXPORT_SYMBOL(register_gifconf);
3127
3128
3129/*
3130 *	Map an interface index to its name (SIOCGIFNAME)
3131 */
3132
3133/*
3134 *	We need this ioctl for efficient implementation of the
3135 *	if_indextoname() function required by the IPv6 API.  Without
3136 *	it, we would have to search all the interfaces to find a
3137 *	match.  --pb
3138 */
3139
3140static int dev_ifname(struct net *net, struct ifreq __user *arg)
3141{
3142	struct net_device *dev;
3143	struct ifreq ifr;
3144
3145	/*
3146	 *	Fetch the caller's info block.
3147	 */
3148
3149	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3150		return -EFAULT;
3151
3152	rcu_read_lock();
3153	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3154	if (!dev) {
3155		rcu_read_unlock();
3156		return -ENODEV;
3157	}
3158
3159	strcpy(ifr.ifr_name, dev->name);
3160	rcu_read_unlock();
3161
3162	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3163		return -EFAULT;
3164	return 0;
3165}
3166
3167/*
3168 *	Perform a SIOCGIFCONF call. This structure will change
3169 *	size eventually, and there is nothing I can do about it.
3170 *	Thus we will need a 'compatibility mode'.
3171 */
3172
3173static int dev_ifconf(struct net *net, char __user *arg)
3174{
3175	struct ifconf ifc;
3176	struct net_device *dev;
3177	char __user *pos;
3178	int len;
3179	int total;
3180	int i;
3181
3182	/*
3183	 *	Fetch the caller's info block.
3184	 */
3185
3186	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3187		return -EFAULT;
3188
3189	pos = ifc.ifc_buf;
3190	len = ifc.ifc_len;
3191
3192	/*
3193	 *	Loop over the interfaces, and write an info block for each.
3194	 */
3195
3196	total = 0;
3197	for_each_netdev(net, dev) {
3198		for (i = 0; i < NPROTO; i++) {
3199			if (gifconf_list[i]) {
3200				int done;
3201				if (!pos)
3202					done = gifconf_list[i](dev, NULL, 0);
3203				else
3204					done = gifconf_list[i](dev, pos + total,
3205							       len - total);
3206				if (done < 0)
3207					return -EFAULT;
3208				total += done;
3209			}
3210		}
3211	}
3212
3213	/*
3214	 *	All done.  Write the updated control block back to the caller.
3215	 */
3216	ifc.ifc_len = total;
3217
3218	/*
3219	 * 	Both BSD and Solaris return 0 here, so we do too.
3220	 */
3221	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3222}
3223
3224#ifdef CONFIG_PROC_FS
3225/*
3226 *	This is invoked by the /proc filesystem handler to display a device
3227 *	in detail.
3228 */
3229void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3230	__acquires(RCU)
3231{
3232	struct net *net = seq_file_net(seq);
3233	loff_t off;
3234	struct net_device *dev;
3235
3236	rcu_read_lock();
3237	if (!*pos)
3238		return SEQ_START_TOKEN;
3239
3240	off = 1;
3241	for_each_netdev_rcu(net, dev)
3242		if (off++ == *pos)
3243			return dev;
3244
3245	return NULL;
3246}
3247
3248void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3249{
3250	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3251				  first_net_device(seq_file_net(seq)) :
3252				  next_net_device((struct net_device *)v);
3253
3254	++*pos;
3255	return rcu_dereference(dev);
3256}
3257
3258void dev_seq_stop(struct seq_file *seq, void *v)
3259	__releases(RCU)
3260{
3261	rcu_read_unlock();
3262}
3263
3264static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3265{
3266	const struct net_device_stats *stats = dev_get_stats(dev);
3267
3268	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3269		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3270		   dev->name, stats->rx_bytes, stats->rx_packets,
3271		   stats->rx_errors,
3272		   stats->rx_dropped + stats->rx_missed_errors,
3273		   stats->rx_fifo_errors,
3274		   stats->rx_length_errors + stats->rx_over_errors +
3275		    stats->rx_crc_errors + stats->rx_frame_errors,
3276		   stats->rx_compressed, stats->multicast,
3277		   stats->tx_bytes, stats->tx_packets,
3278		   stats->tx_errors, stats->tx_dropped,
3279		   stats->tx_fifo_errors, stats->collisions,
3280		   stats->tx_carrier_errors +
3281		    stats->tx_aborted_errors +
3282		    stats->tx_window_errors +
3283		    stats->tx_heartbeat_errors,
3284		   stats->tx_compressed);
3285}
3286
3287/*
3288 *	Called from the PROCfs module. This now uses the new arbitrary sized
3289 *	/proc/net interface to create /proc/net/dev
3290 */
3291static int dev_seq_show(struct seq_file *seq, void *v)
3292{
3293	if (v == SEQ_START_TOKEN)
3294		seq_puts(seq, "Inter-|   Receive                            "
3295			      "                    |  Transmit\n"
3296			      " face |bytes    packets errs drop fifo frame "
3297			      "compressed multicast|bytes    packets errs "
3298			      "drop fifo colls carrier compressed\n");
3299	else
3300		dev_seq_printf_stats(seq, v);
3301	return 0;
3302}
3303
3304static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3305{
3306	struct netif_rx_stats *rc = NULL;
3307
3308	while (*pos < nr_cpu_ids)
3309		if (cpu_online(*pos)) {
3310			rc = &per_cpu(netdev_rx_stat, *pos);
3311			break;
3312		} else
3313			++*pos;
3314	return rc;
3315}
3316
3317static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3318{
3319	return softnet_get_online(pos);
3320}
3321
3322static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3323{
3324	++*pos;
3325	return softnet_get_online(pos);
3326}
3327
3328static void softnet_seq_stop(struct seq_file *seq, void *v)
3329{
3330}
3331
3332static int softnet_seq_show(struct seq_file *seq, void *v)
3333{
3334	struct netif_rx_stats *s = v;
3335
3336	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3337		   s->total, s->dropped, s->time_squeeze, 0,
3338		   0, 0, 0, 0, /* was fastroute */
3339		   s->cpu_collision);
3340	return 0;
3341}
3342
3343static const struct seq_operations dev_seq_ops = {
3344	.start = dev_seq_start,
3345	.next  = dev_seq_next,
3346	.stop  = dev_seq_stop,
3347	.show  = dev_seq_show,
3348};
3349
3350static int dev_seq_open(struct inode *inode, struct file *file)
3351{
3352	return seq_open_net(inode, file, &dev_seq_ops,
3353			    sizeof(struct seq_net_private));
3354}
3355
3356static const struct file_operations dev_seq_fops = {
3357	.owner	 = THIS_MODULE,
3358	.open    = dev_seq_open,
3359	.read    = seq_read,
3360	.llseek  = seq_lseek,
3361	.release = seq_release_net,
3362};
3363
3364static const struct seq_operations softnet_seq_ops = {
3365	.start = softnet_seq_start,
3366	.next  = softnet_seq_next,
3367	.stop  = softnet_seq_stop,
3368	.show  = softnet_seq_show,
3369};
3370
3371static int softnet_seq_open(struct inode *inode, struct file *file)
3372{
3373	return seq_open(file, &softnet_seq_ops);
3374}
3375
3376static const struct file_operations softnet_seq_fops = {
3377	.owner	 = THIS_MODULE,
3378	.open    = softnet_seq_open,
3379	.read    = seq_read,
3380	.llseek  = seq_lseek,
3381	.release = seq_release,
3382};
3383
3384static void *ptype_get_idx(loff_t pos)
3385{
3386	struct packet_type *pt = NULL;
3387	loff_t i = 0;
3388	int t;
3389
3390	list_for_each_entry_rcu(pt, &ptype_all, list) {
3391		if (i == pos)
3392			return pt;
3393		++i;
3394	}
3395
3396	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3397		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3398			if (i == pos)
3399				return pt;
3400			++i;
3401		}
3402	}
3403	return NULL;
3404}
3405
3406static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3407	__acquires(RCU)
3408{
3409	rcu_read_lock();
3410	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3411}
3412
3413static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3414{
3415	struct packet_type *pt;
3416	struct list_head *nxt;
3417	int hash;
3418
3419	++*pos;
3420	if (v == SEQ_START_TOKEN)
3421		return ptype_get_idx(0);
3422
3423	pt = v;
3424	nxt = pt->list.next;
3425	if (pt->type == htons(ETH_P_ALL)) {
3426		if (nxt != &ptype_all)
3427			goto found;
3428		hash = 0;
3429		nxt = ptype_base[0].next;
3430	} else
3431		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3432
3433	while (nxt == &ptype_base[hash]) {
3434		if (++hash >= PTYPE_HASH_SIZE)
3435			return NULL;
3436		nxt = ptype_base[hash].next;
3437	}
3438found:
3439	return list_entry(nxt, struct packet_type, list);
3440}
3441
3442static void ptype_seq_stop(struct seq_file *seq, void *v)
3443	__releases(RCU)
3444{
3445	rcu_read_unlock();
3446}
3447
3448static int ptype_seq_show(struct seq_file *seq, void *v)
3449{
3450	struct packet_type *pt = v;
3451
3452	if (v == SEQ_START_TOKEN)
3453		seq_puts(seq, "Type Device      Function\n");
3454	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3455		if (pt->type == htons(ETH_P_ALL))
3456			seq_puts(seq, "ALL ");
3457		else
3458			seq_printf(seq, "%04x", ntohs(pt->type));
3459
3460		seq_printf(seq, " %-8s %pF\n",
3461			   pt->dev ? pt->dev->name : "", pt->func);
3462	}
3463
3464	return 0;
3465}
3466
3467static const struct seq_operations ptype_seq_ops = {
3468	.start = ptype_seq_start,
3469	.next  = ptype_seq_next,
3470	.stop  = ptype_seq_stop,
3471	.show  = ptype_seq_show,
3472};
3473
3474static int ptype_seq_open(struct inode *inode, struct file *file)
3475{
3476	return seq_open_net(inode, file, &ptype_seq_ops,
3477			sizeof(struct seq_net_private));
3478}
3479
3480static const struct file_operations ptype_seq_fops = {
3481	.owner	 = THIS_MODULE,
3482	.open    = ptype_seq_open,
3483	.read    = seq_read,
3484	.llseek  = seq_lseek,
3485	.release = seq_release_net,
3486};
3487
3488
3489static int __net_init dev_proc_net_init(struct net *net)
3490{
3491	int rc = -ENOMEM;
3492
3493	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3494		goto out;
3495	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3496		goto out_dev;
3497	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3498		goto out_softnet;
3499
3500	if (wext_proc_init(net))
3501		goto out_ptype;
3502	rc = 0;
3503out:
3504	return rc;
3505out_ptype:
3506	proc_net_remove(net, "ptype");
3507out_softnet:
3508	proc_net_remove(net, "softnet_stat");
3509out_dev:
3510	proc_net_remove(net, "dev");
3511	goto out;
3512}
3513
3514static void __net_exit dev_proc_net_exit(struct net *net)
3515{
3516	wext_proc_exit(net);
3517
3518	proc_net_remove(net, "ptype");
3519	proc_net_remove(net, "softnet_stat");
3520	proc_net_remove(net, "dev");
3521}
3522
3523static struct pernet_operations __net_initdata dev_proc_ops = {
3524	.init = dev_proc_net_init,
3525	.exit = dev_proc_net_exit,
3526};
3527
3528static int __init dev_proc_init(void)
3529{
3530	return register_pernet_subsys(&dev_proc_ops);
3531}
3532#else
3533#define dev_proc_init() 0
3534#endif	/* CONFIG_PROC_FS */
3535
3536
3537/**
3538 *	netdev_set_master	-	set up master/slave pair
3539 *	@slave: slave device
3540 *	@master: new master device
3541 *
3542 *	Changes the master device of the slave. Pass %NULL to break the
3543 *	bonding. The caller must hold the RTNL semaphore. On a failure
3544 *	a negative errno code is returned. On success the reference counts
3545 *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3546 *	function returns zero.
3547 */
3548int netdev_set_master(struct net_device *slave, struct net_device *master)
3549{
3550	struct net_device *old = slave->master;
3551
3552	ASSERT_RTNL();
3553
3554	if (master) {
3555		if (old)
3556			return -EBUSY;
3557		dev_hold(master);
3558	}
3559
3560	slave->master = master;
3561
3562	synchronize_net();
3563
3564	if (old)
3565		dev_put(old);
3566
3567	if (master)
3568		slave->flags |= IFF_SLAVE;
3569	else
3570		slave->flags &= ~IFF_SLAVE;
3571
3572	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3573	return 0;
3574}
3575EXPORT_SYMBOL(netdev_set_master);
3576
3577static void dev_change_rx_flags(struct net_device *dev, int flags)
3578{
3579	const struct net_device_ops *ops = dev->netdev_ops;
3580
3581	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3582		ops->ndo_change_rx_flags(dev, flags);
3583}
3584
3585static int __dev_set_promiscuity(struct net_device *dev, int inc)
3586{
3587	unsigned short old_flags = dev->flags;
3588	uid_t uid;
3589	gid_t gid;
3590
3591	ASSERT_RTNL();
3592
3593	dev->flags |= IFF_PROMISC;
3594	dev->promiscuity += inc;
3595	if (dev->promiscuity == 0) {
3596		/*
3597		 * Avoid overflow.
3598		 * If inc causes overflow, untouch promisc and return error.
3599		 */
3600		if (inc < 0)
3601			dev->flags &= ~IFF_PROMISC;
3602		else {
3603			dev->promiscuity -= inc;
3604			printk(KERN_WARNING "%s: promiscuity touches roof, "
3605				"set promiscuity failed, promiscuity feature "
3606				"of device might be broken.\n", dev->name);
3607			return -EOVERFLOW;
3608		}
3609	}
3610	if (dev->flags != old_flags) {
3611		printk(KERN_INFO "device %s %s promiscuous mode\n",
3612		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3613							       "left");
3614		if (audit_enabled) {
3615			current_uid_gid(&uid, &gid);
3616			audit_log(current->audit_context, GFP_ATOMIC,
3617				AUDIT_ANOM_PROMISCUOUS,
3618				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3619				dev->name, (dev->flags & IFF_PROMISC),
3620				(old_flags & IFF_PROMISC),
3621				audit_get_loginuid(current),
3622				uid, gid,
3623				audit_get_sessionid(current));
3624		}
3625
3626		dev_change_rx_flags(dev, IFF_PROMISC);
3627	}
3628	return 0;
3629}
3630
3631/**
3632 *	dev_set_promiscuity	- update promiscuity count on a device
3633 *	@dev: device
3634 *	@inc: modifier
3635 *
3636 *	Add or remove promiscuity from a device. While the count in the device
3637 *	remains above zero the interface remains promiscuous. Once it hits zero
3638 *	the device reverts back to normal filtering operation. A negative inc
3639 *	value is used to drop promiscuity on the device.
3640 *	Return 0 if successful or a negative errno code on error.
3641 */
3642int dev_set_promiscuity(struct net_device *dev, int inc)
3643{
3644	unsigned short old_flags = dev->flags;
3645	int err;
3646
3647	err = __dev_set_promiscuity(dev, inc);
3648	if (err < 0)
3649		return err;
3650	if (dev->flags != old_flags)
3651		dev_set_rx_mode(dev);
3652	return err;
3653}
3654EXPORT_SYMBOL(dev_set_promiscuity);
3655
3656/**
3657 *	dev_set_allmulti	- update allmulti count on a device
3658 *	@dev: device
3659 *	@inc: modifier
3660 *
3661 *	Add or remove reception of all multicast frames to a device. While the
3662 *	count in the device remains above zero the interface remains listening
3663 *	to all interfaces. Once it hits zero the device reverts back to normal
3664 *	filtering operation. A negative @inc value is used to drop the counter
3665 *	when releasing a resource needing all multicasts.
3666 *	Return 0 if successful or a negative errno code on error.
3667 */
3668
3669int dev_set_allmulti(struct net_device *dev, int inc)
3670{
3671	unsigned short old_flags = dev->flags;
3672
3673	ASSERT_RTNL();
3674
3675	dev->flags |= IFF_ALLMULTI;
3676	dev->allmulti += inc;
3677	if (dev->allmulti == 0) {
3678		/*
3679		 * Avoid overflow.
3680		 * If inc causes overflow, untouch allmulti and return error.
3681		 */
3682		if (inc < 0)
3683			dev->flags &= ~IFF_ALLMULTI;
3684		else {
3685			dev->allmulti -= inc;
3686			printk(KERN_WARNING "%s: allmulti touches roof, "
3687				"set allmulti failed, allmulti feature of "
3688				"device might be broken.\n", dev->name);
3689			return -EOVERFLOW;
3690		}
3691	}
3692	if (dev->flags ^ old_flags) {
3693		dev_change_rx_flags(dev, IFF_ALLMULTI);
3694		dev_set_rx_mode(dev);
3695	}
3696	return 0;
3697}
3698EXPORT_SYMBOL(dev_set_allmulti);
3699
3700/*
3701 *	Upload unicast and multicast address lists to device and
3702 *	configure RX filtering. When the device doesn't support unicast
3703 *	filtering it is put in promiscuous mode while unicast addresses
3704 *	are present.
3705 */
3706void __dev_set_rx_mode(struct net_device *dev)
3707{
3708	const struct net_device_ops *ops = dev->netdev_ops;
3709
3710	/* dev_open will call this function so the list will stay sane. */
3711	if (!(dev->flags&IFF_UP))
3712		return;
3713
3714	if (!netif_device_present(dev))
3715		return;
3716
3717	if (ops->ndo_set_rx_mode)
3718		ops->ndo_set_rx_mode(dev);
3719	else {
3720		/* Unicast addresses changes may only happen under the rtnl,
3721		 * therefore calling __dev_set_promiscuity here is safe.
3722		 */
3723		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3724			__dev_set_promiscuity(dev, 1);
3725			dev->uc_promisc = 1;
3726		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3727			__dev_set_promiscuity(dev, -1);
3728			dev->uc_promisc = 0;
3729		}
3730
3731		if (ops->ndo_set_multicast_list)
3732			ops->ndo_set_multicast_list(dev);
3733	}
3734}
3735
3736void dev_set_rx_mode(struct net_device *dev)
3737{
3738	netif_addr_lock_bh(dev);
3739	__dev_set_rx_mode(dev);
3740	netif_addr_unlock_bh(dev);
3741}
3742
3743/* hw addresses list handling functions */
3744
3745static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3746			 int addr_len, unsigned char addr_type)
3747{
3748	struct netdev_hw_addr *ha;
3749	int alloc_size;
3750
3751	if (addr_len > MAX_ADDR_LEN)
3752		return -EINVAL;
3753
3754	list_for_each_entry(ha, &list->list, list) {
3755		if (!memcmp(ha->addr, addr, addr_len) &&
3756		    ha->type == addr_type) {
3757			ha->refcount++;
3758			return 0;
3759		}
3760	}
3761
3762
3763	alloc_size = sizeof(*ha);
3764	if (alloc_size < L1_CACHE_BYTES)
3765		alloc_size = L1_CACHE_BYTES;
3766	ha = kmalloc(alloc_size, GFP_ATOMIC);
3767	if (!ha)
3768		return -ENOMEM;
3769	memcpy(ha->addr, addr, addr_len);
3770	ha->type = addr_type;
3771	ha->refcount = 1;
3772	ha->synced = false;
3773	list_add_tail_rcu(&ha->list, &list->list);
3774	list->count++;
3775	return 0;
3776}
3777
3778static void ha_rcu_free(struct rcu_head *head)
3779{
3780	struct netdev_hw_addr *ha;
3781
3782	ha = container_of(head, struct netdev_hw_addr, rcu_head);
3783	kfree(ha);
3784}
3785
3786static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3787			 int addr_len, unsigned char addr_type)
3788{
3789	struct netdev_hw_addr *ha;
3790
3791	list_for_each_entry(ha, &list->list, list) {
3792		if (!memcmp(ha->addr, addr, addr_len) &&
3793		    (ha->type == addr_type || !addr_type)) {
3794			if (--ha->refcount)
3795				return 0;
3796			list_del_rcu(&ha->list);
3797			call_rcu(&ha->rcu_head, ha_rcu_free);
3798			list->count--;
3799			return 0;
3800		}
3801	}
3802	return -ENOENT;
3803}
3804
3805static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3806				  struct netdev_hw_addr_list *from_list,
3807				  int addr_len,
3808				  unsigned char addr_type)
3809{
3810	int err;
3811	struct netdev_hw_addr *ha, *ha2;
3812	unsigned char type;
3813
3814	list_for_each_entry(ha, &from_list->list, list) {
3815		type = addr_type ? addr_type : ha->type;
3816		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3817		if (err)
3818			goto unroll;
3819	}
3820	return 0;
3821
3822unroll:
3823	list_for_each_entry(ha2, &from_list->list, list) {
3824		if (ha2 == ha)
3825			break;
3826		type = addr_type ? addr_type : ha2->type;
3827		__hw_addr_del(to_list, ha2->addr, addr_len, type);
3828	}
3829	return err;
3830}
3831
3832static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3833				   struct netdev_hw_addr_list *from_list,
3834				   int addr_len,
3835				   unsigned char addr_type)
3836{
3837	struct netdev_hw_addr *ha;
3838	unsigned char type;
3839
3840	list_for_each_entry(ha, &from_list->list, list) {
3841		type = addr_type ? addr_type : ha->type;
3842		__hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3843	}
3844}
3845
3846static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3847			  struct netdev_hw_addr_list *from_list,
3848			  int addr_len)
3849{
3850	int err = 0;
3851	struct netdev_hw_addr *ha, *tmp;
3852
3853	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3854		if (!ha->synced) {
3855			err = __hw_addr_add(to_list, ha->addr,
3856					    addr_len, ha->type);
3857			if (err)
3858				break;
3859			ha->synced = true;
3860			ha->refcount++;
3861		} else if (ha->refcount == 1) {
3862			__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3863			__hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3864		}
3865	}
3866	return err;
3867}
3868
3869static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3870			     struct netdev_hw_addr_list *from_list,
3871			     int addr_len)
3872{
3873	struct netdev_hw_addr *ha, *tmp;
3874
3875	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3876		if (ha->synced) {
3877			__hw_addr_del(to_list, ha->addr,
3878				      addr_len, ha->type);
3879			ha->synced = false;
3880			__hw_addr_del(from_list, ha->addr,
3881				      addr_len, ha->type);
3882		}
3883	}
3884}
3885
3886static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3887{
3888	struct netdev_hw_addr *ha, *tmp;
3889
3890	list_for_each_entry_safe(ha, tmp, &list->list, list) {
3891		list_del_rcu(&ha->list);
3892		call_rcu(&ha->rcu_head, ha_rcu_free);
3893	}
3894	list->count = 0;
3895}
3896
3897static void __hw_addr_init(struct netdev_hw_addr_list *list)
3898{
3899	INIT_LIST_HEAD(&list->list);
3900	list->count = 0;
3901}
3902
3903/* Device addresses handling functions */
3904
3905static void dev_addr_flush(struct net_device *dev)
3906{
3907	/* rtnl_mutex must be held here */
3908
3909	__hw_addr_flush(&dev->dev_addrs);
3910	dev->dev_addr = NULL;
3911}
3912
3913static int dev_addr_init(struct net_device *dev)
3914{
3915	unsigned char addr[MAX_ADDR_LEN];
3916	struct netdev_hw_addr *ha;
3917	int err;
3918
3919	/* rtnl_mutex must be held here */
3920
3921	__hw_addr_init(&dev->dev_addrs);
3922	memset(addr, 0, sizeof(addr));
3923	err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3924			    NETDEV_HW_ADDR_T_LAN);
3925	if (!err) {
3926		/*
3927		 * Get the first (previously created) address from the list
3928		 * and set dev_addr pointer to this location.
3929		 */
3930		ha = list_first_entry(&dev->dev_addrs.list,
3931				      struct netdev_hw_addr, list);
3932		dev->dev_addr = ha->addr;
3933	}
3934	return err;
3935}
3936
3937/**
3938 *	dev_addr_add	- Add a device address
3939 *	@dev: device
3940 *	@addr: address to add
3941 *	@addr_type: address type
3942 *
3943 *	Add a device address to the device or increase the reference count if
3944 *	it already exists.
3945 *
3946 *	The caller must hold the rtnl_mutex.
3947 */
3948int dev_addr_add(struct net_device *dev, unsigned char *addr,
3949		 unsigned char addr_type)
3950{
3951	int err;
3952
3953	ASSERT_RTNL();
3954
3955	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3956	if (!err)
3957		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3958	return err;
3959}
3960EXPORT_SYMBOL(dev_addr_add);
3961
3962/**
3963 *	dev_addr_del	- Release a device address.
3964 *	@dev: device
3965 *	@addr: address to delete
3966 *	@addr_type: address type
3967 *
3968 *	Release reference to a device address and remove it from the device
3969 *	if the reference count drops to zero.
3970 *
3971 *	The caller must hold the rtnl_mutex.
3972 */
3973int dev_addr_del(struct net_device *dev, unsigned char *addr,
3974		 unsigned char addr_type)
3975{
3976	int err;
3977	struct netdev_hw_addr *ha;
3978
3979	ASSERT_RTNL();
3980
3981	/*
3982	 * We can not remove the first address from the list because
3983	 * dev->dev_addr points to that.
3984	 */
3985	ha = list_first_entry(&dev->dev_addrs.list,
3986			      struct netdev_hw_addr, list);
3987	if (ha->addr == dev->dev_addr && ha->refcount == 1)
3988		return -ENOENT;
3989
3990	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3991			    addr_type);
3992	if (!err)
3993		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3994	return err;
3995}
3996EXPORT_SYMBOL(dev_addr_del);
3997
3998/**
3999 *	dev_addr_add_multiple	- Add device addresses from another device
4000 *	@to_dev: device to which addresses will be added
4001 *	@from_dev: device from which addresses will be added
4002 *	@addr_type: address type - 0 means type will be used from from_dev
4003 *
4004 *	Add device addresses of the one device to another.
4005 **
4006 *	The caller must hold the rtnl_mutex.
4007 */
4008int dev_addr_add_multiple(struct net_device *to_dev,
4009			  struct net_device *from_dev,
4010			  unsigned char addr_type)
4011{
4012	int err;
4013
4014	ASSERT_RTNL();
4015
4016	if (from_dev->addr_len != to_dev->addr_len)
4017		return -EINVAL;
4018	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4019				     to_dev->addr_len, addr_type);
4020	if (!err)
4021		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4022	return err;
4023}
4024EXPORT_SYMBOL(dev_addr_add_multiple);
4025
4026/**
4027 *	dev_addr_del_multiple	- Delete device addresses by another device
4028 *	@to_dev: device where the addresses will be deleted
4029 *	@from_dev: device by which addresses the addresses will be deleted
4030 *	@addr_type: address type - 0 means type will used from from_dev
4031 *
4032 *	Deletes addresses in to device by the list of addresses in from device.
4033 *
4034 *	The caller must hold the rtnl_mutex.
4035 */
4036int dev_addr_del_multiple(struct net_device *to_dev,
4037			  struct net_device *from_dev,
4038			  unsigned char addr_type)
4039{
4040	ASSERT_RTNL();
4041
4042	if (from_dev->addr_len != to_dev->addr_len)
4043		return -EINVAL;
4044	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4045			       to_dev->addr_len, addr_type);
4046	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4047	return 0;
4048}
4049EXPORT_SYMBOL(dev_addr_del_multiple);
4050
4051/* multicast addresses handling functions */
4052
4053int __dev_addr_delete(struct dev_addr_list **list, int *count,
4054		      void *addr, int alen, int glbl)
4055{
4056	struct dev_addr_list *da;
4057
4058	for (; (da = *list) != NULL; list = &da->next) {
4059		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4060		    alen == da->da_addrlen) {
4061			if (glbl) {
4062				int old_glbl = da->da_gusers;
4063				da->da_gusers = 0;
4064				if (old_glbl == 0)
4065					break;
4066			}
4067			if (--da->da_users)
4068				return 0;
4069
4070			*list = da->next;
4071			kfree(da);
4072			(*count)--;
4073			return 0;
4074		}
4075	}
4076	return -ENOENT;
4077}
4078
4079int __dev_addr_add(struct dev_addr_list **list, int *count,
4080		   void *addr, int alen, int glbl)
4081{
4082	struct dev_addr_list *da;
4083
4084	for (da = *list; da != NULL; da = da->next) {
4085		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4086		    da->da_addrlen == alen) {
4087			if (glbl) {
4088				int old_glbl = da->da_gusers;
4089				da->da_gusers = 1;
4090				if (old_glbl)
4091					return 0;
4092			}
4093			da->da_users++;
4094			return 0;
4095		}
4096	}
4097
4098	da = kzalloc(sizeof(*da), GFP_ATOMIC);
4099	if (da == NULL)
4100		return -ENOMEM;
4101	memcpy(da->da_addr, addr, alen);
4102	da->da_addrlen = alen;
4103	da->da_users = 1;
4104	da->da_gusers = glbl ? 1 : 0;
4105	da->next = *list;
4106	*list = da;
4107	(*count)++;
4108	return 0;
4109}
4110
4111/**
4112 *	dev_unicast_delete	- Release secondary unicast address.
4113 *	@dev: device
4114 *	@addr: address to delete
4115 *
4116 *	Release reference to a secondary unicast address and remove it
4117 *	from the device if the reference count drops to zero.
4118 *
4119 * 	The caller must hold the rtnl_mutex.
4120 */
4121int dev_unicast_delete(struct net_device *dev, void *addr)
4122{
4123	int err;
4124
4125	ASSERT_RTNL();
4126
4127	netif_addr_lock_bh(dev);
4128	err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4129			    NETDEV_HW_ADDR_T_UNICAST);
4130	if (!err)
4131		__dev_set_rx_mode(dev);
4132	netif_addr_unlock_bh(dev);
4133	return err;
4134}
4135EXPORT_SYMBOL(dev_unicast_delete);
4136
4137/**
4138 *	dev_unicast_add		- add a secondary unicast address
4139 *	@dev: device
4140 *	@addr: address to add
4141 *
4142 *	Add a secondary unicast address to the device or increase
4143 *	the reference count if it already exists.
4144 *
4145 *	The caller must hold the rtnl_mutex.
4146 */
4147int dev_unicast_add(struct net_device *dev, void *addr)
4148{
4149	int err;
4150
4151	ASSERT_RTNL();
4152
4153	netif_addr_lock_bh(dev);
4154	err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4155			    NETDEV_HW_ADDR_T_UNICAST);
4156	if (!err)
4157		__dev_set_rx_mode(dev);
4158	netif_addr_unlock_bh(dev);
4159	return err;
4160}
4161EXPORT_SYMBOL(dev_unicast_add);
4162
4163int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4164		    struct dev_addr_list **from, int *from_count)
4165{
4166	struct dev_addr_list *da, *next;
4167	int err = 0;
4168
4169	da = *from;
4170	while (da != NULL) {
4171		next = da->next;
4172		if (!da->da_synced) {
4173			err = __dev_addr_add(to, to_count,
4174					     da->da_addr, da->da_addrlen, 0);
4175			if (err < 0)
4176				break;
4177			da->da_synced = 1;
4178			da->da_users++;
4179		} else if (da->da_users == 1) {
4180			__dev_addr_delete(to, to_count,
4181					  da->da_addr, da->da_addrlen, 0);
4182			__dev_addr_delete(from, from_count,
4183					  da->da_addr, da->da_addrlen, 0);
4184		}
4185		da = next;
4186	}
4187	return err;
4188}
4189EXPORT_SYMBOL_GPL(__dev_addr_sync);
4190
4191void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4192		       struct dev_addr_list **from, int *from_count)
4193{
4194	struct dev_addr_list *da, *next;
4195
4196	da = *from;
4197	while (da != NULL) {
4198		next = da->next;
4199		if (da->da_synced) {
4200			__dev_addr_delete(to, to_count,
4201					  da->da_addr, da->da_addrlen, 0);
4202			da->da_synced = 0;
4203			__dev_addr_delete(from, from_count,
4204					  da->da_addr, da->da_addrlen, 0);
4205		}
4206		da = next;
4207	}
4208}
4209EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4210
4211/**
4212 *	dev_unicast_sync - Synchronize device's unicast list to another device
4213 *	@to: destination device
4214 *	@from: source device
4215 *
4216 *	Add newly added addresses to the destination device and release
4217 *	addresses that have no users left. The source device must be
4218 *	locked by netif_tx_lock_bh.
4219 *
4220 *	This function is intended to be called from the dev->set_rx_mode
4221 *	function of layered software devices.
4222 */
4223int dev_unicast_sync(struct net_device *to, struct net_device *from)
4224{
4225	int err = 0;
4226
4227	if (to->addr_len != from->addr_len)
4228		return -EINVAL;
4229
4230	netif_addr_lock_bh(to);
4231	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4232	if (!err)
4233		__dev_set_rx_mode(to);
4234	netif_addr_unlock_bh(to);
4235	return err;
4236}
4237EXPORT_SYMBOL(dev_unicast_sync);
4238
4239/**
4240 *	dev_unicast_unsync - Remove synchronized addresses from the destination device
4241 *	@to: destination device
4242 *	@from: source device
4243 *
4244 *	Remove all addresses that were added to the destination device by
4245 *	dev_unicast_sync(). This function is intended to be called from the
4246 *	dev->stop function of layered software devices.
4247 */
4248void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4249{
4250	if (to->addr_len != from->addr_len)
4251		return;
4252
4253	netif_addr_lock_bh(from);
4254	netif_addr_lock(to);
4255	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4256	__dev_set_rx_mode(to);
4257	netif_addr_unlock(to);
4258	netif_addr_unlock_bh(from);
4259}
4260EXPORT_SYMBOL(dev_unicast_unsync);
4261
4262static void dev_unicast_flush(struct net_device *dev)
4263{
4264	netif_addr_lock_bh(dev);
4265	__hw_addr_flush(&dev->uc);
4266	netif_addr_unlock_bh(dev);
4267}
4268
4269static void dev_unicast_init(struct net_device *dev)
4270{
4271	__hw_addr_init(&dev->uc);
4272}
4273
4274
4275static void __dev_addr_discard(struct dev_addr_list **list)
4276{
4277	struct dev_addr_list *tmp;
4278
4279	while (*list != NULL) {
4280		tmp = *list;
4281		*list = tmp->next;
4282		if (tmp->da_users > tmp->da_gusers)
4283			printk("__dev_addr_discard: address leakage! "
4284			       "da_users=%d\n", tmp->da_users);
4285		kfree(tmp);
4286	}
4287}
4288
4289static void dev_addr_discard(struct net_device *dev)
4290{
4291	netif_addr_lock_bh(dev);
4292
4293	__dev_addr_discard(&dev->mc_list);
4294	netdev_mc_count(dev) = 0;
4295
4296	netif_addr_unlock_bh(dev);
4297}
4298
4299/**
4300 *	dev_get_flags - get flags reported to userspace
4301 *	@dev: device
4302 *
4303 *	Get the combination of flag bits exported through APIs to userspace.
4304 */
4305unsigned dev_get_flags(const struct net_device *dev)
4306{
4307	unsigned flags;
4308
4309	flags = (dev->flags & ~(IFF_PROMISC |
4310				IFF_ALLMULTI |
4311				IFF_RUNNING |
4312				IFF_LOWER_UP |
4313				IFF_DORMANT)) |
4314		(dev->gflags & (IFF_PROMISC |
4315				IFF_ALLMULTI));
4316
4317	if (netif_running(dev)) {
4318		if (netif_oper_up(dev))
4319			flags |= IFF_RUNNING;
4320		if (netif_carrier_ok(dev))
4321			flags |= IFF_LOWER_UP;
4322		if (netif_dormant(dev))
4323			flags |= IFF_DORMANT;
4324	}
4325
4326	return flags;
4327}
4328EXPORT_SYMBOL(dev_get_flags);
4329
4330int __dev_change_flags(struct net_device *dev, unsigned int flags)
4331{
4332	int old_flags = dev->flags;
4333	int ret;
4334
4335	ASSERT_RTNL();
4336
4337	/*
4338	 *	Set the flags on our device.
4339	 */
4340
4341	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4342			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4343			       IFF_AUTOMEDIA)) |
4344		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4345				    IFF_ALLMULTI));
4346
4347	/*
4348	 *	Load in the correct multicast list now the flags have changed.
4349	 */
4350
4351	if ((old_flags ^ flags) & IFF_MULTICAST)
4352		dev_change_rx_flags(dev, IFF_MULTICAST);
4353
4354	dev_set_rx_mode(dev);
4355
4356	/*
4357	 *	Have we downed the interface. We handle IFF_UP ourselves
4358	 *	according to user attempts to set it, rather than blindly
4359	 *	setting it.
4360	 */
4361
4362	ret = 0;
4363	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4364		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4365
4366		if (!ret)
4367			dev_set_rx_mode(dev);
4368	}
4369
4370	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4371		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4372
4373		dev->gflags ^= IFF_PROMISC;
4374		dev_set_promiscuity(dev, inc);
4375	}
4376
4377	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4378	   is important. Some (broken) drivers set IFF_PROMISC, when
4379	   IFF_ALLMULTI is requested not asking us and not reporting.
4380	 */
4381	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4382		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4383
4384		dev->gflags ^= IFF_ALLMULTI;
4385		dev_set_allmulti(dev, inc);
4386	}
4387
4388	return ret;
4389}
4390
4391void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4392{
4393	unsigned int changes = dev->flags ^ old_flags;
4394
4395	if (changes & IFF_UP) {
4396		if (dev->flags & IFF_UP)
4397			call_netdevice_notifiers(NETDEV_UP, dev);
4398		else
4399			call_netdevice_notifiers(NETDEV_DOWN, dev);
4400	}
4401
4402	if (dev->flags & IFF_UP &&
4403	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4404		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4405}
4406
4407/**
4408 *	dev_change_flags - change device settings
4409 *	@dev: device
4410 *	@flags: device state flags
4411 *
4412 *	Change settings on device based state flags. The flags are
4413 *	in the userspace exported format.
4414 */
4415int dev_change_flags(struct net_device *dev, unsigned flags)
4416{
4417	int ret, changes;
4418	int old_flags = dev->flags;
4419
4420	ret = __dev_change_flags(dev, flags);
4421	if (ret < 0)
4422		return ret;
4423
4424	changes = old_flags ^ dev->flags;
4425	if (changes)
4426		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4427
4428	__dev_notify_flags(dev, old_flags);
4429	return ret;
4430}
4431EXPORT_SYMBOL(dev_change_flags);
4432
4433/**
4434 *	dev_set_mtu - Change maximum transfer unit
4435 *	@dev: device
4436 *	@new_mtu: new transfer unit
4437 *
4438 *	Change the maximum transfer size of the network device.
4439 */
4440int dev_set_mtu(struct net_device *dev, int new_mtu)
4441{
4442	const struct net_device_ops *ops = dev->netdev_ops;
4443	int err;
4444
4445	if (new_mtu == dev->mtu)
4446		return 0;
4447
4448	/*	MTU must be positive.	 */
4449	if (new_mtu < 0)
4450		return -EINVAL;
4451
4452	if (!netif_device_present(dev))
4453		return -ENODEV;
4454
4455	err = 0;
4456	if (ops->ndo_change_mtu)
4457		err = ops->ndo_change_mtu(dev, new_mtu);
4458	else
4459		dev->mtu = new_mtu;
4460
4461	if (!err && dev->flags & IFF_UP)
4462		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4463	return err;
4464}
4465EXPORT_SYMBOL(dev_set_mtu);
4466
4467/**
4468 *	dev_set_mac_address - Change Media Access Control Address
4469 *	@dev: device
4470 *	@sa: new address
4471 *
4472 *	Change the hardware (MAC) address of the device
4473 */
4474int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4475{
4476	const struct net_device_ops *ops = dev->netdev_ops;
4477	int err;
4478
4479	if (!ops->ndo_set_mac_address)
4480		return -EOPNOTSUPP;
4481	if (sa->sa_family != dev->type)
4482		return -EINVAL;
4483	if (!netif_device_present(dev))
4484		return -ENODEV;
4485	err = ops->ndo_set_mac_address(dev, sa);
4486	if (!err)
4487		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4488	return err;
4489}
4490EXPORT_SYMBOL(dev_set_mac_address);
4491
4492/*
4493 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4494 */
4495static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4496{
4497	int err;
4498	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4499
4500	if (!dev)
4501		return -ENODEV;
4502
4503	switch (cmd) {
4504	case SIOCGIFFLAGS:	/* Get interface flags */
4505		ifr->ifr_flags = (short) dev_get_flags(dev);
4506		return 0;
4507
4508	case SIOCGIFMETRIC:	/* Get the metric on the interface
4509				   (currently unused) */
4510		ifr->ifr_metric = 0;
4511		return 0;
4512
4513	case SIOCGIFMTU:	/* Get the MTU of a device */
4514		ifr->ifr_mtu = dev->mtu;
4515		return 0;
4516
4517	case SIOCGIFHWADDR:
4518		if (!dev->addr_len)
4519			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4520		else
4521			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4522			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4523		ifr->ifr_hwaddr.sa_family = dev->type;
4524		return 0;
4525
4526	case SIOCGIFSLAVE:
4527		err = -EINVAL;
4528		break;
4529
4530	case SIOCGIFMAP:
4531		ifr->ifr_map.mem_start = dev->mem_start;
4532		ifr->ifr_map.mem_end   = dev->mem_end;
4533		ifr->ifr_map.base_addr = dev->base_addr;
4534		ifr->ifr_map.irq       = dev->irq;
4535		ifr->ifr_map.dma       = dev->dma;
4536		ifr->ifr_map.port      = dev->if_port;
4537		return 0;
4538
4539	case SIOCGIFINDEX:
4540		ifr->ifr_ifindex = dev->ifindex;
4541		return 0;
4542
4543	case SIOCGIFTXQLEN:
4544		ifr->ifr_qlen = dev->tx_queue_len;
4545		return 0;
4546
4547	default:
4548		/* dev_ioctl() should ensure this case
4549		 * is never reached
4550		 */
4551		WARN_ON(1);
4552		err = -EINVAL;
4553		break;
4554
4555	}
4556	return err;
4557}
4558
4559/*
4560 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4561 */
4562static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4563{
4564	int err;
4565	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4566	const struct net_device_ops *ops;
4567
4568	if (!dev)
4569		return -ENODEV;
4570
4571	ops = dev->netdev_ops;
4572
4573	switch (cmd) {
4574	case SIOCSIFFLAGS:	/* Set interface flags */
4575		return dev_change_flags(dev, ifr->ifr_flags);
4576
4577	case SIOCSIFMETRIC:	/* Set the metric on the interface
4578				   (currently unused) */
4579		return -EOPNOTSUPP;
4580
4581	case SIOCSIFMTU:	/* Set the MTU of a device */
4582		return dev_set_mtu(dev, ifr->ifr_mtu);
4583
4584	case SIOCSIFHWADDR:
4585		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4586
4587	case SIOCSIFHWBROADCAST:
4588		if (ifr->ifr_hwaddr.sa_family != dev->type)
4589			return -EINVAL;
4590		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4591		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4592		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4593		return 0;
4594
4595	case SIOCSIFMAP:
4596		if (ops->ndo_set_config) {
4597			if (!netif_device_present(dev))
4598				return -ENODEV;
4599			return ops->ndo_set_config(dev, &ifr->ifr_map);
4600		}
4601		return -EOPNOTSUPP;
4602
4603	case SIOCADDMULTI:
4604		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4605		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4606			return -EINVAL;
4607		if (!netif_device_present(dev))
4608			return -ENODEV;
4609		return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4610				  dev->addr_len, 1);
4611
4612	case SIOCDELMULTI:
4613		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4614		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4615			return -EINVAL;
4616		if (!netif_device_present(dev))
4617			return -ENODEV;
4618		return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4619				     dev->addr_len, 1);
4620
4621	case SIOCSIFTXQLEN:
4622		if (ifr->ifr_qlen < 0)
4623			return -EINVAL;
4624		dev->tx_queue_len = ifr->ifr_qlen;
4625		return 0;
4626
4627	case SIOCSIFNAME:
4628		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4629		return dev_change_name(dev, ifr->ifr_newname);
4630
4631	/*
4632	 *	Unknown or private ioctl
4633	 */
4634	default:
4635		if ((cmd >= SIOCDEVPRIVATE &&
4636		    cmd <= SIOCDEVPRIVATE + 15) ||
4637		    cmd == SIOCBONDENSLAVE ||
4638		    cmd == SIOCBONDRELEASE ||
4639		    cmd == SIOCBONDSETHWADDR ||
4640		    cmd == SIOCBONDSLAVEINFOQUERY ||
4641		    cmd == SIOCBONDINFOQUERY ||
4642		    cmd == SIOCBONDCHANGEACTIVE ||
4643		    cmd == SIOCGMIIPHY ||
4644		    cmd == SIOCGMIIREG ||
4645		    cmd == SIOCSMIIREG ||
4646		    cmd == SIOCBRADDIF ||
4647		    cmd == SIOCBRDELIF ||
4648		    cmd == SIOCSHWTSTAMP ||
4649		    cmd == SIOCWANDEV) {
4650			err = -EOPNOTSUPP;
4651			if (ops->ndo_do_ioctl) {
4652				if (netif_device_present(dev))
4653					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4654				else
4655					err = -ENODEV;
4656			}
4657		} else
4658			err = -EINVAL;
4659
4660	}
4661	return err;
4662}
4663
4664/*
4665 *	This function handles all "interface"-type I/O control requests. The actual
4666 *	'doing' part of this is dev_ifsioc above.
4667 */
4668
4669/**
4670 *	dev_ioctl	-	network device ioctl
4671 *	@net: the applicable net namespace
4672 *	@cmd: command to issue
4673 *	@arg: pointer to a struct ifreq in user space
4674 *
4675 *	Issue ioctl functions to devices. This is normally called by the
4676 *	user space syscall interfaces but can sometimes be useful for
4677 *	other purposes. The return value is the return from the syscall if
4678 *	positive or a negative errno code on error.
4679 */
4680
4681int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4682{
4683	struct ifreq ifr;
4684	int ret;
4685	char *colon;
4686
4687	/* One special case: SIOCGIFCONF takes ifconf argument
4688	   and requires shared lock, because it sleeps writing
4689	   to user space.
4690	 */
4691
4692	if (cmd == SIOCGIFCONF) {
4693		rtnl_lock();
4694		ret = dev_ifconf(net, (char __user *) arg);
4695		rtnl_unlock();
4696		return ret;
4697	}
4698	if (cmd == SIOCGIFNAME)
4699		return dev_ifname(net, (struct ifreq __user *)arg);
4700
4701	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4702		return -EFAULT;
4703
4704	ifr.ifr_name[IFNAMSIZ-1] = 0;
4705
4706	colon = strchr(ifr.ifr_name, ':');
4707	if (colon)
4708		*colon = 0;
4709
4710	/*
4711	 *	See which interface the caller is talking about.
4712	 */
4713
4714	switch (cmd) {
4715	/*
4716	 *	These ioctl calls:
4717	 *	- can be done by all.
4718	 *	- atomic and do not require locking.
4719	 *	- return a value
4720	 */
4721	case SIOCGIFFLAGS:
4722	case SIOCGIFMETRIC:
4723	case SIOCGIFMTU:
4724	case SIOCGIFHWADDR:
4725	case SIOCGIFSLAVE:
4726	case SIOCGIFMAP:
4727	case SIOCGIFINDEX:
4728	case SIOCGIFTXQLEN:
4729		dev_load(net, ifr.ifr_name);
4730		rcu_read_lock();
4731		ret = dev_ifsioc_locked(net, &ifr, cmd);
4732		rcu_read_unlock();
4733		if (!ret) {
4734			if (colon)
4735				*colon = ':';
4736			if (copy_to_user(arg, &ifr,
4737					 sizeof(struct ifreq)))
4738				ret = -EFAULT;
4739		}
4740		return ret;
4741
4742	case SIOCETHTOOL:
4743		dev_load(net, ifr.ifr_name);
4744		rtnl_lock();
4745		ret = dev_ethtool(net, &ifr);
4746		rtnl_unlock();
4747		if (!ret) {
4748			if (colon)
4749				*colon = ':';
4750			if (copy_to_user(arg, &ifr,
4751					 sizeof(struct ifreq)))
4752				ret = -EFAULT;
4753		}
4754		return ret;
4755
4756	/*
4757	 *	These ioctl calls:
4758	 *	- require superuser power.
4759	 *	- require strict serialization.
4760	 *	- return a value
4761	 */
4762	case SIOCGMIIPHY:
4763	case SIOCGMIIREG:
4764	case SIOCSIFNAME:
4765		if (!capable(CAP_NET_ADMIN))
4766			return -EPERM;
4767		dev_load(net, ifr.ifr_name);
4768		rtnl_lock();
4769		ret = dev_ifsioc(net, &ifr, cmd);
4770		rtnl_unlock();
4771		if (!ret) {
4772			if (colon)
4773				*colon = ':';
4774			if (copy_to_user(arg, &ifr,
4775					 sizeof(struct ifreq)))
4776				ret = -EFAULT;
4777		}
4778		return ret;
4779
4780	/*
4781	 *	These ioctl calls:
4782	 *	- require superuser power.
4783	 *	- require strict serialization.
4784	 *	- do not return a value
4785	 */
4786	case SIOCSIFFLAGS:
4787	case SIOCSIFMETRIC:
4788	case SIOCSIFMTU:
4789	case SIOCSIFMAP:
4790	case SIOCSIFHWADDR:
4791	case SIOCSIFSLAVE:
4792	case SIOCADDMULTI:
4793	case SIOCDELMULTI:
4794	case SIOCSIFHWBROADCAST:
4795	case SIOCSIFTXQLEN:
4796	case SIOCSMIIREG:
4797	case SIOCBONDENSLAVE:
4798	case SIOCBONDRELEASE:
4799	case SIOCBONDSETHWADDR:
4800	case SIOCBONDCHANGEACTIVE:
4801	case SIOCBRADDIF:
4802	case SIOCBRDELIF:
4803	case SIOCSHWTSTAMP:
4804		if (!capable(CAP_NET_ADMIN))
4805			return -EPERM;
4806		/* fall through */
4807	case SIOCBONDSLAVEINFOQUERY:
4808	case SIOCBONDINFOQUERY:
4809		dev_load(net, ifr.ifr_name);
4810		rtnl_lock();
4811		ret = dev_ifsioc(net, &ifr, cmd);
4812		rtnl_unlock();
4813		return ret;
4814
4815	case SIOCGIFMEM:
4816		/* Get the per device memory space. We can add this but
4817		 * currently do not support it */
4818	case SIOCSIFMEM:
4819		/* Set the per device memory buffer space.
4820		 * Not applicable in our case */
4821	case SIOCSIFLINK:
4822		return -EINVAL;
4823
4824	/*
4825	 *	Unknown or private ioctl.
4826	 */
4827	default:
4828		if (cmd == SIOCWANDEV ||
4829		    (cmd >= SIOCDEVPRIVATE &&
4830		     cmd <= SIOCDEVPRIVATE + 15)) {
4831			dev_load(net, ifr.ifr_name);
4832			rtnl_lock();
4833			ret = dev_ifsioc(net, &ifr, cmd);
4834			rtnl_unlock();
4835			if (!ret && copy_to_user(arg, &ifr,
4836						 sizeof(struct ifreq)))
4837				ret = -EFAULT;
4838			return ret;
4839		}
4840		/* Take care of Wireless Extensions */
4841		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4842			return wext_handle_ioctl(net, &ifr, cmd, arg);
4843		return -EINVAL;
4844	}
4845}
4846
4847
4848/**
4849 *	dev_new_index	-	allocate an ifindex
4850 *	@net: the applicable net namespace
4851 *
4852 *	Returns a suitable unique value for a new device interface
4853 *	number.  The caller must hold the rtnl semaphore or the
4854 *	dev_base_lock to be sure it remains unique.
4855 */
4856static int dev_new_index(struct net *net)
4857{
4858	static int ifindex;
4859	for (;;) {
4860		if (++ifindex <= 0)
4861			ifindex = 1;
4862		if (!__dev_get_by_index(net, ifindex))
4863			return ifindex;
4864	}
4865}
4866
4867/* Delayed registration/unregisteration */
4868static LIST_HEAD(net_todo_list);
4869
4870static void net_set_todo(struct net_device *dev)
4871{
4872	list_add_tail(&dev->todo_list, &net_todo_list);
4873}
4874
4875static void rollback_registered_many(struct list_head *head)
4876{
4877	struct net_device *dev, *tmp;
4878
4879	BUG_ON(dev_boot_phase);
4880	ASSERT_RTNL();
4881
4882	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4883		/* Some devices call without registering
4884		 * for initialization unwind. Remove those
4885		 * devices and proceed with the remaining.
4886		 */
4887		if (dev->reg_state == NETREG_UNINITIALIZED) {
4888			pr_debug("unregister_netdevice: device %s/%p never "
4889				 "was registered\n", dev->name, dev);
4890
4891			WARN_ON(1);
4892			list_del(&dev->unreg_list);
4893			continue;
4894		}
4895
4896		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4897
4898		/* If device is running, close it first. */
4899		dev_close(dev);
4900
4901		/* And unlink it from device chain. */
4902		unlist_netdevice(dev);
4903
4904		dev->reg_state = NETREG_UNREGISTERING;
4905	}
4906
4907	synchronize_net();
4908
4909	list_for_each_entry(dev, head, unreg_list) {
4910		/* Shutdown queueing discipline. */
4911		dev_shutdown(dev);
4912
4913
4914		/* Notify protocols, that we are about to destroy
4915		   this device. They should clean all the things.
4916		*/
4917		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4918
4919		if (!dev->rtnl_link_ops ||
4920		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4921			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4922
4923		/*
4924		 *	Flush the unicast and multicast chains
4925		 */
4926		dev_unicast_flush(dev);
4927		dev_addr_discard(dev);
4928
4929		if (dev->netdev_ops->ndo_uninit)
4930			dev->netdev_ops->ndo_uninit(dev);
4931
4932		/* Notifier chain MUST detach us from master device. */
4933		WARN_ON(dev->master);
4934
4935		/* Remove entries from kobject tree */
4936		netdev_unregister_kobject(dev);
4937	}
4938
4939	/* Process any work delayed until the end of the batch */
4940	dev = list_first_entry(head, struct net_device, unreg_list);
4941	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4942
4943	synchronize_net();
4944
4945	list_for_each_entry(dev, head, unreg_list)
4946		dev_put(dev);
4947}
4948
4949static void rollback_registered(struct net_device *dev)
4950{
4951	LIST_HEAD(single);
4952
4953	list_add(&dev->unreg_list, &single);
4954	rollback_registered_many(&single);
4955}
4956
4957static void __netdev_init_queue_locks_one(struct net_device *dev,
4958					  struct netdev_queue *dev_queue,
4959					  void *_unused)
4960{
4961	spin_lock_init(&dev_queue->_xmit_lock);
4962	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4963	dev_queue->xmit_lock_owner = -1;
4964}
4965
4966static void netdev_init_queue_locks(struct net_device *dev)
4967{
4968	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4969	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4970}
4971
4972unsigned long netdev_fix_features(unsigned long features, const char *name)
4973{
4974	/* Fix illegal SG+CSUM combinations. */
4975	if ((features & NETIF_F_SG) &&
4976	    !(features & NETIF_F_ALL_CSUM)) {
4977		if (name)
4978			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4979			       "checksum feature.\n", name);
4980		features &= ~NETIF_F_SG;
4981	}
4982
4983	/* TSO requires that SG is present as well. */
4984	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4985		if (name)
4986			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4987			       "SG feature.\n", name);
4988		features &= ~NETIF_F_TSO;
4989	}
4990
4991	if (features & NETIF_F_UFO) {
4992		if (!(features & NETIF_F_GEN_CSUM)) {
4993			if (name)
4994				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4995				       "since no NETIF_F_HW_CSUM feature.\n",
4996				       name);
4997			features &= ~NETIF_F_UFO;
4998		}
4999
5000		if (!(features & NETIF_F_SG)) {
5001			if (name)
5002				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5003				       "since no NETIF_F_SG feature.\n", name);
5004			features &= ~NETIF_F_UFO;
5005		}
5006	}
5007
5008	return features;
5009}
5010EXPORT_SYMBOL(netdev_fix_features);
5011
5012/**
5013 *	netif_stacked_transfer_operstate -	transfer operstate
5014 *	@rootdev: the root or lower level device to transfer state from
5015 *	@dev: the device to transfer operstate to
5016 *
5017 *	Transfer operational state from root to device. This is normally
5018 *	called when a stacking relationship exists between the root
5019 *	device and the device(a leaf device).
5020 */
5021void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5022					struct net_device *dev)
5023{
5024	if (rootdev->operstate == IF_OPER_DORMANT)
5025		netif_dormant_on(dev);
5026	else
5027		netif_dormant_off(dev);
5028
5029	if (netif_carrier_ok(rootdev)) {
5030		if (!netif_carrier_ok(dev))
5031			netif_carrier_on(dev);
5032	} else {
5033		if (netif_carrier_ok(dev))
5034			netif_carrier_off(dev);
5035	}
5036}
5037EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5038
5039/**
5040 *	register_netdevice	- register a network device
5041 *	@dev: device to register
5042 *
5043 *	Take a completed network device structure and add it to the kernel
5044 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5045 *	chain. 0 is returned on success. A negative errno code is returned
5046 *	on a failure to set up the device, or if the name is a duplicate.
5047 *
5048 *	Callers must hold the rtnl semaphore. You may want
5049 *	register_netdev() instead of this.
5050 *
5051 *	BUGS:
5052 *	The locking appears insufficient to guarantee two parallel registers
5053 *	will not get the same name.
5054 */
5055
5056int register_netdevice(struct net_device *dev)
5057{
5058	int ret;
5059	struct net *net = dev_net(dev);
5060
5061	BUG_ON(dev_boot_phase);
5062	ASSERT_RTNL();
5063
5064	might_sleep();
5065
5066	/* When net_device's are persistent, this will be fatal. */
5067	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5068	BUG_ON(!net);
5069
5070	spin_lock_init(&dev->addr_list_lock);
5071	netdev_set_addr_lockdep_class(dev);
5072	netdev_init_queue_locks(dev);
5073
5074	dev->iflink = -1;
5075
5076	/* Init, if this function is available */
5077	if (dev->netdev_ops->ndo_init) {
5078		ret = dev->netdev_ops->ndo_init(dev);
5079		if (ret) {
5080			if (ret > 0)
5081				ret = -EIO;
5082			goto out;
5083		}
5084	}
5085
5086	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5087	if (ret)
5088		goto err_uninit;
5089
5090	dev->ifindex = dev_new_index(net);
5091	if (dev->iflink == -1)
5092		dev->iflink = dev->ifindex;
5093
5094	/* Fix illegal checksum combinations */
5095	if ((dev->features & NETIF_F_HW_CSUM) &&
5096	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5097		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5098		       dev->name);
5099		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5100	}
5101
5102	if ((dev->features & NETIF_F_NO_CSUM) &&
5103	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5104		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5105		       dev->name);
5106		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5107	}
5108
5109	dev->features = netdev_fix_features(dev->features, dev->name);
5110
5111	/* Enable software GSO if SG is supported. */
5112	if (dev->features & NETIF_F_SG)
5113		dev->features |= NETIF_F_GSO;
5114
5115	netdev_initialize_kobject(dev);
5116
5117	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5118	ret = notifier_to_errno(ret);
5119	if (ret)
5120		goto err_uninit;
5121
5122	ret = netdev_register_kobject(dev);
5123	if (ret)
5124		goto err_uninit;
5125	dev->reg_state = NETREG_REGISTERED;
5126
5127	/*
5128	 *	Default initial state at registry is that the
5129	 *	device is present.
5130	 */
5131
5132	set_bit(__LINK_STATE_PRESENT, &dev->state);
5133
5134	dev_init_scheduler(dev);
5135	dev_hold(dev);
5136	list_netdevice(dev);
5137
5138	/* Notify protocols, that a new device appeared. */
5139	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5140	ret = notifier_to_errno(ret);
5141	if (ret) {
5142		rollback_registered(dev);
5143		dev->reg_state = NETREG_UNREGISTERED;
5144	}
5145	/*
5146	 *	Prevent userspace races by waiting until the network
5147	 *	device is fully setup before sending notifications.
5148	 */
5149	if (!dev->rtnl_link_ops ||
5150	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5151		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5152
5153out:
5154	return ret;
5155
5156err_uninit:
5157	if (dev->netdev_ops->ndo_uninit)
5158		dev->netdev_ops->ndo_uninit(dev);
5159	goto out;
5160}
5161EXPORT_SYMBOL(register_netdevice);
5162
5163/**
5164 *	init_dummy_netdev	- init a dummy network device for NAPI
5165 *	@dev: device to init
5166 *
5167 *	This takes a network device structure and initialize the minimum
5168 *	amount of fields so it can be used to schedule NAPI polls without
5169 *	registering a full blown interface. This is to be used by drivers
5170 *	that need to tie several hardware interfaces to a single NAPI
5171 *	poll scheduler due to HW limitations.
5172 */
5173int init_dummy_netdev(struct net_device *dev)
5174{
5175	/* Clear everything. Note we don't initialize spinlocks
5176	 * are they aren't supposed to be taken by any of the
5177	 * NAPI code and this dummy netdev is supposed to be
5178	 * only ever used for NAPI polls
5179	 */
5180	memset(dev, 0, sizeof(struct net_device));
5181
5182	/* make sure we BUG if trying to hit standard
5183	 * register/unregister code path
5184	 */
5185	dev->reg_state = NETREG_DUMMY;
5186
5187	/* initialize the ref count */
5188	atomic_set(&dev->refcnt, 1);
5189
5190	/* NAPI wants this */
5191	INIT_LIST_HEAD(&dev->napi_list);
5192
5193	/* a dummy interface is started by default */
5194	set_bit(__LINK_STATE_PRESENT, &dev->state);
5195	set_bit(__LINK_STATE_START, &dev->state);
5196
5197	return 0;
5198}
5199EXPORT_SYMBOL_GPL(init_dummy_netdev);
5200
5201
5202/**
5203 *	register_netdev	- register a network device
5204 *	@dev: device to register
5205 *
5206 *	Take a completed network device structure and add it to the kernel
5207 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5208 *	chain. 0 is returned on success. A negative errno code is returned
5209 *	on a failure to set up the device, or if the name is a duplicate.
5210 *
5211 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5212 *	and expands the device name if you passed a format string to
5213 *	alloc_netdev.
5214 */
5215int register_netdev(struct net_device *dev)
5216{
5217	int err;
5218
5219	rtnl_lock();
5220
5221	/*
5222	 * If the name is a format string the caller wants us to do a
5223	 * name allocation.
5224	 */
5225	if (strchr(dev->name, '%')) {
5226		err = dev_alloc_name(dev, dev->name);
5227		if (err < 0)
5228			goto out;
5229	}
5230
5231	err = register_netdevice(dev);
5232out:
5233	rtnl_unlock();
5234	return err;
5235}
5236EXPORT_SYMBOL(register_netdev);
5237
5238/*
5239 * netdev_wait_allrefs - wait until all references are gone.
5240 *
5241 * This is called when unregistering network devices.
5242 *
5243 * Any protocol or device that holds a reference should register
5244 * for netdevice notification, and cleanup and put back the
5245 * reference if they receive an UNREGISTER event.
5246 * We can get stuck here if buggy protocols don't correctly
5247 * call dev_put.
5248 */
5249static void netdev_wait_allrefs(struct net_device *dev)
5250{
5251	unsigned long rebroadcast_time, warning_time;
5252
5253	linkwatch_forget_dev(dev);
5254
5255	rebroadcast_time = warning_time = jiffies;
5256	while (atomic_read(&dev->refcnt) != 0) {
5257		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5258			rtnl_lock();
5259
5260			/* Rebroadcast unregister notification */
5261			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5262			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5263			 * should have already handle it the first time */
5264
5265			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5266				     &dev->state)) {
5267				/* We must not have linkwatch events
5268				 * pending on unregister. If this
5269				 * happens, we simply run the queue
5270				 * unscheduled, resulting in a noop
5271				 * for this device.
5272				 */
5273				linkwatch_run_queue();
5274			}
5275
5276			__rtnl_unlock();
5277
5278			rebroadcast_time = jiffies;
5279		}
5280
5281		msleep(250);
5282
5283		if (time_after(jiffies, warning_time + 10 * HZ)) {
5284			printk(KERN_EMERG "unregister_netdevice: "
5285			       "waiting for %s to become free. Usage "
5286			       "count = %d\n",
5287			       dev->name, atomic_read(&dev->refcnt));
5288			warning_time = jiffies;
5289		}
5290	}
5291}
5292
5293/* The sequence is:
5294 *
5295 *	rtnl_lock();
5296 *	...
5297 *	register_netdevice(x1);
5298 *	register_netdevice(x2);
5299 *	...
5300 *	unregister_netdevice(y1);
5301 *	unregister_netdevice(y2);
5302 *      ...
5303 *	rtnl_unlock();
5304 *	free_netdev(y1);
5305 *	free_netdev(y2);
5306 *
5307 * We are invoked by rtnl_unlock().
5308 * This allows us to deal with problems:
5309 * 1) We can delete sysfs objects which invoke hotplug
5310 *    without deadlocking with linkwatch via keventd.
5311 * 2) Since we run with the RTNL semaphore not held, we can sleep
5312 *    safely in order to wait for the netdev refcnt to drop to zero.
5313 *
5314 * We must not return until all unregister events added during
5315 * the interval the lock was held have been completed.
5316 */
5317void netdev_run_todo(void)
5318{
5319	struct list_head list;
5320
5321	/* Snapshot list, allow later requests */
5322	list_replace_init(&net_todo_list, &list);
5323
5324	__rtnl_unlock();
5325
5326	while (!list_empty(&list)) {
5327		struct net_device *dev
5328			= list_first_entry(&list, struct net_device, todo_list);
5329		list_del(&dev->todo_list);
5330
5331		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5332			printk(KERN_ERR "network todo '%s' but state %d\n",
5333			       dev->name, dev->reg_state);
5334			dump_stack();
5335			continue;
5336		}
5337
5338		dev->reg_state = NETREG_UNREGISTERED;
5339
5340		on_each_cpu(flush_backlog, dev, 1);
5341
5342		netdev_wait_allrefs(dev);
5343
5344		/* paranoia */
5345		BUG_ON(atomic_read(&dev->refcnt));
5346		WARN_ON(dev->ip_ptr);
5347		WARN_ON(dev->ip6_ptr);
5348		WARN_ON(dev->dn_ptr);
5349
5350		if (dev->destructor)
5351			dev->destructor(dev);
5352
5353		/* Free network device */
5354		kobject_put(&dev->dev.kobj);
5355	}
5356}
5357
5358/**
5359 *	dev_txq_stats_fold - fold tx_queues stats
5360 *	@dev: device to get statistics from
5361 *	@stats: struct net_device_stats to hold results
5362 */
5363void dev_txq_stats_fold(const struct net_device *dev,
5364			struct net_device_stats *stats)
5365{
5366	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5367	unsigned int i;
5368	struct netdev_queue *txq;
5369
5370	for (i = 0; i < dev->num_tx_queues; i++) {
5371		txq = netdev_get_tx_queue(dev, i);
5372		tx_bytes   += txq->tx_bytes;
5373		tx_packets += txq->tx_packets;
5374		tx_dropped += txq->tx_dropped;
5375	}
5376	if (tx_bytes || tx_packets || tx_dropped) {
5377		stats->tx_bytes   = tx_bytes;
5378		stats->tx_packets = tx_packets;
5379		stats->tx_dropped = tx_dropped;
5380	}
5381}
5382EXPORT_SYMBOL(dev_txq_stats_fold);
5383
5384/**
5385 *	dev_get_stats	- get network device statistics
5386 *	@dev: device to get statistics from
5387 *
5388 *	Get network statistics from device. The device driver may provide
5389 *	its own method by setting dev->netdev_ops->get_stats; otherwise
5390 *	the internal statistics structure is used.
5391 */
5392const struct net_device_stats *dev_get_stats(struct net_device *dev)
5393{
5394	const struct net_device_ops *ops = dev->netdev_ops;
5395
5396	if (ops->ndo_get_stats)
5397		return ops->ndo_get_stats(dev);
5398
5399	dev_txq_stats_fold(dev, &dev->stats);
5400	return &dev->stats;
5401}
5402EXPORT_SYMBOL(dev_get_stats);
5403
5404static void netdev_init_one_queue(struct net_device *dev,
5405				  struct netdev_queue *queue,
5406				  void *_unused)
5407{
5408	queue->dev = dev;
5409}
5410
5411static void netdev_init_queues(struct net_device *dev)
5412{
5413	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5414	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5415	spin_lock_init(&dev->tx_global_lock);
5416}
5417
5418/**
5419 *	alloc_netdev_mq - allocate network device
5420 *	@sizeof_priv:	size of private data to allocate space for
5421 *	@name:		device name format string
5422 *	@setup:		callback to initialize device
5423 *	@queue_count:	the number of subqueues to allocate
5424 *
5425 *	Allocates a struct net_device with private data area for driver use
5426 *	and performs basic initialization.  Also allocates subquue structs
5427 *	for each queue on the device at the end of the netdevice.
5428 */
5429struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5430		void (*setup)(struct net_device *), unsigned int queue_count)
5431{
5432	struct netdev_queue *tx;
5433	struct net_device *dev;
5434	size_t alloc_size;
5435	struct net_device *p;
5436
5437	BUG_ON(strlen(name) >= sizeof(dev->name));
5438
5439	alloc_size = sizeof(struct net_device);
5440	if (sizeof_priv) {
5441		/* ensure 32-byte alignment of private area */
5442		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5443		alloc_size += sizeof_priv;
5444	}
5445	/* ensure 32-byte alignment of whole construct */
5446	alloc_size += NETDEV_ALIGN - 1;
5447
5448	p = kzalloc(alloc_size, GFP_KERNEL);
5449	if (!p) {
5450		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5451		return NULL;
5452	}
5453
5454	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5455	if (!tx) {
5456		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5457		       "tx qdiscs.\n");
5458		goto free_p;
5459	}
5460
5461	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5462	dev->padded = (char *)dev - (char *)p;
5463
5464	if (dev_addr_init(dev))
5465		goto free_tx;
5466
5467	dev_unicast_init(dev);
5468
5469	dev_net_set(dev, &init_net);
5470
5471	dev->_tx = tx;
5472	dev->num_tx_queues = queue_count;
5473	dev->real_num_tx_queues = queue_count;
5474
5475	dev->gso_max_size = GSO_MAX_SIZE;
5476
5477	netdev_init_queues(dev);
5478
5479	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5480	dev->ethtool_ntuple_list.count = 0;
5481	INIT_LIST_HEAD(&dev->napi_list);
5482	INIT_LIST_HEAD(&dev->unreg_list);
5483	INIT_LIST_HEAD(&dev->link_watch_list);
5484	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5485	setup(dev);
5486	strcpy(dev->name, name);
5487	return dev;
5488
5489free_tx:
5490	kfree(tx);
5491
5492free_p:
5493	kfree(p);
5494	return NULL;
5495}
5496EXPORT_SYMBOL(alloc_netdev_mq);
5497
5498/**
5499 *	free_netdev - free network device
5500 *	@dev: device
5501 *
5502 *	This function does the last stage of destroying an allocated device
5503 * 	interface. The reference to the device object is released.
5504 *	If this is the last reference then it will be freed.
5505 */
5506void free_netdev(struct net_device *dev)
5507{
5508	struct napi_struct *p, *n;
5509
5510	release_net(dev_net(dev));
5511
5512	kfree(dev->_tx);
5513
5514	/* Flush device addresses */
5515	dev_addr_flush(dev);
5516
5517	/* Clear ethtool n-tuple list */
5518	ethtool_ntuple_flush(dev);
5519
5520	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5521		netif_napi_del(p);
5522
5523	/*  Compatibility with error handling in drivers */
5524	if (dev->reg_state == NETREG_UNINITIALIZED) {
5525		kfree((char *)dev - dev->padded);
5526		return;
5527	}
5528
5529	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5530	dev->reg_state = NETREG_RELEASED;
5531
5532	/* will free via device release */
5533	put_device(&dev->dev);
5534}
5535EXPORT_SYMBOL(free_netdev);
5536
5537/**
5538 *	synchronize_net -  Synchronize with packet receive processing
5539 *
5540 *	Wait for packets currently being received to be done.
5541 *	Does not block later packets from starting.
5542 */
5543void synchronize_net(void)
5544{
5545	might_sleep();
5546	synchronize_rcu();
5547}
5548EXPORT_SYMBOL(synchronize_net);
5549
5550/**
5551 *	unregister_netdevice_queue - remove device from the kernel
5552 *	@dev: device
5553 *	@head: list
5554 *
5555 *	This function shuts down a device interface and removes it
5556 *	from the kernel tables.
5557 *	If head not NULL, device is queued to be unregistered later.
5558 *
5559 *	Callers must hold the rtnl semaphore.  You may want
5560 *	unregister_netdev() instead of this.
5561 */
5562
5563void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5564{
5565	ASSERT_RTNL();
5566
5567	if (head) {
5568		list_move_tail(&dev->unreg_list, head);
5569	} else {
5570		rollback_registered(dev);
5571		/* Finish processing unregister after unlock */
5572		net_set_todo(dev);
5573	}
5574}
5575EXPORT_SYMBOL(unregister_netdevice_queue);
5576
5577/**
5578 *	unregister_netdevice_many - unregister many devices
5579 *	@head: list of devices
5580 */
5581void unregister_netdevice_many(struct list_head *head)
5582{
5583	struct net_device *dev;
5584
5585	if (!list_empty(head)) {
5586		rollback_registered_many(head);
5587		list_for_each_entry(dev, head, unreg_list)
5588			net_set_todo(dev);
5589	}
5590}
5591EXPORT_SYMBOL(unregister_netdevice_many);
5592
5593/**
5594 *	unregister_netdev - remove device from the kernel
5595 *	@dev: device
5596 *
5597 *	This function shuts down a device interface and removes it
5598 *	from the kernel tables.
5599 *
5600 *	This is just a wrapper for unregister_netdevice that takes
5601 *	the rtnl semaphore.  In general you want to use this and not
5602 *	unregister_netdevice.
5603 */
5604void unregister_netdev(struct net_device *dev)
5605{
5606	rtnl_lock();
5607	unregister_netdevice(dev);
5608	rtnl_unlock();
5609}
5610EXPORT_SYMBOL(unregister_netdev);
5611
5612/**
5613 *	dev_change_net_namespace - move device to different nethost namespace
5614 *	@dev: device
5615 *	@net: network namespace
5616 *	@pat: If not NULL name pattern to try if the current device name
5617 *	      is already taken in the destination network namespace.
5618 *
5619 *	This function shuts down a device interface and moves it
5620 *	to a new network namespace. On success 0 is returned, on
5621 *	a failure a netagive errno code is returned.
5622 *
5623 *	Callers must hold the rtnl semaphore.
5624 */
5625
5626int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5627{
5628	int err;
5629
5630	ASSERT_RTNL();
5631
5632	/* Don't allow namespace local devices to be moved. */
5633	err = -EINVAL;
5634	if (dev->features & NETIF_F_NETNS_LOCAL)
5635		goto out;
5636
5637#ifdef CONFIG_SYSFS
5638	/* Don't allow real devices to be moved when sysfs
5639	 * is enabled.
5640	 */
5641	err = -EINVAL;
5642	if (dev->dev.parent)
5643		goto out;
5644#endif
5645
5646	/* Ensure the device has been registrered */
5647	err = -EINVAL;
5648	if (dev->reg_state != NETREG_REGISTERED)
5649		goto out;
5650
5651	/* Get out if there is nothing todo */
5652	err = 0;
5653	if (net_eq(dev_net(dev), net))
5654		goto out;
5655
5656	/* Pick the destination device name, and ensure
5657	 * we can use it in the destination network namespace.
5658	 */
5659	err = -EEXIST;
5660	if (__dev_get_by_name(net, dev->name)) {
5661		/* We get here if we can't use the current device name */
5662		if (!pat)
5663			goto out;
5664		if (dev_get_valid_name(net, pat, dev->name, 1))
5665			goto out;
5666	}
5667
5668	/*
5669	 * And now a mini version of register_netdevice unregister_netdevice.
5670	 */
5671
5672	/* If device is running close it first. */
5673	dev_close(dev);
5674
5675	/* And unlink it from device chain */
5676	err = -ENODEV;
5677	unlist_netdevice(dev);
5678
5679	synchronize_net();
5680
5681	/* Shutdown queueing discipline. */
5682	dev_shutdown(dev);
5683
5684	/* Notify protocols, that we are about to destroy
5685	   this device. They should clean all the things.
5686	*/
5687	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5688	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5689
5690	/*
5691	 *	Flush the unicast and multicast chains
5692	 */
5693	dev_unicast_flush(dev);
5694	dev_addr_discard(dev);
5695
5696	netdev_unregister_kobject(dev);
5697
5698	/* Actually switch the network namespace */
5699	dev_net_set(dev, net);
5700
5701	/* If there is an ifindex conflict assign a new one */
5702	if (__dev_get_by_index(net, dev->ifindex)) {
5703		int iflink = (dev->iflink == dev->ifindex);
5704		dev->ifindex = dev_new_index(net);
5705		if (iflink)
5706			dev->iflink = dev->ifindex;
5707	}
5708
5709	/* Fixup kobjects */
5710	err = netdev_register_kobject(dev);
5711	WARN_ON(err);
5712
5713	/* Add the device back in the hashes */
5714	list_netdevice(dev);
5715
5716	/* Notify protocols, that a new device appeared. */
5717	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5718
5719	/*
5720	 *	Prevent userspace races by waiting until the network
5721	 *	device is fully setup before sending notifications.
5722	 */
5723	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5724
5725	synchronize_net();
5726	err = 0;
5727out:
5728	return err;
5729}
5730EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5731
5732static int dev_cpu_callback(struct notifier_block *nfb,
5733			    unsigned long action,
5734			    void *ocpu)
5735{
5736	struct sk_buff **list_skb;
5737	struct Qdisc **list_net;
5738	struct sk_buff *skb;
5739	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5740	struct softnet_data *sd, *oldsd;
5741
5742	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5743		return NOTIFY_OK;
5744
5745	local_irq_disable();
5746	cpu = smp_processor_id();
5747	sd = &per_cpu(softnet_data, cpu);
5748	oldsd = &per_cpu(softnet_data, oldcpu);
5749
5750	/* Find end of our completion_queue. */
5751	list_skb = &sd->completion_queue;
5752	while (*list_skb)
5753		list_skb = &(*list_skb)->next;
5754	/* Append completion queue from offline CPU. */
5755	*list_skb = oldsd->completion_queue;
5756	oldsd->completion_queue = NULL;
5757
5758	/* Find end of our output_queue. */
5759	list_net = &sd->output_queue;
5760	while (*list_net)
5761		list_net = &(*list_net)->next_sched;
5762	/* Append output queue from offline CPU. */
5763	*list_net = oldsd->output_queue;
5764	oldsd->output_queue = NULL;
5765
5766	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5767	local_irq_enable();
5768
5769	/* Process offline CPU's input_pkt_queue */
5770	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5771		netif_rx(skb);
5772
5773	return NOTIFY_OK;
5774}
5775
5776
5777/**
5778 *	netdev_increment_features - increment feature set by one
5779 *	@all: current feature set
5780 *	@one: new feature set
5781 *	@mask: mask feature set
5782 *
5783 *	Computes a new feature set after adding a device with feature set
5784 *	@one to the master device with current feature set @all.  Will not
5785 *	enable anything that is off in @mask. Returns the new feature set.
5786 */
5787unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5788					unsigned long mask)
5789{
5790	/* If device needs checksumming, downgrade to it. */
5791	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5792		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5793	else if (mask & NETIF_F_ALL_CSUM) {
5794		/* If one device supports v4/v6 checksumming, set for all. */
5795		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5796		    !(all & NETIF_F_GEN_CSUM)) {
5797			all &= ~NETIF_F_ALL_CSUM;
5798			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5799		}
5800
5801		/* If one device supports hw checksumming, set for all. */
5802		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5803			all &= ~NETIF_F_ALL_CSUM;
5804			all |= NETIF_F_HW_CSUM;
5805		}
5806	}
5807
5808	one |= NETIF_F_ALL_CSUM;
5809
5810	one |= all & NETIF_F_ONE_FOR_ALL;
5811	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5812	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5813
5814	return all;
5815}
5816EXPORT_SYMBOL(netdev_increment_features);
5817
5818static struct hlist_head *netdev_create_hash(void)
5819{
5820	int i;
5821	struct hlist_head *hash;
5822
5823	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5824	if (hash != NULL)
5825		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5826			INIT_HLIST_HEAD(&hash[i]);
5827
5828	return hash;
5829}
5830
5831/* Initialize per network namespace state */
5832static int __net_init netdev_init(struct net *net)
5833{
5834	INIT_LIST_HEAD(&net->dev_base_head);
5835
5836	net->dev_name_head = netdev_create_hash();
5837	if (net->dev_name_head == NULL)
5838		goto err_name;
5839
5840	net->dev_index_head = netdev_create_hash();
5841	if (net->dev_index_head == NULL)
5842		goto err_idx;
5843
5844	return 0;
5845
5846err_idx:
5847	kfree(net->dev_name_head);
5848err_name:
5849	return -ENOMEM;
5850}
5851
5852/**
5853 *	netdev_drivername - network driver for the device
5854 *	@dev: network device
5855 *	@buffer: buffer for resulting name
5856 *	@len: size of buffer
5857 *
5858 *	Determine network driver for device.
5859 */
5860char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5861{
5862	const struct device_driver *driver;
5863	const struct device *parent;
5864
5865	if (len <= 0 || !buffer)
5866		return buffer;
5867	buffer[0] = 0;
5868
5869	parent = dev->dev.parent;
5870
5871	if (!parent)
5872		return buffer;
5873
5874	driver = parent->driver;
5875	if (driver && driver->name)
5876		strlcpy(buffer, driver->name, len);
5877	return buffer;
5878}
5879
5880static void __net_exit netdev_exit(struct net *net)
5881{
5882	kfree(net->dev_name_head);
5883	kfree(net->dev_index_head);
5884}
5885
5886static struct pernet_operations __net_initdata netdev_net_ops = {
5887	.init = netdev_init,
5888	.exit = netdev_exit,
5889};
5890
5891static void __net_exit default_device_exit(struct net *net)
5892{
5893	struct net_device *dev, *aux;
5894	/*
5895	 * Push all migratable network devices back to the
5896	 * initial network namespace
5897	 */
5898	rtnl_lock();
5899	for_each_netdev_safe(net, dev, aux) {
5900		int err;
5901		char fb_name[IFNAMSIZ];
5902
5903		/* Ignore unmoveable devices (i.e. loopback) */
5904		if (dev->features & NETIF_F_NETNS_LOCAL)
5905			continue;
5906
5907		/* Leave virtual devices for the generic cleanup */
5908		if (dev->rtnl_link_ops)
5909			continue;
5910
5911		/* Push remaing network devices to init_net */
5912		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5913		err = dev_change_net_namespace(dev, &init_net, fb_name);
5914		if (err) {
5915			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5916				__func__, dev->name, err);
5917			BUG();
5918		}
5919	}
5920	rtnl_unlock();
5921}
5922
5923static void __net_exit default_device_exit_batch(struct list_head *net_list)
5924{
5925	/* At exit all network devices most be removed from a network
5926	 * namespace.  Do this in the reverse order of registeration.
5927	 * Do this across as many network namespaces as possible to
5928	 * improve batching efficiency.
5929	 */
5930	struct net_device *dev;
5931	struct net *net;
5932	LIST_HEAD(dev_kill_list);
5933
5934	rtnl_lock();
5935	list_for_each_entry(net, net_list, exit_list) {
5936		for_each_netdev_reverse(net, dev) {
5937			if (dev->rtnl_link_ops)
5938				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5939			else
5940				unregister_netdevice_queue(dev, &dev_kill_list);
5941		}
5942	}
5943	unregister_netdevice_many(&dev_kill_list);
5944	rtnl_unlock();
5945}
5946
5947static struct pernet_operations __net_initdata default_device_ops = {
5948	.exit = default_device_exit,
5949	.exit_batch = default_device_exit_batch,
5950};
5951
5952/*
5953 *	Initialize the DEV module. At boot time this walks the device list and
5954 *	unhooks any devices that fail to initialise (normally hardware not
5955 *	present) and leaves us with a valid list of present and active devices.
5956 *
5957 */
5958
5959/*
5960 *       This is called single threaded during boot, so no need
5961 *       to take the rtnl semaphore.
5962 */
5963static int __init net_dev_init(void)
5964{
5965	int i, rc = -ENOMEM;
5966
5967	BUG_ON(!dev_boot_phase);
5968
5969	if (dev_proc_init())
5970		goto out;
5971
5972	if (netdev_kobject_init())
5973		goto out;
5974
5975	INIT_LIST_HEAD(&ptype_all);
5976	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5977		INIT_LIST_HEAD(&ptype_base[i]);
5978
5979	if (register_pernet_subsys(&netdev_net_ops))
5980		goto out;
5981
5982	/*
5983	 *	Initialise the packet receive queues.
5984	 */
5985
5986	for_each_possible_cpu(i) {
5987		struct softnet_data *queue;
5988
5989		queue = &per_cpu(softnet_data, i);
5990		skb_queue_head_init(&queue->input_pkt_queue);
5991		queue->completion_queue = NULL;
5992		INIT_LIST_HEAD(&queue->poll_list);
5993
5994		queue->backlog.poll = process_backlog;
5995		queue->backlog.weight = weight_p;
5996		queue->backlog.gro_list = NULL;
5997		queue->backlog.gro_count = 0;
5998	}
5999
6000	dev_boot_phase = 0;
6001
6002	/* The loopback device is special if any other network devices
6003	 * is present in a network namespace the loopback device must
6004	 * be present. Since we now dynamically allocate and free the
6005	 * loopback device ensure this invariant is maintained by
6006	 * keeping the loopback device as the first device on the
6007	 * list of network devices.  Ensuring the loopback devices
6008	 * is the first device that appears and the last network device
6009	 * that disappears.
6010	 */
6011	if (register_pernet_device(&loopback_net_ops))
6012		goto out;
6013
6014	if (register_pernet_device(&default_device_ops))
6015		goto out;
6016
6017	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6018	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6019
6020	hotcpu_notifier(dev_cpu_callback, 0);
6021	dst_init();
6022	dev_mcast_init();
6023	rc = 0;
6024out:
6025	return rc;
6026}
6027
6028subsys_initcall(net_dev_init);
6029
6030static int __init initialize_hashrnd(void)
6031{
6032	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
6033	return 0;
6034}
6035
6036late_initcall_sync(initialize_hashrnd);
6037