net/core/dev.c at v3.2-rc1 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v3.2-rc1 166 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136#include <linux/if_tunnel.h>
 137#include <linux/if_pppox.h>
 138#include <linux/ppp_defs.h>
 139#include <linux/net_tstamp.h>
 140
 141#include "net-sysfs.h"
 142
 143/* Instead of increasing this, you should create a hash table. */
 144#define MAX_GRO_SKBS 8
 145
 146/* This should be increased if a protocol with a bigger head is added. */
 147#define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149/*
 150 *	The list of packet types we will receive (as opposed to discard)
 151 *	and the routines to invoke.
 152 *
 153 *	Why 16. Because with 16 the only overlap we get on a hash of the
 154 *	low nibble of the protocol value is RARP/SNAP/X.25.
 155 *
 156 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 157 *             sure which should go first, but I bet it won't make much
 158 *             difference if we are running VLANs.  The good news is that
 159 *             this protocol won't be in the list unless compiled in, so
 160 *             the average user (w/out VLANs) will not be adversely affected.
 161 *             --BLG
 162 *
 163 *		0800	IP
 164 *		8100    802.1Q VLAN
 165 *		0001	802.3
 166 *		0002	AX.25
 167 *		0004	802.2
 168 *		8035	RARP
 169 *		0005	SNAP
 170 *		0805	X.25
 171 *		0806	ARP
 172 *		8137	IPX
 173 *		0009	Localtalk
 174 *		86DD	IPv6
 175 */
 176
 177#define PTYPE_HASH_SIZE	(16)
 178#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 179
 180static DEFINE_SPINLOCK(ptype_lock);
 181static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 182static struct list_head ptype_all __read_mostly;	/* Taps */
 183
 184/*
 185 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186 * semaphore.
 187 *
 188 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189 *
 190 * Writers must hold the rtnl semaphore while they loop through the
 191 * dev_base_head list, and hold dev_base_lock for writing when they do the
 192 * actual updates.  This allows pure readers to access the list even
 193 * while a writer is preparing to update it.
 194 *
 195 * To put it another way, dev_base_lock is held for writing only to
 196 * protect against pure readers; the rtnl semaphore provides the
 197 * protection against other writers.
 198 *
 199 * See, for example usages, register_netdevice() and
 200 * unregister_netdevice(), which must be called with the rtnl
 201 * semaphore held.
 202 */
 203DEFINE_RWLOCK(dev_base_lock);
 204EXPORT_SYMBOL(dev_base_lock);
 205
 206static inline void dev_base_seq_inc(struct net *net)
 207{
 208	while (++net->dev_base_seq == 0);
 209}
 210
 211static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212{
 213	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 215}
 216
 217static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 218{
 219	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 220}
 221
 222static inline void rps_lock(struct softnet_data *sd)
 223{
 224#ifdef CONFIG_RPS
 225	spin_lock(&sd->input_pkt_queue.lock);
 226#endif
 227}
 228
 229static inline void rps_unlock(struct softnet_data *sd)
 230{
 231#ifdef CONFIG_RPS
 232	spin_unlock(&sd->input_pkt_queue.lock);
 233#endif
 234}
 235
 236/* Device list insertion */
 237static int list_netdevice(struct net_device *dev)
 238{
 239	struct net *net = dev_net(dev);
 240
 241	ASSERT_RTNL();
 242
 243	write_lock_bh(&dev_base_lock);
 244	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 245	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 246	hlist_add_head_rcu(&dev->index_hlist,
 247			   dev_index_hash(net, dev->ifindex));
 248	write_unlock_bh(&dev_base_lock);
 249
 250	dev_base_seq_inc(net);
 251
 252	return 0;
 253}
 254
 255/* Device list removal
 256 * caller must respect a RCU grace period before freeing/reusing dev
 257 */
 258static void unlist_netdevice(struct net_device *dev)
 259{
 260	ASSERT_RTNL();
 261
 262	/* Unlink dev from the device chain */
 263	write_lock_bh(&dev_base_lock);
 264	list_del_rcu(&dev->dev_list);
 265	hlist_del_rcu(&dev->name_hlist);
 266	hlist_del_rcu(&dev->index_hlist);
 267	write_unlock_bh(&dev_base_lock);
 268
 269	dev_base_seq_inc(dev_net(dev));
 270}
 271
 272/*
 273 *	Our notifier list
 274 */
 275
 276static RAW_NOTIFIER_HEAD(netdev_chain);
 277
 278/*
 279 *	Device drivers call our routines to queue packets here. We empty the
 280 *	queue in the local softnet handler.
 281 */
 282
 283DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 284EXPORT_PER_CPU_SYMBOL(softnet_data);
 285
 286#ifdef CONFIG_LOCKDEP
 287/*
 288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 289 * according to dev->type
 290 */
 291static const unsigned short netdev_lock_type[] =
 292	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 293	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 294	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 295	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 296	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 297	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 298	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 299	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 300	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 301	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 302	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 303	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 304	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 305	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 306	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 307	 ARPHRD_VOID, ARPHRD_NONE};
 308
 309static const char *const netdev_lock_name[] =
 310	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 323	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 324	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 325	 "_xmit_VOID", "_xmit_NONE"};
 326
 327static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331{
 332	int i;
 333
 334	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335		if (netdev_lock_type[i] == dev_type)
 336			return i;
 337	/* the last key is used by default */
 338	return ARRAY_SIZE(netdev_lock_type) - 1;
 339}
 340
 341static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342						 unsigned short dev_type)
 343{
 344	int i;
 345
 346	i = netdev_lock_pos(dev_type);
 347	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348				   netdev_lock_name[i]);
 349}
 350
 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352{
 353	int i;
 354
 355	i = netdev_lock_pos(dev->type);
 356	lockdep_set_class_and_name(&dev->addr_list_lock,
 357				   &netdev_addr_lock_key[i],
 358				   netdev_lock_name[i]);
 359}
 360#else
 361static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362						 unsigned short dev_type)
 363{
 364}
 365static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366{
 367}
 368#endif
 369
 370/*******************************************************************************
 371
 372		Protocol management and registration routines
 373
 374*******************************************************************************/
 375
 376/*
 377 *	Add a protocol ID to the list. Now that the input handler is
 378 *	smarter we can dispense with all the messy stuff that used to be
 379 *	here.
 380 *
 381 *	BEWARE!!! Protocol handlers, mangling input packets,
 382 *	MUST BE last in hash buckets and checking protocol handlers
 383 *	MUST start from promiscuous ptype_all chain in net_bh.
 384 *	It is true now, do not change it.
 385 *	Explanation follows: if protocol handler, mangling packet, will
 386 *	be the first on list, it is not able to sense, that packet
 387 *	is cloned and should be copied-on-write, so that it will
 388 *	change it and subsequent readers will get broken packet.
 389 *							--ANK (980803)
 390 */
 391
 392static inline struct list_head *ptype_head(const struct packet_type *pt)
 393{
 394	if (pt->type == htons(ETH_P_ALL))
 395		return &ptype_all;
 396	else
 397		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398}
 399
 400/**
 401 *	dev_add_pack - add packet handler
 402 *	@pt: packet type declaration
 403 *
 404 *	Add a protocol handler to the networking stack. The passed &packet_type
 405 *	is linked into kernel lists and may not be freed until it has been
 406 *	removed from the kernel lists.
 407 *
 408 *	This call does not sleep therefore it can not
 409 *	guarantee all CPU's that are in middle of receiving packets
 410 *	will see the new packet type (until the next received packet).
 411 */
 412
 413void dev_add_pack(struct packet_type *pt)
 414{
 415	struct list_head *head = ptype_head(pt);
 416
 417	spin_lock(&ptype_lock);
 418	list_add_rcu(&pt->list, head);
 419	spin_unlock(&ptype_lock);
 420}
 421EXPORT_SYMBOL(dev_add_pack);
 422
 423/**
 424 *	__dev_remove_pack	 - remove packet handler
 425 *	@pt: packet type declaration
 426 *
 427 *	Remove a protocol handler that was previously added to the kernel
 428 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429 *	from the kernel lists and can be freed or reused once this function
 430 *	returns.
 431 *
 432 *      The packet type might still be in use by receivers
 433 *	and must not be freed until after all the CPU's have gone
 434 *	through a quiescent state.
 435 */
 436void __dev_remove_pack(struct packet_type *pt)
 437{
 438	struct list_head *head = ptype_head(pt);
 439	struct packet_type *pt1;
 440
 441	spin_lock(&ptype_lock);
 442
 443	list_for_each_entry(pt1, head, list) {
 444		if (pt == pt1) {
 445			list_del_rcu(&pt->list);
 446			goto out;
 447		}
 448	}
 449
 450	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 451out:
 452	spin_unlock(&ptype_lock);
 453}
 454EXPORT_SYMBOL(__dev_remove_pack);
 455
 456/**
 457 *	dev_remove_pack	 - remove packet handler
 458 *	@pt: packet type declaration
 459 *
 460 *	Remove a protocol handler that was previously added to the kernel
 461 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462 *	from the kernel lists and can be freed or reused once this function
 463 *	returns.
 464 *
 465 *	This call sleeps to guarantee that no CPU is looking at the packet
 466 *	type after return.
 467 */
 468void dev_remove_pack(struct packet_type *pt)
 469{
 470	__dev_remove_pack(pt);
 471
 472	synchronize_net();
 473}
 474EXPORT_SYMBOL(dev_remove_pack);
 475
 476/******************************************************************************
 477
 478		      Device Boot-time Settings Routines
 479
 480*******************************************************************************/
 481
 482/* Boot time configuration table */
 483static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 484
 485/**
 486 *	netdev_boot_setup_add	- add new setup entry
 487 *	@name: name of the device
 488 *	@map: configured settings for the device
 489 *
 490 *	Adds new setup entry to the dev_boot_setup list.  The function
 491 *	returns 0 on error and 1 on success.  This is a generic routine to
 492 *	all netdevices.
 493 */
 494static int netdev_boot_setup_add(char *name, struct ifmap *map)
 495{
 496	struct netdev_boot_setup *s;
 497	int i;
 498
 499	s = dev_boot_setup;
 500	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 502			memset(s[i].name, 0, sizeof(s[i].name));
 503			strlcpy(s[i].name, name, IFNAMSIZ);
 504			memcpy(&s[i].map, map, sizeof(s[i].map));
 505			break;
 506		}
 507	}
 508
 509	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 510}
 511
 512/**
 513 *	netdev_boot_setup_check	- check boot time settings
 514 *	@dev: the netdevice
 515 *
 516 * 	Check boot time settings for the device.
 517 *	The found settings are set for the device to be used
 518 *	later in the device probing.
 519 *	Returns 0 if no settings found, 1 if they are.
 520 */
 521int netdev_boot_setup_check(struct net_device *dev)
 522{
 523	struct netdev_boot_setup *s = dev_boot_setup;
 524	int i;
 525
 526	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 527		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 528		    !strcmp(dev->name, s[i].name)) {
 529			dev->irq 	= s[i].map.irq;
 530			dev->base_addr 	= s[i].map.base_addr;
 531			dev->mem_start 	= s[i].map.mem_start;
 532			dev->mem_end 	= s[i].map.mem_end;
 533			return 1;
 534		}
 535	}
 536	return 0;
 537}
 538EXPORT_SYMBOL(netdev_boot_setup_check);
 539
 540
 541/**
 542 *	netdev_boot_base	- get address from boot time settings
 543 *	@prefix: prefix for network device
 544 *	@unit: id for network device
 545 *
 546 * 	Check boot time settings for the base address of device.
 547 *	The found settings are set for the device to be used
 548 *	later in the device probing.
 549 *	Returns 0 if no settings found.
 550 */
 551unsigned long netdev_boot_base(const char *prefix, int unit)
 552{
 553	const struct netdev_boot_setup *s = dev_boot_setup;
 554	char name[IFNAMSIZ];
 555	int i;
 556
 557	sprintf(name, "%s%d", prefix, unit);
 558
 559	/*
 560	 * If device already registered then return base of 1
 561	 * to indicate not to probe for this interface
 562	 */
 563	if (__dev_get_by_name(&init_net, name))
 564		return 1;
 565
 566	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 567		if (!strcmp(name, s[i].name))
 568			return s[i].map.base_addr;
 569	return 0;
 570}
 571
 572/*
 573 * Saves at boot time configured settings for any netdevice.
 574 */
 575int __init netdev_boot_setup(char *str)
 576{
 577	int ints[5];
 578	struct ifmap map;
 579
 580	str = get_options(str, ARRAY_SIZE(ints), ints);
 581	if (!str || !*str)
 582		return 0;
 583
 584	/* Save settings */
 585	memset(&map, 0, sizeof(map));
 586	if (ints[0] > 0)
 587		map.irq = ints[1];
 588	if (ints[0] > 1)
 589		map.base_addr = ints[2];
 590	if (ints[0] > 2)
 591		map.mem_start = ints[3];
 592	if (ints[0] > 3)
 593		map.mem_end = ints[4];
 594
 595	/* Add new entry to the list */
 596	return netdev_boot_setup_add(str, &map);
 597}
 598
 599__setup("netdev=", netdev_boot_setup);
 600
 601/*******************************************************************************
 602
 603			    Device Interface Subroutines
 604
 605*******************************************************************************/
 606
 607/**
 608 *	__dev_get_by_name	- find a device by its name
 609 *	@net: the applicable net namespace
 610 *	@name: name to find
 611 *
 612 *	Find an interface by name. Must be called under RTNL semaphore
 613 *	or @dev_base_lock. If the name is found a pointer to the device
 614 *	is returned. If the name is not found then %NULL is returned. The
 615 *	reference counters are not incremented so the caller must be
 616 *	careful with locks.
 617 */
 618
 619struct net_device *__dev_get_by_name(struct net *net, const char *name)
 620{
 621	struct hlist_node *p;
 622	struct net_device *dev;
 623	struct hlist_head *head = dev_name_hash(net, name);
 624
 625	hlist_for_each_entry(dev, p, head, name_hlist)
 626		if (!strncmp(dev->name, name, IFNAMSIZ))
 627			return dev;
 628
 629	return NULL;
 630}
 631EXPORT_SYMBOL(__dev_get_by_name);
 632
 633/**
 634 *	dev_get_by_name_rcu	- find a device by its name
 635 *	@net: the applicable net namespace
 636 *	@name: name to find
 637 *
 638 *	Find an interface by name.
 639 *	If the name is found a pointer to the device is returned.
 640 * 	If the name is not found then %NULL is returned.
 641 *	The reference counters are not incremented so the caller must be
 642 *	careful with locks. The caller must hold RCU lock.
 643 */
 644
 645struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 646{
 647	struct hlist_node *p;
 648	struct net_device *dev;
 649	struct hlist_head *head = dev_name_hash(net, name);
 650
 651	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 652		if (!strncmp(dev->name, name, IFNAMSIZ))
 653			return dev;
 654
 655	return NULL;
 656}
 657EXPORT_SYMBOL(dev_get_by_name_rcu);
 658
 659/**
 660 *	dev_get_by_name		- find a device by its name
 661 *	@net: the applicable net namespace
 662 *	@name: name to find
 663 *
 664 *	Find an interface by name. This can be called from any
 665 *	context and does its own locking. The returned handle has
 666 *	the usage count incremented and the caller must use dev_put() to
 667 *	release it when it is no longer needed. %NULL is returned if no
 668 *	matching device is found.
 669 */
 670
 671struct net_device *dev_get_by_name(struct net *net, const char *name)
 672{
 673	struct net_device *dev;
 674
 675	rcu_read_lock();
 676	dev = dev_get_by_name_rcu(net, name);
 677	if (dev)
 678		dev_hold(dev);
 679	rcu_read_unlock();
 680	return dev;
 681}
 682EXPORT_SYMBOL(dev_get_by_name);
 683
 684/**
 685 *	__dev_get_by_index - find a device by its ifindex
 686 *	@net: the applicable net namespace
 687 *	@ifindex: index of device
 688 *
 689 *	Search for an interface by index. Returns %NULL if the device
 690 *	is not found or a pointer to the device. The device has not
 691 *	had its reference counter increased so the caller must be careful
 692 *	about locking. The caller must hold either the RTNL semaphore
 693 *	or @dev_base_lock.
 694 */
 695
 696struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 697{
 698	struct hlist_node *p;
 699	struct net_device *dev;
 700	struct hlist_head *head = dev_index_hash(net, ifindex);
 701
 702	hlist_for_each_entry(dev, p, head, index_hlist)
 703		if (dev->ifindex == ifindex)
 704			return dev;
 705
 706	return NULL;
 707}
 708EXPORT_SYMBOL(__dev_get_by_index);
 709
 710/**
 711 *	dev_get_by_index_rcu - find a device by its ifindex
 712 *	@net: the applicable net namespace
 713 *	@ifindex: index of device
 714 *
 715 *	Search for an interface by index. Returns %NULL if the device
 716 *	is not found or a pointer to the device. The device has not
 717 *	had its reference counter increased so the caller must be careful
 718 *	about locking. The caller must hold RCU lock.
 719 */
 720
 721struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 722{
 723	struct hlist_node *p;
 724	struct net_device *dev;
 725	struct hlist_head *head = dev_index_hash(net, ifindex);
 726
 727	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 728		if (dev->ifindex == ifindex)
 729			return dev;
 730
 731	return NULL;
 732}
 733EXPORT_SYMBOL(dev_get_by_index_rcu);
 734
 735
 736/**
 737 *	dev_get_by_index - find a device by its ifindex
 738 *	@net: the applicable net namespace
 739 *	@ifindex: index of device
 740 *
 741 *	Search for an interface by index. Returns NULL if the device
 742 *	is not found or a pointer to the device. The device returned has
 743 *	had a reference added and the pointer is safe until the user calls
 744 *	dev_put to indicate they have finished with it.
 745 */
 746
 747struct net_device *dev_get_by_index(struct net *net, int ifindex)
 748{
 749	struct net_device *dev;
 750
 751	rcu_read_lock();
 752	dev = dev_get_by_index_rcu(net, ifindex);
 753	if (dev)
 754		dev_hold(dev);
 755	rcu_read_unlock();
 756	return dev;
 757}
 758EXPORT_SYMBOL(dev_get_by_index);
 759
 760/**
 761 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 762 *	@net: the applicable net namespace
 763 *	@type: media type of device
 764 *	@ha: hardware address
 765 *
 766 *	Search for an interface by MAC address. Returns NULL if the device
 767 *	is not found or a pointer to the device.
 768 *	The caller must hold RCU or RTNL.
 769 *	The returned device has not had its ref count increased
 770 *	and the caller must therefore be careful about locking
 771 *
 772 */
 773
 774struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 775				       const char *ha)
 776{
 777	struct net_device *dev;
 778
 779	for_each_netdev_rcu(net, dev)
 780		if (dev->type == type &&
 781		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 782			return dev;
 783
 784	return NULL;
 785}
 786EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 787
 788struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 789{
 790	struct net_device *dev;
 791
 792	ASSERT_RTNL();
 793	for_each_netdev(net, dev)
 794		if (dev->type == type)
 795			return dev;
 796
 797	return NULL;
 798}
 799EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 800
 801struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 802{
 803	struct net_device *dev, *ret = NULL;
 804
 805	rcu_read_lock();
 806	for_each_netdev_rcu(net, dev)
 807		if (dev->type == type) {
 808			dev_hold(dev);
 809			ret = dev;
 810			break;
 811		}
 812	rcu_read_unlock();
 813	return ret;
 814}
 815EXPORT_SYMBOL(dev_getfirstbyhwtype);
 816
 817/**
 818 *	dev_get_by_flags_rcu - find any device with given flags
 819 *	@net: the applicable net namespace
 820 *	@if_flags: IFF_* values
 821 *	@mask: bitmask of bits in if_flags to check
 822 *
 823 *	Search for any interface with the given flags. Returns NULL if a device
 824 *	is not found or a pointer to the device. Must be called inside
 825 *	rcu_read_lock(), and result refcount is unchanged.
 826 */
 827
 828struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 829				    unsigned short mask)
 830{
 831	struct net_device *dev, *ret;
 832
 833	ret = NULL;
 834	for_each_netdev_rcu(net, dev) {
 835		if (((dev->flags ^ if_flags) & mask) == 0) {
 836			ret = dev;
 837			break;
 838		}
 839	}
 840	return ret;
 841}
 842EXPORT_SYMBOL(dev_get_by_flags_rcu);
 843
 844/**
 845 *	dev_valid_name - check if name is okay for network device
 846 *	@name: name string
 847 *
 848 *	Network device names need to be valid file names to
 849 *	to allow sysfs to work.  We also disallow any kind of
 850 *	whitespace.
 851 */
 852int dev_valid_name(const char *name)
 853{
 854	if (*name == '\0')
 855		return 0;
 856	if (strlen(name) >= IFNAMSIZ)
 857		return 0;
 858	if (!strcmp(name, ".") || !strcmp(name, ".."))
 859		return 0;
 860
 861	while (*name) {
 862		if (*name == '/' || isspace(*name))
 863			return 0;
 864		name++;
 865	}
 866	return 1;
 867}
 868EXPORT_SYMBOL(dev_valid_name);
 869
 870/**
 871 *	__dev_alloc_name - allocate a name for a device
 872 *	@net: network namespace to allocate the device name in
 873 *	@name: name format string
 874 *	@buf:  scratch buffer and result name string
 875 *
 876 *	Passed a format string - eg "lt%d" it will try and find a suitable
 877 *	id. It scans list of devices to build up a free map, then chooses
 878 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 879 *	while allocating the name and adding the device in order to avoid
 880 *	duplicates.
 881 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 882 *	Returns the number of the unit assigned or a negative errno code.
 883 */
 884
 885static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 886{
 887	int i = 0;
 888	const char *p;
 889	const int max_netdevices = 8*PAGE_SIZE;
 890	unsigned long *inuse;
 891	struct net_device *d;
 892
 893	p = strnchr(name, IFNAMSIZ-1, '%');
 894	if (p) {
 895		/*
 896		 * Verify the string as this thing may have come from
 897		 * the user.  There must be either one "%d" and no other "%"
 898		 * characters.
 899		 */
 900		if (p[1] != 'd' || strchr(p + 2, '%'))
 901			return -EINVAL;
 902
 903		/* Use one page as a bit array of possible slots */
 904		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 905		if (!inuse)
 906			return -ENOMEM;
 907
 908		for_each_netdev(net, d) {
 909			if (!sscanf(d->name, name, &i))
 910				continue;
 911			if (i < 0 || i >= max_netdevices)
 912				continue;
 913
 914			/*  avoid cases where sscanf is not exact inverse of printf */
 915			snprintf(buf, IFNAMSIZ, name, i);
 916			if (!strncmp(buf, d->name, IFNAMSIZ))
 917				set_bit(i, inuse);
 918		}
 919
 920		i = find_first_zero_bit(inuse, max_netdevices);
 921		free_page((unsigned long) inuse);
 922	}
 923
 924	if (buf != name)
 925		snprintf(buf, IFNAMSIZ, name, i);
 926	if (!__dev_get_by_name(net, buf))
 927		return i;
 928
 929	/* It is possible to run out of possible slots
 930	 * when the name is long and there isn't enough space left
 931	 * for the digits, or if all bits are used.
 932	 */
 933	return -ENFILE;
 934}
 935
 936/**
 937 *	dev_alloc_name - allocate a name for a device
 938 *	@dev: device
 939 *	@name: name format string
 940 *
 941 *	Passed a format string - eg "lt%d" it will try and find a suitable
 942 *	id. It scans list of devices to build up a free map, then chooses
 943 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 944 *	while allocating the name and adding the device in order to avoid
 945 *	duplicates.
 946 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 947 *	Returns the number of the unit assigned or a negative errno code.
 948 */
 949
 950int dev_alloc_name(struct net_device *dev, const char *name)
 951{
 952	char buf[IFNAMSIZ];
 953	struct net *net;
 954	int ret;
 955
 956	BUG_ON(!dev_net(dev));
 957	net = dev_net(dev);
 958	ret = __dev_alloc_name(net, name, buf);
 959	if (ret >= 0)
 960		strlcpy(dev->name, buf, IFNAMSIZ);
 961	return ret;
 962}
 963EXPORT_SYMBOL(dev_alloc_name);
 964
 965static int dev_get_valid_name(struct net_device *dev, const char *name)
 966{
 967	struct net *net;
 968
 969	BUG_ON(!dev_net(dev));
 970	net = dev_net(dev);
 971
 972	if (!dev_valid_name(name))
 973		return -EINVAL;
 974
 975	if (strchr(name, '%'))
 976		return dev_alloc_name(dev, name);
 977	else if (__dev_get_by_name(net, name))
 978		return -EEXIST;
 979	else if (dev->name != name)
 980		strlcpy(dev->name, name, IFNAMSIZ);
 981
 982	return 0;
 983}
 984
 985/**
 986 *	dev_change_name - change name of a device
 987 *	@dev: device
 988 *	@newname: name (or format string) must be at least IFNAMSIZ
 989 *
 990 *	Change name of a device, can pass format strings "eth%d".
 991 *	for wildcarding.
 992 */
 993int dev_change_name(struct net_device *dev, const char *newname)
 994{
 995	char oldname[IFNAMSIZ];
 996	int err = 0;
 997	int ret;
 998	struct net *net;
 999
1000	ASSERT_RTNL();
1001	BUG_ON(!dev_net(dev));
1002
1003	net = dev_net(dev);
1004	if (dev->flags & IFF_UP)
1005		return -EBUSY;
1006
1007	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008		return 0;
1009
1010	memcpy(oldname, dev->name, IFNAMSIZ);
1011
1012	err = dev_get_valid_name(dev, newname);
1013	if (err < 0)
1014		return err;
1015
1016rollback:
1017	ret = device_rename(&dev->dev, dev->name);
1018	if (ret) {
1019		memcpy(dev->name, oldname, IFNAMSIZ);
1020		return ret;
1021	}
1022
1023	write_lock_bh(&dev_base_lock);
1024	hlist_del_rcu(&dev->name_hlist);
1025	write_unlock_bh(&dev_base_lock);
1026
1027	synchronize_rcu();
1028
1029	write_lock_bh(&dev_base_lock);
1030	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031	write_unlock_bh(&dev_base_lock);
1032
1033	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034	ret = notifier_to_errno(ret);
1035
1036	if (ret) {
1037		/* err >= 0 after dev_alloc_name() or stores the first errno */
1038		if (err >= 0) {
1039			err = ret;
1040			memcpy(dev->name, oldname, IFNAMSIZ);
1041			goto rollback;
1042		} else {
1043			printk(KERN_ERR
1044			       "%s: name change rollback failed: %d.\n",
1045			       dev->name, ret);
1046		}
1047	}
1048
1049	return err;
1050}
1051
1052/**
1053 *	dev_set_alias - change ifalias of a device
1054 *	@dev: device
1055 *	@alias: name up to IFALIASZ
1056 *	@len: limit of bytes to copy from info
1057 *
1058 *	Set ifalias for a device,
1059 */
1060int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061{
1062	ASSERT_RTNL();
1063
1064	if (len >= IFALIASZ)
1065		return -EINVAL;
1066
1067	if (!len) {
1068		if (dev->ifalias) {
1069			kfree(dev->ifalias);
1070			dev->ifalias = NULL;
1071		}
1072		return 0;
1073	}
1074
1075	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076	if (!dev->ifalias)
1077		return -ENOMEM;
1078
1079	strlcpy(dev->ifalias, alias, len+1);
1080	return len;
1081}
1082
1083
1084/**
1085 *	netdev_features_change - device changes features
1086 *	@dev: device to cause notification
1087 *
1088 *	Called to indicate a device has changed features.
1089 */
1090void netdev_features_change(struct net_device *dev)
1091{
1092	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093}
1094EXPORT_SYMBOL(netdev_features_change);
1095
1096/**
1097 *	netdev_state_change - device changes state
1098 *	@dev: device to cause notification
1099 *
1100 *	Called to indicate a device has changed state. This function calls
1101 *	the notifier chains for netdev_chain and sends a NEWLINK message
1102 *	to the routing socket.
1103 */
1104void netdev_state_change(struct net_device *dev)
1105{
1106	if (dev->flags & IFF_UP) {
1107		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109	}
1110}
1111EXPORT_SYMBOL(netdev_state_change);
1112
1113int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114{
1115	return call_netdevice_notifiers(event, dev);
1116}
1117EXPORT_SYMBOL(netdev_bonding_change);
1118
1119/**
1120 *	dev_load 	- load a network module
1121 *	@net: the applicable net namespace
1122 *	@name: name of interface
1123 *
1124 *	If a network interface is not present and the process has suitable
1125 *	privileges this function loads the module. If module loading is not
1126 *	available in this kernel then it becomes a nop.
1127 */
1128
1129void dev_load(struct net *net, const char *name)
1130{
1131	struct net_device *dev;
1132	int no_module;
1133
1134	rcu_read_lock();
1135	dev = dev_get_by_name_rcu(net, name);
1136	rcu_read_unlock();
1137
1138	no_module = !dev;
1139	if (no_module && capable(CAP_NET_ADMIN))
1140		no_module = request_module("netdev-%s", name);
1141	if (no_module && capable(CAP_SYS_MODULE)) {
1142		if (!request_module("%s", name))
1143			pr_err("Loading kernel module for a network device "
1144"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145"instead\n", name);
1146	}
1147}
1148EXPORT_SYMBOL(dev_load);
1149
1150static int __dev_open(struct net_device *dev)
1151{
1152	const struct net_device_ops *ops = dev->netdev_ops;
1153	int ret;
1154
1155	ASSERT_RTNL();
1156
1157	if (!netif_device_present(dev))
1158		return -ENODEV;
1159
1160	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161	ret = notifier_to_errno(ret);
1162	if (ret)
1163		return ret;
1164
1165	set_bit(__LINK_STATE_START, &dev->state);
1166
1167	if (ops->ndo_validate_addr)
1168		ret = ops->ndo_validate_addr(dev);
1169
1170	if (!ret && ops->ndo_open)
1171		ret = ops->ndo_open(dev);
1172
1173	if (ret)
1174		clear_bit(__LINK_STATE_START, &dev->state);
1175	else {
1176		dev->flags |= IFF_UP;
1177		net_dmaengine_get();
1178		dev_set_rx_mode(dev);
1179		dev_activate(dev);
1180	}
1181
1182	return ret;
1183}
1184
1185/**
1186 *	dev_open	- prepare an interface for use.
1187 *	@dev:	device to open
1188 *
1189 *	Takes a device from down to up state. The device's private open
1190 *	function is invoked and then the multicast lists are loaded. Finally
1191 *	the device is moved into the up state and a %NETDEV_UP message is
1192 *	sent to the netdev notifier chain.
1193 *
1194 *	Calling this function on an active interface is a nop. On a failure
1195 *	a negative errno code is returned.
1196 */
1197int dev_open(struct net_device *dev)
1198{
1199	int ret;
1200
1201	if (dev->flags & IFF_UP)
1202		return 0;
1203
1204	ret = __dev_open(dev);
1205	if (ret < 0)
1206		return ret;
1207
1208	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209	call_netdevice_notifiers(NETDEV_UP, dev);
1210
1211	return ret;
1212}
1213EXPORT_SYMBOL(dev_open);
1214
1215static int __dev_close_many(struct list_head *head)
1216{
1217	struct net_device *dev;
1218
1219	ASSERT_RTNL();
1220	might_sleep();
1221
1222	list_for_each_entry(dev, head, unreg_list) {
1223		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224
1225		clear_bit(__LINK_STATE_START, &dev->state);
1226
1227		/* Synchronize to scheduled poll. We cannot touch poll list, it
1228		 * can be even on different cpu. So just clear netif_running().
1229		 *
1230		 * dev->stop() will invoke napi_disable() on all of it's
1231		 * napi_struct instances on this device.
1232		 */
1233		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234	}
1235
1236	dev_deactivate_many(head);
1237
1238	list_for_each_entry(dev, head, unreg_list) {
1239		const struct net_device_ops *ops = dev->netdev_ops;
1240
1241		/*
1242		 *	Call the device specific close. This cannot fail.
1243		 *	Only if device is UP
1244		 *
1245		 *	We allow it to be called even after a DETACH hot-plug
1246		 *	event.
1247		 */
1248		if (ops->ndo_stop)
1249			ops->ndo_stop(dev);
1250
1251		dev->flags &= ~IFF_UP;
1252		net_dmaengine_put();
1253	}
1254
1255	return 0;
1256}
1257
1258static int __dev_close(struct net_device *dev)
1259{
1260	int retval;
1261	LIST_HEAD(single);
1262
1263	list_add(&dev->unreg_list, &single);
1264	retval = __dev_close_many(&single);
1265	list_del(&single);
1266	return retval;
1267}
1268
1269static int dev_close_many(struct list_head *head)
1270{
1271	struct net_device *dev, *tmp;
1272	LIST_HEAD(tmp_list);
1273
1274	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275		if (!(dev->flags & IFF_UP))
1276			list_move(&dev->unreg_list, &tmp_list);
1277
1278	__dev_close_many(head);
1279
1280	list_for_each_entry(dev, head, unreg_list) {
1281		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282		call_netdevice_notifiers(NETDEV_DOWN, dev);
1283	}
1284
1285	/* rollback_registered_many needs the complete original list */
1286	list_splice(&tmp_list, head);
1287	return 0;
1288}
1289
1290/**
1291 *	dev_close - shutdown an interface.
1292 *	@dev: device to shutdown
1293 *
1294 *	This function moves an active device into down state. A
1295 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297 *	chain.
1298 */
1299int dev_close(struct net_device *dev)
1300{
1301	if (dev->flags & IFF_UP) {
1302		LIST_HEAD(single);
1303
1304		list_add(&dev->unreg_list, &single);
1305		dev_close_many(&single);
1306		list_del(&single);
1307	}
1308	return 0;
1309}
1310EXPORT_SYMBOL(dev_close);
1311
1312
1313/**
1314 *	dev_disable_lro - disable Large Receive Offload on a device
1315 *	@dev: device
1316 *
1317 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1318 *	called under RTNL.  This is needed if received packets may be
1319 *	forwarded to another interface.
1320 */
1321void dev_disable_lro(struct net_device *dev)
1322{
1323	u32 flags;
1324
1325	/*
1326	 * If we're trying to disable lro on a vlan device
1327	 * use the underlying physical device instead
1328	 */
1329	if (is_vlan_dev(dev))
1330		dev = vlan_dev_real_dev(dev);
1331
1332	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333		flags = dev->ethtool_ops->get_flags(dev);
1334	else
1335		flags = ethtool_op_get_flags(dev);
1336
1337	if (!(flags & ETH_FLAG_LRO))
1338		return;
1339
1340	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341	if (unlikely(dev->features & NETIF_F_LRO))
1342		netdev_WARN(dev, "failed to disable LRO!\n");
1343}
1344EXPORT_SYMBOL(dev_disable_lro);
1345
1346
1347static int dev_boot_phase = 1;
1348
1349/**
1350 *	register_netdevice_notifier - register a network notifier block
1351 *	@nb: notifier
1352 *
1353 *	Register a notifier to be called when network device events occur.
1354 *	The notifier passed is linked into the kernel structures and must
1355 *	not be reused until it has been unregistered. A negative errno code
1356 *	is returned on a failure.
1357 *
1358 * 	When registered all registration and up events are replayed
1359 *	to the new notifier to allow device to have a race free
1360 *	view of the network device list.
1361 */
1362
1363int register_netdevice_notifier(struct notifier_block *nb)
1364{
1365	struct net_device *dev;
1366	struct net_device *last;
1367	struct net *net;
1368	int err;
1369
1370	rtnl_lock();
1371	err = raw_notifier_chain_register(&netdev_chain, nb);
1372	if (err)
1373		goto unlock;
1374	if (dev_boot_phase)
1375		goto unlock;
1376	for_each_net(net) {
1377		for_each_netdev(net, dev) {
1378			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379			err = notifier_to_errno(err);
1380			if (err)
1381				goto rollback;
1382
1383			if (!(dev->flags & IFF_UP))
1384				continue;
1385
1386			nb->notifier_call(nb, NETDEV_UP, dev);
1387		}
1388	}
1389
1390unlock:
1391	rtnl_unlock();
1392	return err;
1393
1394rollback:
1395	last = dev;
1396	for_each_net(net) {
1397		for_each_netdev(net, dev) {
1398			if (dev == last)
1399				break;
1400
1401			if (dev->flags & IFF_UP) {
1402				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403				nb->notifier_call(nb, NETDEV_DOWN, dev);
1404			}
1405			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407		}
1408	}
1409
1410	raw_notifier_chain_unregister(&netdev_chain, nb);
1411	goto unlock;
1412}
1413EXPORT_SYMBOL(register_netdevice_notifier);
1414
1415/**
1416 *	unregister_netdevice_notifier - unregister a network notifier block
1417 *	@nb: notifier
1418 *
1419 *	Unregister a notifier previously registered by
1420 *	register_netdevice_notifier(). The notifier is unlinked into the
1421 *	kernel structures and may then be reused. A negative errno code
1422 *	is returned on a failure.
1423 */
1424
1425int unregister_netdevice_notifier(struct notifier_block *nb)
1426{
1427	int err;
1428
1429	rtnl_lock();
1430	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1431	rtnl_unlock();
1432	return err;
1433}
1434EXPORT_SYMBOL(unregister_netdevice_notifier);
1435
1436/**
1437 *	call_netdevice_notifiers - call all network notifier blocks
1438 *      @val: value passed unmodified to notifier function
1439 *      @dev: net_device pointer passed unmodified to notifier function
1440 *
1441 *	Call all network notifier blocks.  Parameters and return value
1442 *	are as for raw_notifier_call_chain().
1443 */
1444
1445int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1446{
1447	ASSERT_RTNL();
1448	return raw_notifier_call_chain(&netdev_chain, val, dev);
1449}
1450EXPORT_SYMBOL(call_netdevice_notifiers);
1451
1452/* When > 0 there are consumers of rx skb time stamps */
1453static atomic_t netstamp_needed = ATOMIC_INIT(0);
1454
1455void net_enable_timestamp(void)
1456{
1457	atomic_inc(&netstamp_needed);
1458}
1459EXPORT_SYMBOL(net_enable_timestamp);
1460
1461void net_disable_timestamp(void)
1462{
1463	atomic_dec(&netstamp_needed);
1464}
1465EXPORT_SYMBOL(net_disable_timestamp);
1466
1467static inline void net_timestamp_set(struct sk_buff *skb)
1468{
1469	if (atomic_read(&netstamp_needed))
1470		__net_timestamp(skb);
1471	else
1472		skb->tstamp.tv64 = 0;
1473}
1474
1475static inline void net_timestamp_check(struct sk_buff *skb)
1476{
1477	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1478		__net_timestamp(skb);
1479}
1480
1481static int net_hwtstamp_validate(struct ifreq *ifr)
1482{
1483	struct hwtstamp_config cfg;
1484	enum hwtstamp_tx_types tx_type;
1485	enum hwtstamp_rx_filters rx_filter;
1486	int tx_type_valid = 0;
1487	int rx_filter_valid = 0;
1488
1489	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1490		return -EFAULT;
1491
1492	if (cfg.flags) /* reserved for future extensions */
1493		return -EINVAL;
1494
1495	tx_type = cfg.tx_type;
1496	rx_filter = cfg.rx_filter;
1497
1498	switch (tx_type) {
1499	case HWTSTAMP_TX_OFF:
1500	case HWTSTAMP_TX_ON:
1501	case HWTSTAMP_TX_ONESTEP_SYNC:
1502		tx_type_valid = 1;
1503		break;
1504	}
1505
1506	switch (rx_filter) {
1507	case HWTSTAMP_FILTER_NONE:
1508	case HWTSTAMP_FILTER_ALL:
1509	case HWTSTAMP_FILTER_SOME:
1510	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1511	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1512	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1513	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1514	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1515	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1516	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1517	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1518	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1519	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1520	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1521	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1522		rx_filter_valid = 1;
1523		break;
1524	}
1525
1526	if (!tx_type_valid || !rx_filter_valid)
1527		return -ERANGE;
1528
1529	return 0;
1530}
1531
1532static inline bool is_skb_forwardable(struct net_device *dev,
1533				      struct sk_buff *skb)
1534{
1535	unsigned int len;
1536
1537	if (!(dev->flags & IFF_UP))
1538		return false;
1539
1540	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1541	if (skb->len <= len)
1542		return true;
1543
1544	/* if TSO is enabled, we don't care about the length as the packet
1545	 * could be forwarded without being segmented before
1546	 */
1547	if (skb_is_gso(skb))
1548		return true;
1549
1550	return false;
1551}
1552
1553/**
1554 * dev_forward_skb - loopback an skb to another netif
1555 *
1556 * @dev: destination network device
1557 * @skb: buffer to forward
1558 *
1559 * return values:
1560 *	NET_RX_SUCCESS	(no congestion)
1561 *	NET_RX_DROP     (packet was dropped, but freed)
1562 *
1563 * dev_forward_skb can be used for injecting an skb from the
1564 * start_xmit function of one device into the receive queue
1565 * of another device.
1566 *
1567 * The receiving device may be in another namespace, so
1568 * we have to clear all information in the skb that could
1569 * impact namespace isolation.
1570 */
1571int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1572{
1573	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1574		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1575			atomic_long_inc(&dev->rx_dropped);
1576			kfree_skb(skb);
1577			return NET_RX_DROP;
1578		}
1579	}
1580
1581	skb_orphan(skb);
1582	nf_reset(skb);
1583
1584	if (unlikely(!is_skb_forwardable(dev, skb))) {
1585		atomic_long_inc(&dev->rx_dropped);
1586		kfree_skb(skb);
1587		return NET_RX_DROP;
1588	}
1589	skb_set_dev(skb, dev);
1590	skb->tstamp.tv64 = 0;
1591	skb->pkt_type = PACKET_HOST;
1592	skb->protocol = eth_type_trans(skb, dev);
1593	return netif_rx(skb);
1594}
1595EXPORT_SYMBOL_GPL(dev_forward_skb);
1596
1597static inline int deliver_skb(struct sk_buff *skb,
1598			      struct packet_type *pt_prev,
1599			      struct net_device *orig_dev)
1600{
1601	atomic_inc(&skb->users);
1602	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1603}
1604
1605/*
1606 *	Support routine. Sends outgoing frames to any network
1607 *	taps currently in use.
1608 */
1609
1610static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1611{
1612	struct packet_type *ptype;
1613	struct sk_buff *skb2 = NULL;
1614	struct packet_type *pt_prev = NULL;
1615
1616	rcu_read_lock();
1617	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1618		/* Never send packets back to the socket
1619		 * they originated from - MvS (miquels@drinkel.ow.org)
1620		 */
1621		if ((ptype->dev == dev || !ptype->dev) &&
1622		    (ptype->af_packet_priv == NULL ||
1623		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1624			if (pt_prev) {
1625				deliver_skb(skb2, pt_prev, skb->dev);
1626				pt_prev = ptype;
1627				continue;
1628			}
1629
1630			skb2 = skb_clone(skb, GFP_ATOMIC);
1631			if (!skb2)
1632				break;
1633
1634			net_timestamp_set(skb2);
1635
1636			/* skb->nh should be correctly
1637			   set by sender, so that the second statement is
1638			   just protection against buggy protocols.
1639			 */
1640			skb_reset_mac_header(skb2);
1641
1642			if (skb_network_header(skb2) < skb2->data ||
1643			    skb2->network_header > skb2->tail) {
1644				if (net_ratelimit())
1645					printk(KERN_CRIT "protocol %04x is "
1646					       "buggy, dev %s\n",
1647					       ntohs(skb2->protocol),
1648					       dev->name);
1649				skb_reset_network_header(skb2);
1650			}
1651
1652			skb2->transport_header = skb2->network_header;
1653			skb2->pkt_type = PACKET_OUTGOING;
1654			pt_prev = ptype;
1655		}
1656	}
1657	if (pt_prev)
1658		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1659	rcu_read_unlock();
1660}
1661
1662/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1663 * @dev: Network device
1664 * @txq: number of queues available
1665 *
1666 * If real_num_tx_queues is changed the tc mappings may no longer be
1667 * valid. To resolve this verify the tc mapping remains valid and if
1668 * not NULL the mapping. With no priorities mapping to this
1669 * offset/count pair it will no longer be used. In the worst case TC0
1670 * is invalid nothing can be done so disable priority mappings. If is
1671 * expected that drivers will fix this mapping if they can before
1672 * calling netif_set_real_num_tx_queues.
1673 */
1674static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1675{
1676	int i;
1677	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1678
1679	/* If TC0 is invalidated disable TC mapping */
1680	if (tc->offset + tc->count > txq) {
1681		pr_warning("Number of in use tx queues changed "
1682			   "invalidating tc mappings. Priority "
1683			   "traffic classification disabled!\n");
1684		dev->num_tc = 0;
1685		return;
1686	}
1687
1688	/* Invalidated prio to tc mappings set to TC0 */
1689	for (i = 1; i < TC_BITMASK + 1; i++) {
1690		int q = netdev_get_prio_tc_map(dev, i);
1691
1692		tc = &dev->tc_to_txq[q];
1693		if (tc->offset + tc->count > txq) {
1694			pr_warning("Number of in use tx queues "
1695				   "changed. Priority %i to tc "
1696				   "mapping %i is no longer valid "
1697				   "setting map to 0\n",
1698				   i, q);
1699			netdev_set_prio_tc_map(dev, i, 0);
1700		}
1701	}
1702}
1703
1704/*
1705 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1706 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1707 */
1708int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1709{
1710	int rc;
1711
1712	if (txq < 1 || txq > dev->num_tx_queues)
1713		return -EINVAL;
1714
1715	if (dev->reg_state == NETREG_REGISTERED ||
1716	    dev->reg_state == NETREG_UNREGISTERING) {
1717		ASSERT_RTNL();
1718
1719		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1720						  txq);
1721		if (rc)
1722			return rc;
1723
1724		if (dev->num_tc)
1725			netif_setup_tc(dev, txq);
1726
1727		if (txq < dev->real_num_tx_queues)
1728			qdisc_reset_all_tx_gt(dev, txq);
1729	}
1730
1731	dev->real_num_tx_queues = txq;
1732	return 0;
1733}
1734EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1735
1736#ifdef CONFIG_RPS
1737/**
1738 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1739 *	@dev: Network device
1740 *	@rxq: Actual number of RX queues
1741 *
1742 *	This must be called either with the rtnl_lock held or before
1743 *	registration of the net device.  Returns 0 on success, or a
1744 *	negative error code.  If called before registration, it always
1745 *	succeeds.
1746 */
1747int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1748{
1749	int rc;
1750
1751	if (rxq < 1 || rxq > dev->num_rx_queues)
1752		return -EINVAL;
1753
1754	if (dev->reg_state == NETREG_REGISTERED) {
1755		ASSERT_RTNL();
1756
1757		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1758						  rxq);
1759		if (rc)
1760			return rc;
1761	}
1762
1763	dev->real_num_rx_queues = rxq;
1764	return 0;
1765}
1766EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1767#endif
1768
1769static inline void __netif_reschedule(struct Qdisc *q)
1770{
1771	struct softnet_data *sd;
1772	unsigned long flags;
1773
1774	local_irq_save(flags);
1775	sd = &__get_cpu_var(softnet_data);
1776	q->next_sched = NULL;
1777	*sd->output_queue_tailp = q;
1778	sd->output_queue_tailp = &q->next_sched;
1779	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1780	local_irq_restore(flags);
1781}
1782
1783void __netif_schedule(struct Qdisc *q)
1784{
1785	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1786		__netif_reschedule(q);
1787}
1788EXPORT_SYMBOL(__netif_schedule);
1789
1790void dev_kfree_skb_irq(struct sk_buff *skb)
1791{
1792	if (atomic_dec_and_test(&skb->users)) {
1793		struct softnet_data *sd;
1794		unsigned long flags;
1795
1796		local_irq_save(flags);
1797		sd = &__get_cpu_var(softnet_data);
1798		skb->next = sd->completion_queue;
1799		sd->completion_queue = skb;
1800		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1801		local_irq_restore(flags);
1802	}
1803}
1804EXPORT_SYMBOL(dev_kfree_skb_irq);
1805
1806void dev_kfree_skb_any(struct sk_buff *skb)
1807{
1808	if (in_irq() || irqs_disabled())
1809		dev_kfree_skb_irq(skb);
1810	else
1811		dev_kfree_skb(skb);
1812}
1813EXPORT_SYMBOL(dev_kfree_skb_any);
1814
1815
1816/**
1817 * netif_device_detach - mark device as removed
1818 * @dev: network device
1819 *
1820 * Mark device as removed from system and therefore no longer available.
1821 */
1822void netif_device_detach(struct net_device *dev)
1823{
1824	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1825	    netif_running(dev)) {
1826		netif_tx_stop_all_queues(dev);
1827	}
1828}
1829EXPORT_SYMBOL(netif_device_detach);
1830
1831/**
1832 * netif_device_attach - mark device as attached
1833 * @dev: network device
1834 *
1835 * Mark device as attached from system and restart if needed.
1836 */
1837void netif_device_attach(struct net_device *dev)
1838{
1839	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1840	    netif_running(dev)) {
1841		netif_tx_wake_all_queues(dev);
1842		__netdev_watchdog_up(dev);
1843	}
1844}
1845EXPORT_SYMBOL(netif_device_attach);
1846
1847/**
1848 * skb_dev_set -- assign a new device to a buffer
1849 * @skb: buffer for the new device
1850 * @dev: network device
1851 *
1852 * If an skb is owned by a device already, we have to reset
1853 * all data private to the namespace a device belongs to
1854 * before assigning it a new device.
1855 */
1856#ifdef CONFIG_NET_NS
1857void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1858{
1859	skb_dst_drop(skb);
1860	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1861		secpath_reset(skb);
1862		nf_reset(skb);
1863		skb_init_secmark(skb);
1864		skb->mark = 0;
1865		skb->priority = 0;
1866		skb->nf_trace = 0;
1867		skb->ipvs_property = 0;
1868#ifdef CONFIG_NET_SCHED
1869		skb->tc_index = 0;
1870#endif
1871	}
1872	skb->dev = dev;
1873}
1874EXPORT_SYMBOL(skb_set_dev);
1875#endif /* CONFIG_NET_NS */
1876
1877/*
1878 * Invalidate hardware checksum when packet is to be mangled, and
1879 * complete checksum manually on outgoing path.
1880 */
1881int skb_checksum_help(struct sk_buff *skb)
1882{
1883	__wsum csum;
1884	int ret = 0, offset;
1885
1886	if (skb->ip_summed == CHECKSUM_COMPLETE)
1887		goto out_set_summed;
1888
1889	if (unlikely(skb_shinfo(skb)->gso_size)) {
1890		/* Let GSO fix up the checksum. */
1891		goto out_set_summed;
1892	}
1893
1894	offset = skb_checksum_start_offset(skb);
1895	BUG_ON(offset >= skb_headlen(skb));
1896	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1897
1898	offset += skb->csum_offset;
1899	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1900
1901	if (skb_cloned(skb) &&
1902	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1903		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1904		if (ret)
1905			goto out;
1906	}
1907
1908	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1909out_set_summed:
1910	skb->ip_summed = CHECKSUM_NONE;
1911out:
1912	return ret;
1913}
1914EXPORT_SYMBOL(skb_checksum_help);
1915
1916/**
1917 *	skb_gso_segment - Perform segmentation on skb.
1918 *	@skb: buffer to segment
1919 *	@features: features for the output path (see dev->features)
1920 *
1921 *	This function segments the given skb and returns a list of segments.
1922 *
1923 *	It may return NULL if the skb requires no segmentation.  This is
1924 *	only possible when GSO is used for verifying header integrity.
1925 */
1926struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1927{
1928	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1929	struct packet_type *ptype;
1930	__be16 type = skb->protocol;
1931	int vlan_depth = ETH_HLEN;
1932	int err;
1933
1934	while (type == htons(ETH_P_8021Q)) {
1935		struct vlan_hdr *vh;
1936
1937		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1938			return ERR_PTR(-EINVAL);
1939
1940		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1941		type = vh->h_vlan_encapsulated_proto;
1942		vlan_depth += VLAN_HLEN;
1943	}
1944
1945	skb_reset_mac_header(skb);
1946	skb->mac_len = skb->network_header - skb->mac_header;
1947	__skb_pull(skb, skb->mac_len);
1948
1949	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1950		struct net_device *dev = skb->dev;
1951		struct ethtool_drvinfo info = {};
1952
1953		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1954			dev->ethtool_ops->get_drvinfo(dev, &info);
1955
1956		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1957		     info.driver, dev ? dev->features : 0L,
1958		     skb->sk ? skb->sk->sk_route_caps : 0L,
1959		     skb->len, skb->data_len, skb->ip_summed);
1960
1961		if (skb_header_cloned(skb) &&
1962		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1963			return ERR_PTR(err);
1964	}
1965
1966	rcu_read_lock();
1967	list_for_each_entry_rcu(ptype,
1968			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1969		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1970			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1971				err = ptype->gso_send_check(skb);
1972				segs = ERR_PTR(err);
1973				if (err || skb_gso_ok(skb, features))
1974					break;
1975				__skb_push(skb, (skb->data -
1976						 skb_network_header(skb)));
1977			}
1978			segs = ptype->gso_segment(skb, features);
1979			break;
1980		}
1981	}
1982	rcu_read_unlock();
1983
1984	__skb_push(skb, skb->data - skb_mac_header(skb));
1985
1986	return segs;
1987}
1988EXPORT_SYMBOL(skb_gso_segment);
1989
1990/* Take action when hardware reception checksum errors are detected. */
1991#ifdef CONFIG_BUG
1992void netdev_rx_csum_fault(struct net_device *dev)
1993{
1994	if (net_ratelimit()) {
1995		printk(KERN_ERR "%s: hw csum failure.\n",
1996			dev ? dev->name : "<unknown>");
1997		dump_stack();
1998	}
1999}
2000EXPORT_SYMBOL(netdev_rx_csum_fault);
2001#endif
2002
2003/* Actually, we should eliminate this check as soon as we know, that:
2004 * 1. IOMMU is present and allows to map all the memory.
2005 * 2. No high memory really exists on this machine.
2006 */
2007
2008static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2009{
2010#ifdef CONFIG_HIGHMEM
2011	int i;
2012	if (!(dev->features & NETIF_F_HIGHDMA)) {
2013		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2014			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2015			if (PageHighMem(skb_frag_page(frag)))
2016				return 1;
2017		}
2018	}
2019
2020	if (PCI_DMA_BUS_IS_PHYS) {
2021		struct device *pdev = dev->dev.parent;
2022
2023		if (!pdev)
2024			return 0;
2025		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2026			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2027			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2028			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2029				return 1;
2030		}
2031	}
2032#endif
2033	return 0;
2034}
2035
2036struct dev_gso_cb {
2037	void (*destructor)(struct sk_buff *skb);
2038};
2039
2040#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2041
2042static void dev_gso_skb_destructor(struct sk_buff *skb)
2043{
2044	struct dev_gso_cb *cb;
2045
2046	do {
2047		struct sk_buff *nskb = skb->next;
2048
2049		skb->next = nskb->next;
2050		nskb->next = NULL;
2051		kfree_skb(nskb);
2052	} while (skb->next);
2053
2054	cb = DEV_GSO_CB(skb);
2055	if (cb->destructor)
2056		cb->destructor(skb);
2057}
2058
2059/**
2060 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2061 *	@skb: buffer to segment
2062 *	@features: device features as applicable to this skb
2063 *
2064 *	This function segments the given skb and stores the list of segments
2065 *	in skb->next.
2066 */
2067static int dev_gso_segment(struct sk_buff *skb, int features)
2068{
2069	struct sk_buff *segs;
2070
2071	segs = skb_gso_segment(skb, features);
2072
2073	/* Verifying header integrity only. */
2074	if (!segs)
2075		return 0;
2076
2077	if (IS_ERR(segs))
2078		return PTR_ERR(segs);
2079
2080	skb->next = segs;
2081	DEV_GSO_CB(skb)->destructor = skb->destructor;
2082	skb->destructor = dev_gso_skb_destructor;
2083
2084	return 0;
2085}
2086
2087/*
2088 * Try to orphan skb early, right before transmission by the device.
2089 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2090 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2091 */
2092static inline void skb_orphan_try(struct sk_buff *skb)
2093{
2094	struct sock *sk = skb->sk;
2095
2096	if (sk && !skb_shinfo(skb)->tx_flags) {
2097		/* skb_tx_hash() wont be able to get sk.
2098		 * We copy sk_hash into skb->rxhash
2099		 */
2100		if (!skb->rxhash)
2101			skb->rxhash = sk->sk_hash;
2102		skb_orphan(skb);
2103	}
2104}
2105
2106static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2107{
2108	return ((features & NETIF_F_GEN_CSUM) ||
2109		((features & NETIF_F_V4_CSUM) &&
2110		 protocol == htons(ETH_P_IP)) ||
2111		((features & NETIF_F_V6_CSUM) &&
2112		 protocol == htons(ETH_P_IPV6)) ||
2113		((features & NETIF_F_FCOE_CRC) &&
2114		 protocol == htons(ETH_P_FCOE)));
2115}
2116
2117static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2118{
2119	if (!can_checksum_protocol(features, protocol)) {
2120		features &= ~NETIF_F_ALL_CSUM;
2121		features &= ~NETIF_F_SG;
2122	} else if (illegal_highdma(skb->dev, skb)) {
2123		features &= ~NETIF_F_SG;
2124	}
2125
2126	return features;
2127}
2128
2129u32 netif_skb_features(struct sk_buff *skb)
2130{
2131	__be16 protocol = skb->protocol;
2132	u32 features = skb->dev->features;
2133
2134	if (protocol == htons(ETH_P_8021Q)) {
2135		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2136		protocol = veh->h_vlan_encapsulated_proto;
2137	} else if (!vlan_tx_tag_present(skb)) {
2138		return harmonize_features(skb, protocol, features);
2139	}
2140
2141	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2142
2143	if (protocol != htons(ETH_P_8021Q)) {
2144		return harmonize_features(skb, protocol, features);
2145	} else {
2146		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2147				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2148		return harmonize_features(skb, protocol, features);
2149	}
2150}
2151EXPORT_SYMBOL(netif_skb_features);
2152
2153/*
2154 * Returns true if either:
2155 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2156 *	2. skb is fragmented and the device does not support SG, or if
2157 *	   at least one of fragments is in highmem and device does not
2158 *	   support DMA from it.
2159 */
2160static inline int skb_needs_linearize(struct sk_buff *skb,
2161				      int features)
2162{
2163	return skb_is_nonlinear(skb) &&
2164			((skb_has_frag_list(skb) &&
2165				!(features & NETIF_F_FRAGLIST)) ||
2166			(skb_shinfo(skb)->nr_frags &&
2167				!(features & NETIF_F_SG)));
2168}
2169
2170int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2171			struct netdev_queue *txq)
2172{
2173	const struct net_device_ops *ops = dev->netdev_ops;
2174	int rc = NETDEV_TX_OK;
2175	unsigned int skb_len;
2176
2177	if (likely(!skb->next)) {
2178		u32 features;
2179
2180		/*
2181		 * If device doesn't need skb->dst, release it right now while
2182		 * its hot in this cpu cache
2183		 */
2184		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2185			skb_dst_drop(skb);
2186
2187		if (!list_empty(&ptype_all))
2188			dev_queue_xmit_nit(skb, dev);
2189
2190		skb_orphan_try(skb);
2191
2192		features = netif_skb_features(skb);
2193
2194		if (vlan_tx_tag_present(skb) &&
2195		    !(features & NETIF_F_HW_VLAN_TX)) {
2196			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2197			if (unlikely(!skb))
2198				goto out;
2199
2200			skb->vlan_tci = 0;
2201		}
2202
2203		if (netif_needs_gso(skb, features)) {
2204			if (unlikely(dev_gso_segment(skb, features)))
2205				goto out_kfree_skb;
2206			if (skb->next)
2207				goto gso;
2208		} else {
2209			if (skb_needs_linearize(skb, features) &&
2210			    __skb_linearize(skb))
2211				goto out_kfree_skb;
2212
2213			/* If packet is not checksummed and device does not
2214			 * support checksumming for this protocol, complete
2215			 * checksumming here.
2216			 */
2217			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2218				skb_set_transport_header(skb,
2219					skb_checksum_start_offset(skb));
2220				if (!(features & NETIF_F_ALL_CSUM) &&
2221				     skb_checksum_help(skb))
2222					goto out_kfree_skb;
2223			}
2224		}
2225
2226		skb_len = skb->len;
2227		rc = ops->ndo_start_xmit(skb, dev);
2228		trace_net_dev_xmit(skb, rc, dev, skb_len);
2229		if (rc == NETDEV_TX_OK)
2230			txq_trans_update(txq);
2231		return rc;
2232	}
2233
2234gso:
2235	do {
2236		struct sk_buff *nskb = skb->next;
2237
2238		skb->next = nskb->next;
2239		nskb->next = NULL;
2240
2241		/*
2242		 * If device doesn't need nskb->dst, release it right now while
2243		 * its hot in this cpu cache
2244		 */
2245		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2246			skb_dst_drop(nskb);
2247
2248		skb_len = nskb->len;
2249		rc = ops->ndo_start_xmit(nskb, dev);
2250		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2251		if (unlikely(rc != NETDEV_TX_OK)) {
2252			if (rc & ~NETDEV_TX_MASK)
2253				goto out_kfree_gso_skb;
2254			nskb->next = skb->next;
2255			skb->next = nskb;
2256			return rc;
2257		}
2258		txq_trans_update(txq);
2259		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2260			return NETDEV_TX_BUSY;
2261	} while (skb->next);
2262
2263out_kfree_gso_skb:
2264	if (likely(skb->next == NULL))
2265		skb->destructor = DEV_GSO_CB(skb)->destructor;
2266out_kfree_skb:
2267	kfree_skb(skb);
2268out:
2269	return rc;
2270}
2271
2272static u32 hashrnd __read_mostly;
2273
2274/*
2275 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2276 * to be used as a distribution range.
2277 */
2278u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2279		  unsigned int num_tx_queues)
2280{
2281	u32 hash;
2282	u16 qoffset = 0;
2283	u16 qcount = num_tx_queues;
2284
2285	if (skb_rx_queue_recorded(skb)) {
2286		hash = skb_get_rx_queue(skb);
2287		while (unlikely(hash >= num_tx_queues))
2288			hash -= num_tx_queues;
2289		return hash;
2290	}
2291
2292	if (dev->num_tc) {
2293		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2294		qoffset = dev->tc_to_txq[tc].offset;
2295		qcount = dev->tc_to_txq[tc].count;
2296	}
2297
2298	if (skb->sk && skb->sk->sk_hash)
2299		hash = skb->sk->sk_hash;
2300	else
2301		hash = (__force u16) skb->protocol ^ skb->rxhash;
2302	hash = jhash_1word(hash, hashrnd);
2303
2304	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2305}
2306EXPORT_SYMBOL(__skb_tx_hash);
2307
2308static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2309{
2310	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2311		if (net_ratelimit()) {
2312			pr_warning("%s selects TX queue %d, but "
2313				"real number of TX queues is %d\n",
2314				dev->name, queue_index, dev->real_num_tx_queues);
2315		}
2316		return 0;
2317	}
2318	return queue_index;
2319}
2320
2321static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2322{
2323#ifdef CONFIG_XPS
2324	struct xps_dev_maps *dev_maps;
2325	struct xps_map *map;
2326	int queue_index = -1;
2327
2328	rcu_read_lock();
2329	dev_maps = rcu_dereference(dev->xps_maps);
2330	if (dev_maps) {
2331		map = rcu_dereference(
2332		    dev_maps->cpu_map[raw_smp_processor_id()]);
2333		if (map) {
2334			if (map->len == 1)
2335				queue_index = map->queues[0];
2336			else {
2337				u32 hash;
2338				if (skb->sk && skb->sk->sk_hash)
2339					hash = skb->sk->sk_hash;
2340				else
2341					hash = (__force u16) skb->protocol ^
2342					    skb->rxhash;
2343				hash = jhash_1word(hash, hashrnd);
2344				queue_index = map->queues[
2345				    ((u64)hash * map->len) >> 32];
2346			}
2347			if (unlikely(queue_index >= dev->real_num_tx_queues))
2348				queue_index = -1;
2349		}
2350	}
2351	rcu_read_unlock();
2352
2353	return queue_index;
2354#else
2355	return -1;
2356#endif
2357}
2358
2359static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2360					struct sk_buff *skb)
2361{
2362	int queue_index;
2363	const struct net_device_ops *ops = dev->netdev_ops;
2364
2365	if (dev->real_num_tx_queues == 1)
2366		queue_index = 0;
2367	else if (ops->ndo_select_queue) {
2368		queue_index = ops->ndo_select_queue(dev, skb);
2369		queue_index = dev_cap_txqueue(dev, queue_index);
2370	} else {
2371		struct sock *sk = skb->sk;
2372		queue_index = sk_tx_queue_get(sk);
2373
2374		if (queue_index < 0 || skb->ooo_okay ||
2375		    queue_index >= dev->real_num_tx_queues) {
2376			int old_index = queue_index;
2377
2378			queue_index = get_xps_queue(dev, skb);
2379			if (queue_index < 0)
2380				queue_index = skb_tx_hash(dev, skb);
2381
2382			if (queue_index != old_index && sk) {
2383				struct dst_entry *dst =
2384				    rcu_dereference_check(sk->sk_dst_cache, 1);
2385
2386				if (dst && skb_dst(skb) == dst)
2387					sk_tx_queue_set(sk, queue_index);
2388			}
2389		}
2390	}
2391
2392	skb_set_queue_mapping(skb, queue_index);
2393	return netdev_get_tx_queue(dev, queue_index);
2394}
2395
2396static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2397				 struct net_device *dev,
2398				 struct netdev_queue *txq)
2399{
2400	spinlock_t *root_lock = qdisc_lock(q);
2401	bool contended;
2402	int rc;
2403
2404	qdisc_skb_cb(skb)->pkt_len = skb->len;
2405	qdisc_calculate_pkt_len(skb, q);
2406	/*
2407	 * Heuristic to force contended enqueues to serialize on a
2408	 * separate lock before trying to get qdisc main lock.
2409	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2410	 * and dequeue packets faster.
2411	 */
2412	contended = qdisc_is_running(q);
2413	if (unlikely(contended))
2414		spin_lock(&q->busylock);
2415
2416	spin_lock(root_lock);
2417	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2418		kfree_skb(skb);
2419		rc = NET_XMIT_DROP;
2420	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2421		   qdisc_run_begin(q)) {
2422		/*
2423		 * This is a work-conserving queue; there are no old skbs
2424		 * waiting to be sent out; and the qdisc is not running -
2425		 * xmit the skb directly.
2426		 */
2427		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2428			skb_dst_force(skb);
2429
2430		qdisc_bstats_update(q, skb);
2431
2432		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2433			if (unlikely(contended)) {
2434				spin_unlock(&q->busylock);
2435				contended = false;
2436			}
2437			__qdisc_run(q);
2438		} else
2439			qdisc_run_end(q);
2440
2441		rc = NET_XMIT_SUCCESS;
2442	} else {
2443		skb_dst_force(skb);
2444		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2445		if (qdisc_run_begin(q)) {
2446			if (unlikely(contended)) {
2447				spin_unlock(&q->busylock);
2448				contended = false;
2449			}
2450			__qdisc_run(q);
2451		}
2452	}
2453	spin_unlock(root_lock);
2454	if (unlikely(contended))
2455		spin_unlock(&q->busylock);
2456	return rc;
2457}
2458
2459static DEFINE_PER_CPU(int, xmit_recursion);
2460#define RECURSION_LIMIT 10
2461
2462/**
2463 *	dev_queue_xmit - transmit a buffer
2464 *	@skb: buffer to transmit
2465 *
2466 *	Queue a buffer for transmission to a network device. The caller must
2467 *	have set the device and priority and built the buffer before calling
2468 *	this function. The function can be called from an interrupt.
2469 *
2470 *	A negative errno code is returned on a failure. A success does not
2471 *	guarantee the frame will be transmitted as it may be dropped due
2472 *	to congestion or traffic shaping.
2473 *
2474 * -----------------------------------------------------------------------------------
2475 *      I notice this method can also return errors from the queue disciplines,
2476 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2477 *      be positive.
2478 *
2479 *      Regardless of the return value, the skb is consumed, so it is currently
2480 *      difficult to retry a send to this method.  (You can bump the ref count
2481 *      before sending to hold a reference for retry if you are careful.)
2482 *
2483 *      When calling this method, interrupts MUST be enabled.  This is because
2484 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2485 *          --BLG
2486 */
2487int dev_queue_xmit(struct sk_buff *skb)
2488{
2489	struct net_device *dev = skb->dev;
2490	struct netdev_queue *txq;
2491	struct Qdisc *q;
2492	int rc = -ENOMEM;
2493
2494	/* Disable soft irqs for various locks below. Also
2495	 * stops preemption for RCU.
2496	 */
2497	rcu_read_lock_bh();
2498
2499	txq = dev_pick_tx(dev, skb);
2500	q = rcu_dereference_bh(txq->qdisc);
2501
2502#ifdef CONFIG_NET_CLS_ACT
2503	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2504#endif
2505	trace_net_dev_queue(skb);
2506	if (q->enqueue) {
2507		rc = __dev_xmit_skb(skb, q, dev, txq);
2508		goto out;
2509	}
2510
2511	/* The device has no queue. Common case for software devices:
2512	   loopback, all the sorts of tunnels...
2513
2514	   Really, it is unlikely that netif_tx_lock protection is necessary
2515	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2516	   counters.)
2517	   However, it is possible, that they rely on protection
2518	   made by us here.
2519
2520	   Check this and shot the lock. It is not prone from deadlocks.
2521	   Either shot noqueue qdisc, it is even simpler 8)
2522	 */
2523	if (dev->flags & IFF_UP) {
2524		int cpu = smp_processor_id(); /* ok because BHs are off */
2525
2526		if (txq->xmit_lock_owner != cpu) {
2527
2528			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2529				goto recursion_alert;
2530
2531			HARD_TX_LOCK(dev, txq, cpu);
2532
2533			if (!netif_tx_queue_stopped(txq)) {
2534				__this_cpu_inc(xmit_recursion);
2535				rc = dev_hard_start_xmit(skb, dev, txq);
2536				__this_cpu_dec(xmit_recursion);
2537				if (dev_xmit_complete(rc)) {
2538					HARD_TX_UNLOCK(dev, txq);
2539					goto out;
2540				}
2541			}
2542			HARD_TX_UNLOCK(dev, txq);
2543			if (net_ratelimit())
2544				printk(KERN_CRIT "Virtual device %s asks to "
2545				       "queue packet!\n", dev->name);
2546		} else {
2547			/* Recursion is detected! It is possible,
2548			 * unfortunately
2549			 */
2550recursion_alert:
2551			if (net_ratelimit())
2552				printk(KERN_CRIT "Dead loop on virtual device "
2553				       "%s, fix it urgently!\n", dev->name);
2554		}
2555	}
2556
2557	rc = -ENETDOWN;
2558	rcu_read_unlock_bh();
2559
2560	kfree_skb(skb);
2561	return rc;
2562out:
2563	rcu_read_unlock_bh();
2564	return rc;
2565}
2566EXPORT_SYMBOL(dev_queue_xmit);
2567
2568
2569/*=======================================================================
2570			Receiver routines
2571  =======================================================================*/
2572
2573int netdev_max_backlog __read_mostly = 1000;
2574int netdev_tstamp_prequeue __read_mostly = 1;
2575int netdev_budget __read_mostly = 300;
2576int weight_p __read_mostly = 64;            /* old backlog weight */
2577
2578/* Called with irq disabled */
2579static inline void ____napi_schedule(struct softnet_data *sd,
2580				     struct napi_struct *napi)
2581{
2582	list_add_tail(&napi->poll_list, &sd->poll_list);
2583	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2584}
2585
2586/*
2587 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2588 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2589 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2590 * if hash is a canonical 4-tuple hash over transport ports.
2591 */
2592void __skb_get_rxhash(struct sk_buff *skb)
2593{
2594	int nhoff, hash = 0, poff;
2595	const struct ipv6hdr *ip6;
2596	const struct iphdr *ip;
2597	const struct vlan_hdr *vlan;
2598	u8 ip_proto;
2599	u32 addr1, addr2;
2600	u16 proto;
2601	union {
2602		u32 v32;
2603		u16 v16[2];
2604	} ports;
2605
2606	nhoff = skb_network_offset(skb);
2607	proto = skb->protocol;
2608
2609again:
2610	switch (proto) {
2611	case __constant_htons(ETH_P_IP):
2612ip:
2613		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2614			goto done;
2615
2616		ip = (const struct iphdr *) (skb->data + nhoff);
2617		if (ip_is_fragment(ip))
2618			ip_proto = 0;
2619		else
2620			ip_proto = ip->protocol;
2621		addr1 = (__force u32) ip->saddr;
2622		addr2 = (__force u32) ip->daddr;
2623		nhoff += ip->ihl * 4;
2624		break;
2625	case __constant_htons(ETH_P_IPV6):
2626ipv6:
2627		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2628			goto done;
2629
2630		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2631		ip_proto = ip6->nexthdr;
2632		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2633		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2634		nhoff += 40;
2635		break;
2636	case __constant_htons(ETH_P_8021Q):
2637		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2638			goto done;
2639		vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2640		proto = vlan->h_vlan_encapsulated_proto;
2641		nhoff += sizeof(*vlan);
2642		goto again;
2643	case __constant_htons(ETH_P_PPP_SES):
2644		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2645			goto done;
2646		proto = *((__be16 *) (skb->data + nhoff +
2647				      sizeof(struct pppoe_hdr)));
2648		nhoff += PPPOE_SES_HLEN;
2649		switch (proto) {
2650		case __constant_htons(PPP_IP):
2651			goto ip;
2652		case __constant_htons(PPP_IPV6):
2653			goto ipv6;
2654		default:
2655			goto done;
2656		}
2657	default:
2658		goto done;
2659	}
2660
2661	switch (ip_proto) {
2662	case IPPROTO_GRE:
2663		if (pskb_may_pull(skb, nhoff + 16)) {
2664			u8 *h = skb->data + nhoff;
2665			__be16 flags = *(__be16 *)h;
2666
2667			/*
2668			 * Only look inside GRE if version zero and no
2669			 * routing
2670			 */
2671			if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2672				proto = *(__be16 *)(h + 2);
2673				nhoff += 4;
2674				if (flags & GRE_CSUM)
2675					nhoff += 4;
2676				if (flags & GRE_KEY)
2677					nhoff += 4;
2678				if (flags & GRE_SEQ)
2679					nhoff += 4;
2680				goto again;
2681			}
2682		}
2683		break;
2684	case IPPROTO_IPIP:
2685		goto again;
2686	default:
2687		break;
2688	}
2689
2690	ports.v32 = 0;
2691	poff = proto_ports_offset(ip_proto);
2692	if (poff >= 0) {
2693		nhoff += poff;
2694		if (pskb_may_pull(skb, nhoff + 4)) {
2695			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2696			if (ports.v16[1] < ports.v16[0])
2697				swap(ports.v16[0], ports.v16[1]);
2698			skb->l4_rxhash = 1;
2699		}
2700	}
2701
2702	/* get a consistent hash (same value on both flow directions) */
2703	if (addr2 < addr1)
2704		swap(addr1, addr2);
2705
2706	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2707	if (!hash)
2708		hash = 1;
2709
2710done:
2711	skb->rxhash = hash;
2712}
2713EXPORT_SYMBOL(__skb_get_rxhash);
2714
2715#ifdef CONFIG_RPS
2716
2717/* One global table that all flow-based protocols share. */
2718struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2719EXPORT_SYMBOL(rps_sock_flow_table);
2720
2721static struct rps_dev_flow *
2722set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2723	    struct rps_dev_flow *rflow, u16 next_cpu)
2724{
2725	if (next_cpu != RPS_NO_CPU) {
2726#ifdef CONFIG_RFS_ACCEL
2727		struct netdev_rx_queue *rxqueue;
2728		struct rps_dev_flow_table *flow_table;
2729		struct rps_dev_flow *old_rflow;
2730		u32 flow_id;
2731		u16 rxq_index;
2732		int rc;
2733
2734		/* Should we steer this flow to a different hardware queue? */
2735		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2736		    !(dev->features & NETIF_F_NTUPLE))
2737			goto out;
2738		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2739		if (rxq_index == skb_get_rx_queue(skb))
2740			goto out;
2741
2742		rxqueue = dev->_rx + rxq_index;
2743		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2744		if (!flow_table)
2745			goto out;
2746		flow_id = skb->rxhash & flow_table->mask;
2747		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2748							rxq_index, flow_id);
2749		if (rc < 0)
2750			goto out;
2751		old_rflow = rflow;
2752		rflow = &flow_table->flows[flow_id];
2753		rflow->filter = rc;
2754		if (old_rflow->filter == rflow->filter)
2755			old_rflow->filter = RPS_NO_FILTER;
2756	out:
2757#endif
2758		rflow->last_qtail =
2759			per_cpu(softnet_data, next_cpu).input_queue_head;
2760	}
2761
2762	rflow->cpu = next_cpu;
2763	return rflow;
2764}
2765
2766/*
2767 * get_rps_cpu is called from netif_receive_skb and returns the target
2768 * CPU from the RPS map of the receiving queue for a given skb.
2769 * rcu_read_lock must be held on entry.
2770 */
2771static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2772		       struct rps_dev_flow **rflowp)
2773{
2774	struct netdev_rx_queue *rxqueue;
2775	struct rps_map *map;
2776	struct rps_dev_flow_table *flow_table;
2777	struct rps_sock_flow_table *sock_flow_table;
2778	int cpu = -1;
2779	u16 tcpu;
2780
2781	if (skb_rx_queue_recorded(skb)) {
2782		u16 index = skb_get_rx_queue(skb);
2783		if (unlikely(index >= dev->real_num_rx_queues)) {
2784			WARN_ONCE(dev->real_num_rx_queues > 1,
2785				  "%s received packet on queue %u, but number "
2786				  "of RX queues is %u\n",
2787				  dev->name, index, dev->real_num_rx_queues);
2788			goto done;
2789		}
2790		rxqueue = dev->_rx + index;
2791	} else
2792		rxqueue = dev->_rx;
2793
2794	map = rcu_dereference(rxqueue->rps_map);
2795	if (map) {
2796		if (map->len == 1 &&
2797		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2798			tcpu = map->cpus[0];
2799			if (cpu_online(tcpu))
2800				cpu = tcpu;
2801			goto done;
2802		}
2803	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2804		goto done;
2805	}
2806
2807	skb_reset_network_header(skb);
2808	if (!skb_get_rxhash(skb))
2809		goto done;
2810
2811	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2812	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2813	if (flow_table && sock_flow_table) {
2814		u16 next_cpu;
2815		struct rps_dev_flow *rflow;
2816
2817		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2818		tcpu = rflow->cpu;
2819
2820		next_cpu = sock_flow_table->ents[skb->rxhash &
2821		    sock_flow_table->mask];
2822
2823		/*
2824		 * If the desired CPU (where last recvmsg was done) is
2825		 * different from current CPU (one in the rx-queue flow
2826		 * table entry), switch if one of the following holds:
2827		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2828		 *   - Current CPU is offline.
2829		 *   - The current CPU's queue tail has advanced beyond the
2830		 *     last packet that was enqueued using this table entry.
2831		 *     This guarantees that all previous packets for the flow
2832		 *     have been dequeued, thus preserving in order delivery.
2833		 */
2834		if (unlikely(tcpu != next_cpu) &&
2835		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2836		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2837		      rflow->last_qtail)) >= 0))
2838			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2839
2840		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2841			*rflowp = rflow;
2842			cpu = tcpu;
2843			goto done;
2844		}
2845	}
2846
2847	if (map) {
2848		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2849
2850		if (cpu_online(tcpu)) {
2851			cpu = tcpu;
2852			goto done;
2853		}
2854	}
2855
2856done:
2857	return cpu;
2858}
2859
2860#ifdef CONFIG_RFS_ACCEL
2861
2862/**
2863 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2864 * @dev: Device on which the filter was set
2865 * @rxq_index: RX queue index
2866 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2867 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2868 *
2869 * Drivers that implement ndo_rx_flow_steer() should periodically call
2870 * this function for each installed filter and remove the filters for
2871 * which it returns %true.
2872 */
2873bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2874			 u32 flow_id, u16 filter_id)
2875{
2876	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2877	struct rps_dev_flow_table *flow_table;
2878	struct rps_dev_flow *rflow;
2879	bool expire = true;
2880	int cpu;
2881
2882	rcu_read_lock();
2883	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2884	if (flow_table && flow_id <= flow_table->mask) {
2885		rflow = &flow_table->flows[flow_id];
2886		cpu = ACCESS_ONCE(rflow->cpu);
2887		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2888		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2889			   rflow->last_qtail) <
2890		     (int)(10 * flow_table->mask)))
2891			expire = false;
2892	}
2893	rcu_read_unlock();
2894	return expire;
2895}
2896EXPORT_SYMBOL(rps_may_expire_flow);
2897
2898#endif /* CONFIG_RFS_ACCEL */
2899
2900/* Called from hardirq (IPI) context */
2901static void rps_trigger_softirq(void *data)
2902{
2903	struct softnet_data *sd = data;
2904
2905	____napi_schedule(sd, &sd->backlog);
2906	sd->received_rps++;
2907}
2908
2909#endif /* CONFIG_RPS */
2910
2911/*
2912 * Check if this softnet_data structure is another cpu one
2913 * If yes, queue it to our IPI list and return 1
2914 * If no, return 0
2915 */
2916static int rps_ipi_queued(struct softnet_data *sd)
2917{
2918#ifdef CONFIG_RPS
2919	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2920
2921	if (sd != mysd) {
2922		sd->rps_ipi_next = mysd->rps_ipi_list;
2923		mysd->rps_ipi_list = sd;
2924
2925		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2926		return 1;
2927	}
2928#endif /* CONFIG_RPS */
2929	return 0;
2930}
2931
2932/*
2933 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2934 * queue (may be a remote CPU queue).
2935 */
2936static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2937			      unsigned int *qtail)
2938{
2939	struct softnet_data *sd;
2940	unsigned long flags;
2941
2942	sd = &per_cpu(softnet_data, cpu);
2943
2944	local_irq_save(flags);
2945
2946	rps_lock(sd);
2947	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2948		if (skb_queue_len(&sd->input_pkt_queue)) {
2949enqueue:
2950			__skb_queue_tail(&sd->input_pkt_queue, skb);
2951			input_queue_tail_incr_save(sd, qtail);
2952			rps_unlock(sd);
2953			local_irq_restore(flags);
2954			return NET_RX_SUCCESS;
2955		}
2956
2957		/* Schedule NAPI for backlog device
2958		 * We can use non atomic operation since we own the queue lock
2959		 */
2960		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2961			if (!rps_ipi_queued(sd))
2962				____napi_schedule(sd, &sd->backlog);
2963		}
2964		goto enqueue;
2965	}
2966
2967	sd->dropped++;
2968	rps_unlock(sd);
2969
2970	local_irq_restore(flags);
2971
2972	atomic_long_inc(&skb->dev->rx_dropped);
2973	kfree_skb(skb);
2974	return NET_RX_DROP;
2975}
2976
2977/**
2978 *	netif_rx	-	post buffer to the network code
2979 *	@skb: buffer to post
2980 *
2981 *	This function receives a packet from a device driver and queues it for
2982 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2983 *	may be dropped during processing for congestion control or by the
2984 *	protocol layers.
2985 *
2986 *	return values:
2987 *	NET_RX_SUCCESS	(no congestion)
2988 *	NET_RX_DROP     (packet was dropped)
2989 *
2990 */
2991
2992int netif_rx(struct sk_buff *skb)
2993{
2994	int ret;
2995
2996	/* if netpoll wants it, pretend we never saw it */
2997	if (netpoll_rx(skb))
2998		return NET_RX_DROP;
2999
3000	if (netdev_tstamp_prequeue)
3001		net_timestamp_check(skb);
3002
3003	trace_netif_rx(skb);
3004#ifdef CONFIG_RPS
3005	{
3006		struct rps_dev_flow voidflow, *rflow = &voidflow;
3007		int cpu;
3008
3009		preempt_disable();
3010		rcu_read_lock();
3011
3012		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3013		if (cpu < 0)
3014			cpu = smp_processor_id();
3015
3016		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3017
3018		rcu_read_unlock();
3019		preempt_enable();
3020	}
3021#else
3022	{
3023		unsigned int qtail;
3024		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3025		put_cpu();
3026	}
3027#endif
3028	return ret;
3029}
3030EXPORT_SYMBOL(netif_rx);
3031
3032int netif_rx_ni(struct sk_buff *skb)
3033{
3034	int err;
3035
3036	preempt_disable();
3037	err = netif_rx(skb);
3038	if (local_softirq_pending())
3039		do_softirq();
3040	preempt_enable();
3041
3042	return err;
3043}
3044EXPORT_SYMBOL(netif_rx_ni);
3045
3046static void net_tx_action(struct softirq_action *h)
3047{
3048	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3049
3050	if (sd->completion_queue) {
3051		struct sk_buff *clist;
3052
3053		local_irq_disable();
3054		clist = sd->completion_queue;
3055		sd->completion_queue = NULL;
3056		local_irq_enable();
3057
3058		while (clist) {
3059			struct sk_buff *skb = clist;
3060			clist = clist->next;
3061
3062			WARN_ON(atomic_read(&skb->users));
3063			trace_kfree_skb(skb, net_tx_action);
3064			__kfree_skb(skb);
3065		}
3066	}
3067
3068	if (sd->output_queue) {
3069		struct Qdisc *head;
3070
3071		local_irq_disable();
3072		head = sd->output_queue;
3073		sd->output_queue = NULL;
3074		sd->output_queue_tailp = &sd->output_queue;
3075		local_irq_enable();
3076
3077		while (head) {
3078			struct Qdisc *q = head;
3079			spinlock_t *root_lock;
3080
3081			head = head->next_sched;
3082
3083			root_lock = qdisc_lock(q);
3084			if (spin_trylock(root_lock)) {
3085				smp_mb__before_clear_bit();
3086				clear_bit(__QDISC_STATE_SCHED,
3087					  &q->state);
3088				qdisc_run(q);
3089				spin_unlock(root_lock);
3090			} else {
3091				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3092					      &q->state)) {
3093					__netif_reschedule(q);
3094				} else {
3095					smp_mb__before_clear_bit();
3096					clear_bit(__QDISC_STATE_SCHED,
3097						  &q->state);
3098				}
3099			}
3100		}
3101	}
3102}
3103
3104#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3105    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3106/* This hook is defined here for ATM LANE */
3107int (*br_fdb_test_addr_hook)(struct net_device *dev,
3108			     unsigned char *addr) __read_mostly;
3109EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3110#endif
3111
3112#ifdef CONFIG_NET_CLS_ACT
3113/* TODO: Maybe we should just force sch_ingress to be compiled in
3114 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3115 * a compare and 2 stores extra right now if we dont have it on
3116 * but have CONFIG_NET_CLS_ACT
3117 * NOTE: This doesn't stop any functionality; if you dont have
3118 * the ingress scheduler, you just can't add policies on ingress.
3119 *
3120 */
3121static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3122{
3123	struct net_device *dev = skb->dev;
3124	u32 ttl = G_TC_RTTL(skb->tc_verd);
3125	int result = TC_ACT_OK;
3126	struct Qdisc *q;
3127
3128	if (unlikely(MAX_RED_LOOP < ttl++)) {
3129		if (net_ratelimit())
3130			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3131			       skb->skb_iif, dev->ifindex);
3132		return TC_ACT_SHOT;
3133	}
3134
3135	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3136	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3137
3138	q = rxq->qdisc;
3139	if (q != &noop_qdisc) {
3140		spin_lock(qdisc_lock(q));
3141		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3142			result = qdisc_enqueue_root(skb, q);
3143		spin_unlock(qdisc_lock(q));
3144	}
3145
3146	return result;
3147}
3148
3149static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3150					 struct packet_type **pt_prev,
3151					 int *ret, struct net_device *orig_dev)
3152{
3153	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3154
3155	if (!rxq || rxq->qdisc == &noop_qdisc)
3156		goto out;
3157
3158	if (*pt_prev) {
3159		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3160		*pt_prev = NULL;
3161	}
3162
3163	switch (ing_filter(skb, rxq)) {
3164	case TC_ACT_SHOT:
3165	case TC_ACT_STOLEN:
3166		kfree_skb(skb);
3167		return NULL;
3168	}
3169
3170out:
3171	skb->tc_verd = 0;
3172	return skb;
3173}
3174#endif
3175
3176/**
3177 *	netdev_rx_handler_register - register receive handler
3178 *	@dev: device to register a handler for
3179 *	@rx_handler: receive handler to register
3180 *	@rx_handler_data: data pointer that is used by rx handler
3181 *
3182 *	Register a receive hander for a device. This handler will then be
3183 *	called from __netif_receive_skb. A negative errno code is returned
3184 *	on a failure.
3185 *
3186 *	The caller must hold the rtnl_mutex.
3187 *
3188 *	For a general description of rx_handler, see enum rx_handler_result.
3189 */
3190int netdev_rx_handler_register(struct net_device *dev,
3191			       rx_handler_func_t *rx_handler,
3192			       void *rx_handler_data)
3193{
3194	ASSERT_RTNL();
3195
3196	if (dev->rx_handler)
3197		return -EBUSY;
3198
3199	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3200	rcu_assign_pointer(dev->rx_handler, rx_handler);
3201
3202	return 0;
3203}
3204EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3205
3206/**
3207 *	netdev_rx_handler_unregister - unregister receive handler
3208 *	@dev: device to unregister a handler from
3209 *
3210 *	Unregister a receive hander from a device.
3211 *
3212 *	The caller must hold the rtnl_mutex.
3213 */
3214void netdev_rx_handler_unregister(struct net_device *dev)
3215{
3216
3217	ASSERT_RTNL();
3218	RCU_INIT_POINTER(dev->rx_handler, NULL);
3219	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3220}
3221EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3222
3223static int __netif_receive_skb(struct sk_buff *skb)
3224{
3225	struct packet_type *ptype, *pt_prev;
3226	rx_handler_func_t *rx_handler;
3227	struct net_device *orig_dev;
3228	struct net_device *null_or_dev;
3229	bool deliver_exact = false;
3230	int ret = NET_RX_DROP;
3231	__be16 type;
3232
3233	if (!netdev_tstamp_prequeue)
3234		net_timestamp_check(skb);
3235
3236	trace_netif_receive_skb(skb);
3237
3238	/* if we've gotten here through NAPI, check netpoll */
3239	if (netpoll_receive_skb(skb))
3240		return NET_RX_DROP;
3241
3242	if (!skb->skb_iif)
3243		skb->skb_iif = skb->dev->ifindex;
3244	orig_dev = skb->dev;
3245
3246	skb_reset_network_header(skb);
3247	skb_reset_transport_header(skb);
3248	skb_reset_mac_len(skb);
3249
3250	pt_prev = NULL;
3251
3252	rcu_read_lock();
3253
3254another_round:
3255
3256	__this_cpu_inc(softnet_data.processed);
3257
3258	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3259		skb = vlan_untag(skb);
3260		if (unlikely(!skb))
3261			goto out;
3262	}
3263
3264#ifdef CONFIG_NET_CLS_ACT
3265	if (skb->tc_verd & TC_NCLS) {
3266		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3267		goto ncls;
3268	}
3269#endif
3270
3271	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3272		if (!ptype->dev || ptype->dev == skb->dev) {
3273			if (pt_prev)
3274				ret = deliver_skb(skb, pt_prev, orig_dev);
3275			pt_prev = ptype;
3276		}
3277	}
3278
3279#ifdef CONFIG_NET_CLS_ACT
3280	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3281	if (!skb)
3282		goto out;
3283ncls:
3284#endif
3285
3286	rx_handler = rcu_dereference(skb->dev->rx_handler);
3287	if (vlan_tx_tag_present(skb)) {
3288		if (pt_prev) {
3289			ret = deliver_skb(skb, pt_prev, orig_dev);
3290			pt_prev = NULL;
3291		}
3292		if (vlan_do_receive(&skb, !rx_handler))
3293			goto another_round;
3294		else if (unlikely(!skb))
3295			goto out;
3296	}
3297
3298	if (rx_handler) {
3299		if (pt_prev) {
3300			ret = deliver_skb(skb, pt_prev, orig_dev);
3301			pt_prev = NULL;
3302		}
3303		switch (rx_handler(&skb)) {
3304		case RX_HANDLER_CONSUMED:
3305			goto out;
3306		case RX_HANDLER_ANOTHER:
3307			goto another_round;
3308		case RX_HANDLER_EXACT:
3309			deliver_exact = true;
3310		case RX_HANDLER_PASS:
3311			break;
3312		default:
3313			BUG();
3314		}
3315	}
3316
3317	/* deliver only exact match when indicated */
3318	null_or_dev = deliver_exact ? skb->dev : NULL;
3319
3320	type = skb->protocol;
3321	list_for_each_entry_rcu(ptype,
3322			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3323		if (ptype->type == type &&
3324		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3325		     ptype->dev == orig_dev)) {
3326			if (pt_prev)
3327				ret = deliver_skb(skb, pt_prev, orig_dev);
3328			pt_prev = ptype;
3329		}
3330	}
3331
3332	if (pt_prev) {
3333		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3334	} else {
3335		atomic_long_inc(&skb->dev->rx_dropped);
3336		kfree_skb(skb);
3337		/* Jamal, now you will not able to escape explaining
3338		 * me how you were going to use this. :-)
3339		 */
3340		ret = NET_RX_DROP;
3341	}
3342
3343out:
3344	rcu_read_unlock();
3345	return ret;
3346}
3347
3348/**
3349 *	netif_receive_skb - process receive buffer from network
3350 *	@skb: buffer to process
3351 *
3352 *	netif_receive_skb() is the main receive data processing function.
3353 *	It always succeeds. The buffer may be dropped during processing
3354 *	for congestion control or by the protocol layers.
3355 *
3356 *	This function may only be called from softirq context and interrupts
3357 *	should be enabled.
3358 *
3359 *	Return values (usually ignored):
3360 *	NET_RX_SUCCESS: no congestion
3361 *	NET_RX_DROP: packet was dropped
3362 */
3363int netif_receive_skb(struct sk_buff *skb)
3364{
3365	if (netdev_tstamp_prequeue)
3366		net_timestamp_check(skb);
3367
3368	if (skb_defer_rx_timestamp(skb))
3369		return NET_RX_SUCCESS;
3370
3371#ifdef CONFIG_RPS
3372	{
3373		struct rps_dev_flow voidflow, *rflow = &voidflow;
3374		int cpu, ret;
3375
3376		rcu_read_lock();
3377
3378		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3379
3380		if (cpu >= 0) {
3381			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3382			rcu_read_unlock();
3383		} else {
3384			rcu_read_unlock();
3385			ret = __netif_receive_skb(skb);
3386		}
3387
3388		return ret;
3389	}
3390#else
3391	return __netif_receive_skb(skb);
3392#endif
3393}
3394EXPORT_SYMBOL(netif_receive_skb);
3395
3396/* Network device is going away, flush any packets still pending
3397 * Called with irqs disabled.
3398 */
3399static void flush_backlog(void *arg)
3400{
3401	struct net_device *dev = arg;
3402	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3403	struct sk_buff *skb, *tmp;
3404
3405	rps_lock(sd);
3406	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3407		if (skb->dev == dev) {
3408			__skb_unlink(skb, &sd->input_pkt_queue);
3409			kfree_skb(skb);
3410			input_queue_head_incr(sd);
3411		}
3412	}
3413	rps_unlock(sd);
3414
3415	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3416		if (skb->dev == dev) {
3417			__skb_unlink(skb, &sd->process_queue);
3418			kfree_skb(skb);
3419			input_queue_head_incr(sd);
3420		}
3421	}
3422}
3423
3424static int napi_gro_complete(struct sk_buff *skb)
3425{
3426	struct packet_type *ptype;
3427	__be16 type = skb->protocol;
3428	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3429	int err = -ENOENT;
3430
3431	if (NAPI_GRO_CB(skb)->count == 1) {
3432		skb_shinfo(skb)->gso_size = 0;
3433		goto out;
3434	}
3435
3436	rcu_read_lock();
3437	list_for_each_entry_rcu(ptype, head, list) {
3438		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3439			continue;
3440
3441		err = ptype->gro_complete(skb);
3442		break;
3443	}
3444	rcu_read_unlock();
3445
3446	if (err) {
3447		WARN_ON(&ptype->list == head);
3448		kfree_skb(skb);
3449		return NET_RX_SUCCESS;
3450	}
3451
3452out:
3453	return netif_receive_skb(skb);
3454}
3455
3456inline void napi_gro_flush(struct napi_struct *napi)
3457{
3458	struct sk_buff *skb, *next;
3459
3460	for (skb = napi->gro_list; skb; skb = next) {
3461		next = skb->next;
3462		skb->next = NULL;
3463		napi_gro_complete(skb);
3464	}
3465
3466	napi->gro_count = 0;
3467	napi->gro_list = NULL;
3468}
3469EXPORT_SYMBOL(napi_gro_flush);
3470
3471enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3472{
3473	struct sk_buff **pp = NULL;
3474	struct packet_type *ptype;
3475	__be16 type = skb->protocol;
3476	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3477	int same_flow;
3478	int mac_len;
3479	enum gro_result ret;
3480
3481	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3482		goto normal;
3483
3484	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3485		goto normal;
3486
3487	rcu_read_lock();
3488	list_for_each_entry_rcu(ptype, head, list) {
3489		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3490			continue;
3491
3492		skb_set_network_header(skb, skb_gro_offset(skb));
3493		mac_len = skb->network_header - skb->mac_header;
3494		skb->mac_len = mac_len;
3495		NAPI_GRO_CB(skb)->same_flow = 0;
3496		NAPI_GRO_CB(skb)->flush = 0;
3497		NAPI_GRO_CB(skb)->free = 0;
3498
3499		pp = ptype->gro_receive(&napi->gro_list, skb);
3500		break;
3501	}
3502	rcu_read_unlock();
3503
3504	if (&ptype->list == head)
3505		goto normal;
3506
3507	same_flow = NAPI_GRO_CB(skb)->same_flow;
3508	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3509
3510	if (pp) {
3511		struct sk_buff *nskb = *pp;
3512
3513		*pp = nskb->next;
3514		nskb->next = NULL;
3515		napi_gro_complete(nskb);
3516		napi->gro_count--;
3517	}
3518
3519	if (same_flow)
3520		goto ok;
3521
3522	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3523		goto normal;
3524
3525	napi->gro_count++;
3526	NAPI_GRO_CB(skb)->count = 1;
3527	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3528	skb->next = napi->gro_list;
3529	napi->gro_list = skb;
3530	ret = GRO_HELD;
3531
3532pull:
3533	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3534		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3535
3536		BUG_ON(skb->end - skb->tail < grow);
3537
3538		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3539
3540		skb->tail += grow;
3541		skb->data_len -= grow;
3542
3543		skb_shinfo(skb)->frags[0].page_offset += grow;
3544		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3545
3546		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3547			skb_frag_unref(skb, 0);
3548			memmove(skb_shinfo(skb)->frags,
3549				skb_shinfo(skb)->frags + 1,
3550				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3551		}
3552	}
3553
3554ok:
3555	return ret;
3556
3557normal:
3558	ret = GRO_NORMAL;
3559	goto pull;
3560}
3561EXPORT_SYMBOL(dev_gro_receive);
3562
3563static inline gro_result_t
3564__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3565{
3566	struct sk_buff *p;
3567
3568	for (p = napi->gro_list; p; p = p->next) {
3569		unsigned long diffs;
3570
3571		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3572		diffs |= p->vlan_tci ^ skb->vlan_tci;
3573		diffs |= compare_ether_header(skb_mac_header(p),
3574					      skb_gro_mac_header(skb));
3575		NAPI_GRO_CB(p)->same_flow = !diffs;
3576		NAPI_GRO_CB(p)->flush = 0;
3577	}
3578
3579	return dev_gro_receive(napi, skb);
3580}
3581
3582gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3583{
3584	switch (ret) {
3585	case GRO_NORMAL:
3586		if (netif_receive_skb(skb))
3587			ret = GRO_DROP;
3588		break;
3589
3590	case GRO_DROP:
3591	case GRO_MERGED_FREE:
3592		kfree_skb(skb);
3593		break;
3594
3595	case GRO_HELD:
3596	case GRO_MERGED:
3597		break;
3598	}
3599
3600	return ret;
3601}
3602EXPORT_SYMBOL(napi_skb_finish);
3603
3604void skb_gro_reset_offset(struct sk_buff *skb)
3605{
3606	NAPI_GRO_CB(skb)->data_offset = 0;
3607	NAPI_GRO_CB(skb)->frag0 = NULL;
3608	NAPI_GRO_CB(skb)->frag0_len = 0;
3609
3610	if (skb->mac_header == skb->tail &&
3611	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3612		NAPI_GRO_CB(skb)->frag0 =
3613			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3614		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3615	}
3616}
3617EXPORT_SYMBOL(skb_gro_reset_offset);
3618
3619gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3620{
3621	skb_gro_reset_offset(skb);
3622
3623	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3624}
3625EXPORT_SYMBOL(napi_gro_receive);
3626
3627static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3628{
3629	__skb_pull(skb, skb_headlen(skb));
3630	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3631	skb->vlan_tci = 0;
3632	skb->dev = napi->dev;
3633	skb->skb_iif = 0;
3634
3635	napi->skb = skb;
3636}
3637
3638struct sk_buff *napi_get_frags(struct napi_struct *napi)
3639{
3640	struct sk_buff *skb = napi->skb;
3641
3642	if (!skb) {
3643		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3644		if (skb)
3645			napi->skb = skb;
3646	}
3647	return skb;
3648}
3649EXPORT_SYMBOL(napi_get_frags);
3650
3651gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3652			       gro_result_t ret)
3653{
3654	switch (ret) {
3655	case GRO_NORMAL:
3656	case GRO_HELD:
3657		skb->protocol = eth_type_trans(skb, skb->dev);
3658
3659		if (ret == GRO_HELD)
3660			skb_gro_pull(skb, -ETH_HLEN);
3661		else if (netif_receive_skb(skb))
3662			ret = GRO_DROP;
3663		break;
3664
3665	case GRO_DROP:
3666	case GRO_MERGED_FREE:
3667		napi_reuse_skb(napi, skb);
3668		break;
3669
3670	case GRO_MERGED:
3671		break;
3672	}
3673
3674	return ret;
3675}
3676EXPORT_SYMBOL(napi_frags_finish);
3677
3678struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3679{
3680	struct sk_buff *skb = napi->skb;
3681	struct ethhdr *eth;
3682	unsigned int hlen;
3683	unsigned int off;
3684
3685	napi->skb = NULL;
3686
3687	skb_reset_mac_header(skb);
3688	skb_gro_reset_offset(skb);
3689
3690	off = skb_gro_offset(skb);
3691	hlen = off + sizeof(*eth);
3692	eth = skb_gro_header_fast(skb, off);
3693	if (skb_gro_header_hard(skb, hlen)) {
3694		eth = skb_gro_header_slow(skb, hlen, off);
3695		if (unlikely(!eth)) {
3696			napi_reuse_skb(napi, skb);
3697			skb = NULL;
3698			goto out;
3699		}
3700	}
3701
3702	skb_gro_pull(skb, sizeof(*eth));
3703
3704	/*
3705	 * This works because the only protocols we care about don't require
3706	 * special handling.  We'll fix it up properly at the end.
3707	 */
3708	skb->protocol = eth->h_proto;
3709
3710out:
3711	return skb;
3712}
3713EXPORT_SYMBOL(napi_frags_skb);
3714
3715gro_result_t napi_gro_frags(struct napi_struct *napi)
3716{
3717	struct sk_buff *skb = napi_frags_skb(napi);
3718
3719	if (!skb)
3720		return GRO_DROP;
3721
3722	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3723}
3724EXPORT_SYMBOL(napi_gro_frags);
3725
3726/*
3727 * net_rps_action sends any pending IPI's for rps.
3728 * Note: called with local irq disabled, but exits with local irq enabled.
3729 */
3730static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3731{
3732#ifdef CONFIG_RPS
3733	struct softnet_data *remsd = sd->rps_ipi_list;
3734
3735	if (remsd) {
3736		sd->rps_ipi_list = NULL;
3737
3738		local_irq_enable();
3739
3740		/* Send pending IPI's to kick RPS processing on remote cpus. */
3741		while (remsd) {
3742			struct softnet_data *next = remsd->rps_ipi_next;
3743
3744			if (cpu_online(remsd->cpu))
3745				__smp_call_function_single(remsd->cpu,
3746							   &remsd->csd, 0);
3747			remsd = next;
3748		}
3749	} else
3750#endif
3751		local_irq_enable();
3752}
3753
3754static int process_backlog(struct napi_struct *napi, int quota)
3755{
3756	int work = 0;
3757	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3758
3759#ifdef CONFIG_RPS
3760	/* Check if we have pending ipi, its better to send them now,
3761	 * not waiting net_rx_action() end.
3762	 */
3763	if (sd->rps_ipi_list) {
3764		local_irq_disable();
3765		net_rps_action_and_irq_enable(sd);
3766	}
3767#endif
3768	napi->weight = weight_p;
3769	local_irq_disable();
3770	while (work < quota) {
3771		struct sk_buff *skb;
3772		unsigned int qlen;
3773
3774		while ((skb = __skb_dequeue(&sd->process_queue))) {
3775			local_irq_enable();
3776			__netif_receive_skb(skb);
3777			local_irq_disable();
3778			input_queue_head_incr(sd);
3779			if (++work >= quota) {
3780				local_irq_enable();
3781				return work;
3782			}
3783		}
3784
3785		rps_lock(sd);
3786		qlen = skb_queue_len(&sd->input_pkt_queue);
3787		if (qlen)
3788			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3789						   &sd->process_queue);
3790
3791		if (qlen < quota - work) {
3792			/*
3793			 * Inline a custom version of __napi_complete().
3794			 * only current cpu owns and manipulates this napi,
3795			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3796			 * we can use a plain write instead of clear_bit(),
3797			 * and we dont need an smp_mb() memory barrier.
3798			 */
3799			list_del(&napi->poll_list);
3800			napi->state = 0;
3801
3802			quota = work + qlen;
3803		}
3804		rps_unlock(sd);
3805	}
3806	local_irq_enable();
3807
3808	return work;
3809}
3810
3811/**
3812 * __napi_schedule - schedule for receive
3813 * @n: entry to schedule
3814 *
3815 * The entry's receive function will be scheduled to run
3816 */
3817void __napi_schedule(struct napi_struct *n)
3818{
3819	unsigned long flags;
3820
3821	local_irq_save(flags);
3822	____napi_schedule(&__get_cpu_var(softnet_data), n);
3823	local_irq_restore(flags);
3824}
3825EXPORT_SYMBOL(__napi_schedule);
3826
3827void __napi_complete(struct napi_struct *n)
3828{
3829	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3830	BUG_ON(n->gro_list);
3831
3832	list_del(&n->poll_list);
3833	smp_mb__before_clear_bit();
3834	clear_bit(NAPI_STATE_SCHED, &n->state);
3835}
3836EXPORT_SYMBOL(__napi_complete);
3837
3838void napi_complete(struct napi_struct *n)
3839{
3840	unsigned long flags;
3841
3842	/*
3843	 * don't let napi dequeue from the cpu poll list
3844	 * just in case its running on a different cpu
3845	 */
3846	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3847		return;
3848
3849	napi_gro_flush(n);
3850	local_irq_save(flags);
3851	__napi_complete(n);
3852	local_irq_restore(flags);
3853}
3854EXPORT_SYMBOL(napi_complete);
3855
3856void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3857		    int (*poll)(struct napi_struct *, int), int weight)
3858{
3859	INIT_LIST_HEAD(&napi->poll_list);
3860	napi->gro_count = 0;
3861	napi->gro_list = NULL;
3862	napi->skb = NULL;
3863	napi->poll = poll;
3864	napi->weight = weight;
3865	list_add(&napi->dev_list, &dev->napi_list);
3866	napi->dev = dev;
3867#ifdef CONFIG_NETPOLL
3868	spin_lock_init(&napi->poll_lock);
3869	napi->poll_owner = -1;
3870#endif
3871	set_bit(NAPI_STATE_SCHED, &napi->state);
3872}
3873EXPORT_SYMBOL(netif_napi_add);
3874
3875void netif_napi_del(struct napi_struct *napi)
3876{
3877	struct sk_buff *skb, *next;
3878
3879	list_del_init(&napi->dev_list);
3880	napi_free_frags(napi);
3881
3882	for (skb = napi->gro_list; skb; skb = next) {
3883		next = skb->next;
3884		skb->next = NULL;
3885		kfree_skb(skb);
3886	}
3887
3888	napi->gro_list = NULL;
3889	napi->gro_count = 0;
3890}
3891EXPORT_SYMBOL(netif_napi_del);
3892
3893static void net_rx_action(struct softirq_action *h)
3894{
3895	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3896	unsigned long time_limit = jiffies + 2;
3897	int budget = netdev_budget;
3898	void *have;
3899
3900	local_irq_disable();
3901
3902	while (!list_empty(&sd->poll_list)) {
3903		struct napi_struct *n;
3904		int work, weight;
3905
3906		/* If softirq window is exhuasted then punt.
3907		 * Allow this to run for 2 jiffies since which will allow
3908		 * an average latency of 1.5/HZ.
3909		 */
3910		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3911			goto softnet_break;
3912
3913		local_irq_enable();
3914
3915		/* Even though interrupts have been re-enabled, this
3916		 * access is safe because interrupts can only add new
3917		 * entries to the tail of this list, and only ->poll()
3918		 * calls can remove this head entry from the list.
3919		 */
3920		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3921
3922		have = netpoll_poll_lock(n);
3923
3924		weight = n->weight;
3925
3926		/* This NAPI_STATE_SCHED test is for avoiding a race
3927		 * with netpoll's poll_napi().  Only the entity which
3928		 * obtains the lock and sees NAPI_STATE_SCHED set will
3929		 * actually make the ->poll() call.  Therefore we avoid
3930		 * accidentally calling ->poll() when NAPI is not scheduled.
3931		 */
3932		work = 0;
3933		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3934			work = n->poll(n, weight);
3935			trace_napi_poll(n);
3936		}
3937
3938		WARN_ON_ONCE(work > weight);
3939
3940		budget -= work;
3941
3942		local_irq_disable();
3943
3944		/* Drivers must not modify the NAPI state if they
3945		 * consume the entire weight.  In such cases this code
3946		 * still "owns" the NAPI instance and therefore can
3947		 * move the instance around on the list at-will.
3948		 */
3949		if (unlikely(work == weight)) {
3950			if (unlikely(napi_disable_pending(n))) {
3951				local_irq_enable();
3952				napi_complete(n);
3953				local_irq_disable();
3954			} else
3955				list_move_tail(&n->poll_list, &sd->poll_list);
3956		}
3957
3958		netpoll_poll_unlock(have);
3959	}
3960out:
3961	net_rps_action_and_irq_enable(sd);
3962
3963#ifdef CONFIG_NET_DMA
3964	/*
3965	 * There may not be any more sk_buffs coming right now, so push
3966	 * any pending DMA copies to hardware
3967	 */
3968	dma_issue_pending_all();
3969#endif
3970
3971	return;
3972
3973softnet_break:
3974	sd->time_squeeze++;
3975	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3976	goto out;
3977}
3978
3979static gifconf_func_t *gifconf_list[NPROTO];
3980
3981/**
3982 *	register_gifconf	-	register a SIOCGIF handler
3983 *	@family: Address family
3984 *	@gifconf: Function handler
3985 *
3986 *	Register protocol dependent address dumping routines. The handler
3987 *	that is passed must not be freed or reused until it has been replaced
3988 *	by another handler.
3989 */
3990int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3991{
3992	if (family >= NPROTO)
3993		return -EINVAL;
3994	gifconf_list[family] = gifconf;
3995	return 0;
3996}
3997EXPORT_SYMBOL(register_gifconf);
3998
3999
4000/*
4001 *	Map an interface index to its name (SIOCGIFNAME)
4002 */
4003
4004/*
4005 *	We need this ioctl for efficient implementation of the
4006 *	if_indextoname() function required by the IPv6 API.  Without
4007 *	it, we would have to search all the interfaces to find a
4008 *	match.  --pb
4009 */
4010
4011static int dev_ifname(struct net *net, struct ifreq __user *arg)
4012{
4013	struct net_device *dev;
4014	struct ifreq ifr;
4015
4016	/*
4017	 *	Fetch the caller's info block.
4018	 */
4019
4020	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4021		return -EFAULT;
4022
4023	rcu_read_lock();
4024	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4025	if (!dev) {
4026		rcu_read_unlock();
4027		return -ENODEV;
4028	}
4029
4030	strcpy(ifr.ifr_name, dev->name);
4031	rcu_read_unlock();
4032
4033	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4034		return -EFAULT;
4035	return 0;
4036}
4037
4038/*
4039 *	Perform a SIOCGIFCONF call. This structure will change
4040 *	size eventually, and there is nothing I can do about it.
4041 *	Thus we will need a 'compatibility mode'.
4042 */
4043
4044static int dev_ifconf(struct net *net, char __user *arg)
4045{
4046	struct ifconf ifc;
4047	struct net_device *dev;
4048	char __user *pos;
4049	int len;
4050	int total;
4051	int i;
4052
4053	/*
4054	 *	Fetch the caller's info block.
4055	 */
4056
4057	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4058		return -EFAULT;
4059
4060	pos = ifc.ifc_buf;
4061	len = ifc.ifc_len;
4062
4063	/*
4064	 *	Loop over the interfaces, and write an info block for each.
4065	 */
4066
4067	total = 0;
4068	for_each_netdev(net, dev) {
4069		for (i = 0; i < NPROTO; i++) {
4070			if (gifconf_list[i]) {
4071				int done;
4072				if (!pos)
4073					done = gifconf_list[i](dev, NULL, 0);
4074				else
4075					done = gifconf_list[i](dev, pos + total,
4076							       len - total);
4077				if (done < 0)
4078					return -EFAULT;
4079				total += done;
4080			}
4081		}
4082	}
4083
4084	/*
4085	 *	All done.  Write the updated control block back to the caller.
4086	 */
4087	ifc.ifc_len = total;
4088
4089	/*
4090	 * 	Both BSD and Solaris return 0 here, so we do too.
4091	 */
4092	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4093}
4094
4095#ifdef CONFIG_PROC_FS
4096
4097#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4098
4099struct dev_iter_state {
4100	struct seq_net_private p;
4101	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4102};
4103
4104#define get_bucket(x) ((x) >> BUCKET_SPACE)
4105#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4106#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4107
4108static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4109{
4110	struct dev_iter_state *state = seq->private;
4111	struct net *net = seq_file_net(seq);
4112	struct net_device *dev;
4113	struct hlist_node *p;
4114	struct hlist_head *h;
4115	unsigned int count, bucket, offset;
4116
4117	bucket = get_bucket(state->pos);
4118	offset = get_offset(state->pos);
4119	h = &net->dev_name_head[bucket];
4120	count = 0;
4121	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4122		if (count++ == offset) {
4123			state->pos = set_bucket_offset(bucket, count);
4124			return dev;
4125		}
4126	}
4127
4128	return NULL;
4129}
4130
4131static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4132{
4133	struct dev_iter_state *state = seq->private;
4134	struct net_device *dev;
4135	unsigned int bucket;
4136
4137	bucket = get_bucket(state->pos);
4138	do {
4139		dev = dev_from_same_bucket(seq);
4140		if (dev)
4141			return dev;
4142
4143		bucket++;
4144		state->pos = set_bucket_offset(bucket, 0);
4145	} while (bucket < NETDEV_HASHENTRIES);
4146
4147	return NULL;
4148}
4149
4150/*
4151 *	This is invoked by the /proc filesystem handler to display a device
4152 *	in detail.
4153 */
4154void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4155	__acquires(RCU)
4156{
4157	struct dev_iter_state *state = seq->private;
4158
4159	rcu_read_lock();
4160	if (!*pos)
4161		return SEQ_START_TOKEN;
4162
4163	/* check for end of the hash */
4164	if (state->pos == 0 && *pos > 1)
4165		return NULL;
4166
4167	return dev_from_new_bucket(seq);
4168}
4169
4170void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4171{
4172	struct net_device *dev;
4173
4174	++*pos;
4175
4176	if (v == SEQ_START_TOKEN)
4177		return dev_from_new_bucket(seq);
4178
4179	dev = dev_from_same_bucket(seq);
4180	if (dev)
4181		return dev;
4182
4183	return dev_from_new_bucket(seq);
4184}
4185
4186void dev_seq_stop(struct seq_file *seq, void *v)
4187	__releases(RCU)
4188{
4189	rcu_read_unlock();
4190}
4191
4192static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4193{
4194	struct rtnl_link_stats64 temp;
4195	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4196
4197	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4198		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4199		   dev->name, stats->rx_bytes, stats->rx_packets,
4200		   stats->rx_errors,
4201		   stats->rx_dropped + stats->rx_missed_errors,
4202		   stats->rx_fifo_errors,
4203		   stats->rx_length_errors + stats->rx_over_errors +
4204		    stats->rx_crc_errors + stats->rx_frame_errors,
4205		   stats->rx_compressed, stats->multicast,
4206		   stats->tx_bytes, stats->tx_packets,
4207		   stats->tx_errors, stats->tx_dropped,
4208		   stats->tx_fifo_errors, stats->collisions,
4209		   stats->tx_carrier_errors +
4210		    stats->tx_aborted_errors +
4211		    stats->tx_window_errors +
4212		    stats->tx_heartbeat_errors,
4213		   stats->tx_compressed);
4214}
4215
4216/*
4217 *	Called from the PROCfs module. This now uses the new arbitrary sized
4218 *	/proc/net interface to create /proc/net/dev
4219 */
4220static int dev_seq_show(struct seq_file *seq, void *v)
4221{
4222	if (v == SEQ_START_TOKEN)
4223		seq_puts(seq, "Inter-|   Receive                            "
4224			      "                    |  Transmit\n"
4225			      " face |bytes    packets errs drop fifo frame "
4226			      "compressed multicast|bytes    packets errs "
4227			      "drop fifo colls carrier compressed\n");
4228	else
4229		dev_seq_printf_stats(seq, v);
4230	return 0;
4231}
4232
4233static struct softnet_data *softnet_get_online(loff_t *pos)
4234{
4235	struct softnet_data *sd = NULL;
4236
4237	while (*pos < nr_cpu_ids)
4238		if (cpu_online(*pos)) {
4239			sd = &per_cpu(softnet_data, *pos);
4240			break;
4241		} else
4242			++*pos;
4243	return sd;
4244}
4245
4246static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4247{
4248	return softnet_get_online(pos);
4249}
4250
4251static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4252{
4253	++*pos;
4254	return softnet_get_online(pos);
4255}
4256
4257static void softnet_seq_stop(struct seq_file *seq, void *v)
4258{
4259}
4260
4261static int softnet_seq_show(struct seq_file *seq, void *v)
4262{
4263	struct softnet_data *sd = v;
4264
4265	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4266		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4267		   0, 0, 0, 0, /* was fastroute */
4268		   sd->cpu_collision, sd->received_rps);
4269	return 0;
4270}
4271
4272static const struct seq_operations dev_seq_ops = {
4273	.start = dev_seq_start,
4274	.next  = dev_seq_next,
4275	.stop  = dev_seq_stop,
4276	.show  = dev_seq_show,
4277};
4278
4279static int dev_seq_open(struct inode *inode, struct file *file)
4280{
4281	return seq_open_net(inode, file, &dev_seq_ops,
4282			    sizeof(struct dev_iter_state));
4283}
4284
4285static const struct file_operations dev_seq_fops = {
4286	.owner	 = THIS_MODULE,
4287	.open    = dev_seq_open,
4288	.read    = seq_read,
4289	.llseek  = seq_lseek,
4290	.release = seq_release_net,
4291};
4292
4293static const struct seq_operations softnet_seq_ops = {
4294	.start = softnet_seq_start,
4295	.next  = softnet_seq_next,
4296	.stop  = softnet_seq_stop,
4297	.show  = softnet_seq_show,
4298};
4299
4300static int softnet_seq_open(struct inode *inode, struct file *file)
4301{
4302	return seq_open(file, &softnet_seq_ops);
4303}
4304
4305static const struct file_operations softnet_seq_fops = {
4306	.owner	 = THIS_MODULE,
4307	.open    = softnet_seq_open,
4308	.read    = seq_read,
4309	.llseek  = seq_lseek,
4310	.release = seq_release,
4311};
4312
4313static void *ptype_get_idx(loff_t pos)
4314{
4315	struct packet_type *pt = NULL;
4316	loff_t i = 0;
4317	int t;
4318
4319	list_for_each_entry_rcu(pt, &ptype_all, list) {
4320		if (i == pos)
4321			return pt;
4322		++i;
4323	}
4324
4325	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4326		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4327			if (i == pos)
4328				return pt;
4329			++i;
4330		}
4331	}
4332	return NULL;
4333}
4334
4335static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4336	__acquires(RCU)
4337{
4338	rcu_read_lock();
4339	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4340}
4341
4342static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4343{
4344	struct packet_type *pt;
4345	struct list_head *nxt;
4346	int hash;
4347
4348	++*pos;
4349	if (v == SEQ_START_TOKEN)
4350		return ptype_get_idx(0);
4351
4352	pt = v;
4353	nxt = pt->list.next;
4354	if (pt->type == htons(ETH_P_ALL)) {
4355		if (nxt != &ptype_all)
4356			goto found;
4357		hash = 0;
4358		nxt = ptype_base[0].next;
4359	} else
4360		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4361
4362	while (nxt == &ptype_base[hash]) {
4363		if (++hash >= PTYPE_HASH_SIZE)
4364			return NULL;
4365		nxt = ptype_base[hash].next;
4366	}
4367found:
4368	return list_entry(nxt, struct packet_type, list);
4369}
4370
4371static void ptype_seq_stop(struct seq_file *seq, void *v)
4372	__releases(RCU)
4373{
4374	rcu_read_unlock();
4375}
4376
4377static int ptype_seq_show(struct seq_file *seq, void *v)
4378{
4379	struct packet_type *pt = v;
4380
4381	if (v == SEQ_START_TOKEN)
4382		seq_puts(seq, "Type Device      Function\n");
4383	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4384		if (pt->type == htons(ETH_P_ALL))
4385			seq_puts(seq, "ALL ");
4386		else
4387			seq_printf(seq, "%04x", ntohs(pt->type));
4388
4389		seq_printf(seq, " %-8s %pF\n",
4390			   pt->dev ? pt->dev->name : "", pt->func);
4391	}
4392
4393	return 0;
4394}
4395
4396static const struct seq_operations ptype_seq_ops = {
4397	.start = ptype_seq_start,
4398	.next  = ptype_seq_next,
4399	.stop  = ptype_seq_stop,
4400	.show  = ptype_seq_show,
4401};
4402
4403static int ptype_seq_open(struct inode *inode, struct file *file)
4404{
4405	return seq_open_net(inode, file, &ptype_seq_ops,
4406			sizeof(struct seq_net_private));
4407}
4408
4409static const struct file_operations ptype_seq_fops = {
4410	.owner	 = THIS_MODULE,
4411	.open    = ptype_seq_open,
4412	.read    = seq_read,
4413	.llseek  = seq_lseek,
4414	.release = seq_release_net,
4415};
4416
4417
4418static int __net_init dev_proc_net_init(struct net *net)
4419{
4420	int rc = -ENOMEM;
4421
4422	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4423		goto out;
4424	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4425		goto out_dev;
4426	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4427		goto out_softnet;
4428
4429	if (wext_proc_init(net))
4430		goto out_ptype;
4431	rc = 0;
4432out:
4433	return rc;
4434out_ptype:
4435	proc_net_remove(net, "ptype");
4436out_softnet:
4437	proc_net_remove(net, "softnet_stat");
4438out_dev:
4439	proc_net_remove(net, "dev");
4440	goto out;
4441}
4442
4443static void __net_exit dev_proc_net_exit(struct net *net)
4444{
4445	wext_proc_exit(net);
4446
4447	proc_net_remove(net, "ptype");
4448	proc_net_remove(net, "softnet_stat");
4449	proc_net_remove(net, "dev");
4450}
4451
4452static struct pernet_operations __net_initdata dev_proc_ops = {
4453	.init = dev_proc_net_init,
4454	.exit = dev_proc_net_exit,
4455};
4456
4457static int __init dev_proc_init(void)
4458{
4459	return register_pernet_subsys(&dev_proc_ops);
4460}
4461#else
4462#define dev_proc_init() 0
4463#endif	/* CONFIG_PROC_FS */
4464
4465
4466/**
4467 *	netdev_set_master	-	set up master pointer
4468 *	@slave: slave device
4469 *	@master: new master device
4470 *
4471 *	Changes the master device of the slave. Pass %NULL to break the
4472 *	bonding. The caller must hold the RTNL semaphore. On a failure
4473 *	a negative errno code is returned. On success the reference counts
4474 *	are adjusted and the function returns zero.
4475 */
4476int netdev_set_master(struct net_device *slave, struct net_device *master)
4477{
4478	struct net_device *old = slave->master;
4479
4480	ASSERT_RTNL();
4481
4482	if (master) {
4483		if (old)
4484			return -EBUSY;
4485		dev_hold(master);
4486	}
4487
4488	slave->master = master;
4489
4490	if (old)
4491		dev_put(old);
4492	return 0;
4493}
4494EXPORT_SYMBOL(netdev_set_master);
4495
4496/**
4497 *	netdev_set_bond_master	-	set up bonding master/slave pair
4498 *	@slave: slave device
4499 *	@master: new master device
4500 *
4501 *	Changes the master device of the slave. Pass %NULL to break the
4502 *	bonding. The caller must hold the RTNL semaphore. On a failure
4503 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4504 *	to the routing socket and the function returns zero.
4505 */
4506int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4507{
4508	int err;
4509
4510	ASSERT_RTNL();
4511
4512	err = netdev_set_master(slave, master);
4513	if (err)
4514		return err;
4515	if (master)
4516		slave->flags |= IFF_SLAVE;
4517	else
4518		slave->flags &= ~IFF_SLAVE;
4519
4520	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4521	return 0;
4522}
4523EXPORT_SYMBOL(netdev_set_bond_master);
4524
4525static void dev_change_rx_flags(struct net_device *dev, int flags)
4526{
4527	const struct net_device_ops *ops = dev->netdev_ops;
4528
4529	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4530		ops->ndo_change_rx_flags(dev, flags);
4531}
4532
4533static int __dev_set_promiscuity(struct net_device *dev, int inc)
4534{
4535	unsigned short old_flags = dev->flags;
4536	uid_t uid;
4537	gid_t gid;
4538
4539	ASSERT_RTNL();
4540
4541	dev->flags |= IFF_PROMISC;
4542	dev->promiscuity += inc;
4543	if (dev->promiscuity == 0) {
4544		/*
4545		 * Avoid overflow.
4546		 * If inc causes overflow, untouch promisc and return error.
4547		 */
4548		if (inc < 0)
4549			dev->flags &= ~IFF_PROMISC;
4550		else {
4551			dev->promiscuity -= inc;
4552			printk(KERN_WARNING "%s: promiscuity touches roof, "
4553				"set promiscuity failed, promiscuity feature "
4554				"of device might be broken.\n", dev->name);
4555			return -EOVERFLOW;
4556		}
4557	}
4558	if (dev->flags != old_flags) {
4559		printk(KERN_INFO "device %s %s promiscuous mode\n",
4560		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4561							       "left");
4562		if (audit_enabled) {
4563			current_uid_gid(&uid, &gid);
4564			audit_log(current->audit_context, GFP_ATOMIC,
4565				AUDIT_ANOM_PROMISCUOUS,
4566				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4567				dev->name, (dev->flags & IFF_PROMISC),
4568				(old_flags & IFF_PROMISC),
4569				audit_get_loginuid(current),
4570				uid, gid,
4571				audit_get_sessionid(current));
4572		}
4573
4574		dev_change_rx_flags(dev, IFF_PROMISC);
4575	}
4576	return 0;
4577}
4578
4579/**
4580 *	dev_set_promiscuity	- update promiscuity count on a device
4581 *	@dev: device
4582 *	@inc: modifier
4583 *
4584 *	Add or remove promiscuity from a device. While the count in the device
4585 *	remains above zero the interface remains promiscuous. Once it hits zero
4586 *	the device reverts back to normal filtering operation. A negative inc
4587 *	value is used to drop promiscuity on the device.
4588 *	Return 0 if successful or a negative errno code on error.
4589 */
4590int dev_set_promiscuity(struct net_device *dev, int inc)
4591{
4592	unsigned short old_flags = dev->flags;
4593	int err;
4594
4595	err = __dev_set_promiscuity(dev, inc);
4596	if (err < 0)
4597		return err;
4598	if (dev->flags != old_flags)
4599		dev_set_rx_mode(dev);
4600	return err;
4601}
4602EXPORT_SYMBOL(dev_set_promiscuity);
4603
4604/**
4605 *	dev_set_allmulti	- update allmulti count on a device
4606 *	@dev: device
4607 *	@inc: modifier
4608 *
4609 *	Add or remove reception of all multicast frames to a device. While the
4610 *	count in the device remains above zero the interface remains listening
4611 *	to all interfaces. Once it hits zero the device reverts back to normal
4612 *	filtering operation. A negative @inc value is used to drop the counter
4613 *	when releasing a resource needing all multicasts.
4614 *	Return 0 if successful or a negative errno code on error.
4615 */
4616
4617int dev_set_allmulti(struct net_device *dev, int inc)
4618{
4619	unsigned short old_flags = dev->flags;
4620
4621	ASSERT_RTNL();
4622
4623	dev->flags |= IFF_ALLMULTI;
4624	dev->allmulti += inc;
4625	if (dev->allmulti == 0) {
4626		/*
4627		 * Avoid overflow.
4628		 * If inc causes overflow, untouch allmulti and return error.
4629		 */
4630		if (inc < 0)
4631			dev->flags &= ~IFF_ALLMULTI;
4632		else {
4633			dev->allmulti -= inc;
4634			printk(KERN_WARNING "%s: allmulti touches roof, "
4635				"set allmulti failed, allmulti feature of "
4636				"device might be broken.\n", dev->name);
4637			return -EOVERFLOW;
4638		}
4639	}
4640	if (dev->flags ^ old_flags) {
4641		dev_change_rx_flags(dev, IFF_ALLMULTI);
4642		dev_set_rx_mode(dev);
4643	}
4644	return 0;
4645}
4646EXPORT_SYMBOL(dev_set_allmulti);
4647
4648/*
4649 *	Upload unicast and multicast address lists to device and
4650 *	configure RX filtering. When the device doesn't support unicast
4651 *	filtering it is put in promiscuous mode while unicast addresses
4652 *	are present.
4653 */
4654void __dev_set_rx_mode(struct net_device *dev)
4655{
4656	const struct net_device_ops *ops = dev->netdev_ops;
4657
4658	/* dev_open will call this function so the list will stay sane. */
4659	if (!(dev->flags&IFF_UP))
4660		return;
4661
4662	if (!netif_device_present(dev))
4663		return;
4664
4665	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4666		/* Unicast addresses changes may only happen under the rtnl,
4667		 * therefore calling __dev_set_promiscuity here is safe.
4668		 */
4669		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4670			__dev_set_promiscuity(dev, 1);
4671			dev->uc_promisc = true;
4672		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4673			__dev_set_promiscuity(dev, -1);
4674			dev->uc_promisc = false;
4675		}
4676	}
4677
4678	if (ops->ndo_set_rx_mode)
4679		ops->ndo_set_rx_mode(dev);
4680}
4681
4682void dev_set_rx_mode(struct net_device *dev)
4683{
4684	netif_addr_lock_bh(dev);
4685	__dev_set_rx_mode(dev);
4686	netif_addr_unlock_bh(dev);
4687}
4688
4689/**
4690 *	dev_get_flags - get flags reported to userspace
4691 *	@dev: device
4692 *
4693 *	Get the combination of flag bits exported through APIs to userspace.
4694 */
4695unsigned dev_get_flags(const struct net_device *dev)
4696{
4697	unsigned flags;
4698
4699	flags = (dev->flags & ~(IFF_PROMISC |
4700				IFF_ALLMULTI |
4701				IFF_RUNNING |
4702				IFF_LOWER_UP |
4703				IFF_DORMANT)) |
4704		(dev->gflags & (IFF_PROMISC |
4705				IFF_ALLMULTI));
4706
4707	if (netif_running(dev)) {
4708		if (netif_oper_up(dev))
4709			flags |= IFF_RUNNING;
4710		if (netif_carrier_ok(dev))
4711			flags |= IFF_LOWER_UP;
4712		if (netif_dormant(dev))
4713			flags |= IFF_DORMANT;
4714	}
4715
4716	return flags;
4717}
4718EXPORT_SYMBOL(dev_get_flags);
4719
4720int __dev_change_flags(struct net_device *dev, unsigned int flags)
4721{
4722	int old_flags = dev->flags;
4723	int ret;
4724
4725	ASSERT_RTNL();
4726
4727	/*
4728	 *	Set the flags on our device.
4729	 */
4730
4731	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4732			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4733			       IFF_AUTOMEDIA)) |
4734		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4735				    IFF_ALLMULTI));
4736
4737	/*
4738	 *	Load in the correct multicast list now the flags have changed.
4739	 */
4740
4741	if ((old_flags ^ flags) & IFF_MULTICAST)
4742		dev_change_rx_flags(dev, IFF_MULTICAST);
4743
4744	dev_set_rx_mode(dev);
4745
4746	/*
4747	 *	Have we downed the interface. We handle IFF_UP ourselves
4748	 *	according to user attempts to set it, rather than blindly
4749	 *	setting it.
4750	 */
4751
4752	ret = 0;
4753	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4754		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4755
4756		if (!ret)
4757			dev_set_rx_mode(dev);
4758	}
4759
4760	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4761		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4762
4763		dev->gflags ^= IFF_PROMISC;
4764		dev_set_promiscuity(dev, inc);
4765	}
4766
4767	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4768	   is important. Some (broken) drivers set IFF_PROMISC, when
4769	   IFF_ALLMULTI is requested not asking us and not reporting.
4770	 */
4771	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4772		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4773
4774		dev->gflags ^= IFF_ALLMULTI;
4775		dev_set_allmulti(dev, inc);
4776	}
4777
4778	return ret;
4779}
4780
4781void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4782{
4783	unsigned int changes = dev->flags ^ old_flags;
4784
4785	if (changes & IFF_UP) {
4786		if (dev->flags & IFF_UP)
4787			call_netdevice_notifiers(NETDEV_UP, dev);
4788		else
4789			call_netdevice_notifiers(NETDEV_DOWN, dev);
4790	}
4791
4792	if (dev->flags & IFF_UP &&
4793	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4794		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4795}
4796
4797/**
4798 *	dev_change_flags - change device settings
4799 *	@dev: device
4800 *	@flags: device state flags
4801 *
4802 *	Change settings on device based state flags. The flags are
4803 *	in the userspace exported format.
4804 */
4805int dev_change_flags(struct net_device *dev, unsigned flags)
4806{
4807	int ret, changes;
4808	int old_flags = dev->flags;
4809
4810	ret = __dev_change_flags(dev, flags);
4811	if (ret < 0)
4812		return ret;
4813
4814	changes = old_flags ^ dev->flags;
4815	if (changes)
4816		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4817
4818	__dev_notify_flags(dev, old_flags);
4819	return ret;
4820}
4821EXPORT_SYMBOL(dev_change_flags);
4822
4823/**
4824 *	dev_set_mtu - Change maximum transfer unit
4825 *	@dev: device
4826 *	@new_mtu: new transfer unit
4827 *
4828 *	Change the maximum transfer size of the network device.
4829 */
4830int dev_set_mtu(struct net_device *dev, int new_mtu)
4831{
4832	const struct net_device_ops *ops = dev->netdev_ops;
4833	int err;
4834
4835	if (new_mtu == dev->mtu)
4836		return 0;
4837
4838	/*	MTU must be positive.	 */
4839	if (new_mtu < 0)
4840		return -EINVAL;
4841
4842	if (!netif_device_present(dev))
4843		return -ENODEV;
4844
4845	err = 0;
4846	if (ops->ndo_change_mtu)
4847		err = ops->ndo_change_mtu(dev, new_mtu);
4848	else
4849		dev->mtu = new_mtu;
4850
4851	if (!err && dev->flags & IFF_UP)
4852		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4853	return err;
4854}
4855EXPORT_SYMBOL(dev_set_mtu);
4856
4857/**
4858 *	dev_set_group - Change group this device belongs to
4859 *	@dev: device
4860 *	@new_group: group this device should belong to
4861 */
4862void dev_set_group(struct net_device *dev, int new_group)
4863{
4864	dev->group = new_group;
4865}
4866EXPORT_SYMBOL(dev_set_group);
4867
4868/**
4869 *	dev_set_mac_address - Change Media Access Control Address
4870 *	@dev: device
4871 *	@sa: new address
4872 *
4873 *	Change the hardware (MAC) address of the device
4874 */
4875int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4876{
4877	const struct net_device_ops *ops = dev->netdev_ops;
4878	int err;
4879
4880	if (!ops->ndo_set_mac_address)
4881		return -EOPNOTSUPP;
4882	if (sa->sa_family != dev->type)
4883		return -EINVAL;
4884	if (!netif_device_present(dev))
4885		return -ENODEV;
4886	err = ops->ndo_set_mac_address(dev, sa);
4887	if (!err)
4888		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4889	return err;
4890}
4891EXPORT_SYMBOL(dev_set_mac_address);
4892
4893/*
4894 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4895 */
4896static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4897{
4898	int err;
4899	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4900
4901	if (!dev)
4902		return -ENODEV;
4903
4904	switch (cmd) {
4905	case SIOCGIFFLAGS:	/* Get interface flags */
4906		ifr->ifr_flags = (short) dev_get_flags(dev);
4907		return 0;
4908
4909	case SIOCGIFMETRIC:	/* Get the metric on the interface
4910				   (currently unused) */
4911		ifr->ifr_metric = 0;
4912		return 0;
4913
4914	case SIOCGIFMTU:	/* Get the MTU of a device */
4915		ifr->ifr_mtu = dev->mtu;
4916		return 0;
4917
4918	case SIOCGIFHWADDR:
4919		if (!dev->addr_len)
4920			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4921		else
4922			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4923			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4924		ifr->ifr_hwaddr.sa_family = dev->type;
4925		return 0;
4926
4927	case SIOCGIFSLAVE:
4928		err = -EINVAL;
4929		break;
4930
4931	case SIOCGIFMAP:
4932		ifr->ifr_map.mem_start = dev->mem_start;
4933		ifr->ifr_map.mem_end   = dev->mem_end;
4934		ifr->ifr_map.base_addr = dev->base_addr;
4935		ifr->ifr_map.irq       = dev->irq;
4936		ifr->ifr_map.dma       = dev->dma;
4937		ifr->ifr_map.port      = dev->if_port;
4938		return 0;
4939
4940	case SIOCGIFINDEX:
4941		ifr->ifr_ifindex = dev->ifindex;
4942		return 0;
4943
4944	case SIOCGIFTXQLEN:
4945		ifr->ifr_qlen = dev->tx_queue_len;
4946		return 0;
4947
4948	default:
4949		/* dev_ioctl() should ensure this case
4950		 * is never reached
4951		 */
4952		WARN_ON(1);
4953		err = -ENOTTY;
4954		break;
4955
4956	}
4957	return err;
4958}
4959
4960/*
4961 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4962 */
4963static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4964{
4965	int err;
4966	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4967	const struct net_device_ops *ops;
4968
4969	if (!dev)
4970		return -ENODEV;
4971
4972	ops = dev->netdev_ops;
4973
4974	switch (cmd) {
4975	case SIOCSIFFLAGS:	/* Set interface flags */
4976		return dev_change_flags(dev, ifr->ifr_flags);
4977
4978	case SIOCSIFMETRIC:	/* Set the metric on the interface
4979				   (currently unused) */
4980		return -EOPNOTSUPP;
4981
4982	case SIOCSIFMTU:	/* Set the MTU of a device */
4983		return dev_set_mtu(dev, ifr->ifr_mtu);
4984
4985	case SIOCSIFHWADDR:
4986		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4987
4988	case SIOCSIFHWBROADCAST:
4989		if (ifr->ifr_hwaddr.sa_family != dev->type)
4990			return -EINVAL;
4991		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4992		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4993		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4994		return 0;
4995
4996	case SIOCSIFMAP:
4997		if (ops->ndo_set_config) {
4998			if (!netif_device_present(dev))
4999				return -ENODEV;
5000			return ops->ndo_set_config(dev, &ifr->ifr_map);
5001		}
5002		return -EOPNOTSUPP;
5003
5004	case SIOCADDMULTI:
5005		if (!ops->ndo_set_rx_mode ||
5006		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5007			return -EINVAL;
5008		if (!netif_device_present(dev))
5009			return -ENODEV;
5010		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5011
5012	case SIOCDELMULTI:
5013		if (!ops->ndo_set_rx_mode ||
5014		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5015			return -EINVAL;
5016		if (!netif_device_present(dev))
5017			return -ENODEV;
5018		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5019
5020	case SIOCSIFTXQLEN:
5021		if (ifr->ifr_qlen < 0)
5022			return -EINVAL;
5023		dev->tx_queue_len = ifr->ifr_qlen;
5024		return 0;
5025
5026	case SIOCSIFNAME:
5027		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5028		return dev_change_name(dev, ifr->ifr_newname);
5029
5030	case SIOCSHWTSTAMP:
5031		err = net_hwtstamp_validate(ifr);
5032		if (err)
5033			return err;
5034		/* fall through */
5035
5036	/*
5037	 *	Unknown or private ioctl
5038	 */
5039	default:
5040		if ((cmd >= SIOCDEVPRIVATE &&
5041		    cmd <= SIOCDEVPRIVATE + 15) ||
5042		    cmd == SIOCBONDENSLAVE ||
5043		    cmd == SIOCBONDRELEASE ||
5044		    cmd == SIOCBONDSETHWADDR ||
5045		    cmd == SIOCBONDSLAVEINFOQUERY ||
5046		    cmd == SIOCBONDINFOQUERY ||
5047		    cmd == SIOCBONDCHANGEACTIVE ||
5048		    cmd == SIOCGMIIPHY ||
5049		    cmd == SIOCGMIIREG ||
5050		    cmd == SIOCSMIIREG ||
5051		    cmd == SIOCBRADDIF ||
5052		    cmd == SIOCBRDELIF ||
5053		    cmd == SIOCSHWTSTAMP ||
5054		    cmd == SIOCWANDEV) {
5055			err = -EOPNOTSUPP;
5056			if (ops->ndo_do_ioctl) {
5057				if (netif_device_present(dev))
5058					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5059				else
5060					err = -ENODEV;
5061			}
5062		} else
5063			err = -EINVAL;
5064
5065	}
5066	return err;
5067}
5068
5069/*
5070 *	This function handles all "interface"-type I/O control requests. The actual
5071 *	'doing' part of this is dev_ifsioc above.
5072 */
5073
5074/**
5075 *	dev_ioctl	-	network device ioctl
5076 *	@net: the applicable net namespace
5077 *	@cmd: command to issue
5078 *	@arg: pointer to a struct ifreq in user space
5079 *
5080 *	Issue ioctl functions to devices. This is normally called by the
5081 *	user space syscall interfaces but can sometimes be useful for
5082 *	other purposes. The return value is the return from the syscall if
5083 *	positive or a negative errno code on error.
5084 */
5085
5086int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5087{
5088	struct ifreq ifr;
5089	int ret;
5090	char *colon;
5091
5092	/* One special case: SIOCGIFCONF takes ifconf argument
5093	   and requires shared lock, because it sleeps writing
5094	   to user space.
5095	 */
5096
5097	if (cmd == SIOCGIFCONF) {
5098		rtnl_lock();
5099		ret = dev_ifconf(net, (char __user *) arg);
5100		rtnl_unlock();
5101		return ret;
5102	}
5103	if (cmd == SIOCGIFNAME)
5104		return dev_ifname(net, (struct ifreq __user *)arg);
5105
5106	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5107		return -EFAULT;
5108
5109	ifr.ifr_name[IFNAMSIZ-1] = 0;
5110
5111	colon = strchr(ifr.ifr_name, ':');
5112	if (colon)
5113		*colon = 0;
5114
5115	/*
5116	 *	See which interface the caller is talking about.
5117	 */
5118
5119	switch (cmd) {
5120	/*
5121	 *	These ioctl calls:
5122	 *	- can be done by all.
5123	 *	- atomic and do not require locking.
5124	 *	- return a value
5125	 */
5126	case SIOCGIFFLAGS:
5127	case SIOCGIFMETRIC:
5128	case SIOCGIFMTU:
5129	case SIOCGIFHWADDR:
5130	case SIOCGIFSLAVE:
5131	case SIOCGIFMAP:
5132	case SIOCGIFINDEX:
5133	case SIOCGIFTXQLEN:
5134		dev_load(net, ifr.ifr_name);
5135		rcu_read_lock();
5136		ret = dev_ifsioc_locked(net, &ifr, cmd);
5137		rcu_read_unlock();
5138		if (!ret) {
5139			if (colon)
5140				*colon = ':';
5141			if (copy_to_user(arg, &ifr,
5142					 sizeof(struct ifreq)))
5143				ret = -EFAULT;
5144		}
5145		return ret;
5146
5147	case SIOCETHTOOL:
5148		dev_load(net, ifr.ifr_name);
5149		rtnl_lock();
5150		ret = dev_ethtool(net, &ifr);
5151		rtnl_unlock();
5152		if (!ret) {
5153			if (colon)
5154				*colon = ':';
5155			if (copy_to_user(arg, &ifr,
5156					 sizeof(struct ifreq)))
5157				ret = -EFAULT;
5158		}
5159		return ret;
5160
5161	/*
5162	 *	These ioctl calls:
5163	 *	- require superuser power.
5164	 *	- require strict serialization.
5165	 *	- return a value
5166	 */
5167	case SIOCGMIIPHY:
5168	case SIOCGMIIREG:
5169	case SIOCSIFNAME:
5170		if (!capable(CAP_NET_ADMIN))
5171			return -EPERM;
5172		dev_load(net, ifr.ifr_name);
5173		rtnl_lock();
5174		ret = dev_ifsioc(net, &ifr, cmd);
5175		rtnl_unlock();
5176		if (!ret) {
5177			if (colon)
5178				*colon = ':';
5179			if (copy_to_user(arg, &ifr,
5180					 sizeof(struct ifreq)))
5181				ret = -EFAULT;
5182		}
5183		return ret;
5184
5185	/*
5186	 *	These ioctl calls:
5187	 *	- require superuser power.
5188	 *	- require strict serialization.
5189	 *	- do not return a value
5190	 */
5191	case SIOCSIFFLAGS:
5192	case SIOCSIFMETRIC:
5193	case SIOCSIFMTU:
5194	case SIOCSIFMAP:
5195	case SIOCSIFHWADDR:
5196	case SIOCSIFSLAVE:
5197	case SIOCADDMULTI:
5198	case SIOCDELMULTI:
5199	case SIOCSIFHWBROADCAST:
5200	case SIOCSIFTXQLEN:
5201	case SIOCSMIIREG:
5202	case SIOCBONDENSLAVE:
5203	case SIOCBONDRELEASE:
5204	case SIOCBONDSETHWADDR:
5205	case SIOCBONDCHANGEACTIVE:
5206	case SIOCBRADDIF:
5207	case SIOCBRDELIF:
5208	case SIOCSHWTSTAMP:
5209		if (!capable(CAP_NET_ADMIN))
5210			return -EPERM;
5211		/* fall through */
5212	case SIOCBONDSLAVEINFOQUERY:
5213	case SIOCBONDINFOQUERY:
5214		dev_load(net, ifr.ifr_name);
5215		rtnl_lock();
5216		ret = dev_ifsioc(net, &ifr, cmd);
5217		rtnl_unlock();
5218		return ret;
5219
5220	case SIOCGIFMEM:
5221		/* Get the per device memory space. We can add this but
5222		 * currently do not support it */
5223	case SIOCSIFMEM:
5224		/* Set the per device memory buffer space.
5225		 * Not applicable in our case */
5226	case SIOCSIFLINK:
5227		return -ENOTTY;
5228
5229	/*
5230	 *	Unknown or private ioctl.
5231	 */
5232	default:
5233		if (cmd == SIOCWANDEV ||
5234		    (cmd >= SIOCDEVPRIVATE &&
5235		     cmd <= SIOCDEVPRIVATE + 15)) {
5236			dev_load(net, ifr.ifr_name);
5237			rtnl_lock();
5238			ret = dev_ifsioc(net, &ifr, cmd);
5239			rtnl_unlock();
5240			if (!ret && copy_to_user(arg, &ifr,
5241						 sizeof(struct ifreq)))
5242				ret = -EFAULT;
5243			return ret;
5244		}
5245		/* Take care of Wireless Extensions */
5246		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5247			return wext_handle_ioctl(net, &ifr, cmd, arg);
5248		return -ENOTTY;
5249	}
5250}
5251
5252
5253/**
5254 *	dev_new_index	-	allocate an ifindex
5255 *	@net: the applicable net namespace
5256 *
5257 *	Returns a suitable unique value for a new device interface
5258 *	number.  The caller must hold the rtnl semaphore or the
5259 *	dev_base_lock to be sure it remains unique.
5260 */
5261static int dev_new_index(struct net *net)
5262{
5263	static int ifindex;
5264	for (;;) {
5265		if (++ifindex <= 0)
5266			ifindex = 1;
5267		if (!__dev_get_by_index(net, ifindex))
5268			return ifindex;
5269	}
5270}
5271
5272/* Delayed registration/unregisteration */
5273static LIST_HEAD(net_todo_list);
5274
5275static void net_set_todo(struct net_device *dev)
5276{
5277	list_add_tail(&dev->todo_list, &net_todo_list);
5278}
5279
5280static void rollback_registered_many(struct list_head *head)
5281{
5282	struct net_device *dev, *tmp;
5283
5284	BUG_ON(dev_boot_phase);
5285	ASSERT_RTNL();
5286
5287	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5288		/* Some devices call without registering
5289		 * for initialization unwind. Remove those
5290		 * devices and proceed with the remaining.
5291		 */
5292		if (dev->reg_state == NETREG_UNINITIALIZED) {
5293			pr_debug("unregister_netdevice: device %s/%p never "
5294				 "was registered\n", dev->name, dev);
5295
5296			WARN_ON(1);
5297			list_del(&dev->unreg_list);
5298			continue;
5299		}
5300		dev->dismantle = true;
5301		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5302	}
5303
5304	/* If device is running, close it first. */
5305	dev_close_many(head);
5306
5307	list_for_each_entry(dev, head, unreg_list) {
5308		/* And unlink it from device chain. */
5309		unlist_netdevice(dev);
5310
5311		dev->reg_state = NETREG_UNREGISTERING;
5312	}
5313
5314	synchronize_net();
5315
5316	list_for_each_entry(dev, head, unreg_list) {
5317		/* Shutdown queueing discipline. */
5318		dev_shutdown(dev);
5319
5320
5321		/* Notify protocols, that we are about to destroy
5322		   this device. They should clean all the things.
5323		*/
5324		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5325
5326		if (!dev->rtnl_link_ops ||
5327		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5328			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5329
5330		/*
5331		 *	Flush the unicast and multicast chains
5332		 */
5333		dev_uc_flush(dev);
5334		dev_mc_flush(dev);
5335
5336		if (dev->netdev_ops->ndo_uninit)
5337			dev->netdev_ops->ndo_uninit(dev);
5338
5339		/* Notifier chain MUST detach us from master device. */
5340		WARN_ON(dev->master);
5341
5342		/* Remove entries from kobject tree */
5343		netdev_unregister_kobject(dev);
5344	}
5345
5346	/* Process any work delayed until the end of the batch */
5347	dev = list_first_entry(head, struct net_device, unreg_list);
5348	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5349
5350	synchronize_net();
5351
5352	list_for_each_entry(dev, head, unreg_list)
5353		dev_put(dev);
5354}
5355
5356static void rollback_registered(struct net_device *dev)
5357{
5358	LIST_HEAD(single);
5359
5360	list_add(&dev->unreg_list, &single);
5361	rollback_registered_many(&single);
5362	list_del(&single);
5363}
5364
5365static u32 netdev_fix_features(struct net_device *dev, u32 features)
5366{
5367	/* Fix illegal checksum combinations */
5368	if ((features & NETIF_F_HW_CSUM) &&
5369	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5370		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5371		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5372	}
5373
5374	if ((features & NETIF_F_NO_CSUM) &&
5375	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5376		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5377		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5378	}
5379
5380	/* Fix illegal SG+CSUM combinations. */
5381	if ((features & NETIF_F_SG) &&
5382	    !(features & NETIF_F_ALL_CSUM)) {
5383		netdev_dbg(dev,
5384			"Dropping NETIF_F_SG since no checksum feature.\n");
5385		features &= ~NETIF_F_SG;
5386	}
5387
5388	/* TSO requires that SG is present as well. */
5389	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5390		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5391		features &= ~NETIF_F_ALL_TSO;
5392	}
5393
5394	/* TSO ECN requires that TSO is present as well. */
5395	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5396		features &= ~NETIF_F_TSO_ECN;
5397
5398	/* Software GSO depends on SG. */
5399	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5400		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5401		features &= ~NETIF_F_GSO;
5402	}
5403
5404	/* UFO needs SG and checksumming */
5405	if (features & NETIF_F_UFO) {
5406		/* maybe split UFO into V4 and V6? */
5407		if (!((features & NETIF_F_GEN_CSUM) ||
5408		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5409			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5410			netdev_dbg(dev,
5411				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5412			features &= ~NETIF_F_UFO;
5413		}
5414
5415		if (!(features & NETIF_F_SG)) {
5416			netdev_dbg(dev,
5417				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5418			features &= ~NETIF_F_UFO;
5419		}
5420	}
5421
5422	return features;
5423}
5424
5425int __netdev_update_features(struct net_device *dev)
5426{
5427	u32 features;
5428	int err = 0;
5429
5430	ASSERT_RTNL();
5431
5432	features = netdev_get_wanted_features(dev);
5433
5434	if (dev->netdev_ops->ndo_fix_features)
5435		features = dev->netdev_ops->ndo_fix_features(dev, features);
5436
5437	/* driver might be less strict about feature dependencies */
5438	features = netdev_fix_features(dev, features);
5439
5440	if (dev->features == features)
5441		return 0;
5442
5443	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5444		dev->features, features);
5445
5446	if (dev->netdev_ops->ndo_set_features)
5447		err = dev->netdev_ops->ndo_set_features(dev, features);
5448
5449	if (unlikely(err < 0)) {
5450		netdev_err(dev,
5451			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5452			err, features, dev->features);
5453		return -1;
5454	}
5455
5456	if (!err)
5457		dev->features = features;
5458
5459	return 1;
5460}
5461
5462/**
5463 *	netdev_update_features - recalculate device features
5464 *	@dev: the device to check
5465 *
5466 *	Recalculate dev->features set and send notifications if it
5467 *	has changed. Should be called after driver or hardware dependent
5468 *	conditions might have changed that influence the features.
5469 */
5470void netdev_update_features(struct net_device *dev)
5471{
5472	if (__netdev_update_features(dev))
5473		netdev_features_change(dev);
5474}
5475EXPORT_SYMBOL(netdev_update_features);
5476
5477/**
5478 *	netdev_change_features - recalculate device features
5479 *	@dev: the device to check
5480 *
5481 *	Recalculate dev->features set and send notifications even
5482 *	if they have not changed. Should be called instead of
5483 *	netdev_update_features() if also dev->vlan_features might
5484 *	have changed to allow the changes to be propagated to stacked
5485 *	VLAN devices.
5486 */
5487void netdev_change_features(struct net_device *dev)
5488{
5489	__netdev_update_features(dev);
5490	netdev_features_change(dev);
5491}
5492EXPORT_SYMBOL(netdev_change_features);
5493
5494/**
5495 *	netif_stacked_transfer_operstate -	transfer operstate
5496 *	@rootdev: the root or lower level device to transfer state from
5497 *	@dev: the device to transfer operstate to
5498 *
5499 *	Transfer operational state from root to device. This is normally
5500 *	called when a stacking relationship exists between the root
5501 *	device and the device(a leaf device).
5502 */
5503void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5504					struct net_device *dev)
5505{
5506	if (rootdev->operstate == IF_OPER_DORMANT)
5507		netif_dormant_on(dev);
5508	else
5509		netif_dormant_off(dev);
5510
5511	if (netif_carrier_ok(rootdev)) {
5512		if (!netif_carrier_ok(dev))
5513			netif_carrier_on(dev);
5514	} else {
5515		if (netif_carrier_ok(dev))
5516			netif_carrier_off(dev);
5517	}
5518}
5519EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5520
5521#ifdef CONFIG_RPS
5522static int netif_alloc_rx_queues(struct net_device *dev)
5523{
5524	unsigned int i, count = dev->num_rx_queues;
5525	struct netdev_rx_queue *rx;
5526
5527	BUG_ON(count < 1);
5528
5529	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5530	if (!rx) {
5531		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5532		return -ENOMEM;
5533	}
5534	dev->_rx = rx;
5535
5536	for (i = 0; i < count; i++)
5537		rx[i].dev = dev;
5538	return 0;
5539}
5540#endif
5541
5542static void netdev_init_one_queue(struct net_device *dev,
5543				  struct netdev_queue *queue, void *_unused)
5544{
5545	/* Initialize queue lock */
5546	spin_lock_init(&queue->_xmit_lock);
5547	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5548	queue->xmit_lock_owner = -1;
5549	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5550	queue->dev = dev;
5551}
5552
5553static int netif_alloc_netdev_queues(struct net_device *dev)
5554{
5555	unsigned int count = dev->num_tx_queues;
5556	struct netdev_queue *tx;
5557
5558	BUG_ON(count < 1);
5559
5560	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5561	if (!tx) {
5562		pr_err("netdev: Unable to allocate %u tx queues.\n",
5563		       count);
5564		return -ENOMEM;
5565	}
5566	dev->_tx = tx;
5567
5568	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5569	spin_lock_init(&dev->tx_global_lock);
5570
5571	return 0;
5572}
5573
5574/**
5575 *	register_netdevice	- register a network device
5576 *	@dev: device to register
5577 *
5578 *	Take a completed network device structure and add it to the kernel
5579 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5580 *	chain. 0 is returned on success. A negative errno code is returned
5581 *	on a failure to set up the device, or if the name is a duplicate.
5582 *
5583 *	Callers must hold the rtnl semaphore. You may want
5584 *	register_netdev() instead of this.
5585 *
5586 *	BUGS:
5587 *	The locking appears insufficient to guarantee two parallel registers
5588 *	will not get the same name.
5589 */
5590
5591int register_netdevice(struct net_device *dev)
5592{
5593	int ret;
5594	struct net *net = dev_net(dev);
5595
5596	BUG_ON(dev_boot_phase);
5597	ASSERT_RTNL();
5598
5599	might_sleep();
5600
5601	/* When net_device's are persistent, this will be fatal. */
5602	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5603	BUG_ON(!net);
5604
5605	spin_lock_init(&dev->addr_list_lock);
5606	netdev_set_addr_lockdep_class(dev);
5607
5608	dev->iflink = -1;
5609
5610	ret = dev_get_valid_name(dev, dev->name);
5611	if (ret < 0)
5612		goto out;
5613
5614	/* Init, if this function is available */
5615	if (dev->netdev_ops->ndo_init) {
5616		ret = dev->netdev_ops->ndo_init(dev);
5617		if (ret) {
5618			if (ret > 0)
5619				ret = -EIO;
5620			goto out;
5621		}
5622	}
5623
5624	dev->ifindex = dev_new_index(net);
5625	if (dev->iflink == -1)
5626		dev->iflink = dev->ifindex;
5627
5628	/* Transfer changeable features to wanted_features and enable
5629	 * software offloads (GSO and GRO).
5630	 */
5631	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5632	dev->features |= NETIF_F_SOFT_FEATURES;
5633	dev->wanted_features = dev->features & dev->hw_features;
5634
5635	/* Turn on no cache copy if HW is doing checksum */
5636	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5637	if ((dev->features & NETIF_F_ALL_CSUM) &&
5638	    !(dev->features & NETIF_F_NO_CSUM)) {
5639		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5640		dev->features |= NETIF_F_NOCACHE_COPY;
5641	}
5642
5643	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5644	 */
5645	dev->vlan_features |= NETIF_F_HIGHDMA;
5646
5647	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5648	ret = notifier_to_errno(ret);
5649	if (ret)
5650		goto err_uninit;
5651
5652	ret = netdev_register_kobject(dev);
5653	if (ret)
5654		goto err_uninit;
5655	dev->reg_state = NETREG_REGISTERED;
5656
5657	__netdev_update_features(dev);
5658
5659	/*
5660	 *	Default initial state at registry is that the
5661	 *	device is present.
5662	 */
5663
5664	set_bit(__LINK_STATE_PRESENT, &dev->state);
5665
5666	dev_init_scheduler(dev);
5667	dev_hold(dev);
5668	list_netdevice(dev);
5669
5670	/* Notify protocols, that a new device appeared. */
5671	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5672	ret = notifier_to_errno(ret);
5673	if (ret) {
5674		rollback_registered(dev);
5675		dev->reg_state = NETREG_UNREGISTERED;
5676	}
5677	/*
5678	 *	Prevent userspace races by waiting until the network
5679	 *	device is fully setup before sending notifications.
5680	 */
5681	if (!dev->rtnl_link_ops ||
5682	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5683		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5684
5685out:
5686	return ret;
5687
5688err_uninit:
5689	if (dev->netdev_ops->ndo_uninit)
5690		dev->netdev_ops->ndo_uninit(dev);
5691	goto out;
5692}
5693EXPORT_SYMBOL(register_netdevice);
5694
5695/**
5696 *	init_dummy_netdev	- init a dummy network device for NAPI
5697 *	@dev: device to init
5698 *
5699 *	This takes a network device structure and initialize the minimum
5700 *	amount of fields so it can be used to schedule NAPI polls without
5701 *	registering a full blown interface. This is to be used by drivers
5702 *	that need to tie several hardware interfaces to a single NAPI
5703 *	poll scheduler due to HW limitations.
5704 */
5705int init_dummy_netdev(struct net_device *dev)
5706{
5707	/* Clear everything. Note we don't initialize spinlocks
5708	 * are they aren't supposed to be taken by any of the
5709	 * NAPI code and this dummy netdev is supposed to be
5710	 * only ever used for NAPI polls
5711	 */
5712	memset(dev, 0, sizeof(struct net_device));
5713
5714	/* make sure we BUG if trying to hit standard
5715	 * register/unregister code path
5716	 */
5717	dev->reg_state = NETREG_DUMMY;
5718
5719	/* NAPI wants this */
5720	INIT_LIST_HEAD(&dev->napi_list);
5721
5722	/* a dummy interface is started by default */
5723	set_bit(__LINK_STATE_PRESENT, &dev->state);
5724	set_bit(__LINK_STATE_START, &dev->state);
5725
5726	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5727	 * because users of this 'device' dont need to change
5728	 * its refcount.
5729	 */
5730
5731	return 0;
5732}
5733EXPORT_SYMBOL_GPL(init_dummy_netdev);
5734
5735
5736/**
5737 *	register_netdev	- register a network device
5738 *	@dev: device to register
5739 *
5740 *	Take a completed network device structure and add it to the kernel
5741 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5742 *	chain. 0 is returned on success. A negative errno code is returned
5743 *	on a failure to set up the device, or if the name is a duplicate.
5744 *
5745 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5746 *	and expands the device name if you passed a format string to
5747 *	alloc_netdev.
5748 */
5749int register_netdev(struct net_device *dev)
5750{
5751	int err;
5752
5753	rtnl_lock();
5754	err = register_netdevice(dev);
5755	rtnl_unlock();
5756	return err;
5757}
5758EXPORT_SYMBOL(register_netdev);
5759
5760int netdev_refcnt_read(const struct net_device *dev)
5761{
5762	int i, refcnt = 0;
5763
5764	for_each_possible_cpu(i)
5765		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5766	return refcnt;
5767}
5768EXPORT_SYMBOL(netdev_refcnt_read);
5769
5770/*
5771 * netdev_wait_allrefs - wait until all references are gone.
5772 *
5773 * This is called when unregistering network devices.
5774 *
5775 * Any protocol or device that holds a reference should register
5776 * for netdevice notification, and cleanup and put back the
5777 * reference if they receive an UNREGISTER event.
5778 * We can get stuck here if buggy protocols don't correctly
5779 * call dev_put.
5780 */
5781static void netdev_wait_allrefs(struct net_device *dev)
5782{
5783	unsigned long rebroadcast_time, warning_time;
5784	int refcnt;
5785
5786	linkwatch_forget_dev(dev);
5787
5788	rebroadcast_time = warning_time = jiffies;
5789	refcnt = netdev_refcnt_read(dev);
5790
5791	while (refcnt != 0) {
5792		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5793			rtnl_lock();
5794
5795			/* Rebroadcast unregister notification */
5796			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5797			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5798			 * should have already handle it the first time */
5799
5800			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5801				     &dev->state)) {
5802				/* We must not have linkwatch events
5803				 * pending on unregister. If this
5804				 * happens, we simply run the queue
5805				 * unscheduled, resulting in a noop
5806				 * for this device.
5807				 */
5808				linkwatch_run_queue();
5809			}
5810
5811			__rtnl_unlock();
5812
5813			rebroadcast_time = jiffies;
5814		}
5815
5816		msleep(250);
5817
5818		refcnt = netdev_refcnt_read(dev);
5819
5820		if (time_after(jiffies, warning_time + 10 * HZ)) {
5821			printk(KERN_EMERG "unregister_netdevice: "
5822			       "waiting for %s to become free. Usage "
5823			       "count = %d\n",
5824			       dev->name, refcnt);
5825			warning_time = jiffies;
5826		}
5827	}
5828}
5829
5830/* The sequence is:
5831 *
5832 *	rtnl_lock();
5833 *	...
5834 *	register_netdevice(x1);
5835 *	register_netdevice(x2);
5836 *	...
5837 *	unregister_netdevice(y1);
5838 *	unregister_netdevice(y2);
5839 *      ...
5840 *	rtnl_unlock();
5841 *	free_netdev(y1);
5842 *	free_netdev(y2);
5843 *
5844 * We are invoked by rtnl_unlock().
5845 * This allows us to deal with problems:
5846 * 1) We can delete sysfs objects which invoke hotplug
5847 *    without deadlocking with linkwatch via keventd.
5848 * 2) Since we run with the RTNL semaphore not held, we can sleep
5849 *    safely in order to wait for the netdev refcnt to drop to zero.
5850 *
5851 * We must not return until all unregister events added during
5852 * the interval the lock was held have been completed.
5853 */
5854void netdev_run_todo(void)
5855{
5856	struct list_head list;
5857
5858	/* Snapshot list, allow later requests */
5859	list_replace_init(&net_todo_list, &list);
5860
5861	__rtnl_unlock();
5862
5863	/* Wait for rcu callbacks to finish before attempting to drain
5864	 * the device list.  This usually avoids a 250ms wait.
5865	 */
5866	if (!list_empty(&list))
5867		rcu_barrier();
5868
5869	while (!list_empty(&list)) {
5870		struct net_device *dev
5871			= list_first_entry(&list, struct net_device, todo_list);
5872		list_del(&dev->todo_list);
5873
5874		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5875			printk(KERN_ERR "network todo '%s' but state %d\n",
5876			       dev->name, dev->reg_state);
5877			dump_stack();
5878			continue;
5879		}
5880
5881		dev->reg_state = NETREG_UNREGISTERED;
5882
5883		on_each_cpu(flush_backlog, dev, 1);
5884
5885		netdev_wait_allrefs(dev);
5886
5887		/* paranoia */
5888		BUG_ON(netdev_refcnt_read(dev));
5889		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5890		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5891		WARN_ON(dev->dn_ptr);
5892
5893		if (dev->destructor)
5894			dev->destructor(dev);
5895
5896		/* Free network device */
5897		kobject_put(&dev->dev.kobj);
5898	}
5899}
5900
5901/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5902 * fields in the same order, with only the type differing.
5903 */
5904static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5905				    const struct net_device_stats *netdev_stats)
5906{
5907#if BITS_PER_LONG == 64
5908        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5909        memcpy(stats64, netdev_stats, sizeof(*stats64));
5910#else
5911	size_t i, n = sizeof(*stats64) / sizeof(u64);
5912	const unsigned long *src = (const unsigned long *)netdev_stats;
5913	u64 *dst = (u64 *)stats64;
5914
5915	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5916		     sizeof(*stats64) / sizeof(u64));
5917	for (i = 0; i < n; i++)
5918		dst[i] = src[i];
5919#endif
5920}
5921
5922/**
5923 *	dev_get_stats	- get network device statistics
5924 *	@dev: device to get statistics from
5925 *	@storage: place to store stats
5926 *
5927 *	Get network statistics from device. Return @storage.
5928 *	The device driver may provide its own method by setting
5929 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5930 *	otherwise the internal statistics structure is used.
5931 */
5932struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5933					struct rtnl_link_stats64 *storage)
5934{
5935	const struct net_device_ops *ops = dev->netdev_ops;
5936
5937	if (ops->ndo_get_stats64) {
5938		memset(storage, 0, sizeof(*storage));
5939		ops->ndo_get_stats64(dev, storage);
5940	} else if (ops->ndo_get_stats) {
5941		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5942	} else {
5943		netdev_stats_to_stats64(storage, &dev->stats);
5944	}
5945	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5946	return storage;
5947}
5948EXPORT_SYMBOL(dev_get_stats);
5949
5950struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5951{
5952	struct netdev_queue *queue = dev_ingress_queue(dev);
5953
5954#ifdef CONFIG_NET_CLS_ACT
5955	if (queue)
5956		return queue;
5957	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5958	if (!queue)
5959		return NULL;
5960	netdev_init_one_queue(dev, queue, NULL);
5961	queue->qdisc = &noop_qdisc;
5962	queue->qdisc_sleeping = &noop_qdisc;
5963	rcu_assign_pointer(dev->ingress_queue, queue);
5964#endif
5965	return queue;
5966}
5967
5968/**
5969 *	alloc_netdev_mqs - allocate network device
5970 *	@sizeof_priv:	size of private data to allocate space for
5971 *	@name:		device name format string
5972 *	@setup:		callback to initialize device
5973 *	@txqs:		the number of TX subqueues to allocate
5974 *	@rxqs:		the number of RX subqueues to allocate
5975 *
5976 *	Allocates a struct net_device with private data area for driver use
5977 *	and performs basic initialization.  Also allocates subquue structs
5978 *	for each queue on the device.
5979 */
5980struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5981		void (*setup)(struct net_device *),
5982		unsigned int txqs, unsigned int rxqs)
5983{
5984	struct net_device *dev;
5985	size_t alloc_size;
5986	struct net_device *p;
5987
5988	BUG_ON(strlen(name) >= sizeof(dev->name));
5989
5990	if (txqs < 1) {
5991		pr_err("alloc_netdev: Unable to allocate device "
5992		       "with zero queues.\n");
5993		return NULL;
5994	}
5995
5996#ifdef CONFIG_RPS
5997	if (rxqs < 1) {
5998		pr_err("alloc_netdev: Unable to allocate device "
5999		       "with zero RX queues.\n");
6000		return NULL;
6001	}
6002#endif
6003
6004	alloc_size = sizeof(struct net_device);
6005	if (sizeof_priv) {
6006		/* ensure 32-byte alignment of private area */
6007		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6008		alloc_size += sizeof_priv;
6009	}
6010	/* ensure 32-byte alignment of whole construct */
6011	alloc_size += NETDEV_ALIGN - 1;
6012
6013	p = kzalloc(alloc_size, GFP_KERNEL);
6014	if (!p) {
6015		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6016		return NULL;
6017	}
6018
6019	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6020	dev->padded = (char *)dev - (char *)p;
6021
6022	dev->pcpu_refcnt = alloc_percpu(int);
6023	if (!dev->pcpu_refcnt)
6024		goto free_p;
6025
6026	if (dev_addr_init(dev))
6027		goto free_pcpu;
6028
6029	dev_mc_init(dev);
6030	dev_uc_init(dev);
6031
6032	dev_net_set(dev, &init_net);
6033
6034	dev->gso_max_size = GSO_MAX_SIZE;
6035
6036	INIT_LIST_HEAD(&dev->napi_list);
6037	INIT_LIST_HEAD(&dev->unreg_list);
6038	INIT_LIST_HEAD(&dev->link_watch_list);
6039	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6040	setup(dev);
6041
6042	dev->num_tx_queues = txqs;
6043	dev->real_num_tx_queues = txqs;
6044	if (netif_alloc_netdev_queues(dev))
6045		goto free_all;
6046
6047#ifdef CONFIG_RPS
6048	dev->num_rx_queues = rxqs;
6049	dev->real_num_rx_queues = rxqs;
6050	if (netif_alloc_rx_queues(dev))
6051		goto free_all;
6052#endif
6053
6054	strcpy(dev->name, name);
6055	dev->group = INIT_NETDEV_GROUP;
6056	return dev;
6057
6058free_all:
6059	free_netdev(dev);
6060	return NULL;
6061
6062free_pcpu:
6063	free_percpu(dev->pcpu_refcnt);
6064	kfree(dev->_tx);
6065#ifdef CONFIG_RPS
6066	kfree(dev->_rx);
6067#endif
6068
6069free_p:
6070	kfree(p);
6071	return NULL;
6072}
6073EXPORT_SYMBOL(alloc_netdev_mqs);
6074
6075/**
6076 *	free_netdev - free network device
6077 *	@dev: device
6078 *
6079 *	This function does the last stage of destroying an allocated device
6080 * 	interface. The reference to the device object is released.
6081 *	If this is the last reference then it will be freed.
6082 */
6083void free_netdev(struct net_device *dev)
6084{
6085	struct napi_struct *p, *n;
6086
6087	release_net(dev_net(dev));
6088
6089	kfree(dev->_tx);
6090#ifdef CONFIG_RPS
6091	kfree(dev->_rx);
6092#endif
6093
6094	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6095
6096	/* Flush device addresses */
6097	dev_addr_flush(dev);
6098
6099	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6100		netif_napi_del(p);
6101
6102	free_percpu(dev->pcpu_refcnt);
6103	dev->pcpu_refcnt = NULL;
6104
6105	/*  Compatibility with error handling in drivers */
6106	if (dev->reg_state == NETREG_UNINITIALIZED) {
6107		kfree((char *)dev - dev->padded);
6108		return;
6109	}
6110
6111	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6112	dev->reg_state = NETREG_RELEASED;
6113
6114	/* will free via device release */
6115	put_device(&dev->dev);
6116}
6117EXPORT_SYMBOL(free_netdev);
6118
6119/**
6120 *	synchronize_net -  Synchronize with packet receive processing
6121 *
6122 *	Wait for packets currently being received to be done.
6123 *	Does not block later packets from starting.
6124 */
6125void synchronize_net(void)
6126{
6127	might_sleep();
6128	if (rtnl_is_locked())
6129		synchronize_rcu_expedited();
6130	else
6131		synchronize_rcu();
6132}
6133EXPORT_SYMBOL(synchronize_net);
6134
6135/**
6136 *	unregister_netdevice_queue - remove device from the kernel
6137 *	@dev: device
6138 *	@head: list
6139 *
6140 *	This function shuts down a device interface and removes it
6141 *	from the kernel tables.
6142 *	If head not NULL, device is queued to be unregistered later.
6143 *
6144 *	Callers must hold the rtnl semaphore.  You may want
6145 *	unregister_netdev() instead of this.
6146 */
6147
6148void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6149{
6150	ASSERT_RTNL();
6151
6152	if (head) {
6153		list_move_tail(&dev->unreg_list, head);
6154	} else {
6155		rollback_registered(dev);
6156		/* Finish processing unregister after unlock */
6157		net_set_todo(dev);
6158	}
6159}
6160EXPORT_SYMBOL(unregister_netdevice_queue);
6161
6162/**
6163 *	unregister_netdevice_many - unregister many devices
6164 *	@head: list of devices
6165 */
6166void unregister_netdevice_many(struct list_head *head)
6167{
6168	struct net_device *dev;
6169
6170	if (!list_empty(head)) {
6171		rollback_registered_many(head);
6172		list_for_each_entry(dev, head, unreg_list)
6173			net_set_todo(dev);
6174	}
6175}
6176EXPORT_SYMBOL(unregister_netdevice_many);
6177
6178/**
6179 *	unregister_netdev - remove device from the kernel
6180 *	@dev: device
6181 *
6182 *	This function shuts down a device interface and removes it
6183 *	from the kernel tables.
6184 *
6185 *	This is just a wrapper for unregister_netdevice that takes
6186 *	the rtnl semaphore.  In general you want to use this and not
6187 *	unregister_netdevice.
6188 */
6189void unregister_netdev(struct net_device *dev)
6190{
6191	rtnl_lock();
6192	unregister_netdevice(dev);
6193	rtnl_unlock();
6194}
6195EXPORT_SYMBOL(unregister_netdev);
6196
6197/**
6198 *	dev_change_net_namespace - move device to different nethost namespace
6199 *	@dev: device
6200 *	@net: network namespace
6201 *	@pat: If not NULL name pattern to try if the current device name
6202 *	      is already taken in the destination network namespace.
6203 *
6204 *	This function shuts down a device interface and moves it
6205 *	to a new network namespace. On success 0 is returned, on
6206 *	a failure a netagive errno code is returned.
6207 *
6208 *	Callers must hold the rtnl semaphore.
6209 */
6210
6211int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6212{
6213	int err;
6214
6215	ASSERT_RTNL();
6216
6217	/* Don't allow namespace local devices to be moved. */
6218	err = -EINVAL;
6219	if (dev->features & NETIF_F_NETNS_LOCAL)
6220		goto out;
6221
6222	/* Ensure the device has been registrered */
6223	err = -EINVAL;
6224	if (dev->reg_state != NETREG_REGISTERED)
6225		goto out;
6226
6227	/* Get out if there is nothing todo */
6228	err = 0;
6229	if (net_eq(dev_net(dev), net))
6230		goto out;
6231
6232	/* Pick the destination device name, and ensure
6233	 * we can use it in the destination network namespace.
6234	 */
6235	err = -EEXIST;
6236	if (__dev_get_by_name(net, dev->name)) {
6237		/* We get here if we can't use the current device name */
6238		if (!pat)
6239			goto out;
6240		if (dev_get_valid_name(dev, pat) < 0)
6241			goto out;
6242	}
6243
6244	/*
6245	 * And now a mini version of register_netdevice unregister_netdevice.
6246	 */
6247
6248	/* If device is running close it first. */
6249	dev_close(dev);
6250
6251	/* And unlink it from device chain */
6252	err = -ENODEV;
6253	unlist_netdevice(dev);
6254
6255	synchronize_net();
6256
6257	/* Shutdown queueing discipline. */
6258	dev_shutdown(dev);
6259
6260	/* Notify protocols, that we are about to destroy
6261	   this device. They should clean all the things.
6262
6263	   Note that dev->reg_state stays at NETREG_REGISTERED.
6264	   This is wanted because this way 8021q and macvlan know
6265	   the device is just moving and can keep their slaves up.
6266	*/
6267	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6268	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6269	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6270
6271	/*
6272	 *	Flush the unicast and multicast chains
6273	 */
6274	dev_uc_flush(dev);
6275	dev_mc_flush(dev);
6276
6277	/* Actually switch the network namespace */
6278	dev_net_set(dev, net);
6279
6280	/* If there is an ifindex conflict assign a new one */
6281	if (__dev_get_by_index(net, dev->ifindex)) {
6282		int iflink = (dev->iflink == dev->ifindex);
6283		dev->ifindex = dev_new_index(net);
6284		if (iflink)
6285			dev->iflink = dev->ifindex;
6286	}
6287
6288	/* Fixup kobjects */
6289	err = device_rename(&dev->dev, dev->name);
6290	WARN_ON(err);
6291
6292	/* Add the device back in the hashes */
6293	list_netdevice(dev);
6294
6295	/* Notify protocols, that a new device appeared. */
6296	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6297
6298	/*
6299	 *	Prevent userspace races by waiting until the network
6300	 *	device is fully setup before sending notifications.
6301	 */
6302	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6303
6304	synchronize_net();
6305	err = 0;
6306out:
6307	return err;
6308}
6309EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6310
6311static int dev_cpu_callback(struct notifier_block *nfb,
6312			    unsigned long action,
6313			    void *ocpu)
6314{
6315	struct sk_buff **list_skb;
6316	struct sk_buff *skb;
6317	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6318	struct softnet_data *sd, *oldsd;
6319
6320	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6321		return NOTIFY_OK;
6322
6323	local_irq_disable();
6324	cpu = smp_processor_id();
6325	sd = &per_cpu(softnet_data, cpu);
6326	oldsd = &per_cpu(softnet_data, oldcpu);
6327
6328	/* Find end of our completion_queue. */
6329	list_skb = &sd->completion_queue;
6330	while (*list_skb)
6331		list_skb = &(*list_skb)->next;
6332	/* Append completion queue from offline CPU. */
6333	*list_skb = oldsd->completion_queue;
6334	oldsd->completion_queue = NULL;
6335
6336	/* Append output queue from offline CPU. */
6337	if (oldsd->output_queue) {
6338		*sd->output_queue_tailp = oldsd->output_queue;
6339		sd->output_queue_tailp = oldsd->output_queue_tailp;
6340		oldsd->output_queue = NULL;
6341		oldsd->output_queue_tailp = &oldsd->output_queue;
6342	}
6343	/* Append NAPI poll list from offline CPU. */
6344	if (!list_empty(&oldsd->poll_list)) {
6345		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6346		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6347	}
6348
6349	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6350	local_irq_enable();
6351
6352	/* Process offline CPU's input_pkt_queue */
6353	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6354		netif_rx(skb);
6355		input_queue_head_incr(oldsd);
6356	}
6357	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6358		netif_rx(skb);
6359		input_queue_head_incr(oldsd);
6360	}
6361
6362	return NOTIFY_OK;
6363}
6364
6365
6366/**
6367 *	netdev_increment_features - increment feature set by one
6368 *	@all: current feature set
6369 *	@one: new feature set
6370 *	@mask: mask feature set
6371 *
6372 *	Computes a new feature set after adding a device with feature set
6373 *	@one to the master device with current feature set @all.  Will not
6374 *	enable anything that is off in @mask. Returns the new feature set.
6375 */
6376u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6377{
6378	if (mask & NETIF_F_GEN_CSUM)
6379		mask |= NETIF_F_ALL_CSUM;
6380	mask |= NETIF_F_VLAN_CHALLENGED;
6381
6382	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6383	all &= one | ~NETIF_F_ALL_FOR_ALL;
6384
6385	/* If device needs checksumming, downgrade to it. */
6386	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6387		all &= ~NETIF_F_NO_CSUM;
6388
6389	/* If one device supports hw checksumming, set for all. */
6390	if (all & NETIF_F_GEN_CSUM)
6391		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6392
6393	return all;
6394}
6395EXPORT_SYMBOL(netdev_increment_features);
6396
6397static struct hlist_head *netdev_create_hash(void)
6398{
6399	int i;
6400	struct hlist_head *hash;
6401
6402	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6403	if (hash != NULL)
6404		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6405			INIT_HLIST_HEAD(&hash[i]);
6406
6407	return hash;
6408}
6409
6410/* Initialize per network namespace state */
6411static int __net_init netdev_init(struct net *net)
6412{
6413	INIT_LIST_HEAD(&net->dev_base_head);
6414
6415	net->dev_name_head = netdev_create_hash();
6416	if (net->dev_name_head == NULL)
6417		goto err_name;
6418
6419	net->dev_index_head = netdev_create_hash();
6420	if (net->dev_index_head == NULL)
6421		goto err_idx;
6422
6423	return 0;
6424
6425err_idx:
6426	kfree(net->dev_name_head);
6427err_name:
6428	return -ENOMEM;
6429}
6430
6431/**
6432 *	netdev_drivername - network driver for the device
6433 *	@dev: network device
6434 *
6435 *	Determine network driver for device.
6436 */
6437const char *netdev_drivername(const struct net_device *dev)
6438{
6439	const struct device_driver *driver;
6440	const struct device *parent;
6441	const char *empty = "";
6442
6443	parent = dev->dev.parent;
6444	if (!parent)
6445		return empty;
6446
6447	driver = parent->driver;
6448	if (driver && driver->name)
6449		return driver->name;
6450	return empty;
6451}
6452
6453int __netdev_printk(const char *level, const struct net_device *dev,
6454			   struct va_format *vaf)
6455{
6456	int r;
6457
6458	if (dev && dev->dev.parent)
6459		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6460			       netdev_name(dev), vaf);
6461	else if (dev)
6462		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6463	else
6464		r = printk("%s(NULL net_device): %pV", level, vaf);
6465
6466	return r;
6467}
6468EXPORT_SYMBOL(__netdev_printk);
6469
6470int netdev_printk(const char *level, const struct net_device *dev,
6471		  const char *format, ...)
6472{
6473	struct va_format vaf;
6474	va_list args;
6475	int r;
6476
6477	va_start(args, format);
6478
6479	vaf.fmt = format;
6480	vaf.va = &args;
6481
6482	r = __netdev_printk(level, dev, &vaf);
6483	va_end(args);
6484
6485	return r;
6486}
6487EXPORT_SYMBOL(netdev_printk);
6488
6489#define define_netdev_printk_level(func, level)			\
6490int func(const struct net_device *dev, const char *fmt, ...)	\
6491{								\
6492	int r;							\
6493	struct va_format vaf;					\
6494	va_list args;						\
6495								\
6496	va_start(args, fmt);					\
6497								\
6498	vaf.fmt = fmt;						\
6499	vaf.va = &args;						\
6500								\
6501	r = __netdev_printk(level, dev, &vaf);			\
6502	va_end(args);						\
6503								\
6504	return r;						\
6505}								\
6506EXPORT_SYMBOL(func);
6507
6508define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6509define_netdev_printk_level(netdev_alert, KERN_ALERT);
6510define_netdev_printk_level(netdev_crit, KERN_CRIT);
6511define_netdev_printk_level(netdev_err, KERN_ERR);
6512define_netdev_printk_level(netdev_warn, KERN_WARNING);
6513define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6514define_netdev_printk_level(netdev_info, KERN_INFO);
6515
6516static void __net_exit netdev_exit(struct net *net)
6517{
6518	kfree(net->dev_name_head);
6519	kfree(net->dev_index_head);
6520}
6521
6522static struct pernet_operations __net_initdata netdev_net_ops = {
6523	.init = netdev_init,
6524	.exit = netdev_exit,
6525};
6526
6527static void __net_exit default_device_exit(struct net *net)
6528{
6529	struct net_device *dev, *aux;
6530	/*
6531	 * Push all migratable network devices back to the
6532	 * initial network namespace
6533	 */
6534	rtnl_lock();
6535	for_each_netdev_safe(net, dev, aux) {
6536		int err;
6537		char fb_name[IFNAMSIZ];
6538
6539		/* Ignore unmoveable devices (i.e. loopback) */
6540		if (dev->features & NETIF_F_NETNS_LOCAL)
6541			continue;
6542
6543		/* Leave virtual devices for the generic cleanup */
6544		if (dev->rtnl_link_ops)
6545			continue;
6546
6547		/* Push remaining network devices to init_net */
6548		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6549		err = dev_change_net_namespace(dev, &init_net, fb_name);
6550		if (err) {
6551			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6552				__func__, dev->name, err);
6553			BUG();
6554		}
6555	}
6556	rtnl_unlock();
6557}
6558
6559static void __net_exit default_device_exit_batch(struct list_head *net_list)
6560{
6561	/* At exit all network devices most be removed from a network
6562	 * namespace.  Do this in the reverse order of registration.
6563	 * Do this across as many network namespaces as possible to
6564	 * improve batching efficiency.
6565	 */
6566	struct net_device *dev;
6567	struct net *net;
6568	LIST_HEAD(dev_kill_list);
6569
6570	rtnl_lock();
6571	list_for_each_entry(net, net_list, exit_list) {
6572		for_each_netdev_reverse(net, dev) {
6573			if (dev->rtnl_link_ops)
6574				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6575			else
6576				unregister_netdevice_queue(dev, &dev_kill_list);
6577		}
6578	}
6579	unregister_netdevice_many(&dev_kill_list);
6580	list_del(&dev_kill_list);
6581	rtnl_unlock();
6582}
6583
6584static struct pernet_operations __net_initdata default_device_ops = {
6585	.exit = default_device_exit,
6586	.exit_batch = default_device_exit_batch,
6587};
6588
6589/*
6590 *	Initialize the DEV module. At boot time this walks the device list and
6591 *	unhooks any devices that fail to initialise (normally hardware not
6592 *	present) and leaves us with a valid list of present and active devices.
6593 *
6594 */
6595
6596/*
6597 *       This is called single threaded during boot, so no need
6598 *       to take the rtnl semaphore.
6599 */
6600static int __init net_dev_init(void)
6601{
6602	int i, rc = -ENOMEM;
6603
6604	BUG_ON(!dev_boot_phase);
6605
6606	if (dev_proc_init())
6607		goto out;
6608
6609	if (netdev_kobject_init())
6610		goto out;
6611
6612	INIT_LIST_HEAD(&ptype_all);
6613	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6614		INIT_LIST_HEAD(&ptype_base[i]);
6615
6616	if (register_pernet_subsys(&netdev_net_ops))
6617		goto out;
6618
6619	/*
6620	 *	Initialise the packet receive queues.
6621	 */
6622
6623	for_each_possible_cpu(i) {
6624		struct softnet_data *sd = &per_cpu(softnet_data, i);
6625
6626		memset(sd, 0, sizeof(*sd));
6627		skb_queue_head_init(&sd->input_pkt_queue);
6628		skb_queue_head_init(&sd->process_queue);
6629		sd->completion_queue = NULL;
6630		INIT_LIST_HEAD(&sd->poll_list);
6631		sd->output_queue = NULL;
6632		sd->output_queue_tailp = &sd->output_queue;
6633#ifdef CONFIG_RPS
6634		sd->csd.func = rps_trigger_softirq;
6635		sd->csd.info = sd;
6636		sd->csd.flags = 0;
6637		sd->cpu = i;
6638#endif
6639
6640		sd->backlog.poll = process_backlog;
6641		sd->backlog.weight = weight_p;
6642		sd->backlog.gro_list = NULL;
6643		sd->backlog.gro_count = 0;
6644	}
6645
6646	dev_boot_phase = 0;
6647
6648	/* The loopback device is special if any other network devices
6649	 * is present in a network namespace the loopback device must
6650	 * be present. Since we now dynamically allocate and free the
6651	 * loopback device ensure this invariant is maintained by
6652	 * keeping the loopback device as the first device on the
6653	 * list of network devices.  Ensuring the loopback devices
6654	 * is the first device that appears and the last network device
6655	 * that disappears.
6656	 */
6657	if (register_pernet_device(&loopback_net_ops))
6658		goto out;
6659
6660	if (register_pernet_device(&default_device_ops))
6661		goto out;
6662
6663	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6664	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6665
6666	hotcpu_notifier(dev_cpu_callback, 0);
6667	dst_init();
6668	dev_mcast_init();
6669	rc = 0;
6670out:
6671	return rc;
6672}
6673
6674subsys_initcall(net_dev_init);
6675
6676static int __init initialize_hashrnd(void)
6677{
6678	get_random_bytes(&hashrnd, sizeof(hashrnd));
6679	return 0;
6680}
6681
6682late_initcall_sync(initialize_hashrnd);
6683