net/core/dev.c at v3.2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v3.2 166 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136#include <linux/if_tunnel.h>
 137#include <linux/if_pppox.h>
 138#include <linux/ppp_defs.h>
 139#include <linux/net_tstamp.h>
 140
 141#include "net-sysfs.h"
 142
 143/* Instead of increasing this, you should create a hash table. */
 144#define MAX_GRO_SKBS 8
 145
 146/* This should be increased if a protocol with a bigger head is added. */
 147#define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149/*
 150 *	The list of packet types we will receive (as opposed to discard)
 151 *	and the routines to invoke.
 152 *
 153 *	Why 16. Because with 16 the only overlap we get on a hash of the
 154 *	low nibble of the protocol value is RARP/SNAP/X.25.
 155 *
 156 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 157 *             sure which should go first, but I bet it won't make much
 158 *             difference if we are running VLANs.  The good news is that
 159 *             this protocol won't be in the list unless compiled in, so
 160 *             the average user (w/out VLANs) will not be adversely affected.
 161 *             --BLG
 162 *
 163 *		0800	IP
 164 *		8100    802.1Q VLAN
 165 *		0001	802.3
 166 *		0002	AX.25
 167 *		0004	802.2
 168 *		8035	RARP
 169 *		0005	SNAP
 170 *		0805	X.25
 171 *		0806	ARP
 172 *		8137	IPX
 173 *		0009	Localtalk
 174 *		86DD	IPv6
 175 */
 176
 177#define PTYPE_HASH_SIZE	(16)
 178#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 179
 180static DEFINE_SPINLOCK(ptype_lock);
 181static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 182static struct list_head ptype_all __read_mostly;	/* Taps */
 183
 184/*
 185 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186 * semaphore.
 187 *
 188 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189 *
 190 * Writers must hold the rtnl semaphore while they loop through the
 191 * dev_base_head list, and hold dev_base_lock for writing when they do the
 192 * actual updates.  This allows pure readers to access the list even
 193 * while a writer is preparing to update it.
 194 *
 195 * To put it another way, dev_base_lock is held for writing only to
 196 * protect against pure readers; the rtnl semaphore provides the
 197 * protection against other writers.
 198 *
 199 * See, for example usages, register_netdevice() and
 200 * unregister_netdevice(), which must be called with the rtnl
 201 * semaphore held.
 202 */
 203DEFINE_RWLOCK(dev_base_lock);
 204EXPORT_SYMBOL(dev_base_lock);
 205
 206static inline void dev_base_seq_inc(struct net *net)
 207{
 208	while (++net->dev_base_seq == 0);
 209}
 210
 211static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212{
 213	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 215}
 216
 217static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 218{
 219	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 220}
 221
 222static inline void rps_lock(struct softnet_data *sd)
 223{
 224#ifdef CONFIG_RPS
 225	spin_lock(&sd->input_pkt_queue.lock);
 226#endif
 227}
 228
 229static inline void rps_unlock(struct softnet_data *sd)
 230{
 231#ifdef CONFIG_RPS
 232	spin_unlock(&sd->input_pkt_queue.lock);
 233#endif
 234}
 235
 236/* Device list insertion */
 237static int list_netdevice(struct net_device *dev)
 238{
 239	struct net *net = dev_net(dev);
 240
 241	ASSERT_RTNL();
 242
 243	write_lock_bh(&dev_base_lock);
 244	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 245	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 246	hlist_add_head_rcu(&dev->index_hlist,
 247			   dev_index_hash(net, dev->ifindex));
 248	write_unlock_bh(&dev_base_lock);
 249
 250	dev_base_seq_inc(net);
 251
 252	return 0;
 253}
 254
 255/* Device list removal
 256 * caller must respect a RCU grace period before freeing/reusing dev
 257 */
 258static void unlist_netdevice(struct net_device *dev)
 259{
 260	ASSERT_RTNL();
 261
 262	/* Unlink dev from the device chain */
 263	write_lock_bh(&dev_base_lock);
 264	list_del_rcu(&dev->dev_list);
 265	hlist_del_rcu(&dev->name_hlist);
 266	hlist_del_rcu(&dev->index_hlist);
 267	write_unlock_bh(&dev_base_lock);
 268
 269	dev_base_seq_inc(dev_net(dev));
 270}
 271
 272/*
 273 *	Our notifier list
 274 */
 275
 276static RAW_NOTIFIER_HEAD(netdev_chain);
 277
 278/*
 279 *	Device drivers call our routines to queue packets here. We empty the
 280 *	queue in the local softnet handler.
 281 */
 282
 283DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 284EXPORT_PER_CPU_SYMBOL(softnet_data);
 285
 286#ifdef CONFIG_LOCKDEP
 287/*
 288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 289 * according to dev->type
 290 */
 291static const unsigned short netdev_lock_type[] =
 292	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 293	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 294	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 295	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 296	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 297	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 298	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 299	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 300	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 301	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 302	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 303	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 304	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 305	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 306	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 307	 ARPHRD_VOID, ARPHRD_NONE};
 308
 309static const char *const netdev_lock_name[] =
 310	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 323	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 324	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 325	 "_xmit_VOID", "_xmit_NONE"};
 326
 327static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331{
 332	int i;
 333
 334	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335		if (netdev_lock_type[i] == dev_type)
 336			return i;
 337	/* the last key is used by default */
 338	return ARRAY_SIZE(netdev_lock_type) - 1;
 339}
 340
 341static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342						 unsigned short dev_type)
 343{
 344	int i;
 345
 346	i = netdev_lock_pos(dev_type);
 347	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348				   netdev_lock_name[i]);
 349}
 350
 351static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352{
 353	int i;
 354
 355	i = netdev_lock_pos(dev->type);
 356	lockdep_set_class_and_name(&dev->addr_list_lock,
 357				   &netdev_addr_lock_key[i],
 358				   netdev_lock_name[i]);
 359}
 360#else
 361static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362						 unsigned short dev_type)
 363{
 364}
 365static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366{
 367}
 368#endif
 369
 370/*******************************************************************************
 371
 372		Protocol management and registration routines
 373
 374*******************************************************************************/
 375
 376/*
 377 *	Add a protocol ID to the list. Now that the input handler is
 378 *	smarter we can dispense with all the messy stuff that used to be
 379 *	here.
 380 *
 381 *	BEWARE!!! Protocol handlers, mangling input packets,
 382 *	MUST BE last in hash buckets and checking protocol handlers
 383 *	MUST start from promiscuous ptype_all chain in net_bh.
 384 *	It is true now, do not change it.
 385 *	Explanation follows: if protocol handler, mangling packet, will
 386 *	be the first on list, it is not able to sense, that packet
 387 *	is cloned and should be copied-on-write, so that it will
 388 *	change it and subsequent readers will get broken packet.
 389 *							--ANK (980803)
 390 */
 391
 392static inline struct list_head *ptype_head(const struct packet_type *pt)
 393{
 394	if (pt->type == htons(ETH_P_ALL))
 395		return &ptype_all;
 396	else
 397		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398}
 399
 400/**
 401 *	dev_add_pack - add packet handler
 402 *	@pt: packet type declaration
 403 *
 404 *	Add a protocol handler to the networking stack. The passed &packet_type
 405 *	is linked into kernel lists and may not be freed until it has been
 406 *	removed from the kernel lists.
 407 *
 408 *	This call does not sleep therefore it can not
 409 *	guarantee all CPU's that are in middle of receiving packets
 410 *	will see the new packet type (until the next received packet).
 411 */
 412
 413void dev_add_pack(struct packet_type *pt)
 414{
 415	struct list_head *head = ptype_head(pt);
 416
 417	spin_lock(&ptype_lock);
 418	list_add_rcu(&pt->list, head);
 419	spin_unlock(&ptype_lock);
 420}
 421EXPORT_SYMBOL(dev_add_pack);
 422
 423/**
 424 *	__dev_remove_pack	 - remove packet handler
 425 *	@pt: packet type declaration
 426 *
 427 *	Remove a protocol handler that was previously added to the kernel
 428 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429 *	from the kernel lists and can be freed or reused once this function
 430 *	returns.
 431 *
 432 *      The packet type might still be in use by receivers
 433 *	and must not be freed until after all the CPU's have gone
 434 *	through a quiescent state.
 435 */
 436void __dev_remove_pack(struct packet_type *pt)
 437{
 438	struct list_head *head = ptype_head(pt);
 439	struct packet_type *pt1;
 440
 441	spin_lock(&ptype_lock);
 442
 443	list_for_each_entry(pt1, head, list) {
 444		if (pt == pt1) {
 445			list_del_rcu(&pt->list);
 446			goto out;
 447		}
 448	}
 449
 450	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 451out:
 452	spin_unlock(&ptype_lock);
 453}
 454EXPORT_SYMBOL(__dev_remove_pack);
 455
 456/**
 457 *	dev_remove_pack	 - remove packet handler
 458 *	@pt: packet type declaration
 459 *
 460 *	Remove a protocol handler that was previously added to the kernel
 461 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462 *	from the kernel lists and can be freed or reused once this function
 463 *	returns.
 464 *
 465 *	This call sleeps to guarantee that no CPU is looking at the packet
 466 *	type after return.
 467 */
 468void dev_remove_pack(struct packet_type *pt)
 469{
 470	__dev_remove_pack(pt);
 471
 472	synchronize_net();
 473}
 474EXPORT_SYMBOL(dev_remove_pack);
 475
 476/******************************************************************************
 477
 478		      Device Boot-time Settings Routines
 479
 480*******************************************************************************/
 481
 482/* Boot time configuration table */
 483static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 484
 485/**
 486 *	netdev_boot_setup_add	- add new setup entry
 487 *	@name: name of the device
 488 *	@map: configured settings for the device
 489 *
 490 *	Adds new setup entry to the dev_boot_setup list.  The function
 491 *	returns 0 on error and 1 on success.  This is a generic routine to
 492 *	all netdevices.
 493 */
 494static int netdev_boot_setup_add(char *name, struct ifmap *map)
 495{
 496	struct netdev_boot_setup *s;
 497	int i;
 498
 499	s = dev_boot_setup;
 500	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 502			memset(s[i].name, 0, sizeof(s[i].name));
 503			strlcpy(s[i].name, name, IFNAMSIZ);
 504			memcpy(&s[i].map, map, sizeof(s[i].map));
 505			break;
 506		}
 507	}
 508
 509	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 510}
 511
 512/**
 513 *	netdev_boot_setup_check	- check boot time settings
 514 *	@dev: the netdevice
 515 *
 516 * 	Check boot time settings for the device.
 517 *	The found settings are set for the device to be used
 518 *	later in the device probing.
 519 *	Returns 0 if no settings found, 1 if they are.
 520 */
 521int netdev_boot_setup_check(struct net_device *dev)
 522{
 523	struct netdev_boot_setup *s = dev_boot_setup;
 524	int i;
 525
 526	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 527		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 528		    !strcmp(dev->name, s[i].name)) {
 529			dev->irq 	= s[i].map.irq;
 530			dev->base_addr 	= s[i].map.base_addr;
 531			dev->mem_start 	= s[i].map.mem_start;
 532			dev->mem_end 	= s[i].map.mem_end;
 533			return 1;
 534		}
 535	}
 536	return 0;
 537}
 538EXPORT_SYMBOL(netdev_boot_setup_check);
 539
 540
 541/**
 542 *	netdev_boot_base	- get address from boot time settings
 543 *	@prefix: prefix for network device
 544 *	@unit: id for network device
 545 *
 546 * 	Check boot time settings for the base address of device.
 547 *	The found settings are set for the device to be used
 548 *	later in the device probing.
 549 *	Returns 0 if no settings found.
 550 */
 551unsigned long netdev_boot_base(const char *prefix, int unit)
 552{
 553	const struct netdev_boot_setup *s = dev_boot_setup;
 554	char name[IFNAMSIZ];
 555	int i;
 556
 557	sprintf(name, "%s%d", prefix, unit);
 558
 559	/*
 560	 * If device already registered then return base of 1
 561	 * to indicate not to probe for this interface
 562	 */
 563	if (__dev_get_by_name(&init_net, name))
 564		return 1;
 565
 566	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 567		if (!strcmp(name, s[i].name))
 568			return s[i].map.base_addr;
 569	return 0;
 570}
 571
 572/*
 573 * Saves at boot time configured settings for any netdevice.
 574 */
 575int __init netdev_boot_setup(char *str)
 576{
 577	int ints[5];
 578	struct ifmap map;
 579
 580	str = get_options(str, ARRAY_SIZE(ints), ints);
 581	if (!str || !*str)
 582		return 0;
 583
 584	/* Save settings */
 585	memset(&map, 0, sizeof(map));
 586	if (ints[0] > 0)
 587		map.irq = ints[1];
 588	if (ints[0] > 1)
 589		map.base_addr = ints[2];
 590	if (ints[0] > 2)
 591		map.mem_start = ints[3];
 592	if (ints[0] > 3)
 593		map.mem_end = ints[4];
 594
 595	/* Add new entry to the list */
 596	return netdev_boot_setup_add(str, &map);
 597}
 598
 599__setup("netdev=", netdev_boot_setup);
 600
 601/*******************************************************************************
 602
 603			    Device Interface Subroutines
 604
 605*******************************************************************************/
 606
 607/**
 608 *	__dev_get_by_name	- find a device by its name
 609 *	@net: the applicable net namespace
 610 *	@name: name to find
 611 *
 612 *	Find an interface by name. Must be called under RTNL semaphore
 613 *	or @dev_base_lock. If the name is found a pointer to the device
 614 *	is returned. If the name is not found then %NULL is returned. The
 615 *	reference counters are not incremented so the caller must be
 616 *	careful with locks.
 617 */
 618
 619struct net_device *__dev_get_by_name(struct net *net, const char *name)
 620{
 621	struct hlist_node *p;
 622	struct net_device *dev;
 623	struct hlist_head *head = dev_name_hash(net, name);
 624
 625	hlist_for_each_entry(dev, p, head, name_hlist)
 626		if (!strncmp(dev->name, name, IFNAMSIZ))
 627			return dev;
 628
 629	return NULL;
 630}
 631EXPORT_SYMBOL(__dev_get_by_name);
 632
 633/**
 634 *	dev_get_by_name_rcu	- find a device by its name
 635 *	@net: the applicable net namespace
 636 *	@name: name to find
 637 *
 638 *	Find an interface by name.
 639 *	If the name is found a pointer to the device is returned.
 640 * 	If the name is not found then %NULL is returned.
 641 *	The reference counters are not incremented so the caller must be
 642 *	careful with locks. The caller must hold RCU lock.
 643 */
 644
 645struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 646{
 647	struct hlist_node *p;
 648	struct net_device *dev;
 649	struct hlist_head *head = dev_name_hash(net, name);
 650
 651	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 652		if (!strncmp(dev->name, name, IFNAMSIZ))
 653			return dev;
 654
 655	return NULL;
 656}
 657EXPORT_SYMBOL(dev_get_by_name_rcu);
 658
 659/**
 660 *	dev_get_by_name		- find a device by its name
 661 *	@net: the applicable net namespace
 662 *	@name: name to find
 663 *
 664 *	Find an interface by name. This can be called from any
 665 *	context and does its own locking. The returned handle has
 666 *	the usage count incremented and the caller must use dev_put() to
 667 *	release it when it is no longer needed. %NULL is returned if no
 668 *	matching device is found.
 669 */
 670
 671struct net_device *dev_get_by_name(struct net *net, const char *name)
 672{
 673	struct net_device *dev;
 674
 675	rcu_read_lock();
 676	dev = dev_get_by_name_rcu(net, name);
 677	if (dev)
 678		dev_hold(dev);
 679	rcu_read_unlock();
 680	return dev;
 681}
 682EXPORT_SYMBOL(dev_get_by_name);
 683
 684/**
 685 *	__dev_get_by_index - find a device by its ifindex
 686 *	@net: the applicable net namespace
 687 *	@ifindex: index of device
 688 *
 689 *	Search for an interface by index. Returns %NULL if the device
 690 *	is not found or a pointer to the device. The device has not
 691 *	had its reference counter increased so the caller must be careful
 692 *	about locking. The caller must hold either the RTNL semaphore
 693 *	or @dev_base_lock.
 694 */
 695
 696struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 697{
 698	struct hlist_node *p;
 699	struct net_device *dev;
 700	struct hlist_head *head = dev_index_hash(net, ifindex);
 701
 702	hlist_for_each_entry(dev, p, head, index_hlist)
 703		if (dev->ifindex == ifindex)
 704			return dev;
 705
 706	return NULL;
 707}
 708EXPORT_SYMBOL(__dev_get_by_index);
 709
 710/**
 711 *	dev_get_by_index_rcu - find a device by its ifindex
 712 *	@net: the applicable net namespace
 713 *	@ifindex: index of device
 714 *
 715 *	Search for an interface by index. Returns %NULL if the device
 716 *	is not found or a pointer to the device. The device has not
 717 *	had its reference counter increased so the caller must be careful
 718 *	about locking. The caller must hold RCU lock.
 719 */
 720
 721struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 722{
 723	struct hlist_node *p;
 724	struct net_device *dev;
 725	struct hlist_head *head = dev_index_hash(net, ifindex);
 726
 727	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 728		if (dev->ifindex == ifindex)
 729			return dev;
 730
 731	return NULL;
 732}
 733EXPORT_SYMBOL(dev_get_by_index_rcu);
 734
 735
 736/**
 737 *	dev_get_by_index - find a device by its ifindex
 738 *	@net: the applicable net namespace
 739 *	@ifindex: index of device
 740 *
 741 *	Search for an interface by index. Returns NULL if the device
 742 *	is not found or a pointer to the device. The device returned has
 743 *	had a reference added and the pointer is safe until the user calls
 744 *	dev_put to indicate they have finished with it.
 745 */
 746
 747struct net_device *dev_get_by_index(struct net *net, int ifindex)
 748{
 749	struct net_device *dev;
 750
 751	rcu_read_lock();
 752	dev = dev_get_by_index_rcu(net, ifindex);
 753	if (dev)
 754		dev_hold(dev);
 755	rcu_read_unlock();
 756	return dev;
 757}
 758EXPORT_SYMBOL(dev_get_by_index);
 759
 760/**
 761 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 762 *	@net: the applicable net namespace
 763 *	@type: media type of device
 764 *	@ha: hardware address
 765 *
 766 *	Search for an interface by MAC address. Returns NULL if the device
 767 *	is not found or a pointer to the device.
 768 *	The caller must hold RCU or RTNL.
 769 *	The returned device has not had its ref count increased
 770 *	and the caller must therefore be careful about locking
 771 *
 772 */
 773
 774struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 775				       const char *ha)
 776{
 777	struct net_device *dev;
 778
 779	for_each_netdev_rcu(net, dev)
 780		if (dev->type == type &&
 781		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 782			return dev;
 783
 784	return NULL;
 785}
 786EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 787
 788struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 789{
 790	struct net_device *dev;
 791
 792	ASSERT_RTNL();
 793	for_each_netdev(net, dev)
 794		if (dev->type == type)
 795			return dev;
 796
 797	return NULL;
 798}
 799EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 800
 801struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 802{
 803	struct net_device *dev, *ret = NULL;
 804
 805	rcu_read_lock();
 806	for_each_netdev_rcu(net, dev)
 807		if (dev->type == type) {
 808			dev_hold(dev);
 809			ret = dev;
 810			break;
 811		}
 812	rcu_read_unlock();
 813	return ret;
 814}
 815EXPORT_SYMBOL(dev_getfirstbyhwtype);
 816
 817/**
 818 *	dev_get_by_flags_rcu - find any device with given flags
 819 *	@net: the applicable net namespace
 820 *	@if_flags: IFF_* values
 821 *	@mask: bitmask of bits in if_flags to check
 822 *
 823 *	Search for any interface with the given flags. Returns NULL if a device
 824 *	is not found or a pointer to the device. Must be called inside
 825 *	rcu_read_lock(), and result refcount is unchanged.
 826 */
 827
 828struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 829				    unsigned short mask)
 830{
 831	struct net_device *dev, *ret;
 832
 833	ret = NULL;
 834	for_each_netdev_rcu(net, dev) {
 835		if (((dev->flags ^ if_flags) & mask) == 0) {
 836			ret = dev;
 837			break;
 838		}
 839	}
 840	return ret;
 841}
 842EXPORT_SYMBOL(dev_get_by_flags_rcu);
 843
 844/**
 845 *	dev_valid_name - check if name is okay for network device
 846 *	@name: name string
 847 *
 848 *	Network device names need to be valid file names to
 849 *	to allow sysfs to work.  We also disallow any kind of
 850 *	whitespace.
 851 */
 852int dev_valid_name(const char *name)
 853{
 854	if (*name == '\0')
 855		return 0;
 856	if (strlen(name) >= IFNAMSIZ)
 857		return 0;
 858	if (!strcmp(name, ".") || !strcmp(name, ".."))
 859		return 0;
 860
 861	while (*name) {
 862		if (*name == '/' || isspace(*name))
 863			return 0;
 864		name++;
 865	}
 866	return 1;
 867}
 868EXPORT_SYMBOL(dev_valid_name);
 869
 870/**
 871 *	__dev_alloc_name - allocate a name for a device
 872 *	@net: network namespace to allocate the device name in
 873 *	@name: name format string
 874 *	@buf:  scratch buffer and result name string
 875 *
 876 *	Passed a format string - eg "lt%d" it will try and find a suitable
 877 *	id. It scans list of devices to build up a free map, then chooses
 878 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 879 *	while allocating the name and adding the device in order to avoid
 880 *	duplicates.
 881 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 882 *	Returns the number of the unit assigned or a negative errno code.
 883 */
 884
 885static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 886{
 887	int i = 0;
 888	const char *p;
 889	const int max_netdevices = 8*PAGE_SIZE;
 890	unsigned long *inuse;
 891	struct net_device *d;
 892
 893	p = strnchr(name, IFNAMSIZ-1, '%');
 894	if (p) {
 895		/*
 896		 * Verify the string as this thing may have come from
 897		 * the user.  There must be either one "%d" and no other "%"
 898		 * characters.
 899		 */
 900		if (p[1] != 'd' || strchr(p + 2, '%'))
 901			return -EINVAL;
 902
 903		/* Use one page as a bit array of possible slots */
 904		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 905		if (!inuse)
 906			return -ENOMEM;
 907
 908		for_each_netdev(net, d) {
 909			if (!sscanf(d->name, name, &i))
 910				continue;
 911			if (i < 0 || i >= max_netdevices)
 912				continue;
 913
 914			/*  avoid cases where sscanf is not exact inverse of printf */
 915			snprintf(buf, IFNAMSIZ, name, i);
 916			if (!strncmp(buf, d->name, IFNAMSIZ))
 917				set_bit(i, inuse);
 918		}
 919
 920		i = find_first_zero_bit(inuse, max_netdevices);
 921		free_page((unsigned long) inuse);
 922	}
 923
 924	if (buf != name)
 925		snprintf(buf, IFNAMSIZ, name, i);
 926	if (!__dev_get_by_name(net, buf))
 927		return i;
 928
 929	/* It is possible to run out of possible slots
 930	 * when the name is long and there isn't enough space left
 931	 * for the digits, or if all bits are used.
 932	 */
 933	return -ENFILE;
 934}
 935
 936/**
 937 *	dev_alloc_name - allocate a name for a device
 938 *	@dev: device
 939 *	@name: name format string
 940 *
 941 *	Passed a format string - eg "lt%d" it will try and find a suitable
 942 *	id. It scans list of devices to build up a free map, then chooses
 943 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 944 *	while allocating the name and adding the device in order to avoid
 945 *	duplicates.
 946 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 947 *	Returns the number of the unit assigned or a negative errno code.
 948 */
 949
 950int dev_alloc_name(struct net_device *dev, const char *name)
 951{
 952	char buf[IFNAMSIZ];
 953	struct net *net;
 954	int ret;
 955
 956	BUG_ON(!dev_net(dev));
 957	net = dev_net(dev);
 958	ret = __dev_alloc_name(net, name, buf);
 959	if (ret >= 0)
 960		strlcpy(dev->name, buf, IFNAMSIZ);
 961	return ret;
 962}
 963EXPORT_SYMBOL(dev_alloc_name);
 964
 965static int dev_get_valid_name(struct net_device *dev, const char *name)
 966{
 967	struct net *net;
 968
 969	BUG_ON(!dev_net(dev));
 970	net = dev_net(dev);
 971
 972	if (!dev_valid_name(name))
 973		return -EINVAL;
 974
 975	if (strchr(name, '%'))
 976		return dev_alloc_name(dev, name);
 977	else if (__dev_get_by_name(net, name))
 978		return -EEXIST;
 979	else if (dev->name != name)
 980		strlcpy(dev->name, name, IFNAMSIZ);
 981
 982	return 0;
 983}
 984
 985/**
 986 *	dev_change_name - change name of a device
 987 *	@dev: device
 988 *	@newname: name (or format string) must be at least IFNAMSIZ
 989 *
 990 *	Change name of a device, can pass format strings "eth%d".
 991 *	for wildcarding.
 992 */
 993int dev_change_name(struct net_device *dev, const char *newname)
 994{
 995	char oldname[IFNAMSIZ];
 996	int err = 0;
 997	int ret;
 998	struct net *net;
 999
1000	ASSERT_RTNL();
1001	BUG_ON(!dev_net(dev));
1002
1003	net = dev_net(dev);
1004	if (dev->flags & IFF_UP)
1005		return -EBUSY;
1006
1007	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008		return 0;
1009
1010	memcpy(oldname, dev->name, IFNAMSIZ);
1011
1012	err = dev_get_valid_name(dev, newname);
1013	if (err < 0)
1014		return err;
1015
1016rollback:
1017	ret = device_rename(&dev->dev, dev->name);
1018	if (ret) {
1019		memcpy(dev->name, oldname, IFNAMSIZ);
1020		return ret;
1021	}
1022
1023	write_lock_bh(&dev_base_lock);
1024	hlist_del_rcu(&dev->name_hlist);
1025	write_unlock_bh(&dev_base_lock);
1026
1027	synchronize_rcu();
1028
1029	write_lock_bh(&dev_base_lock);
1030	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031	write_unlock_bh(&dev_base_lock);
1032
1033	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034	ret = notifier_to_errno(ret);
1035
1036	if (ret) {
1037		/* err >= 0 after dev_alloc_name() or stores the first errno */
1038		if (err >= 0) {
1039			err = ret;
1040			memcpy(dev->name, oldname, IFNAMSIZ);
1041			goto rollback;
1042		} else {
1043			printk(KERN_ERR
1044			       "%s: name change rollback failed: %d.\n",
1045			       dev->name, ret);
1046		}
1047	}
1048
1049	return err;
1050}
1051
1052/**
1053 *	dev_set_alias - change ifalias of a device
1054 *	@dev: device
1055 *	@alias: name up to IFALIASZ
1056 *	@len: limit of bytes to copy from info
1057 *
1058 *	Set ifalias for a device,
1059 */
1060int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061{
1062	ASSERT_RTNL();
1063
1064	if (len >= IFALIASZ)
1065		return -EINVAL;
1066
1067	if (!len) {
1068		if (dev->ifalias) {
1069			kfree(dev->ifalias);
1070			dev->ifalias = NULL;
1071		}
1072		return 0;
1073	}
1074
1075	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076	if (!dev->ifalias)
1077		return -ENOMEM;
1078
1079	strlcpy(dev->ifalias, alias, len+1);
1080	return len;
1081}
1082
1083
1084/**
1085 *	netdev_features_change - device changes features
1086 *	@dev: device to cause notification
1087 *
1088 *	Called to indicate a device has changed features.
1089 */
1090void netdev_features_change(struct net_device *dev)
1091{
1092	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093}
1094EXPORT_SYMBOL(netdev_features_change);
1095
1096/**
1097 *	netdev_state_change - device changes state
1098 *	@dev: device to cause notification
1099 *
1100 *	Called to indicate a device has changed state. This function calls
1101 *	the notifier chains for netdev_chain and sends a NEWLINK message
1102 *	to the routing socket.
1103 */
1104void netdev_state_change(struct net_device *dev)
1105{
1106	if (dev->flags & IFF_UP) {
1107		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109	}
1110}
1111EXPORT_SYMBOL(netdev_state_change);
1112
1113int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114{
1115	return call_netdevice_notifiers(event, dev);
1116}
1117EXPORT_SYMBOL(netdev_bonding_change);
1118
1119/**
1120 *	dev_load 	- load a network module
1121 *	@net: the applicable net namespace
1122 *	@name: name of interface
1123 *
1124 *	If a network interface is not present and the process has suitable
1125 *	privileges this function loads the module. If module loading is not
1126 *	available in this kernel then it becomes a nop.
1127 */
1128
1129void dev_load(struct net *net, const char *name)
1130{
1131	struct net_device *dev;
1132	int no_module;
1133
1134	rcu_read_lock();
1135	dev = dev_get_by_name_rcu(net, name);
1136	rcu_read_unlock();
1137
1138	no_module = !dev;
1139	if (no_module && capable(CAP_NET_ADMIN))
1140		no_module = request_module("netdev-%s", name);
1141	if (no_module && capable(CAP_SYS_MODULE)) {
1142		if (!request_module("%s", name))
1143			pr_err("Loading kernel module for a network device "
1144"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145"instead\n", name);
1146	}
1147}
1148EXPORT_SYMBOL(dev_load);
1149
1150static int __dev_open(struct net_device *dev)
1151{
1152	const struct net_device_ops *ops = dev->netdev_ops;
1153	int ret;
1154
1155	ASSERT_RTNL();
1156
1157	if (!netif_device_present(dev))
1158		return -ENODEV;
1159
1160	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161	ret = notifier_to_errno(ret);
1162	if (ret)
1163		return ret;
1164
1165	set_bit(__LINK_STATE_START, &dev->state);
1166
1167	if (ops->ndo_validate_addr)
1168		ret = ops->ndo_validate_addr(dev);
1169
1170	if (!ret && ops->ndo_open)
1171		ret = ops->ndo_open(dev);
1172
1173	if (ret)
1174		clear_bit(__LINK_STATE_START, &dev->state);
1175	else {
1176		dev->flags |= IFF_UP;
1177		net_dmaengine_get();
1178		dev_set_rx_mode(dev);
1179		dev_activate(dev);
1180	}
1181
1182	return ret;
1183}
1184
1185/**
1186 *	dev_open	- prepare an interface for use.
1187 *	@dev:	device to open
1188 *
1189 *	Takes a device from down to up state. The device's private open
1190 *	function is invoked and then the multicast lists are loaded. Finally
1191 *	the device is moved into the up state and a %NETDEV_UP message is
1192 *	sent to the netdev notifier chain.
1193 *
1194 *	Calling this function on an active interface is a nop. On a failure
1195 *	a negative errno code is returned.
1196 */
1197int dev_open(struct net_device *dev)
1198{
1199	int ret;
1200
1201	if (dev->flags & IFF_UP)
1202		return 0;
1203
1204	ret = __dev_open(dev);
1205	if (ret < 0)
1206		return ret;
1207
1208	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209	call_netdevice_notifiers(NETDEV_UP, dev);
1210
1211	return ret;
1212}
1213EXPORT_SYMBOL(dev_open);
1214
1215static int __dev_close_many(struct list_head *head)
1216{
1217	struct net_device *dev;
1218
1219	ASSERT_RTNL();
1220	might_sleep();
1221
1222	list_for_each_entry(dev, head, unreg_list) {
1223		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224
1225		clear_bit(__LINK_STATE_START, &dev->state);
1226
1227		/* Synchronize to scheduled poll. We cannot touch poll list, it
1228		 * can be even on different cpu. So just clear netif_running().
1229		 *
1230		 * dev->stop() will invoke napi_disable() on all of it's
1231		 * napi_struct instances on this device.
1232		 */
1233		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234	}
1235
1236	dev_deactivate_many(head);
1237
1238	list_for_each_entry(dev, head, unreg_list) {
1239		const struct net_device_ops *ops = dev->netdev_ops;
1240
1241		/*
1242		 *	Call the device specific close. This cannot fail.
1243		 *	Only if device is UP
1244		 *
1245		 *	We allow it to be called even after a DETACH hot-plug
1246		 *	event.
1247		 */
1248		if (ops->ndo_stop)
1249			ops->ndo_stop(dev);
1250
1251		dev->flags &= ~IFF_UP;
1252		net_dmaengine_put();
1253	}
1254
1255	return 0;
1256}
1257
1258static int __dev_close(struct net_device *dev)
1259{
1260	int retval;
1261	LIST_HEAD(single);
1262
1263	list_add(&dev->unreg_list, &single);
1264	retval = __dev_close_many(&single);
1265	list_del(&single);
1266	return retval;
1267}
1268
1269static int dev_close_many(struct list_head *head)
1270{
1271	struct net_device *dev, *tmp;
1272	LIST_HEAD(tmp_list);
1273
1274	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275		if (!(dev->flags & IFF_UP))
1276			list_move(&dev->unreg_list, &tmp_list);
1277
1278	__dev_close_many(head);
1279
1280	list_for_each_entry(dev, head, unreg_list) {
1281		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282		call_netdevice_notifiers(NETDEV_DOWN, dev);
1283	}
1284
1285	/* rollback_registered_many needs the complete original list */
1286	list_splice(&tmp_list, head);
1287	return 0;
1288}
1289
1290/**
1291 *	dev_close - shutdown an interface.
1292 *	@dev: device to shutdown
1293 *
1294 *	This function moves an active device into down state. A
1295 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297 *	chain.
1298 */
1299int dev_close(struct net_device *dev)
1300{
1301	if (dev->flags & IFF_UP) {
1302		LIST_HEAD(single);
1303
1304		list_add(&dev->unreg_list, &single);
1305		dev_close_many(&single);
1306		list_del(&single);
1307	}
1308	return 0;
1309}
1310EXPORT_SYMBOL(dev_close);
1311
1312
1313/**
1314 *	dev_disable_lro - disable Large Receive Offload on a device
1315 *	@dev: device
1316 *
1317 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1318 *	called under RTNL.  This is needed if received packets may be
1319 *	forwarded to another interface.
1320 */
1321void dev_disable_lro(struct net_device *dev)
1322{
1323	u32 flags;
1324
1325	/*
1326	 * If we're trying to disable lro on a vlan device
1327	 * use the underlying physical device instead
1328	 */
1329	if (is_vlan_dev(dev))
1330		dev = vlan_dev_real_dev(dev);
1331
1332	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333		flags = dev->ethtool_ops->get_flags(dev);
1334	else
1335		flags = ethtool_op_get_flags(dev);
1336
1337	if (!(flags & ETH_FLAG_LRO))
1338		return;
1339
1340	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341	if (unlikely(dev->features & NETIF_F_LRO))
1342		netdev_WARN(dev, "failed to disable LRO!\n");
1343}
1344EXPORT_SYMBOL(dev_disable_lro);
1345
1346
1347static int dev_boot_phase = 1;
1348
1349/**
1350 *	register_netdevice_notifier - register a network notifier block
1351 *	@nb: notifier
1352 *
1353 *	Register a notifier to be called when network device events occur.
1354 *	The notifier passed is linked into the kernel structures and must
1355 *	not be reused until it has been unregistered. A negative errno code
1356 *	is returned on a failure.
1357 *
1358 * 	When registered all registration and up events are replayed
1359 *	to the new notifier to allow device to have a race free
1360 *	view of the network device list.
1361 */
1362
1363int register_netdevice_notifier(struct notifier_block *nb)
1364{
1365	struct net_device *dev;
1366	struct net_device *last;
1367	struct net *net;
1368	int err;
1369
1370	rtnl_lock();
1371	err = raw_notifier_chain_register(&netdev_chain, nb);
1372	if (err)
1373		goto unlock;
1374	if (dev_boot_phase)
1375		goto unlock;
1376	for_each_net(net) {
1377		for_each_netdev(net, dev) {
1378			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379			err = notifier_to_errno(err);
1380			if (err)
1381				goto rollback;
1382
1383			if (!(dev->flags & IFF_UP))
1384				continue;
1385
1386			nb->notifier_call(nb, NETDEV_UP, dev);
1387		}
1388	}
1389
1390unlock:
1391	rtnl_unlock();
1392	return err;
1393
1394rollback:
1395	last = dev;
1396	for_each_net(net) {
1397		for_each_netdev(net, dev) {
1398			if (dev == last)
1399				goto outroll;
1400
1401			if (dev->flags & IFF_UP) {
1402				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403				nb->notifier_call(nb, NETDEV_DOWN, dev);
1404			}
1405			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407		}
1408	}
1409
1410outroll:
1411	raw_notifier_chain_unregister(&netdev_chain, nb);
1412	goto unlock;
1413}
1414EXPORT_SYMBOL(register_netdevice_notifier);
1415
1416/**
1417 *	unregister_netdevice_notifier - unregister a network notifier block
1418 *	@nb: notifier
1419 *
1420 *	Unregister a notifier previously registered by
1421 *	register_netdevice_notifier(). The notifier is unlinked into the
1422 *	kernel structures and may then be reused. A negative errno code
1423 *	is returned on a failure.
1424 */
1425
1426int unregister_netdevice_notifier(struct notifier_block *nb)
1427{
1428	int err;
1429
1430	rtnl_lock();
1431	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1432	rtnl_unlock();
1433	return err;
1434}
1435EXPORT_SYMBOL(unregister_netdevice_notifier);
1436
1437/**
1438 *	call_netdevice_notifiers - call all network notifier blocks
1439 *      @val: value passed unmodified to notifier function
1440 *      @dev: net_device pointer passed unmodified to notifier function
1441 *
1442 *	Call all network notifier blocks.  Parameters and return value
1443 *	are as for raw_notifier_call_chain().
1444 */
1445
1446int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1447{
1448	ASSERT_RTNL();
1449	return raw_notifier_call_chain(&netdev_chain, val, dev);
1450}
1451EXPORT_SYMBOL(call_netdevice_notifiers);
1452
1453/* When > 0 there are consumers of rx skb time stamps */
1454static atomic_t netstamp_needed = ATOMIC_INIT(0);
1455
1456void net_enable_timestamp(void)
1457{
1458	atomic_inc(&netstamp_needed);
1459}
1460EXPORT_SYMBOL(net_enable_timestamp);
1461
1462void net_disable_timestamp(void)
1463{
1464	atomic_dec(&netstamp_needed);
1465}
1466EXPORT_SYMBOL(net_disable_timestamp);
1467
1468static inline void net_timestamp_set(struct sk_buff *skb)
1469{
1470	if (atomic_read(&netstamp_needed))
1471		__net_timestamp(skb);
1472	else
1473		skb->tstamp.tv64 = 0;
1474}
1475
1476static inline void net_timestamp_check(struct sk_buff *skb)
1477{
1478	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1479		__net_timestamp(skb);
1480}
1481
1482static int net_hwtstamp_validate(struct ifreq *ifr)
1483{
1484	struct hwtstamp_config cfg;
1485	enum hwtstamp_tx_types tx_type;
1486	enum hwtstamp_rx_filters rx_filter;
1487	int tx_type_valid = 0;
1488	int rx_filter_valid = 0;
1489
1490	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1491		return -EFAULT;
1492
1493	if (cfg.flags) /* reserved for future extensions */
1494		return -EINVAL;
1495
1496	tx_type = cfg.tx_type;
1497	rx_filter = cfg.rx_filter;
1498
1499	switch (tx_type) {
1500	case HWTSTAMP_TX_OFF:
1501	case HWTSTAMP_TX_ON:
1502	case HWTSTAMP_TX_ONESTEP_SYNC:
1503		tx_type_valid = 1;
1504		break;
1505	}
1506
1507	switch (rx_filter) {
1508	case HWTSTAMP_FILTER_NONE:
1509	case HWTSTAMP_FILTER_ALL:
1510	case HWTSTAMP_FILTER_SOME:
1511	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1512	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1513	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1514	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1515	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1516	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1517	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1518	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1519	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1520	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1521	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1522	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1523		rx_filter_valid = 1;
1524		break;
1525	}
1526
1527	if (!tx_type_valid || !rx_filter_valid)
1528		return -ERANGE;
1529
1530	return 0;
1531}
1532
1533static inline bool is_skb_forwardable(struct net_device *dev,
1534				      struct sk_buff *skb)
1535{
1536	unsigned int len;
1537
1538	if (!(dev->flags & IFF_UP))
1539		return false;
1540
1541	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1542	if (skb->len <= len)
1543		return true;
1544
1545	/* if TSO is enabled, we don't care about the length as the packet
1546	 * could be forwarded without being segmented before
1547	 */
1548	if (skb_is_gso(skb))
1549		return true;
1550
1551	return false;
1552}
1553
1554/**
1555 * dev_forward_skb - loopback an skb to another netif
1556 *
1557 * @dev: destination network device
1558 * @skb: buffer to forward
1559 *
1560 * return values:
1561 *	NET_RX_SUCCESS	(no congestion)
1562 *	NET_RX_DROP     (packet was dropped, but freed)
1563 *
1564 * dev_forward_skb can be used for injecting an skb from the
1565 * start_xmit function of one device into the receive queue
1566 * of another device.
1567 *
1568 * The receiving device may be in another namespace, so
1569 * we have to clear all information in the skb that could
1570 * impact namespace isolation.
1571 */
1572int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1573{
1574	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1575		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1576			atomic_long_inc(&dev->rx_dropped);
1577			kfree_skb(skb);
1578			return NET_RX_DROP;
1579		}
1580	}
1581
1582	skb_orphan(skb);
1583	nf_reset(skb);
1584
1585	if (unlikely(!is_skb_forwardable(dev, skb))) {
1586		atomic_long_inc(&dev->rx_dropped);
1587		kfree_skb(skb);
1588		return NET_RX_DROP;
1589	}
1590	skb_set_dev(skb, dev);
1591	skb->tstamp.tv64 = 0;
1592	skb->pkt_type = PACKET_HOST;
1593	skb->protocol = eth_type_trans(skb, dev);
1594	return netif_rx(skb);
1595}
1596EXPORT_SYMBOL_GPL(dev_forward_skb);
1597
1598static inline int deliver_skb(struct sk_buff *skb,
1599			      struct packet_type *pt_prev,
1600			      struct net_device *orig_dev)
1601{
1602	atomic_inc(&skb->users);
1603	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1604}
1605
1606/*
1607 *	Support routine. Sends outgoing frames to any network
1608 *	taps currently in use.
1609 */
1610
1611static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1612{
1613	struct packet_type *ptype;
1614	struct sk_buff *skb2 = NULL;
1615	struct packet_type *pt_prev = NULL;
1616
1617	rcu_read_lock();
1618	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1619		/* Never send packets back to the socket
1620		 * they originated from - MvS (miquels@drinkel.ow.org)
1621		 */
1622		if ((ptype->dev == dev || !ptype->dev) &&
1623		    (ptype->af_packet_priv == NULL ||
1624		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1625			if (pt_prev) {
1626				deliver_skb(skb2, pt_prev, skb->dev);
1627				pt_prev = ptype;
1628				continue;
1629			}
1630
1631			skb2 = skb_clone(skb, GFP_ATOMIC);
1632			if (!skb2)
1633				break;
1634
1635			net_timestamp_set(skb2);
1636
1637			/* skb->nh should be correctly
1638			   set by sender, so that the second statement is
1639			   just protection against buggy protocols.
1640			 */
1641			skb_reset_mac_header(skb2);
1642
1643			if (skb_network_header(skb2) < skb2->data ||
1644			    skb2->network_header > skb2->tail) {
1645				if (net_ratelimit())
1646					printk(KERN_CRIT "protocol %04x is "
1647					       "buggy, dev %s\n",
1648					       ntohs(skb2->protocol),
1649					       dev->name);
1650				skb_reset_network_header(skb2);
1651			}
1652
1653			skb2->transport_header = skb2->network_header;
1654			skb2->pkt_type = PACKET_OUTGOING;
1655			pt_prev = ptype;
1656		}
1657	}
1658	if (pt_prev)
1659		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1660	rcu_read_unlock();
1661}
1662
1663/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1664 * @dev: Network device
1665 * @txq: number of queues available
1666 *
1667 * If real_num_tx_queues is changed the tc mappings may no longer be
1668 * valid. To resolve this verify the tc mapping remains valid and if
1669 * not NULL the mapping. With no priorities mapping to this
1670 * offset/count pair it will no longer be used. In the worst case TC0
1671 * is invalid nothing can be done so disable priority mappings. If is
1672 * expected that drivers will fix this mapping if they can before
1673 * calling netif_set_real_num_tx_queues.
1674 */
1675static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1676{
1677	int i;
1678	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1679
1680	/* If TC0 is invalidated disable TC mapping */
1681	if (tc->offset + tc->count > txq) {
1682		pr_warning("Number of in use tx queues changed "
1683			   "invalidating tc mappings. Priority "
1684			   "traffic classification disabled!\n");
1685		dev->num_tc = 0;
1686		return;
1687	}
1688
1689	/* Invalidated prio to tc mappings set to TC0 */
1690	for (i = 1; i < TC_BITMASK + 1; i++) {
1691		int q = netdev_get_prio_tc_map(dev, i);
1692
1693		tc = &dev->tc_to_txq[q];
1694		if (tc->offset + tc->count > txq) {
1695			pr_warning("Number of in use tx queues "
1696				   "changed. Priority %i to tc "
1697				   "mapping %i is no longer valid "
1698				   "setting map to 0\n",
1699				   i, q);
1700			netdev_set_prio_tc_map(dev, i, 0);
1701		}
1702	}
1703}
1704
1705/*
1706 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1707 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1708 */
1709int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1710{
1711	int rc;
1712
1713	if (txq < 1 || txq > dev->num_tx_queues)
1714		return -EINVAL;
1715
1716	if (dev->reg_state == NETREG_REGISTERED ||
1717	    dev->reg_state == NETREG_UNREGISTERING) {
1718		ASSERT_RTNL();
1719
1720		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1721						  txq);
1722		if (rc)
1723			return rc;
1724
1725		if (dev->num_tc)
1726			netif_setup_tc(dev, txq);
1727
1728		if (txq < dev->real_num_tx_queues)
1729			qdisc_reset_all_tx_gt(dev, txq);
1730	}
1731
1732	dev->real_num_tx_queues = txq;
1733	return 0;
1734}
1735EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1736
1737#ifdef CONFIG_RPS
1738/**
1739 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1740 *	@dev: Network device
1741 *	@rxq: Actual number of RX queues
1742 *
1743 *	This must be called either with the rtnl_lock held or before
1744 *	registration of the net device.  Returns 0 on success, or a
1745 *	negative error code.  If called before registration, it always
1746 *	succeeds.
1747 */
1748int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1749{
1750	int rc;
1751
1752	if (rxq < 1 || rxq > dev->num_rx_queues)
1753		return -EINVAL;
1754
1755	if (dev->reg_state == NETREG_REGISTERED) {
1756		ASSERT_RTNL();
1757
1758		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1759						  rxq);
1760		if (rc)
1761			return rc;
1762	}
1763
1764	dev->real_num_rx_queues = rxq;
1765	return 0;
1766}
1767EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1768#endif
1769
1770static inline void __netif_reschedule(struct Qdisc *q)
1771{
1772	struct softnet_data *sd;
1773	unsigned long flags;
1774
1775	local_irq_save(flags);
1776	sd = &__get_cpu_var(softnet_data);
1777	q->next_sched = NULL;
1778	*sd->output_queue_tailp = q;
1779	sd->output_queue_tailp = &q->next_sched;
1780	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1781	local_irq_restore(flags);
1782}
1783
1784void __netif_schedule(struct Qdisc *q)
1785{
1786	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1787		__netif_reschedule(q);
1788}
1789EXPORT_SYMBOL(__netif_schedule);
1790
1791void dev_kfree_skb_irq(struct sk_buff *skb)
1792{
1793	if (atomic_dec_and_test(&skb->users)) {
1794		struct softnet_data *sd;
1795		unsigned long flags;
1796
1797		local_irq_save(flags);
1798		sd = &__get_cpu_var(softnet_data);
1799		skb->next = sd->completion_queue;
1800		sd->completion_queue = skb;
1801		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1802		local_irq_restore(flags);
1803	}
1804}
1805EXPORT_SYMBOL(dev_kfree_skb_irq);
1806
1807void dev_kfree_skb_any(struct sk_buff *skb)
1808{
1809	if (in_irq() || irqs_disabled())
1810		dev_kfree_skb_irq(skb);
1811	else
1812		dev_kfree_skb(skb);
1813}
1814EXPORT_SYMBOL(dev_kfree_skb_any);
1815
1816
1817/**
1818 * netif_device_detach - mark device as removed
1819 * @dev: network device
1820 *
1821 * Mark device as removed from system and therefore no longer available.
1822 */
1823void netif_device_detach(struct net_device *dev)
1824{
1825	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1826	    netif_running(dev)) {
1827		netif_tx_stop_all_queues(dev);
1828	}
1829}
1830EXPORT_SYMBOL(netif_device_detach);
1831
1832/**
1833 * netif_device_attach - mark device as attached
1834 * @dev: network device
1835 *
1836 * Mark device as attached from system and restart if needed.
1837 */
1838void netif_device_attach(struct net_device *dev)
1839{
1840	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1841	    netif_running(dev)) {
1842		netif_tx_wake_all_queues(dev);
1843		__netdev_watchdog_up(dev);
1844	}
1845}
1846EXPORT_SYMBOL(netif_device_attach);
1847
1848/**
1849 * skb_dev_set -- assign a new device to a buffer
1850 * @skb: buffer for the new device
1851 * @dev: network device
1852 *
1853 * If an skb is owned by a device already, we have to reset
1854 * all data private to the namespace a device belongs to
1855 * before assigning it a new device.
1856 */
1857#ifdef CONFIG_NET_NS
1858void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1859{
1860	skb_dst_drop(skb);
1861	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1862		secpath_reset(skb);
1863		nf_reset(skb);
1864		skb_init_secmark(skb);
1865		skb->mark = 0;
1866		skb->priority = 0;
1867		skb->nf_trace = 0;
1868		skb->ipvs_property = 0;
1869#ifdef CONFIG_NET_SCHED
1870		skb->tc_index = 0;
1871#endif
1872	}
1873	skb->dev = dev;
1874}
1875EXPORT_SYMBOL(skb_set_dev);
1876#endif /* CONFIG_NET_NS */
1877
1878/*
1879 * Invalidate hardware checksum when packet is to be mangled, and
1880 * complete checksum manually on outgoing path.
1881 */
1882int skb_checksum_help(struct sk_buff *skb)
1883{
1884	__wsum csum;
1885	int ret = 0, offset;
1886
1887	if (skb->ip_summed == CHECKSUM_COMPLETE)
1888		goto out_set_summed;
1889
1890	if (unlikely(skb_shinfo(skb)->gso_size)) {
1891		/* Let GSO fix up the checksum. */
1892		goto out_set_summed;
1893	}
1894
1895	offset = skb_checksum_start_offset(skb);
1896	BUG_ON(offset >= skb_headlen(skb));
1897	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1898
1899	offset += skb->csum_offset;
1900	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1901
1902	if (skb_cloned(skb) &&
1903	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1904		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1905		if (ret)
1906			goto out;
1907	}
1908
1909	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1910out_set_summed:
1911	skb->ip_summed = CHECKSUM_NONE;
1912out:
1913	return ret;
1914}
1915EXPORT_SYMBOL(skb_checksum_help);
1916
1917/**
1918 *	skb_gso_segment - Perform segmentation on skb.
1919 *	@skb: buffer to segment
1920 *	@features: features for the output path (see dev->features)
1921 *
1922 *	This function segments the given skb and returns a list of segments.
1923 *
1924 *	It may return NULL if the skb requires no segmentation.  This is
1925 *	only possible when GSO is used for verifying header integrity.
1926 */
1927struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1928{
1929	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1930	struct packet_type *ptype;
1931	__be16 type = skb->protocol;
1932	int vlan_depth = ETH_HLEN;
1933	int err;
1934
1935	while (type == htons(ETH_P_8021Q)) {
1936		struct vlan_hdr *vh;
1937
1938		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1939			return ERR_PTR(-EINVAL);
1940
1941		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1942		type = vh->h_vlan_encapsulated_proto;
1943		vlan_depth += VLAN_HLEN;
1944	}
1945
1946	skb_reset_mac_header(skb);
1947	skb->mac_len = skb->network_header - skb->mac_header;
1948	__skb_pull(skb, skb->mac_len);
1949
1950	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1951		struct net_device *dev = skb->dev;
1952		struct ethtool_drvinfo info = {};
1953
1954		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1955			dev->ethtool_ops->get_drvinfo(dev, &info);
1956
1957		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1958		     info.driver, dev ? dev->features : 0L,
1959		     skb->sk ? skb->sk->sk_route_caps : 0L,
1960		     skb->len, skb->data_len, skb->ip_summed);
1961
1962		if (skb_header_cloned(skb) &&
1963		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1964			return ERR_PTR(err);
1965	}
1966
1967	rcu_read_lock();
1968	list_for_each_entry_rcu(ptype,
1969			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1970		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1971			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1972				err = ptype->gso_send_check(skb);
1973				segs = ERR_PTR(err);
1974				if (err || skb_gso_ok(skb, features))
1975					break;
1976				__skb_push(skb, (skb->data -
1977						 skb_network_header(skb)));
1978			}
1979			segs = ptype->gso_segment(skb, features);
1980			break;
1981		}
1982	}
1983	rcu_read_unlock();
1984
1985	__skb_push(skb, skb->data - skb_mac_header(skb));
1986
1987	return segs;
1988}
1989EXPORT_SYMBOL(skb_gso_segment);
1990
1991/* Take action when hardware reception checksum errors are detected. */
1992#ifdef CONFIG_BUG
1993void netdev_rx_csum_fault(struct net_device *dev)
1994{
1995	if (net_ratelimit()) {
1996		printk(KERN_ERR "%s: hw csum failure.\n",
1997			dev ? dev->name : "<unknown>");
1998		dump_stack();
1999	}
2000}
2001EXPORT_SYMBOL(netdev_rx_csum_fault);
2002#endif
2003
2004/* Actually, we should eliminate this check as soon as we know, that:
2005 * 1. IOMMU is present and allows to map all the memory.
2006 * 2. No high memory really exists on this machine.
2007 */
2008
2009static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2010{
2011#ifdef CONFIG_HIGHMEM
2012	int i;
2013	if (!(dev->features & NETIF_F_HIGHDMA)) {
2014		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2015			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2016			if (PageHighMem(skb_frag_page(frag)))
2017				return 1;
2018		}
2019	}
2020
2021	if (PCI_DMA_BUS_IS_PHYS) {
2022		struct device *pdev = dev->dev.parent;
2023
2024		if (!pdev)
2025			return 0;
2026		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2027			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2028			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2029			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2030				return 1;
2031		}
2032	}
2033#endif
2034	return 0;
2035}
2036
2037struct dev_gso_cb {
2038	void (*destructor)(struct sk_buff *skb);
2039};
2040
2041#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2042
2043static void dev_gso_skb_destructor(struct sk_buff *skb)
2044{
2045	struct dev_gso_cb *cb;
2046
2047	do {
2048		struct sk_buff *nskb = skb->next;
2049
2050		skb->next = nskb->next;
2051		nskb->next = NULL;
2052		kfree_skb(nskb);
2053	} while (skb->next);
2054
2055	cb = DEV_GSO_CB(skb);
2056	if (cb->destructor)
2057		cb->destructor(skb);
2058}
2059
2060/**
2061 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2062 *	@skb: buffer to segment
2063 *	@features: device features as applicable to this skb
2064 *
2065 *	This function segments the given skb and stores the list of segments
2066 *	in skb->next.
2067 */
2068static int dev_gso_segment(struct sk_buff *skb, int features)
2069{
2070	struct sk_buff *segs;
2071
2072	segs = skb_gso_segment(skb, features);
2073
2074	/* Verifying header integrity only. */
2075	if (!segs)
2076		return 0;
2077
2078	if (IS_ERR(segs))
2079		return PTR_ERR(segs);
2080
2081	skb->next = segs;
2082	DEV_GSO_CB(skb)->destructor = skb->destructor;
2083	skb->destructor = dev_gso_skb_destructor;
2084
2085	return 0;
2086}
2087
2088/*
2089 * Try to orphan skb early, right before transmission by the device.
2090 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2091 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2092 */
2093static inline void skb_orphan_try(struct sk_buff *skb)
2094{
2095	struct sock *sk = skb->sk;
2096
2097	if (sk && !skb_shinfo(skb)->tx_flags) {
2098		/* skb_tx_hash() wont be able to get sk.
2099		 * We copy sk_hash into skb->rxhash
2100		 */
2101		if (!skb->rxhash)
2102			skb->rxhash = sk->sk_hash;
2103		skb_orphan(skb);
2104	}
2105}
2106
2107static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2108{
2109	return ((features & NETIF_F_GEN_CSUM) ||
2110		((features & NETIF_F_V4_CSUM) &&
2111		 protocol == htons(ETH_P_IP)) ||
2112		((features & NETIF_F_V6_CSUM) &&
2113		 protocol == htons(ETH_P_IPV6)) ||
2114		((features & NETIF_F_FCOE_CRC) &&
2115		 protocol == htons(ETH_P_FCOE)));
2116}
2117
2118static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2119{
2120	if (!can_checksum_protocol(features, protocol)) {
2121		features &= ~NETIF_F_ALL_CSUM;
2122		features &= ~NETIF_F_SG;
2123	} else if (illegal_highdma(skb->dev, skb)) {
2124		features &= ~NETIF_F_SG;
2125	}
2126
2127	return features;
2128}
2129
2130u32 netif_skb_features(struct sk_buff *skb)
2131{
2132	__be16 protocol = skb->protocol;
2133	u32 features = skb->dev->features;
2134
2135	if (protocol == htons(ETH_P_8021Q)) {
2136		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2137		protocol = veh->h_vlan_encapsulated_proto;
2138	} else if (!vlan_tx_tag_present(skb)) {
2139		return harmonize_features(skb, protocol, features);
2140	}
2141
2142	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2143
2144	if (protocol != htons(ETH_P_8021Q)) {
2145		return harmonize_features(skb, protocol, features);
2146	} else {
2147		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2148				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2149		return harmonize_features(skb, protocol, features);
2150	}
2151}
2152EXPORT_SYMBOL(netif_skb_features);
2153
2154/*
2155 * Returns true if either:
2156 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2157 *	2. skb is fragmented and the device does not support SG, or if
2158 *	   at least one of fragments is in highmem and device does not
2159 *	   support DMA from it.
2160 */
2161static inline int skb_needs_linearize(struct sk_buff *skb,
2162				      int features)
2163{
2164	return skb_is_nonlinear(skb) &&
2165			((skb_has_frag_list(skb) &&
2166				!(features & NETIF_F_FRAGLIST)) ||
2167			(skb_shinfo(skb)->nr_frags &&
2168				!(features & NETIF_F_SG)));
2169}
2170
2171int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2172			struct netdev_queue *txq)
2173{
2174	const struct net_device_ops *ops = dev->netdev_ops;
2175	int rc = NETDEV_TX_OK;
2176	unsigned int skb_len;
2177
2178	if (likely(!skb->next)) {
2179		u32 features;
2180
2181		/*
2182		 * If device doesn't need skb->dst, release it right now while
2183		 * its hot in this cpu cache
2184		 */
2185		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2186			skb_dst_drop(skb);
2187
2188		if (!list_empty(&ptype_all))
2189			dev_queue_xmit_nit(skb, dev);
2190
2191		skb_orphan_try(skb);
2192
2193		features = netif_skb_features(skb);
2194
2195		if (vlan_tx_tag_present(skb) &&
2196		    !(features & NETIF_F_HW_VLAN_TX)) {
2197			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2198			if (unlikely(!skb))
2199				goto out;
2200
2201			skb->vlan_tci = 0;
2202		}
2203
2204		if (netif_needs_gso(skb, features)) {
2205			if (unlikely(dev_gso_segment(skb, features)))
2206				goto out_kfree_skb;
2207			if (skb->next)
2208				goto gso;
2209		} else {
2210			if (skb_needs_linearize(skb, features) &&
2211			    __skb_linearize(skb))
2212				goto out_kfree_skb;
2213
2214			/* If packet is not checksummed and device does not
2215			 * support checksumming for this protocol, complete
2216			 * checksumming here.
2217			 */
2218			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2219				skb_set_transport_header(skb,
2220					skb_checksum_start_offset(skb));
2221				if (!(features & NETIF_F_ALL_CSUM) &&
2222				     skb_checksum_help(skb))
2223					goto out_kfree_skb;
2224			}
2225		}
2226
2227		skb_len = skb->len;
2228		rc = ops->ndo_start_xmit(skb, dev);
2229		trace_net_dev_xmit(skb, rc, dev, skb_len);
2230		if (rc == NETDEV_TX_OK)
2231			txq_trans_update(txq);
2232		return rc;
2233	}
2234
2235gso:
2236	do {
2237		struct sk_buff *nskb = skb->next;
2238
2239		skb->next = nskb->next;
2240		nskb->next = NULL;
2241
2242		/*
2243		 * If device doesn't need nskb->dst, release it right now while
2244		 * its hot in this cpu cache
2245		 */
2246		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2247			skb_dst_drop(nskb);
2248
2249		skb_len = nskb->len;
2250		rc = ops->ndo_start_xmit(nskb, dev);
2251		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2252		if (unlikely(rc != NETDEV_TX_OK)) {
2253			if (rc & ~NETDEV_TX_MASK)
2254				goto out_kfree_gso_skb;
2255			nskb->next = skb->next;
2256			skb->next = nskb;
2257			return rc;
2258		}
2259		txq_trans_update(txq);
2260		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2261			return NETDEV_TX_BUSY;
2262	} while (skb->next);
2263
2264out_kfree_gso_skb:
2265	if (likely(skb->next == NULL))
2266		skb->destructor = DEV_GSO_CB(skb)->destructor;
2267out_kfree_skb:
2268	kfree_skb(skb);
2269out:
2270	return rc;
2271}
2272
2273static u32 hashrnd __read_mostly;
2274
2275/*
2276 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2277 * to be used as a distribution range.
2278 */
2279u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2280		  unsigned int num_tx_queues)
2281{
2282	u32 hash;
2283	u16 qoffset = 0;
2284	u16 qcount = num_tx_queues;
2285
2286	if (skb_rx_queue_recorded(skb)) {
2287		hash = skb_get_rx_queue(skb);
2288		while (unlikely(hash >= num_tx_queues))
2289			hash -= num_tx_queues;
2290		return hash;
2291	}
2292
2293	if (dev->num_tc) {
2294		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2295		qoffset = dev->tc_to_txq[tc].offset;
2296		qcount = dev->tc_to_txq[tc].count;
2297	}
2298
2299	if (skb->sk && skb->sk->sk_hash)
2300		hash = skb->sk->sk_hash;
2301	else
2302		hash = (__force u16) skb->protocol ^ skb->rxhash;
2303	hash = jhash_1word(hash, hashrnd);
2304
2305	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2306}
2307EXPORT_SYMBOL(__skb_tx_hash);
2308
2309static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2310{
2311	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2312		if (net_ratelimit()) {
2313			pr_warning("%s selects TX queue %d, but "
2314				"real number of TX queues is %d\n",
2315				dev->name, queue_index, dev->real_num_tx_queues);
2316		}
2317		return 0;
2318	}
2319	return queue_index;
2320}
2321
2322static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2323{
2324#ifdef CONFIG_XPS
2325	struct xps_dev_maps *dev_maps;
2326	struct xps_map *map;
2327	int queue_index = -1;
2328
2329	rcu_read_lock();
2330	dev_maps = rcu_dereference(dev->xps_maps);
2331	if (dev_maps) {
2332		map = rcu_dereference(
2333		    dev_maps->cpu_map[raw_smp_processor_id()]);
2334		if (map) {
2335			if (map->len == 1)
2336				queue_index = map->queues[0];
2337			else {
2338				u32 hash;
2339				if (skb->sk && skb->sk->sk_hash)
2340					hash = skb->sk->sk_hash;
2341				else
2342					hash = (__force u16) skb->protocol ^
2343					    skb->rxhash;
2344				hash = jhash_1word(hash, hashrnd);
2345				queue_index = map->queues[
2346				    ((u64)hash * map->len) >> 32];
2347			}
2348			if (unlikely(queue_index >= dev->real_num_tx_queues))
2349				queue_index = -1;
2350		}
2351	}
2352	rcu_read_unlock();
2353
2354	return queue_index;
2355#else
2356	return -1;
2357#endif
2358}
2359
2360static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2361					struct sk_buff *skb)
2362{
2363	int queue_index;
2364	const struct net_device_ops *ops = dev->netdev_ops;
2365
2366	if (dev->real_num_tx_queues == 1)
2367		queue_index = 0;
2368	else if (ops->ndo_select_queue) {
2369		queue_index = ops->ndo_select_queue(dev, skb);
2370		queue_index = dev_cap_txqueue(dev, queue_index);
2371	} else {
2372		struct sock *sk = skb->sk;
2373		queue_index = sk_tx_queue_get(sk);
2374
2375		if (queue_index < 0 || skb->ooo_okay ||
2376		    queue_index >= dev->real_num_tx_queues) {
2377			int old_index = queue_index;
2378
2379			queue_index = get_xps_queue(dev, skb);
2380			if (queue_index < 0)
2381				queue_index = skb_tx_hash(dev, skb);
2382
2383			if (queue_index != old_index && sk) {
2384				struct dst_entry *dst =
2385				    rcu_dereference_check(sk->sk_dst_cache, 1);
2386
2387				if (dst && skb_dst(skb) == dst)
2388					sk_tx_queue_set(sk, queue_index);
2389			}
2390		}
2391	}
2392
2393	skb_set_queue_mapping(skb, queue_index);
2394	return netdev_get_tx_queue(dev, queue_index);
2395}
2396
2397static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2398				 struct net_device *dev,
2399				 struct netdev_queue *txq)
2400{
2401	spinlock_t *root_lock = qdisc_lock(q);
2402	bool contended;
2403	int rc;
2404
2405	qdisc_skb_cb(skb)->pkt_len = skb->len;
2406	qdisc_calculate_pkt_len(skb, q);
2407	/*
2408	 * Heuristic to force contended enqueues to serialize on a
2409	 * separate lock before trying to get qdisc main lock.
2410	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2411	 * and dequeue packets faster.
2412	 */
2413	contended = qdisc_is_running(q);
2414	if (unlikely(contended))
2415		spin_lock(&q->busylock);
2416
2417	spin_lock(root_lock);
2418	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2419		kfree_skb(skb);
2420		rc = NET_XMIT_DROP;
2421	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2422		   qdisc_run_begin(q)) {
2423		/*
2424		 * This is a work-conserving queue; there are no old skbs
2425		 * waiting to be sent out; and the qdisc is not running -
2426		 * xmit the skb directly.
2427		 */
2428		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2429			skb_dst_force(skb);
2430
2431		qdisc_bstats_update(q, skb);
2432
2433		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2434			if (unlikely(contended)) {
2435				spin_unlock(&q->busylock);
2436				contended = false;
2437			}
2438			__qdisc_run(q);
2439		} else
2440			qdisc_run_end(q);
2441
2442		rc = NET_XMIT_SUCCESS;
2443	} else {
2444		skb_dst_force(skb);
2445		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2446		if (qdisc_run_begin(q)) {
2447			if (unlikely(contended)) {
2448				spin_unlock(&q->busylock);
2449				contended = false;
2450			}
2451			__qdisc_run(q);
2452		}
2453	}
2454	spin_unlock(root_lock);
2455	if (unlikely(contended))
2456		spin_unlock(&q->busylock);
2457	return rc;
2458}
2459
2460static DEFINE_PER_CPU(int, xmit_recursion);
2461#define RECURSION_LIMIT 10
2462
2463/**
2464 *	dev_queue_xmit - transmit a buffer
2465 *	@skb: buffer to transmit
2466 *
2467 *	Queue a buffer for transmission to a network device. The caller must
2468 *	have set the device and priority and built the buffer before calling
2469 *	this function. The function can be called from an interrupt.
2470 *
2471 *	A negative errno code is returned on a failure. A success does not
2472 *	guarantee the frame will be transmitted as it may be dropped due
2473 *	to congestion or traffic shaping.
2474 *
2475 * -----------------------------------------------------------------------------------
2476 *      I notice this method can also return errors from the queue disciplines,
2477 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2478 *      be positive.
2479 *
2480 *      Regardless of the return value, the skb is consumed, so it is currently
2481 *      difficult to retry a send to this method.  (You can bump the ref count
2482 *      before sending to hold a reference for retry if you are careful.)
2483 *
2484 *      When calling this method, interrupts MUST be enabled.  This is because
2485 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2486 *          --BLG
2487 */
2488int dev_queue_xmit(struct sk_buff *skb)
2489{
2490	struct net_device *dev = skb->dev;
2491	struct netdev_queue *txq;
2492	struct Qdisc *q;
2493	int rc = -ENOMEM;
2494
2495	/* Disable soft irqs for various locks below. Also
2496	 * stops preemption for RCU.
2497	 */
2498	rcu_read_lock_bh();
2499
2500	txq = dev_pick_tx(dev, skb);
2501	q = rcu_dereference_bh(txq->qdisc);
2502
2503#ifdef CONFIG_NET_CLS_ACT
2504	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2505#endif
2506	trace_net_dev_queue(skb);
2507	if (q->enqueue) {
2508		rc = __dev_xmit_skb(skb, q, dev, txq);
2509		goto out;
2510	}
2511
2512	/* The device has no queue. Common case for software devices:
2513	   loopback, all the sorts of tunnels...
2514
2515	   Really, it is unlikely that netif_tx_lock protection is necessary
2516	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2517	   counters.)
2518	   However, it is possible, that they rely on protection
2519	   made by us here.
2520
2521	   Check this and shot the lock. It is not prone from deadlocks.
2522	   Either shot noqueue qdisc, it is even simpler 8)
2523	 */
2524	if (dev->flags & IFF_UP) {
2525		int cpu = smp_processor_id(); /* ok because BHs are off */
2526
2527		if (txq->xmit_lock_owner != cpu) {
2528
2529			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2530				goto recursion_alert;
2531
2532			HARD_TX_LOCK(dev, txq, cpu);
2533
2534			if (!netif_tx_queue_stopped(txq)) {
2535				__this_cpu_inc(xmit_recursion);
2536				rc = dev_hard_start_xmit(skb, dev, txq);
2537				__this_cpu_dec(xmit_recursion);
2538				if (dev_xmit_complete(rc)) {
2539					HARD_TX_UNLOCK(dev, txq);
2540					goto out;
2541				}
2542			}
2543			HARD_TX_UNLOCK(dev, txq);
2544			if (net_ratelimit())
2545				printk(KERN_CRIT "Virtual device %s asks to "
2546				       "queue packet!\n", dev->name);
2547		} else {
2548			/* Recursion is detected! It is possible,
2549			 * unfortunately
2550			 */
2551recursion_alert:
2552			if (net_ratelimit())
2553				printk(KERN_CRIT "Dead loop on virtual device "
2554				       "%s, fix it urgently!\n", dev->name);
2555		}
2556	}
2557
2558	rc = -ENETDOWN;
2559	rcu_read_unlock_bh();
2560
2561	kfree_skb(skb);
2562	return rc;
2563out:
2564	rcu_read_unlock_bh();
2565	return rc;
2566}
2567EXPORT_SYMBOL(dev_queue_xmit);
2568
2569
2570/*=======================================================================
2571			Receiver routines
2572  =======================================================================*/
2573
2574int netdev_max_backlog __read_mostly = 1000;
2575int netdev_tstamp_prequeue __read_mostly = 1;
2576int netdev_budget __read_mostly = 300;
2577int weight_p __read_mostly = 64;            /* old backlog weight */
2578
2579/* Called with irq disabled */
2580static inline void ____napi_schedule(struct softnet_data *sd,
2581				     struct napi_struct *napi)
2582{
2583	list_add_tail(&napi->poll_list, &sd->poll_list);
2584	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2585}
2586
2587/*
2588 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2589 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2590 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2591 * if hash is a canonical 4-tuple hash over transport ports.
2592 */
2593void __skb_get_rxhash(struct sk_buff *skb)
2594{
2595	int nhoff, hash = 0, poff;
2596	const struct ipv6hdr *ip6;
2597	const struct iphdr *ip;
2598	const struct vlan_hdr *vlan;
2599	u8 ip_proto;
2600	u32 addr1, addr2;
2601	u16 proto;
2602	union {
2603		u32 v32;
2604		u16 v16[2];
2605	} ports;
2606
2607	nhoff = skb_network_offset(skb);
2608	proto = skb->protocol;
2609
2610again:
2611	switch (proto) {
2612	case __constant_htons(ETH_P_IP):
2613ip:
2614		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2615			goto done;
2616
2617		ip = (const struct iphdr *) (skb->data + nhoff);
2618		if (ip_is_fragment(ip))
2619			ip_proto = 0;
2620		else
2621			ip_proto = ip->protocol;
2622		addr1 = (__force u32) ip->saddr;
2623		addr2 = (__force u32) ip->daddr;
2624		nhoff += ip->ihl * 4;
2625		break;
2626	case __constant_htons(ETH_P_IPV6):
2627ipv6:
2628		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2629			goto done;
2630
2631		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2632		ip_proto = ip6->nexthdr;
2633		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2634		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2635		nhoff += 40;
2636		break;
2637	case __constant_htons(ETH_P_8021Q):
2638		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2639			goto done;
2640		vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2641		proto = vlan->h_vlan_encapsulated_proto;
2642		nhoff += sizeof(*vlan);
2643		goto again;
2644	case __constant_htons(ETH_P_PPP_SES):
2645		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2646			goto done;
2647		proto = *((__be16 *) (skb->data + nhoff +
2648				      sizeof(struct pppoe_hdr)));
2649		nhoff += PPPOE_SES_HLEN;
2650		switch (proto) {
2651		case __constant_htons(PPP_IP):
2652			goto ip;
2653		case __constant_htons(PPP_IPV6):
2654			goto ipv6;
2655		default:
2656			goto done;
2657		}
2658	default:
2659		goto done;
2660	}
2661
2662	switch (ip_proto) {
2663	case IPPROTO_GRE:
2664		if (pskb_may_pull(skb, nhoff + 16)) {
2665			u8 *h = skb->data + nhoff;
2666			__be16 flags = *(__be16 *)h;
2667
2668			/*
2669			 * Only look inside GRE if version zero and no
2670			 * routing
2671			 */
2672			if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2673				proto = *(__be16 *)(h + 2);
2674				nhoff += 4;
2675				if (flags & GRE_CSUM)
2676					nhoff += 4;
2677				if (flags & GRE_KEY)
2678					nhoff += 4;
2679				if (flags & GRE_SEQ)
2680					nhoff += 4;
2681				goto again;
2682			}
2683		}
2684		break;
2685	case IPPROTO_IPIP:
2686		goto again;
2687	default:
2688		break;
2689	}
2690
2691	ports.v32 = 0;
2692	poff = proto_ports_offset(ip_proto);
2693	if (poff >= 0) {
2694		nhoff += poff;
2695		if (pskb_may_pull(skb, nhoff + 4)) {
2696			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2697			if (ports.v16[1] < ports.v16[0])
2698				swap(ports.v16[0], ports.v16[1]);
2699			skb->l4_rxhash = 1;
2700		}
2701	}
2702
2703	/* get a consistent hash (same value on both flow directions) */
2704	if (addr2 < addr1)
2705		swap(addr1, addr2);
2706
2707	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2708	if (!hash)
2709		hash = 1;
2710
2711done:
2712	skb->rxhash = hash;
2713}
2714EXPORT_SYMBOL(__skb_get_rxhash);
2715
2716#ifdef CONFIG_RPS
2717
2718/* One global table that all flow-based protocols share. */
2719struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2720EXPORT_SYMBOL(rps_sock_flow_table);
2721
2722static struct rps_dev_flow *
2723set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2724	    struct rps_dev_flow *rflow, u16 next_cpu)
2725{
2726	if (next_cpu != RPS_NO_CPU) {
2727#ifdef CONFIG_RFS_ACCEL
2728		struct netdev_rx_queue *rxqueue;
2729		struct rps_dev_flow_table *flow_table;
2730		struct rps_dev_flow *old_rflow;
2731		u32 flow_id;
2732		u16 rxq_index;
2733		int rc;
2734
2735		/* Should we steer this flow to a different hardware queue? */
2736		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2737		    !(dev->features & NETIF_F_NTUPLE))
2738			goto out;
2739		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2740		if (rxq_index == skb_get_rx_queue(skb))
2741			goto out;
2742
2743		rxqueue = dev->_rx + rxq_index;
2744		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2745		if (!flow_table)
2746			goto out;
2747		flow_id = skb->rxhash & flow_table->mask;
2748		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2749							rxq_index, flow_id);
2750		if (rc < 0)
2751			goto out;
2752		old_rflow = rflow;
2753		rflow = &flow_table->flows[flow_id];
2754		rflow->filter = rc;
2755		if (old_rflow->filter == rflow->filter)
2756			old_rflow->filter = RPS_NO_FILTER;
2757	out:
2758#endif
2759		rflow->last_qtail =
2760			per_cpu(softnet_data, next_cpu).input_queue_head;
2761	}
2762
2763	rflow->cpu = next_cpu;
2764	return rflow;
2765}
2766
2767/*
2768 * get_rps_cpu is called from netif_receive_skb and returns the target
2769 * CPU from the RPS map of the receiving queue for a given skb.
2770 * rcu_read_lock must be held on entry.
2771 */
2772static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2773		       struct rps_dev_flow **rflowp)
2774{
2775	struct netdev_rx_queue *rxqueue;
2776	struct rps_map *map;
2777	struct rps_dev_flow_table *flow_table;
2778	struct rps_sock_flow_table *sock_flow_table;
2779	int cpu = -1;
2780	u16 tcpu;
2781
2782	if (skb_rx_queue_recorded(skb)) {
2783		u16 index = skb_get_rx_queue(skb);
2784		if (unlikely(index >= dev->real_num_rx_queues)) {
2785			WARN_ONCE(dev->real_num_rx_queues > 1,
2786				  "%s received packet on queue %u, but number "
2787				  "of RX queues is %u\n",
2788				  dev->name, index, dev->real_num_rx_queues);
2789			goto done;
2790		}
2791		rxqueue = dev->_rx + index;
2792	} else
2793		rxqueue = dev->_rx;
2794
2795	map = rcu_dereference(rxqueue->rps_map);
2796	if (map) {
2797		if (map->len == 1 &&
2798		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2799			tcpu = map->cpus[0];
2800			if (cpu_online(tcpu))
2801				cpu = tcpu;
2802			goto done;
2803		}
2804	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2805		goto done;
2806	}
2807
2808	skb_reset_network_header(skb);
2809	if (!skb_get_rxhash(skb))
2810		goto done;
2811
2812	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2813	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2814	if (flow_table && sock_flow_table) {
2815		u16 next_cpu;
2816		struct rps_dev_flow *rflow;
2817
2818		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2819		tcpu = rflow->cpu;
2820
2821		next_cpu = sock_flow_table->ents[skb->rxhash &
2822		    sock_flow_table->mask];
2823
2824		/*
2825		 * If the desired CPU (where last recvmsg was done) is
2826		 * different from current CPU (one in the rx-queue flow
2827		 * table entry), switch if one of the following holds:
2828		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2829		 *   - Current CPU is offline.
2830		 *   - The current CPU's queue tail has advanced beyond the
2831		 *     last packet that was enqueued using this table entry.
2832		 *     This guarantees that all previous packets for the flow
2833		 *     have been dequeued, thus preserving in order delivery.
2834		 */
2835		if (unlikely(tcpu != next_cpu) &&
2836		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2837		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2838		      rflow->last_qtail)) >= 0))
2839			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2840
2841		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2842			*rflowp = rflow;
2843			cpu = tcpu;
2844			goto done;
2845		}
2846	}
2847
2848	if (map) {
2849		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2850
2851		if (cpu_online(tcpu)) {
2852			cpu = tcpu;
2853			goto done;
2854		}
2855	}
2856
2857done:
2858	return cpu;
2859}
2860
2861#ifdef CONFIG_RFS_ACCEL
2862
2863/**
2864 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2865 * @dev: Device on which the filter was set
2866 * @rxq_index: RX queue index
2867 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2868 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2869 *
2870 * Drivers that implement ndo_rx_flow_steer() should periodically call
2871 * this function for each installed filter and remove the filters for
2872 * which it returns %true.
2873 */
2874bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2875			 u32 flow_id, u16 filter_id)
2876{
2877	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2878	struct rps_dev_flow_table *flow_table;
2879	struct rps_dev_flow *rflow;
2880	bool expire = true;
2881	int cpu;
2882
2883	rcu_read_lock();
2884	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2885	if (flow_table && flow_id <= flow_table->mask) {
2886		rflow = &flow_table->flows[flow_id];
2887		cpu = ACCESS_ONCE(rflow->cpu);
2888		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2889		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2890			   rflow->last_qtail) <
2891		     (int)(10 * flow_table->mask)))
2892			expire = false;
2893	}
2894	rcu_read_unlock();
2895	return expire;
2896}
2897EXPORT_SYMBOL(rps_may_expire_flow);
2898
2899#endif /* CONFIG_RFS_ACCEL */
2900
2901/* Called from hardirq (IPI) context */
2902static void rps_trigger_softirq(void *data)
2903{
2904	struct softnet_data *sd = data;
2905
2906	____napi_schedule(sd, &sd->backlog);
2907	sd->received_rps++;
2908}
2909
2910#endif /* CONFIG_RPS */
2911
2912/*
2913 * Check if this softnet_data structure is another cpu one
2914 * If yes, queue it to our IPI list and return 1
2915 * If no, return 0
2916 */
2917static int rps_ipi_queued(struct softnet_data *sd)
2918{
2919#ifdef CONFIG_RPS
2920	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2921
2922	if (sd != mysd) {
2923		sd->rps_ipi_next = mysd->rps_ipi_list;
2924		mysd->rps_ipi_list = sd;
2925
2926		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2927		return 1;
2928	}
2929#endif /* CONFIG_RPS */
2930	return 0;
2931}
2932
2933/*
2934 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2935 * queue (may be a remote CPU queue).
2936 */
2937static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2938			      unsigned int *qtail)
2939{
2940	struct softnet_data *sd;
2941	unsigned long flags;
2942
2943	sd = &per_cpu(softnet_data, cpu);
2944
2945	local_irq_save(flags);
2946
2947	rps_lock(sd);
2948	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2949		if (skb_queue_len(&sd->input_pkt_queue)) {
2950enqueue:
2951			__skb_queue_tail(&sd->input_pkt_queue, skb);
2952			input_queue_tail_incr_save(sd, qtail);
2953			rps_unlock(sd);
2954			local_irq_restore(flags);
2955			return NET_RX_SUCCESS;
2956		}
2957
2958		/* Schedule NAPI for backlog device
2959		 * We can use non atomic operation since we own the queue lock
2960		 */
2961		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2962			if (!rps_ipi_queued(sd))
2963				____napi_schedule(sd, &sd->backlog);
2964		}
2965		goto enqueue;
2966	}
2967
2968	sd->dropped++;
2969	rps_unlock(sd);
2970
2971	local_irq_restore(flags);
2972
2973	atomic_long_inc(&skb->dev->rx_dropped);
2974	kfree_skb(skb);
2975	return NET_RX_DROP;
2976}
2977
2978/**
2979 *	netif_rx	-	post buffer to the network code
2980 *	@skb: buffer to post
2981 *
2982 *	This function receives a packet from a device driver and queues it for
2983 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2984 *	may be dropped during processing for congestion control or by the
2985 *	protocol layers.
2986 *
2987 *	return values:
2988 *	NET_RX_SUCCESS	(no congestion)
2989 *	NET_RX_DROP     (packet was dropped)
2990 *
2991 */
2992
2993int netif_rx(struct sk_buff *skb)
2994{
2995	int ret;
2996
2997	/* if netpoll wants it, pretend we never saw it */
2998	if (netpoll_rx(skb))
2999		return NET_RX_DROP;
3000
3001	if (netdev_tstamp_prequeue)
3002		net_timestamp_check(skb);
3003
3004	trace_netif_rx(skb);
3005#ifdef CONFIG_RPS
3006	{
3007		struct rps_dev_flow voidflow, *rflow = &voidflow;
3008		int cpu;
3009
3010		preempt_disable();
3011		rcu_read_lock();
3012
3013		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3014		if (cpu < 0)
3015			cpu = smp_processor_id();
3016
3017		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3018
3019		rcu_read_unlock();
3020		preempt_enable();
3021	}
3022#else
3023	{
3024		unsigned int qtail;
3025		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3026		put_cpu();
3027	}
3028#endif
3029	return ret;
3030}
3031EXPORT_SYMBOL(netif_rx);
3032
3033int netif_rx_ni(struct sk_buff *skb)
3034{
3035	int err;
3036
3037	preempt_disable();
3038	err = netif_rx(skb);
3039	if (local_softirq_pending())
3040		do_softirq();
3041	preempt_enable();
3042
3043	return err;
3044}
3045EXPORT_SYMBOL(netif_rx_ni);
3046
3047static void net_tx_action(struct softirq_action *h)
3048{
3049	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3050
3051	if (sd->completion_queue) {
3052		struct sk_buff *clist;
3053
3054		local_irq_disable();
3055		clist = sd->completion_queue;
3056		sd->completion_queue = NULL;
3057		local_irq_enable();
3058
3059		while (clist) {
3060			struct sk_buff *skb = clist;
3061			clist = clist->next;
3062
3063			WARN_ON(atomic_read(&skb->users));
3064			trace_kfree_skb(skb, net_tx_action);
3065			__kfree_skb(skb);
3066		}
3067	}
3068
3069	if (sd->output_queue) {
3070		struct Qdisc *head;
3071
3072		local_irq_disable();
3073		head = sd->output_queue;
3074		sd->output_queue = NULL;
3075		sd->output_queue_tailp = &sd->output_queue;
3076		local_irq_enable();
3077
3078		while (head) {
3079			struct Qdisc *q = head;
3080			spinlock_t *root_lock;
3081
3082			head = head->next_sched;
3083
3084			root_lock = qdisc_lock(q);
3085			if (spin_trylock(root_lock)) {
3086				smp_mb__before_clear_bit();
3087				clear_bit(__QDISC_STATE_SCHED,
3088					  &q->state);
3089				qdisc_run(q);
3090				spin_unlock(root_lock);
3091			} else {
3092				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3093					      &q->state)) {
3094					__netif_reschedule(q);
3095				} else {
3096					smp_mb__before_clear_bit();
3097					clear_bit(__QDISC_STATE_SCHED,
3098						  &q->state);
3099				}
3100			}
3101		}
3102	}
3103}
3104
3105#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3106    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3107/* This hook is defined here for ATM LANE */
3108int (*br_fdb_test_addr_hook)(struct net_device *dev,
3109			     unsigned char *addr) __read_mostly;
3110EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3111#endif
3112
3113#ifdef CONFIG_NET_CLS_ACT
3114/* TODO: Maybe we should just force sch_ingress to be compiled in
3115 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3116 * a compare and 2 stores extra right now if we dont have it on
3117 * but have CONFIG_NET_CLS_ACT
3118 * NOTE: This doesn't stop any functionality; if you dont have
3119 * the ingress scheduler, you just can't add policies on ingress.
3120 *
3121 */
3122static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3123{
3124	struct net_device *dev = skb->dev;
3125	u32 ttl = G_TC_RTTL(skb->tc_verd);
3126	int result = TC_ACT_OK;
3127	struct Qdisc *q;
3128
3129	if (unlikely(MAX_RED_LOOP < ttl++)) {
3130		if (net_ratelimit())
3131			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3132			       skb->skb_iif, dev->ifindex);
3133		return TC_ACT_SHOT;
3134	}
3135
3136	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3137	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3138
3139	q = rxq->qdisc;
3140	if (q != &noop_qdisc) {
3141		spin_lock(qdisc_lock(q));
3142		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3143			result = qdisc_enqueue_root(skb, q);
3144		spin_unlock(qdisc_lock(q));
3145	}
3146
3147	return result;
3148}
3149
3150static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3151					 struct packet_type **pt_prev,
3152					 int *ret, struct net_device *orig_dev)
3153{
3154	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3155
3156	if (!rxq || rxq->qdisc == &noop_qdisc)
3157		goto out;
3158
3159	if (*pt_prev) {
3160		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3161		*pt_prev = NULL;
3162	}
3163
3164	switch (ing_filter(skb, rxq)) {
3165	case TC_ACT_SHOT:
3166	case TC_ACT_STOLEN:
3167		kfree_skb(skb);
3168		return NULL;
3169	}
3170
3171out:
3172	skb->tc_verd = 0;
3173	return skb;
3174}
3175#endif
3176
3177/**
3178 *	netdev_rx_handler_register - register receive handler
3179 *	@dev: device to register a handler for
3180 *	@rx_handler: receive handler to register
3181 *	@rx_handler_data: data pointer that is used by rx handler
3182 *
3183 *	Register a receive hander for a device. This handler will then be
3184 *	called from __netif_receive_skb. A negative errno code is returned
3185 *	on a failure.
3186 *
3187 *	The caller must hold the rtnl_mutex.
3188 *
3189 *	For a general description of rx_handler, see enum rx_handler_result.
3190 */
3191int netdev_rx_handler_register(struct net_device *dev,
3192			       rx_handler_func_t *rx_handler,
3193			       void *rx_handler_data)
3194{
3195	ASSERT_RTNL();
3196
3197	if (dev->rx_handler)
3198		return -EBUSY;
3199
3200	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3201	rcu_assign_pointer(dev->rx_handler, rx_handler);
3202
3203	return 0;
3204}
3205EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3206
3207/**
3208 *	netdev_rx_handler_unregister - unregister receive handler
3209 *	@dev: device to unregister a handler from
3210 *
3211 *	Unregister a receive hander from a device.
3212 *
3213 *	The caller must hold the rtnl_mutex.
3214 */
3215void netdev_rx_handler_unregister(struct net_device *dev)
3216{
3217
3218	ASSERT_RTNL();
3219	RCU_INIT_POINTER(dev->rx_handler, NULL);
3220	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3221}
3222EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3223
3224static int __netif_receive_skb(struct sk_buff *skb)
3225{
3226	struct packet_type *ptype, *pt_prev;
3227	rx_handler_func_t *rx_handler;
3228	struct net_device *orig_dev;
3229	struct net_device *null_or_dev;
3230	bool deliver_exact = false;
3231	int ret = NET_RX_DROP;
3232	__be16 type;
3233
3234	if (!netdev_tstamp_prequeue)
3235		net_timestamp_check(skb);
3236
3237	trace_netif_receive_skb(skb);
3238
3239	/* if we've gotten here through NAPI, check netpoll */
3240	if (netpoll_receive_skb(skb))
3241		return NET_RX_DROP;
3242
3243	if (!skb->skb_iif)
3244		skb->skb_iif = skb->dev->ifindex;
3245	orig_dev = skb->dev;
3246
3247	skb_reset_network_header(skb);
3248	skb_reset_transport_header(skb);
3249	skb_reset_mac_len(skb);
3250
3251	pt_prev = NULL;
3252
3253	rcu_read_lock();
3254
3255another_round:
3256
3257	__this_cpu_inc(softnet_data.processed);
3258
3259	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3260		skb = vlan_untag(skb);
3261		if (unlikely(!skb))
3262			goto out;
3263	}
3264
3265#ifdef CONFIG_NET_CLS_ACT
3266	if (skb->tc_verd & TC_NCLS) {
3267		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3268		goto ncls;
3269	}
3270#endif
3271
3272	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3273		if (!ptype->dev || ptype->dev == skb->dev) {
3274			if (pt_prev)
3275				ret = deliver_skb(skb, pt_prev, orig_dev);
3276			pt_prev = ptype;
3277		}
3278	}
3279
3280#ifdef CONFIG_NET_CLS_ACT
3281	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3282	if (!skb)
3283		goto out;
3284ncls:
3285#endif
3286
3287	rx_handler = rcu_dereference(skb->dev->rx_handler);
3288	if (vlan_tx_tag_present(skb)) {
3289		if (pt_prev) {
3290			ret = deliver_skb(skb, pt_prev, orig_dev);
3291			pt_prev = NULL;
3292		}
3293		if (vlan_do_receive(&skb, !rx_handler))
3294			goto another_round;
3295		else if (unlikely(!skb))
3296			goto out;
3297	}
3298
3299	if (rx_handler) {
3300		if (pt_prev) {
3301			ret = deliver_skb(skb, pt_prev, orig_dev);
3302			pt_prev = NULL;
3303		}
3304		switch (rx_handler(&skb)) {
3305		case RX_HANDLER_CONSUMED:
3306			goto out;
3307		case RX_HANDLER_ANOTHER:
3308			goto another_round;
3309		case RX_HANDLER_EXACT:
3310			deliver_exact = true;
3311		case RX_HANDLER_PASS:
3312			break;
3313		default:
3314			BUG();
3315		}
3316	}
3317
3318	/* deliver only exact match when indicated */
3319	null_or_dev = deliver_exact ? skb->dev : NULL;
3320
3321	type = skb->protocol;
3322	list_for_each_entry_rcu(ptype,
3323			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3324		if (ptype->type == type &&
3325		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3326		     ptype->dev == orig_dev)) {
3327			if (pt_prev)
3328				ret = deliver_skb(skb, pt_prev, orig_dev);
3329			pt_prev = ptype;
3330		}
3331	}
3332
3333	if (pt_prev) {
3334		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3335	} else {
3336		atomic_long_inc(&skb->dev->rx_dropped);
3337		kfree_skb(skb);
3338		/* Jamal, now you will not able to escape explaining
3339		 * me how you were going to use this. :-)
3340		 */
3341		ret = NET_RX_DROP;
3342	}
3343
3344out:
3345	rcu_read_unlock();
3346	return ret;
3347}
3348
3349/**
3350 *	netif_receive_skb - process receive buffer from network
3351 *	@skb: buffer to process
3352 *
3353 *	netif_receive_skb() is the main receive data processing function.
3354 *	It always succeeds. The buffer may be dropped during processing
3355 *	for congestion control or by the protocol layers.
3356 *
3357 *	This function may only be called from softirq context and interrupts
3358 *	should be enabled.
3359 *
3360 *	Return values (usually ignored):
3361 *	NET_RX_SUCCESS: no congestion
3362 *	NET_RX_DROP: packet was dropped
3363 */
3364int netif_receive_skb(struct sk_buff *skb)
3365{
3366	if (netdev_tstamp_prequeue)
3367		net_timestamp_check(skb);
3368
3369	if (skb_defer_rx_timestamp(skb))
3370		return NET_RX_SUCCESS;
3371
3372#ifdef CONFIG_RPS
3373	{
3374		struct rps_dev_flow voidflow, *rflow = &voidflow;
3375		int cpu, ret;
3376
3377		rcu_read_lock();
3378
3379		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3380
3381		if (cpu >= 0) {
3382			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3383			rcu_read_unlock();
3384		} else {
3385			rcu_read_unlock();
3386			ret = __netif_receive_skb(skb);
3387		}
3388
3389		return ret;
3390	}
3391#else
3392	return __netif_receive_skb(skb);
3393#endif
3394}
3395EXPORT_SYMBOL(netif_receive_skb);
3396
3397/* Network device is going away, flush any packets still pending
3398 * Called with irqs disabled.
3399 */
3400static void flush_backlog(void *arg)
3401{
3402	struct net_device *dev = arg;
3403	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3404	struct sk_buff *skb, *tmp;
3405
3406	rps_lock(sd);
3407	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3408		if (skb->dev == dev) {
3409			__skb_unlink(skb, &sd->input_pkt_queue);
3410			kfree_skb(skb);
3411			input_queue_head_incr(sd);
3412		}
3413	}
3414	rps_unlock(sd);
3415
3416	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3417		if (skb->dev == dev) {
3418			__skb_unlink(skb, &sd->process_queue);
3419			kfree_skb(skb);
3420			input_queue_head_incr(sd);
3421		}
3422	}
3423}
3424
3425static int napi_gro_complete(struct sk_buff *skb)
3426{
3427	struct packet_type *ptype;
3428	__be16 type = skb->protocol;
3429	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3430	int err = -ENOENT;
3431
3432	if (NAPI_GRO_CB(skb)->count == 1) {
3433		skb_shinfo(skb)->gso_size = 0;
3434		goto out;
3435	}
3436
3437	rcu_read_lock();
3438	list_for_each_entry_rcu(ptype, head, list) {
3439		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3440			continue;
3441
3442		err = ptype->gro_complete(skb);
3443		break;
3444	}
3445	rcu_read_unlock();
3446
3447	if (err) {
3448		WARN_ON(&ptype->list == head);
3449		kfree_skb(skb);
3450		return NET_RX_SUCCESS;
3451	}
3452
3453out:
3454	return netif_receive_skb(skb);
3455}
3456
3457inline void napi_gro_flush(struct napi_struct *napi)
3458{
3459	struct sk_buff *skb, *next;
3460
3461	for (skb = napi->gro_list; skb; skb = next) {
3462		next = skb->next;
3463		skb->next = NULL;
3464		napi_gro_complete(skb);
3465	}
3466
3467	napi->gro_count = 0;
3468	napi->gro_list = NULL;
3469}
3470EXPORT_SYMBOL(napi_gro_flush);
3471
3472enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3473{
3474	struct sk_buff **pp = NULL;
3475	struct packet_type *ptype;
3476	__be16 type = skb->protocol;
3477	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3478	int same_flow;
3479	int mac_len;
3480	enum gro_result ret;
3481
3482	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3483		goto normal;
3484
3485	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3486		goto normal;
3487
3488	rcu_read_lock();
3489	list_for_each_entry_rcu(ptype, head, list) {
3490		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3491			continue;
3492
3493		skb_set_network_header(skb, skb_gro_offset(skb));
3494		mac_len = skb->network_header - skb->mac_header;
3495		skb->mac_len = mac_len;
3496		NAPI_GRO_CB(skb)->same_flow = 0;
3497		NAPI_GRO_CB(skb)->flush = 0;
3498		NAPI_GRO_CB(skb)->free = 0;
3499
3500		pp = ptype->gro_receive(&napi->gro_list, skb);
3501		break;
3502	}
3503	rcu_read_unlock();
3504
3505	if (&ptype->list == head)
3506		goto normal;
3507
3508	same_flow = NAPI_GRO_CB(skb)->same_flow;
3509	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3510
3511	if (pp) {
3512		struct sk_buff *nskb = *pp;
3513
3514		*pp = nskb->next;
3515		nskb->next = NULL;
3516		napi_gro_complete(nskb);
3517		napi->gro_count--;
3518	}
3519
3520	if (same_flow)
3521		goto ok;
3522
3523	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3524		goto normal;
3525
3526	napi->gro_count++;
3527	NAPI_GRO_CB(skb)->count = 1;
3528	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3529	skb->next = napi->gro_list;
3530	napi->gro_list = skb;
3531	ret = GRO_HELD;
3532
3533pull:
3534	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3535		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3536
3537		BUG_ON(skb->end - skb->tail < grow);
3538
3539		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3540
3541		skb->tail += grow;
3542		skb->data_len -= grow;
3543
3544		skb_shinfo(skb)->frags[0].page_offset += grow;
3545		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3546
3547		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3548			skb_frag_unref(skb, 0);
3549			memmove(skb_shinfo(skb)->frags,
3550				skb_shinfo(skb)->frags + 1,
3551				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3552		}
3553	}
3554
3555ok:
3556	return ret;
3557
3558normal:
3559	ret = GRO_NORMAL;
3560	goto pull;
3561}
3562EXPORT_SYMBOL(dev_gro_receive);
3563
3564static inline gro_result_t
3565__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3566{
3567	struct sk_buff *p;
3568
3569	for (p = napi->gro_list; p; p = p->next) {
3570		unsigned long diffs;
3571
3572		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3573		diffs |= p->vlan_tci ^ skb->vlan_tci;
3574		diffs |= compare_ether_header(skb_mac_header(p),
3575					      skb_gro_mac_header(skb));
3576		NAPI_GRO_CB(p)->same_flow = !diffs;
3577		NAPI_GRO_CB(p)->flush = 0;
3578	}
3579
3580	return dev_gro_receive(napi, skb);
3581}
3582
3583gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3584{
3585	switch (ret) {
3586	case GRO_NORMAL:
3587		if (netif_receive_skb(skb))
3588			ret = GRO_DROP;
3589		break;
3590
3591	case GRO_DROP:
3592	case GRO_MERGED_FREE:
3593		kfree_skb(skb);
3594		break;
3595
3596	case GRO_HELD:
3597	case GRO_MERGED:
3598		break;
3599	}
3600
3601	return ret;
3602}
3603EXPORT_SYMBOL(napi_skb_finish);
3604
3605void skb_gro_reset_offset(struct sk_buff *skb)
3606{
3607	NAPI_GRO_CB(skb)->data_offset = 0;
3608	NAPI_GRO_CB(skb)->frag0 = NULL;
3609	NAPI_GRO_CB(skb)->frag0_len = 0;
3610
3611	if (skb->mac_header == skb->tail &&
3612	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3613		NAPI_GRO_CB(skb)->frag0 =
3614			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3615		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3616	}
3617}
3618EXPORT_SYMBOL(skb_gro_reset_offset);
3619
3620gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3621{
3622	skb_gro_reset_offset(skb);
3623
3624	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3625}
3626EXPORT_SYMBOL(napi_gro_receive);
3627
3628static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3629{
3630	__skb_pull(skb, skb_headlen(skb));
3631	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3632	skb->vlan_tci = 0;
3633	skb->dev = napi->dev;
3634	skb->skb_iif = 0;
3635
3636	napi->skb = skb;
3637}
3638
3639struct sk_buff *napi_get_frags(struct napi_struct *napi)
3640{
3641	struct sk_buff *skb = napi->skb;
3642
3643	if (!skb) {
3644		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3645		if (skb)
3646			napi->skb = skb;
3647	}
3648	return skb;
3649}
3650EXPORT_SYMBOL(napi_get_frags);
3651
3652gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3653			       gro_result_t ret)
3654{
3655	switch (ret) {
3656	case GRO_NORMAL:
3657	case GRO_HELD:
3658		skb->protocol = eth_type_trans(skb, skb->dev);
3659
3660		if (ret == GRO_HELD)
3661			skb_gro_pull(skb, -ETH_HLEN);
3662		else if (netif_receive_skb(skb))
3663			ret = GRO_DROP;
3664		break;
3665
3666	case GRO_DROP:
3667	case GRO_MERGED_FREE:
3668		napi_reuse_skb(napi, skb);
3669		break;
3670
3671	case GRO_MERGED:
3672		break;
3673	}
3674
3675	return ret;
3676}
3677EXPORT_SYMBOL(napi_frags_finish);
3678
3679struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3680{
3681	struct sk_buff *skb = napi->skb;
3682	struct ethhdr *eth;
3683	unsigned int hlen;
3684	unsigned int off;
3685
3686	napi->skb = NULL;
3687
3688	skb_reset_mac_header(skb);
3689	skb_gro_reset_offset(skb);
3690
3691	off = skb_gro_offset(skb);
3692	hlen = off + sizeof(*eth);
3693	eth = skb_gro_header_fast(skb, off);
3694	if (skb_gro_header_hard(skb, hlen)) {
3695		eth = skb_gro_header_slow(skb, hlen, off);
3696		if (unlikely(!eth)) {
3697			napi_reuse_skb(napi, skb);
3698			skb = NULL;
3699			goto out;
3700		}
3701	}
3702
3703	skb_gro_pull(skb, sizeof(*eth));
3704
3705	/*
3706	 * This works because the only protocols we care about don't require
3707	 * special handling.  We'll fix it up properly at the end.
3708	 */
3709	skb->protocol = eth->h_proto;
3710
3711out:
3712	return skb;
3713}
3714EXPORT_SYMBOL(napi_frags_skb);
3715
3716gro_result_t napi_gro_frags(struct napi_struct *napi)
3717{
3718	struct sk_buff *skb = napi_frags_skb(napi);
3719
3720	if (!skb)
3721		return GRO_DROP;
3722
3723	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3724}
3725EXPORT_SYMBOL(napi_gro_frags);
3726
3727/*
3728 * net_rps_action sends any pending IPI's for rps.
3729 * Note: called with local irq disabled, but exits with local irq enabled.
3730 */
3731static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3732{
3733#ifdef CONFIG_RPS
3734	struct softnet_data *remsd = sd->rps_ipi_list;
3735
3736	if (remsd) {
3737		sd->rps_ipi_list = NULL;
3738
3739		local_irq_enable();
3740
3741		/* Send pending IPI's to kick RPS processing on remote cpus. */
3742		while (remsd) {
3743			struct softnet_data *next = remsd->rps_ipi_next;
3744
3745			if (cpu_online(remsd->cpu))
3746				__smp_call_function_single(remsd->cpu,
3747							   &remsd->csd, 0);
3748			remsd = next;
3749		}
3750	} else
3751#endif
3752		local_irq_enable();
3753}
3754
3755static int process_backlog(struct napi_struct *napi, int quota)
3756{
3757	int work = 0;
3758	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3759
3760#ifdef CONFIG_RPS
3761	/* Check if we have pending ipi, its better to send them now,
3762	 * not waiting net_rx_action() end.
3763	 */
3764	if (sd->rps_ipi_list) {
3765		local_irq_disable();
3766		net_rps_action_and_irq_enable(sd);
3767	}
3768#endif
3769	napi->weight = weight_p;
3770	local_irq_disable();
3771	while (work < quota) {
3772		struct sk_buff *skb;
3773		unsigned int qlen;
3774
3775		while ((skb = __skb_dequeue(&sd->process_queue))) {
3776			local_irq_enable();
3777			__netif_receive_skb(skb);
3778			local_irq_disable();
3779			input_queue_head_incr(sd);
3780			if (++work >= quota) {
3781				local_irq_enable();
3782				return work;
3783			}
3784		}
3785
3786		rps_lock(sd);
3787		qlen = skb_queue_len(&sd->input_pkt_queue);
3788		if (qlen)
3789			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3790						   &sd->process_queue);
3791
3792		if (qlen < quota - work) {
3793			/*
3794			 * Inline a custom version of __napi_complete().
3795			 * only current cpu owns and manipulates this napi,
3796			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3797			 * we can use a plain write instead of clear_bit(),
3798			 * and we dont need an smp_mb() memory barrier.
3799			 */
3800			list_del(&napi->poll_list);
3801			napi->state = 0;
3802
3803			quota = work + qlen;
3804		}
3805		rps_unlock(sd);
3806	}
3807	local_irq_enable();
3808
3809	return work;
3810}
3811
3812/**
3813 * __napi_schedule - schedule for receive
3814 * @n: entry to schedule
3815 *
3816 * The entry's receive function will be scheduled to run
3817 */
3818void __napi_schedule(struct napi_struct *n)
3819{
3820	unsigned long flags;
3821
3822	local_irq_save(flags);
3823	____napi_schedule(&__get_cpu_var(softnet_data), n);
3824	local_irq_restore(flags);
3825}
3826EXPORT_SYMBOL(__napi_schedule);
3827
3828void __napi_complete(struct napi_struct *n)
3829{
3830	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3831	BUG_ON(n->gro_list);
3832
3833	list_del(&n->poll_list);
3834	smp_mb__before_clear_bit();
3835	clear_bit(NAPI_STATE_SCHED, &n->state);
3836}
3837EXPORT_SYMBOL(__napi_complete);
3838
3839void napi_complete(struct napi_struct *n)
3840{
3841	unsigned long flags;
3842
3843	/*
3844	 * don't let napi dequeue from the cpu poll list
3845	 * just in case its running on a different cpu
3846	 */
3847	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3848		return;
3849
3850	napi_gro_flush(n);
3851	local_irq_save(flags);
3852	__napi_complete(n);
3853	local_irq_restore(flags);
3854}
3855EXPORT_SYMBOL(napi_complete);
3856
3857void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3858		    int (*poll)(struct napi_struct *, int), int weight)
3859{
3860	INIT_LIST_HEAD(&napi->poll_list);
3861	napi->gro_count = 0;
3862	napi->gro_list = NULL;
3863	napi->skb = NULL;
3864	napi->poll = poll;
3865	napi->weight = weight;
3866	list_add(&napi->dev_list, &dev->napi_list);
3867	napi->dev = dev;
3868#ifdef CONFIG_NETPOLL
3869	spin_lock_init(&napi->poll_lock);
3870	napi->poll_owner = -1;
3871#endif
3872	set_bit(NAPI_STATE_SCHED, &napi->state);
3873}
3874EXPORT_SYMBOL(netif_napi_add);
3875
3876void netif_napi_del(struct napi_struct *napi)
3877{
3878	struct sk_buff *skb, *next;
3879
3880	list_del_init(&napi->dev_list);
3881	napi_free_frags(napi);
3882
3883	for (skb = napi->gro_list; skb; skb = next) {
3884		next = skb->next;
3885		skb->next = NULL;
3886		kfree_skb(skb);
3887	}
3888
3889	napi->gro_list = NULL;
3890	napi->gro_count = 0;
3891}
3892EXPORT_SYMBOL(netif_napi_del);
3893
3894static void net_rx_action(struct softirq_action *h)
3895{
3896	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3897	unsigned long time_limit = jiffies + 2;
3898	int budget = netdev_budget;
3899	void *have;
3900
3901	local_irq_disable();
3902
3903	while (!list_empty(&sd->poll_list)) {
3904		struct napi_struct *n;
3905		int work, weight;
3906
3907		/* If softirq window is exhuasted then punt.
3908		 * Allow this to run for 2 jiffies since which will allow
3909		 * an average latency of 1.5/HZ.
3910		 */
3911		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3912			goto softnet_break;
3913
3914		local_irq_enable();
3915
3916		/* Even though interrupts have been re-enabled, this
3917		 * access is safe because interrupts can only add new
3918		 * entries to the tail of this list, and only ->poll()
3919		 * calls can remove this head entry from the list.
3920		 */
3921		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3922
3923		have = netpoll_poll_lock(n);
3924
3925		weight = n->weight;
3926
3927		/* This NAPI_STATE_SCHED test is for avoiding a race
3928		 * with netpoll's poll_napi().  Only the entity which
3929		 * obtains the lock and sees NAPI_STATE_SCHED set will
3930		 * actually make the ->poll() call.  Therefore we avoid
3931		 * accidentally calling ->poll() when NAPI is not scheduled.
3932		 */
3933		work = 0;
3934		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3935			work = n->poll(n, weight);
3936			trace_napi_poll(n);
3937		}
3938
3939		WARN_ON_ONCE(work > weight);
3940
3941		budget -= work;
3942
3943		local_irq_disable();
3944
3945		/* Drivers must not modify the NAPI state if they
3946		 * consume the entire weight.  In such cases this code
3947		 * still "owns" the NAPI instance and therefore can
3948		 * move the instance around on the list at-will.
3949		 */
3950		if (unlikely(work == weight)) {
3951			if (unlikely(napi_disable_pending(n))) {
3952				local_irq_enable();
3953				napi_complete(n);
3954				local_irq_disable();
3955			} else
3956				list_move_tail(&n->poll_list, &sd->poll_list);
3957		}
3958
3959		netpoll_poll_unlock(have);
3960	}
3961out:
3962	net_rps_action_and_irq_enable(sd);
3963
3964#ifdef CONFIG_NET_DMA
3965	/*
3966	 * There may not be any more sk_buffs coming right now, so push
3967	 * any pending DMA copies to hardware
3968	 */
3969	dma_issue_pending_all();
3970#endif
3971
3972	return;
3973
3974softnet_break:
3975	sd->time_squeeze++;
3976	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3977	goto out;
3978}
3979
3980static gifconf_func_t *gifconf_list[NPROTO];
3981
3982/**
3983 *	register_gifconf	-	register a SIOCGIF handler
3984 *	@family: Address family
3985 *	@gifconf: Function handler
3986 *
3987 *	Register protocol dependent address dumping routines. The handler
3988 *	that is passed must not be freed or reused until it has been replaced
3989 *	by another handler.
3990 */
3991int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3992{
3993	if (family >= NPROTO)
3994		return -EINVAL;
3995	gifconf_list[family] = gifconf;
3996	return 0;
3997}
3998EXPORT_SYMBOL(register_gifconf);
3999
4000
4001/*
4002 *	Map an interface index to its name (SIOCGIFNAME)
4003 */
4004
4005/*
4006 *	We need this ioctl for efficient implementation of the
4007 *	if_indextoname() function required by the IPv6 API.  Without
4008 *	it, we would have to search all the interfaces to find a
4009 *	match.  --pb
4010 */
4011
4012static int dev_ifname(struct net *net, struct ifreq __user *arg)
4013{
4014	struct net_device *dev;
4015	struct ifreq ifr;
4016
4017	/*
4018	 *	Fetch the caller's info block.
4019	 */
4020
4021	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4022		return -EFAULT;
4023
4024	rcu_read_lock();
4025	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4026	if (!dev) {
4027		rcu_read_unlock();
4028		return -ENODEV;
4029	}
4030
4031	strcpy(ifr.ifr_name, dev->name);
4032	rcu_read_unlock();
4033
4034	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4035		return -EFAULT;
4036	return 0;
4037}
4038
4039/*
4040 *	Perform a SIOCGIFCONF call. This structure will change
4041 *	size eventually, and there is nothing I can do about it.
4042 *	Thus we will need a 'compatibility mode'.
4043 */
4044
4045static int dev_ifconf(struct net *net, char __user *arg)
4046{
4047	struct ifconf ifc;
4048	struct net_device *dev;
4049	char __user *pos;
4050	int len;
4051	int total;
4052	int i;
4053
4054	/*
4055	 *	Fetch the caller's info block.
4056	 */
4057
4058	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4059		return -EFAULT;
4060
4061	pos = ifc.ifc_buf;
4062	len = ifc.ifc_len;
4063
4064	/*
4065	 *	Loop over the interfaces, and write an info block for each.
4066	 */
4067
4068	total = 0;
4069	for_each_netdev(net, dev) {
4070		for (i = 0; i < NPROTO; i++) {
4071			if (gifconf_list[i]) {
4072				int done;
4073				if (!pos)
4074					done = gifconf_list[i](dev, NULL, 0);
4075				else
4076					done = gifconf_list[i](dev, pos + total,
4077							       len - total);
4078				if (done < 0)
4079					return -EFAULT;
4080				total += done;
4081			}
4082		}
4083	}
4084
4085	/*
4086	 *	All done.  Write the updated control block back to the caller.
4087	 */
4088	ifc.ifc_len = total;
4089
4090	/*
4091	 * 	Both BSD and Solaris return 0 here, so we do too.
4092	 */
4093	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4094}
4095
4096#ifdef CONFIG_PROC_FS
4097
4098#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4099
4100struct dev_iter_state {
4101	struct seq_net_private p;
4102	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4103};
4104
4105#define get_bucket(x) ((x) >> BUCKET_SPACE)
4106#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4107#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4108
4109static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4110{
4111	struct dev_iter_state *state = seq->private;
4112	struct net *net = seq_file_net(seq);
4113	struct net_device *dev;
4114	struct hlist_node *p;
4115	struct hlist_head *h;
4116	unsigned int count, bucket, offset;
4117
4118	bucket = get_bucket(state->pos);
4119	offset = get_offset(state->pos);
4120	h = &net->dev_name_head[bucket];
4121	count = 0;
4122	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4123		if (count++ == offset) {
4124			state->pos = set_bucket_offset(bucket, count);
4125			return dev;
4126		}
4127	}
4128
4129	return NULL;
4130}
4131
4132static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4133{
4134	struct dev_iter_state *state = seq->private;
4135	struct net_device *dev;
4136	unsigned int bucket;
4137
4138	bucket = get_bucket(state->pos);
4139	do {
4140		dev = dev_from_same_bucket(seq);
4141		if (dev)
4142			return dev;
4143
4144		bucket++;
4145		state->pos = set_bucket_offset(bucket, 0);
4146	} while (bucket < NETDEV_HASHENTRIES);
4147
4148	return NULL;
4149}
4150
4151/*
4152 *	This is invoked by the /proc filesystem handler to display a device
4153 *	in detail.
4154 */
4155void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4156	__acquires(RCU)
4157{
4158	struct dev_iter_state *state = seq->private;
4159
4160	rcu_read_lock();
4161	if (!*pos)
4162		return SEQ_START_TOKEN;
4163
4164	/* check for end of the hash */
4165	if (state->pos == 0 && *pos > 1)
4166		return NULL;
4167
4168	return dev_from_new_bucket(seq);
4169}
4170
4171void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4172{
4173	struct net_device *dev;
4174
4175	++*pos;
4176
4177	if (v == SEQ_START_TOKEN)
4178		return dev_from_new_bucket(seq);
4179
4180	dev = dev_from_same_bucket(seq);
4181	if (dev)
4182		return dev;
4183
4184	return dev_from_new_bucket(seq);
4185}
4186
4187void dev_seq_stop(struct seq_file *seq, void *v)
4188	__releases(RCU)
4189{
4190	rcu_read_unlock();
4191}
4192
4193static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4194{
4195	struct rtnl_link_stats64 temp;
4196	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4197
4198	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4199		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4200		   dev->name, stats->rx_bytes, stats->rx_packets,
4201		   stats->rx_errors,
4202		   stats->rx_dropped + stats->rx_missed_errors,
4203		   stats->rx_fifo_errors,
4204		   stats->rx_length_errors + stats->rx_over_errors +
4205		    stats->rx_crc_errors + stats->rx_frame_errors,
4206		   stats->rx_compressed, stats->multicast,
4207		   stats->tx_bytes, stats->tx_packets,
4208		   stats->tx_errors, stats->tx_dropped,
4209		   stats->tx_fifo_errors, stats->collisions,
4210		   stats->tx_carrier_errors +
4211		    stats->tx_aborted_errors +
4212		    stats->tx_window_errors +
4213		    stats->tx_heartbeat_errors,
4214		   stats->tx_compressed);
4215}
4216
4217/*
4218 *	Called from the PROCfs module. This now uses the new arbitrary sized
4219 *	/proc/net interface to create /proc/net/dev
4220 */
4221static int dev_seq_show(struct seq_file *seq, void *v)
4222{
4223	if (v == SEQ_START_TOKEN)
4224		seq_puts(seq, "Inter-|   Receive                            "
4225			      "                    |  Transmit\n"
4226			      " face |bytes    packets errs drop fifo frame "
4227			      "compressed multicast|bytes    packets errs "
4228			      "drop fifo colls carrier compressed\n");
4229	else
4230		dev_seq_printf_stats(seq, v);
4231	return 0;
4232}
4233
4234static struct softnet_data *softnet_get_online(loff_t *pos)
4235{
4236	struct softnet_data *sd = NULL;
4237
4238	while (*pos < nr_cpu_ids)
4239		if (cpu_online(*pos)) {
4240			sd = &per_cpu(softnet_data, *pos);
4241			break;
4242		} else
4243			++*pos;
4244	return sd;
4245}
4246
4247static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4248{
4249	return softnet_get_online(pos);
4250}
4251
4252static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4253{
4254	++*pos;
4255	return softnet_get_online(pos);
4256}
4257
4258static void softnet_seq_stop(struct seq_file *seq, void *v)
4259{
4260}
4261
4262static int softnet_seq_show(struct seq_file *seq, void *v)
4263{
4264	struct softnet_data *sd = v;
4265
4266	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4267		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4268		   0, 0, 0, 0, /* was fastroute */
4269		   sd->cpu_collision, sd->received_rps);
4270	return 0;
4271}
4272
4273static const struct seq_operations dev_seq_ops = {
4274	.start = dev_seq_start,
4275	.next  = dev_seq_next,
4276	.stop  = dev_seq_stop,
4277	.show  = dev_seq_show,
4278};
4279
4280static int dev_seq_open(struct inode *inode, struct file *file)
4281{
4282	return seq_open_net(inode, file, &dev_seq_ops,
4283			    sizeof(struct dev_iter_state));
4284}
4285
4286int dev_seq_open_ops(struct inode *inode, struct file *file,
4287		     const struct seq_operations *ops)
4288{
4289	return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4290}
4291
4292static const struct file_operations dev_seq_fops = {
4293	.owner	 = THIS_MODULE,
4294	.open    = dev_seq_open,
4295	.read    = seq_read,
4296	.llseek  = seq_lseek,
4297	.release = seq_release_net,
4298};
4299
4300static const struct seq_operations softnet_seq_ops = {
4301	.start = softnet_seq_start,
4302	.next  = softnet_seq_next,
4303	.stop  = softnet_seq_stop,
4304	.show  = softnet_seq_show,
4305};
4306
4307static int softnet_seq_open(struct inode *inode, struct file *file)
4308{
4309	return seq_open(file, &softnet_seq_ops);
4310}
4311
4312static const struct file_operations softnet_seq_fops = {
4313	.owner	 = THIS_MODULE,
4314	.open    = softnet_seq_open,
4315	.read    = seq_read,
4316	.llseek  = seq_lseek,
4317	.release = seq_release,
4318};
4319
4320static void *ptype_get_idx(loff_t pos)
4321{
4322	struct packet_type *pt = NULL;
4323	loff_t i = 0;
4324	int t;
4325
4326	list_for_each_entry_rcu(pt, &ptype_all, list) {
4327		if (i == pos)
4328			return pt;
4329		++i;
4330	}
4331
4332	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4333		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4334			if (i == pos)
4335				return pt;
4336			++i;
4337		}
4338	}
4339	return NULL;
4340}
4341
4342static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4343	__acquires(RCU)
4344{
4345	rcu_read_lock();
4346	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4347}
4348
4349static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4350{
4351	struct packet_type *pt;
4352	struct list_head *nxt;
4353	int hash;
4354
4355	++*pos;
4356	if (v == SEQ_START_TOKEN)
4357		return ptype_get_idx(0);
4358
4359	pt = v;
4360	nxt = pt->list.next;
4361	if (pt->type == htons(ETH_P_ALL)) {
4362		if (nxt != &ptype_all)
4363			goto found;
4364		hash = 0;
4365		nxt = ptype_base[0].next;
4366	} else
4367		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4368
4369	while (nxt == &ptype_base[hash]) {
4370		if (++hash >= PTYPE_HASH_SIZE)
4371			return NULL;
4372		nxt = ptype_base[hash].next;
4373	}
4374found:
4375	return list_entry(nxt, struct packet_type, list);
4376}
4377
4378static void ptype_seq_stop(struct seq_file *seq, void *v)
4379	__releases(RCU)
4380{
4381	rcu_read_unlock();
4382}
4383
4384static int ptype_seq_show(struct seq_file *seq, void *v)
4385{
4386	struct packet_type *pt = v;
4387
4388	if (v == SEQ_START_TOKEN)
4389		seq_puts(seq, "Type Device      Function\n");
4390	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4391		if (pt->type == htons(ETH_P_ALL))
4392			seq_puts(seq, "ALL ");
4393		else
4394			seq_printf(seq, "%04x", ntohs(pt->type));
4395
4396		seq_printf(seq, " %-8s %pF\n",
4397			   pt->dev ? pt->dev->name : "", pt->func);
4398	}
4399
4400	return 0;
4401}
4402
4403static const struct seq_operations ptype_seq_ops = {
4404	.start = ptype_seq_start,
4405	.next  = ptype_seq_next,
4406	.stop  = ptype_seq_stop,
4407	.show  = ptype_seq_show,
4408};
4409
4410static int ptype_seq_open(struct inode *inode, struct file *file)
4411{
4412	return seq_open_net(inode, file, &ptype_seq_ops,
4413			sizeof(struct seq_net_private));
4414}
4415
4416static const struct file_operations ptype_seq_fops = {
4417	.owner	 = THIS_MODULE,
4418	.open    = ptype_seq_open,
4419	.read    = seq_read,
4420	.llseek  = seq_lseek,
4421	.release = seq_release_net,
4422};
4423
4424
4425static int __net_init dev_proc_net_init(struct net *net)
4426{
4427	int rc = -ENOMEM;
4428
4429	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4430		goto out;
4431	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4432		goto out_dev;
4433	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4434		goto out_softnet;
4435
4436	if (wext_proc_init(net))
4437		goto out_ptype;
4438	rc = 0;
4439out:
4440	return rc;
4441out_ptype:
4442	proc_net_remove(net, "ptype");
4443out_softnet:
4444	proc_net_remove(net, "softnet_stat");
4445out_dev:
4446	proc_net_remove(net, "dev");
4447	goto out;
4448}
4449
4450static void __net_exit dev_proc_net_exit(struct net *net)
4451{
4452	wext_proc_exit(net);
4453
4454	proc_net_remove(net, "ptype");
4455	proc_net_remove(net, "softnet_stat");
4456	proc_net_remove(net, "dev");
4457}
4458
4459static struct pernet_operations __net_initdata dev_proc_ops = {
4460	.init = dev_proc_net_init,
4461	.exit = dev_proc_net_exit,
4462};
4463
4464static int __init dev_proc_init(void)
4465{
4466	return register_pernet_subsys(&dev_proc_ops);
4467}
4468#else
4469#define dev_proc_init() 0
4470#endif	/* CONFIG_PROC_FS */
4471
4472
4473/**
4474 *	netdev_set_master	-	set up master pointer
4475 *	@slave: slave device
4476 *	@master: new master device
4477 *
4478 *	Changes the master device of the slave. Pass %NULL to break the
4479 *	bonding. The caller must hold the RTNL semaphore. On a failure
4480 *	a negative errno code is returned. On success the reference counts
4481 *	are adjusted and the function returns zero.
4482 */
4483int netdev_set_master(struct net_device *slave, struct net_device *master)
4484{
4485	struct net_device *old = slave->master;
4486
4487	ASSERT_RTNL();
4488
4489	if (master) {
4490		if (old)
4491			return -EBUSY;
4492		dev_hold(master);
4493	}
4494
4495	slave->master = master;
4496
4497	if (old)
4498		dev_put(old);
4499	return 0;
4500}
4501EXPORT_SYMBOL(netdev_set_master);
4502
4503/**
4504 *	netdev_set_bond_master	-	set up bonding master/slave pair
4505 *	@slave: slave device
4506 *	@master: new master device
4507 *
4508 *	Changes the master device of the slave. Pass %NULL to break the
4509 *	bonding. The caller must hold the RTNL semaphore. On a failure
4510 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4511 *	to the routing socket and the function returns zero.
4512 */
4513int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4514{
4515	int err;
4516
4517	ASSERT_RTNL();
4518
4519	err = netdev_set_master(slave, master);
4520	if (err)
4521		return err;
4522	if (master)
4523		slave->flags |= IFF_SLAVE;
4524	else
4525		slave->flags &= ~IFF_SLAVE;
4526
4527	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4528	return 0;
4529}
4530EXPORT_SYMBOL(netdev_set_bond_master);
4531
4532static void dev_change_rx_flags(struct net_device *dev, int flags)
4533{
4534	const struct net_device_ops *ops = dev->netdev_ops;
4535
4536	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4537		ops->ndo_change_rx_flags(dev, flags);
4538}
4539
4540static int __dev_set_promiscuity(struct net_device *dev, int inc)
4541{
4542	unsigned short old_flags = dev->flags;
4543	uid_t uid;
4544	gid_t gid;
4545
4546	ASSERT_RTNL();
4547
4548	dev->flags |= IFF_PROMISC;
4549	dev->promiscuity += inc;
4550	if (dev->promiscuity == 0) {
4551		/*
4552		 * Avoid overflow.
4553		 * If inc causes overflow, untouch promisc and return error.
4554		 */
4555		if (inc < 0)
4556			dev->flags &= ~IFF_PROMISC;
4557		else {
4558			dev->promiscuity -= inc;
4559			printk(KERN_WARNING "%s: promiscuity touches roof, "
4560				"set promiscuity failed, promiscuity feature "
4561				"of device might be broken.\n", dev->name);
4562			return -EOVERFLOW;
4563		}
4564	}
4565	if (dev->flags != old_flags) {
4566		printk(KERN_INFO "device %s %s promiscuous mode\n",
4567		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4568							       "left");
4569		if (audit_enabled) {
4570			current_uid_gid(&uid, &gid);
4571			audit_log(current->audit_context, GFP_ATOMIC,
4572				AUDIT_ANOM_PROMISCUOUS,
4573				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4574				dev->name, (dev->flags & IFF_PROMISC),
4575				(old_flags & IFF_PROMISC),
4576				audit_get_loginuid(current),
4577				uid, gid,
4578				audit_get_sessionid(current));
4579		}
4580
4581		dev_change_rx_flags(dev, IFF_PROMISC);
4582	}
4583	return 0;
4584}
4585
4586/**
4587 *	dev_set_promiscuity	- update promiscuity count on a device
4588 *	@dev: device
4589 *	@inc: modifier
4590 *
4591 *	Add or remove promiscuity from a device. While the count in the device
4592 *	remains above zero the interface remains promiscuous. Once it hits zero
4593 *	the device reverts back to normal filtering operation. A negative inc
4594 *	value is used to drop promiscuity on the device.
4595 *	Return 0 if successful or a negative errno code on error.
4596 */
4597int dev_set_promiscuity(struct net_device *dev, int inc)
4598{
4599	unsigned short old_flags = dev->flags;
4600	int err;
4601
4602	err = __dev_set_promiscuity(dev, inc);
4603	if (err < 0)
4604		return err;
4605	if (dev->flags != old_flags)
4606		dev_set_rx_mode(dev);
4607	return err;
4608}
4609EXPORT_SYMBOL(dev_set_promiscuity);
4610
4611/**
4612 *	dev_set_allmulti	- update allmulti count on a device
4613 *	@dev: device
4614 *	@inc: modifier
4615 *
4616 *	Add or remove reception of all multicast frames to a device. While the
4617 *	count in the device remains above zero the interface remains listening
4618 *	to all interfaces. Once it hits zero the device reverts back to normal
4619 *	filtering operation. A negative @inc value is used to drop the counter
4620 *	when releasing a resource needing all multicasts.
4621 *	Return 0 if successful or a negative errno code on error.
4622 */
4623
4624int dev_set_allmulti(struct net_device *dev, int inc)
4625{
4626	unsigned short old_flags = dev->flags;
4627
4628	ASSERT_RTNL();
4629
4630	dev->flags |= IFF_ALLMULTI;
4631	dev->allmulti += inc;
4632	if (dev->allmulti == 0) {
4633		/*
4634		 * Avoid overflow.
4635		 * If inc causes overflow, untouch allmulti and return error.
4636		 */
4637		if (inc < 0)
4638			dev->flags &= ~IFF_ALLMULTI;
4639		else {
4640			dev->allmulti -= inc;
4641			printk(KERN_WARNING "%s: allmulti touches roof, "
4642				"set allmulti failed, allmulti feature of "
4643				"device might be broken.\n", dev->name);
4644			return -EOVERFLOW;
4645		}
4646	}
4647	if (dev->flags ^ old_flags) {
4648		dev_change_rx_flags(dev, IFF_ALLMULTI);
4649		dev_set_rx_mode(dev);
4650	}
4651	return 0;
4652}
4653EXPORT_SYMBOL(dev_set_allmulti);
4654
4655/*
4656 *	Upload unicast and multicast address lists to device and
4657 *	configure RX filtering. When the device doesn't support unicast
4658 *	filtering it is put in promiscuous mode while unicast addresses
4659 *	are present.
4660 */
4661void __dev_set_rx_mode(struct net_device *dev)
4662{
4663	const struct net_device_ops *ops = dev->netdev_ops;
4664
4665	/* dev_open will call this function so the list will stay sane. */
4666	if (!(dev->flags&IFF_UP))
4667		return;
4668
4669	if (!netif_device_present(dev))
4670		return;
4671
4672	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4673		/* Unicast addresses changes may only happen under the rtnl,
4674		 * therefore calling __dev_set_promiscuity here is safe.
4675		 */
4676		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4677			__dev_set_promiscuity(dev, 1);
4678			dev->uc_promisc = true;
4679		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4680			__dev_set_promiscuity(dev, -1);
4681			dev->uc_promisc = false;
4682		}
4683	}
4684
4685	if (ops->ndo_set_rx_mode)
4686		ops->ndo_set_rx_mode(dev);
4687}
4688
4689void dev_set_rx_mode(struct net_device *dev)
4690{
4691	netif_addr_lock_bh(dev);
4692	__dev_set_rx_mode(dev);
4693	netif_addr_unlock_bh(dev);
4694}
4695
4696/**
4697 *	dev_get_flags - get flags reported to userspace
4698 *	@dev: device
4699 *
4700 *	Get the combination of flag bits exported through APIs to userspace.
4701 */
4702unsigned dev_get_flags(const struct net_device *dev)
4703{
4704	unsigned flags;
4705
4706	flags = (dev->flags & ~(IFF_PROMISC |
4707				IFF_ALLMULTI |
4708				IFF_RUNNING |
4709				IFF_LOWER_UP |
4710				IFF_DORMANT)) |
4711		(dev->gflags & (IFF_PROMISC |
4712				IFF_ALLMULTI));
4713
4714	if (netif_running(dev)) {
4715		if (netif_oper_up(dev))
4716			flags |= IFF_RUNNING;
4717		if (netif_carrier_ok(dev))
4718			flags |= IFF_LOWER_UP;
4719		if (netif_dormant(dev))
4720			flags |= IFF_DORMANT;
4721	}
4722
4723	return flags;
4724}
4725EXPORT_SYMBOL(dev_get_flags);
4726
4727int __dev_change_flags(struct net_device *dev, unsigned int flags)
4728{
4729	int old_flags = dev->flags;
4730	int ret;
4731
4732	ASSERT_RTNL();
4733
4734	/*
4735	 *	Set the flags on our device.
4736	 */
4737
4738	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4739			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4740			       IFF_AUTOMEDIA)) |
4741		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4742				    IFF_ALLMULTI));
4743
4744	/*
4745	 *	Load in the correct multicast list now the flags have changed.
4746	 */
4747
4748	if ((old_flags ^ flags) & IFF_MULTICAST)
4749		dev_change_rx_flags(dev, IFF_MULTICAST);
4750
4751	dev_set_rx_mode(dev);
4752
4753	/*
4754	 *	Have we downed the interface. We handle IFF_UP ourselves
4755	 *	according to user attempts to set it, rather than blindly
4756	 *	setting it.
4757	 */
4758
4759	ret = 0;
4760	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4761		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4762
4763		if (!ret)
4764			dev_set_rx_mode(dev);
4765	}
4766
4767	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4768		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4769
4770		dev->gflags ^= IFF_PROMISC;
4771		dev_set_promiscuity(dev, inc);
4772	}
4773
4774	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4775	   is important. Some (broken) drivers set IFF_PROMISC, when
4776	   IFF_ALLMULTI is requested not asking us and not reporting.
4777	 */
4778	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4779		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4780
4781		dev->gflags ^= IFF_ALLMULTI;
4782		dev_set_allmulti(dev, inc);
4783	}
4784
4785	return ret;
4786}
4787
4788void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4789{
4790	unsigned int changes = dev->flags ^ old_flags;
4791
4792	if (changes & IFF_UP) {
4793		if (dev->flags & IFF_UP)
4794			call_netdevice_notifiers(NETDEV_UP, dev);
4795		else
4796			call_netdevice_notifiers(NETDEV_DOWN, dev);
4797	}
4798
4799	if (dev->flags & IFF_UP &&
4800	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4801		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4802}
4803
4804/**
4805 *	dev_change_flags - change device settings
4806 *	@dev: device
4807 *	@flags: device state flags
4808 *
4809 *	Change settings on device based state flags. The flags are
4810 *	in the userspace exported format.
4811 */
4812int dev_change_flags(struct net_device *dev, unsigned flags)
4813{
4814	int ret, changes;
4815	int old_flags = dev->flags;
4816
4817	ret = __dev_change_flags(dev, flags);
4818	if (ret < 0)
4819		return ret;
4820
4821	changes = old_flags ^ dev->flags;
4822	if (changes)
4823		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4824
4825	__dev_notify_flags(dev, old_flags);
4826	return ret;
4827}
4828EXPORT_SYMBOL(dev_change_flags);
4829
4830/**
4831 *	dev_set_mtu - Change maximum transfer unit
4832 *	@dev: device
4833 *	@new_mtu: new transfer unit
4834 *
4835 *	Change the maximum transfer size of the network device.
4836 */
4837int dev_set_mtu(struct net_device *dev, int new_mtu)
4838{
4839	const struct net_device_ops *ops = dev->netdev_ops;
4840	int err;
4841
4842	if (new_mtu == dev->mtu)
4843		return 0;
4844
4845	/*	MTU must be positive.	 */
4846	if (new_mtu < 0)
4847		return -EINVAL;
4848
4849	if (!netif_device_present(dev))
4850		return -ENODEV;
4851
4852	err = 0;
4853	if (ops->ndo_change_mtu)
4854		err = ops->ndo_change_mtu(dev, new_mtu);
4855	else
4856		dev->mtu = new_mtu;
4857
4858	if (!err && dev->flags & IFF_UP)
4859		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4860	return err;
4861}
4862EXPORT_SYMBOL(dev_set_mtu);
4863
4864/**
4865 *	dev_set_group - Change group this device belongs to
4866 *	@dev: device
4867 *	@new_group: group this device should belong to
4868 */
4869void dev_set_group(struct net_device *dev, int new_group)
4870{
4871	dev->group = new_group;
4872}
4873EXPORT_SYMBOL(dev_set_group);
4874
4875/**
4876 *	dev_set_mac_address - Change Media Access Control Address
4877 *	@dev: device
4878 *	@sa: new address
4879 *
4880 *	Change the hardware (MAC) address of the device
4881 */
4882int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4883{
4884	const struct net_device_ops *ops = dev->netdev_ops;
4885	int err;
4886
4887	if (!ops->ndo_set_mac_address)
4888		return -EOPNOTSUPP;
4889	if (sa->sa_family != dev->type)
4890		return -EINVAL;
4891	if (!netif_device_present(dev))
4892		return -ENODEV;
4893	err = ops->ndo_set_mac_address(dev, sa);
4894	if (!err)
4895		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4896	return err;
4897}
4898EXPORT_SYMBOL(dev_set_mac_address);
4899
4900/*
4901 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4902 */
4903static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4904{
4905	int err;
4906	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4907
4908	if (!dev)
4909		return -ENODEV;
4910
4911	switch (cmd) {
4912	case SIOCGIFFLAGS:	/* Get interface flags */
4913		ifr->ifr_flags = (short) dev_get_flags(dev);
4914		return 0;
4915
4916	case SIOCGIFMETRIC:	/* Get the metric on the interface
4917				   (currently unused) */
4918		ifr->ifr_metric = 0;
4919		return 0;
4920
4921	case SIOCGIFMTU:	/* Get the MTU of a device */
4922		ifr->ifr_mtu = dev->mtu;
4923		return 0;
4924
4925	case SIOCGIFHWADDR:
4926		if (!dev->addr_len)
4927			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4928		else
4929			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4930			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4931		ifr->ifr_hwaddr.sa_family = dev->type;
4932		return 0;
4933
4934	case SIOCGIFSLAVE:
4935		err = -EINVAL;
4936		break;
4937
4938	case SIOCGIFMAP:
4939		ifr->ifr_map.mem_start = dev->mem_start;
4940		ifr->ifr_map.mem_end   = dev->mem_end;
4941		ifr->ifr_map.base_addr = dev->base_addr;
4942		ifr->ifr_map.irq       = dev->irq;
4943		ifr->ifr_map.dma       = dev->dma;
4944		ifr->ifr_map.port      = dev->if_port;
4945		return 0;
4946
4947	case SIOCGIFINDEX:
4948		ifr->ifr_ifindex = dev->ifindex;
4949		return 0;
4950
4951	case SIOCGIFTXQLEN:
4952		ifr->ifr_qlen = dev->tx_queue_len;
4953		return 0;
4954
4955	default:
4956		/* dev_ioctl() should ensure this case
4957		 * is never reached
4958		 */
4959		WARN_ON(1);
4960		err = -ENOTTY;
4961		break;
4962
4963	}
4964	return err;
4965}
4966
4967/*
4968 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4969 */
4970static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4971{
4972	int err;
4973	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4974	const struct net_device_ops *ops;
4975
4976	if (!dev)
4977		return -ENODEV;
4978
4979	ops = dev->netdev_ops;
4980
4981	switch (cmd) {
4982	case SIOCSIFFLAGS:	/* Set interface flags */
4983		return dev_change_flags(dev, ifr->ifr_flags);
4984
4985	case SIOCSIFMETRIC:	/* Set the metric on the interface
4986				   (currently unused) */
4987		return -EOPNOTSUPP;
4988
4989	case SIOCSIFMTU:	/* Set the MTU of a device */
4990		return dev_set_mtu(dev, ifr->ifr_mtu);
4991
4992	case SIOCSIFHWADDR:
4993		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4994
4995	case SIOCSIFHWBROADCAST:
4996		if (ifr->ifr_hwaddr.sa_family != dev->type)
4997			return -EINVAL;
4998		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4999		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5000		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5001		return 0;
5002
5003	case SIOCSIFMAP:
5004		if (ops->ndo_set_config) {
5005			if (!netif_device_present(dev))
5006				return -ENODEV;
5007			return ops->ndo_set_config(dev, &ifr->ifr_map);
5008		}
5009		return -EOPNOTSUPP;
5010
5011	case SIOCADDMULTI:
5012		if (!ops->ndo_set_rx_mode ||
5013		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5014			return -EINVAL;
5015		if (!netif_device_present(dev))
5016			return -ENODEV;
5017		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5018
5019	case SIOCDELMULTI:
5020		if (!ops->ndo_set_rx_mode ||
5021		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5022			return -EINVAL;
5023		if (!netif_device_present(dev))
5024			return -ENODEV;
5025		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5026
5027	case SIOCSIFTXQLEN:
5028		if (ifr->ifr_qlen < 0)
5029			return -EINVAL;
5030		dev->tx_queue_len = ifr->ifr_qlen;
5031		return 0;
5032
5033	case SIOCSIFNAME:
5034		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5035		return dev_change_name(dev, ifr->ifr_newname);
5036
5037	case SIOCSHWTSTAMP:
5038		err = net_hwtstamp_validate(ifr);
5039		if (err)
5040			return err;
5041		/* fall through */
5042
5043	/*
5044	 *	Unknown or private ioctl
5045	 */
5046	default:
5047		if ((cmd >= SIOCDEVPRIVATE &&
5048		    cmd <= SIOCDEVPRIVATE + 15) ||
5049		    cmd == SIOCBONDENSLAVE ||
5050		    cmd == SIOCBONDRELEASE ||
5051		    cmd == SIOCBONDSETHWADDR ||
5052		    cmd == SIOCBONDSLAVEINFOQUERY ||
5053		    cmd == SIOCBONDINFOQUERY ||
5054		    cmd == SIOCBONDCHANGEACTIVE ||
5055		    cmd == SIOCGMIIPHY ||
5056		    cmd == SIOCGMIIREG ||
5057		    cmd == SIOCSMIIREG ||
5058		    cmd == SIOCBRADDIF ||
5059		    cmd == SIOCBRDELIF ||
5060		    cmd == SIOCSHWTSTAMP ||
5061		    cmd == SIOCWANDEV) {
5062			err = -EOPNOTSUPP;
5063			if (ops->ndo_do_ioctl) {
5064				if (netif_device_present(dev))
5065					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5066				else
5067					err = -ENODEV;
5068			}
5069		} else
5070			err = -EINVAL;
5071
5072	}
5073	return err;
5074}
5075
5076/*
5077 *	This function handles all "interface"-type I/O control requests. The actual
5078 *	'doing' part of this is dev_ifsioc above.
5079 */
5080
5081/**
5082 *	dev_ioctl	-	network device ioctl
5083 *	@net: the applicable net namespace
5084 *	@cmd: command to issue
5085 *	@arg: pointer to a struct ifreq in user space
5086 *
5087 *	Issue ioctl functions to devices. This is normally called by the
5088 *	user space syscall interfaces but can sometimes be useful for
5089 *	other purposes. The return value is the return from the syscall if
5090 *	positive or a negative errno code on error.
5091 */
5092
5093int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5094{
5095	struct ifreq ifr;
5096	int ret;
5097	char *colon;
5098
5099	/* One special case: SIOCGIFCONF takes ifconf argument
5100	   and requires shared lock, because it sleeps writing
5101	   to user space.
5102	 */
5103
5104	if (cmd == SIOCGIFCONF) {
5105		rtnl_lock();
5106		ret = dev_ifconf(net, (char __user *) arg);
5107		rtnl_unlock();
5108		return ret;
5109	}
5110	if (cmd == SIOCGIFNAME)
5111		return dev_ifname(net, (struct ifreq __user *)arg);
5112
5113	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5114		return -EFAULT;
5115
5116	ifr.ifr_name[IFNAMSIZ-1] = 0;
5117
5118	colon = strchr(ifr.ifr_name, ':');
5119	if (colon)
5120		*colon = 0;
5121
5122	/*
5123	 *	See which interface the caller is talking about.
5124	 */
5125
5126	switch (cmd) {
5127	/*
5128	 *	These ioctl calls:
5129	 *	- can be done by all.
5130	 *	- atomic and do not require locking.
5131	 *	- return a value
5132	 */
5133	case SIOCGIFFLAGS:
5134	case SIOCGIFMETRIC:
5135	case SIOCGIFMTU:
5136	case SIOCGIFHWADDR:
5137	case SIOCGIFSLAVE:
5138	case SIOCGIFMAP:
5139	case SIOCGIFINDEX:
5140	case SIOCGIFTXQLEN:
5141		dev_load(net, ifr.ifr_name);
5142		rcu_read_lock();
5143		ret = dev_ifsioc_locked(net, &ifr, cmd);
5144		rcu_read_unlock();
5145		if (!ret) {
5146			if (colon)
5147				*colon = ':';
5148			if (copy_to_user(arg, &ifr,
5149					 sizeof(struct ifreq)))
5150				ret = -EFAULT;
5151		}
5152		return ret;
5153
5154	case SIOCETHTOOL:
5155		dev_load(net, ifr.ifr_name);
5156		rtnl_lock();
5157		ret = dev_ethtool(net, &ifr);
5158		rtnl_unlock();
5159		if (!ret) {
5160			if (colon)
5161				*colon = ':';
5162			if (copy_to_user(arg, &ifr,
5163					 sizeof(struct ifreq)))
5164				ret = -EFAULT;
5165		}
5166		return ret;
5167
5168	/*
5169	 *	These ioctl calls:
5170	 *	- require superuser power.
5171	 *	- require strict serialization.
5172	 *	- return a value
5173	 */
5174	case SIOCGMIIPHY:
5175	case SIOCGMIIREG:
5176	case SIOCSIFNAME:
5177		if (!capable(CAP_NET_ADMIN))
5178			return -EPERM;
5179		dev_load(net, ifr.ifr_name);
5180		rtnl_lock();
5181		ret = dev_ifsioc(net, &ifr, cmd);
5182		rtnl_unlock();
5183		if (!ret) {
5184			if (colon)
5185				*colon = ':';
5186			if (copy_to_user(arg, &ifr,
5187					 sizeof(struct ifreq)))
5188				ret = -EFAULT;
5189		}
5190		return ret;
5191
5192	/*
5193	 *	These ioctl calls:
5194	 *	- require superuser power.
5195	 *	- require strict serialization.
5196	 *	- do not return a value
5197	 */
5198	case SIOCSIFFLAGS:
5199	case SIOCSIFMETRIC:
5200	case SIOCSIFMTU:
5201	case SIOCSIFMAP:
5202	case SIOCSIFHWADDR:
5203	case SIOCSIFSLAVE:
5204	case SIOCADDMULTI:
5205	case SIOCDELMULTI:
5206	case SIOCSIFHWBROADCAST:
5207	case SIOCSIFTXQLEN:
5208	case SIOCSMIIREG:
5209	case SIOCBONDENSLAVE:
5210	case SIOCBONDRELEASE:
5211	case SIOCBONDSETHWADDR:
5212	case SIOCBONDCHANGEACTIVE:
5213	case SIOCBRADDIF:
5214	case SIOCBRDELIF:
5215	case SIOCSHWTSTAMP:
5216		if (!capable(CAP_NET_ADMIN))
5217			return -EPERM;
5218		/* fall through */
5219	case SIOCBONDSLAVEINFOQUERY:
5220	case SIOCBONDINFOQUERY:
5221		dev_load(net, ifr.ifr_name);
5222		rtnl_lock();
5223		ret = dev_ifsioc(net, &ifr, cmd);
5224		rtnl_unlock();
5225		return ret;
5226
5227	case SIOCGIFMEM:
5228		/* Get the per device memory space. We can add this but
5229		 * currently do not support it */
5230	case SIOCSIFMEM:
5231		/* Set the per device memory buffer space.
5232		 * Not applicable in our case */
5233	case SIOCSIFLINK:
5234		return -ENOTTY;
5235
5236	/*
5237	 *	Unknown or private ioctl.
5238	 */
5239	default:
5240		if (cmd == SIOCWANDEV ||
5241		    (cmd >= SIOCDEVPRIVATE &&
5242		     cmd <= SIOCDEVPRIVATE + 15)) {
5243			dev_load(net, ifr.ifr_name);
5244			rtnl_lock();
5245			ret = dev_ifsioc(net, &ifr, cmd);
5246			rtnl_unlock();
5247			if (!ret && copy_to_user(arg, &ifr,
5248						 sizeof(struct ifreq)))
5249				ret = -EFAULT;
5250			return ret;
5251		}
5252		/* Take care of Wireless Extensions */
5253		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5254			return wext_handle_ioctl(net, &ifr, cmd, arg);
5255		return -ENOTTY;
5256	}
5257}
5258
5259
5260/**
5261 *	dev_new_index	-	allocate an ifindex
5262 *	@net: the applicable net namespace
5263 *
5264 *	Returns a suitable unique value for a new device interface
5265 *	number.  The caller must hold the rtnl semaphore or the
5266 *	dev_base_lock to be sure it remains unique.
5267 */
5268static int dev_new_index(struct net *net)
5269{
5270	static int ifindex;
5271	for (;;) {
5272		if (++ifindex <= 0)
5273			ifindex = 1;
5274		if (!__dev_get_by_index(net, ifindex))
5275			return ifindex;
5276	}
5277}
5278
5279/* Delayed registration/unregisteration */
5280static LIST_HEAD(net_todo_list);
5281
5282static void net_set_todo(struct net_device *dev)
5283{
5284	list_add_tail(&dev->todo_list, &net_todo_list);
5285}
5286
5287static void rollback_registered_many(struct list_head *head)
5288{
5289	struct net_device *dev, *tmp;
5290
5291	BUG_ON(dev_boot_phase);
5292	ASSERT_RTNL();
5293
5294	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5295		/* Some devices call without registering
5296		 * for initialization unwind. Remove those
5297		 * devices and proceed with the remaining.
5298		 */
5299		if (dev->reg_state == NETREG_UNINITIALIZED) {
5300			pr_debug("unregister_netdevice: device %s/%p never "
5301				 "was registered\n", dev->name, dev);
5302
5303			WARN_ON(1);
5304			list_del(&dev->unreg_list);
5305			continue;
5306		}
5307		dev->dismantle = true;
5308		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5309	}
5310
5311	/* If device is running, close it first. */
5312	dev_close_many(head);
5313
5314	list_for_each_entry(dev, head, unreg_list) {
5315		/* And unlink it from device chain. */
5316		unlist_netdevice(dev);
5317
5318		dev->reg_state = NETREG_UNREGISTERING;
5319	}
5320
5321	synchronize_net();
5322
5323	list_for_each_entry(dev, head, unreg_list) {
5324		/* Shutdown queueing discipline. */
5325		dev_shutdown(dev);
5326
5327
5328		/* Notify protocols, that we are about to destroy
5329		   this device. They should clean all the things.
5330		*/
5331		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5332
5333		if (!dev->rtnl_link_ops ||
5334		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5335			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5336
5337		/*
5338		 *	Flush the unicast and multicast chains
5339		 */
5340		dev_uc_flush(dev);
5341		dev_mc_flush(dev);
5342
5343		if (dev->netdev_ops->ndo_uninit)
5344			dev->netdev_ops->ndo_uninit(dev);
5345
5346		/* Notifier chain MUST detach us from master device. */
5347		WARN_ON(dev->master);
5348
5349		/* Remove entries from kobject tree */
5350		netdev_unregister_kobject(dev);
5351	}
5352
5353	/* Process any work delayed until the end of the batch */
5354	dev = list_first_entry(head, struct net_device, unreg_list);
5355	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5356
5357	synchronize_net();
5358
5359	list_for_each_entry(dev, head, unreg_list)
5360		dev_put(dev);
5361}
5362
5363static void rollback_registered(struct net_device *dev)
5364{
5365	LIST_HEAD(single);
5366
5367	list_add(&dev->unreg_list, &single);
5368	rollback_registered_many(&single);
5369	list_del(&single);
5370}
5371
5372static u32 netdev_fix_features(struct net_device *dev, u32 features)
5373{
5374	/* Fix illegal checksum combinations */
5375	if ((features & NETIF_F_HW_CSUM) &&
5376	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5377		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5378		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5379	}
5380
5381	if ((features & NETIF_F_NO_CSUM) &&
5382	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5383		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5384		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5385	}
5386
5387	/* Fix illegal SG+CSUM combinations. */
5388	if ((features & NETIF_F_SG) &&
5389	    !(features & NETIF_F_ALL_CSUM)) {
5390		netdev_dbg(dev,
5391			"Dropping NETIF_F_SG since no checksum feature.\n");
5392		features &= ~NETIF_F_SG;
5393	}
5394
5395	/* TSO requires that SG is present as well. */
5396	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5397		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5398		features &= ~NETIF_F_ALL_TSO;
5399	}
5400
5401	/* TSO ECN requires that TSO is present as well. */
5402	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5403		features &= ~NETIF_F_TSO_ECN;
5404
5405	/* Software GSO depends on SG. */
5406	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5407		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5408		features &= ~NETIF_F_GSO;
5409	}
5410
5411	/* UFO needs SG and checksumming */
5412	if (features & NETIF_F_UFO) {
5413		/* maybe split UFO into V4 and V6? */
5414		if (!((features & NETIF_F_GEN_CSUM) ||
5415		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5416			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5417			netdev_dbg(dev,
5418				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5419			features &= ~NETIF_F_UFO;
5420		}
5421
5422		if (!(features & NETIF_F_SG)) {
5423			netdev_dbg(dev,
5424				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5425			features &= ~NETIF_F_UFO;
5426		}
5427	}
5428
5429	return features;
5430}
5431
5432int __netdev_update_features(struct net_device *dev)
5433{
5434	u32 features;
5435	int err = 0;
5436
5437	ASSERT_RTNL();
5438
5439	features = netdev_get_wanted_features(dev);
5440
5441	if (dev->netdev_ops->ndo_fix_features)
5442		features = dev->netdev_ops->ndo_fix_features(dev, features);
5443
5444	/* driver might be less strict about feature dependencies */
5445	features = netdev_fix_features(dev, features);
5446
5447	if (dev->features == features)
5448		return 0;
5449
5450	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5451		dev->features, features);
5452
5453	if (dev->netdev_ops->ndo_set_features)
5454		err = dev->netdev_ops->ndo_set_features(dev, features);
5455
5456	if (unlikely(err < 0)) {
5457		netdev_err(dev,
5458			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5459			err, features, dev->features);
5460		return -1;
5461	}
5462
5463	if (!err)
5464		dev->features = features;
5465
5466	return 1;
5467}
5468
5469/**
5470 *	netdev_update_features - recalculate device features
5471 *	@dev: the device to check
5472 *
5473 *	Recalculate dev->features set and send notifications if it
5474 *	has changed. Should be called after driver or hardware dependent
5475 *	conditions might have changed that influence the features.
5476 */
5477void netdev_update_features(struct net_device *dev)
5478{
5479	if (__netdev_update_features(dev))
5480		netdev_features_change(dev);
5481}
5482EXPORT_SYMBOL(netdev_update_features);
5483
5484/**
5485 *	netdev_change_features - recalculate device features
5486 *	@dev: the device to check
5487 *
5488 *	Recalculate dev->features set and send notifications even
5489 *	if they have not changed. Should be called instead of
5490 *	netdev_update_features() if also dev->vlan_features might
5491 *	have changed to allow the changes to be propagated to stacked
5492 *	VLAN devices.
5493 */
5494void netdev_change_features(struct net_device *dev)
5495{
5496	__netdev_update_features(dev);
5497	netdev_features_change(dev);
5498}
5499EXPORT_SYMBOL(netdev_change_features);
5500
5501/**
5502 *	netif_stacked_transfer_operstate -	transfer operstate
5503 *	@rootdev: the root or lower level device to transfer state from
5504 *	@dev: the device to transfer operstate to
5505 *
5506 *	Transfer operational state from root to device. This is normally
5507 *	called when a stacking relationship exists between the root
5508 *	device and the device(a leaf device).
5509 */
5510void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5511					struct net_device *dev)
5512{
5513	if (rootdev->operstate == IF_OPER_DORMANT)
5514		netif_dormant_on(dev);
5515	else
5516		netif_dormant_off(dev);
5517
5518	if (netif_carrier_ok(rootdev)) {
5519		if (!netif_carrier_ok(dev))
5520			netif_carrier_on(dev);
5521	} else {
5522		if (netif_carrier_ok(dev))
5523			netif_carrier_off(dev);
5524	}
5525}
5526EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5527
5528#ifdef CONFIG_RPS
5529static int netif_alloc_rx_queues(struct net_device *dev)
5530{
5531	unsigned int i, count = dev->num_rx_queues;
5532	struct netdev_rx_queue *rx;
5533
5534	BUG_ON(count < 1);
5535
5536	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5537	if (!rx) {
5538		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5539		return -ENOMEM;
5540	}
5541	dev->_rx = rx;
5542
5543	for (i = 0; i < count; i++)
5544		rx[i].dev = dev;
5545	return 0;
5546}
5547#endif
5548
5549static void netdev_init_one_queue(struct net_device *dev,
5550				  struct netdev_queue *queue, void *_unused)
5551{
5552	/* Initialize queue lock */
5553	spin_lock_init(&queue->_xmit_lock);
5554	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5555	queue->xmit_lock_owner = -1;
5556	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5557	queue->dev = dev;
5558}
5559
5560static int netif_alloc_netdev_queues(struct net_device *dev)
5561{
5562	unsigned int count = dev->num_tx_queues;
5563	struct netdev_queue *tx;
5564
5565	BUG_ON(count < 1);
5566
5567	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5568	if (!tx) {
5569		pr_err("netdev: Unable to allocate %u tx queues.\n",
5570		       count);
5571		return -ENOMEM;
5572	}
5573	dev->_tx = tx;
5574
5575	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5576	spin_lock_init(&dev->tx_global_lock);
5577
5578	return 0;
5579}
5580
5581/**
5582 *	register_netdevice	- register a network device
5583 *	@dev: device to register
5584 *
5585 *	Take a completed network device structure and add it to the kernel
5586 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5587 *	chain. 0 is returned on success. A negative errno code is returned
5588 *	on a failure to set up the device, or if the name is a duplicate.
5589 *
5590 *	Callers must hold the rtnl semaphore. You may want
5591 *	register_netdev() instead of this.
5592 *
5593 *	BUGS:
5594 *	The locking appears insufficient to guarantee two parallel registers
5595 *	will not get the same name.
5596 */
5597
5598int register_netdevice(struct net_device *dev)
5599{
5600	int ret;
5601	struct net *net = dev_net(dev);
5602
5603	BUG_ON(dev_boot_phase);
5604	ASSERT_RTNL();
5605
5606	might_sleep();
5607
5608	/* When net_device's are persistent, this will be fatal. */
5609	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5610	BUG_ON(!net);
5611
5612	spin_lock_init(&dev->addr_list_lock);
5613	netdev_set_addr_lockdep_class(dev);
5614
5615	dev->iflink = -1;
5616
5617	ret = dev_get_valid_name(dev, dev->name);
5618	if (ret < 0)
5619		goto out;
5620
5621	/* Init, if this function is available */
5622	if (dev->netdev_ops->ndo_init) {
5623		ret = dev->netdev_ops->ndo_init(dev);
5624		if (ret) {
5625			if (ret > 0)
5626				ret = -EIO;
5627			goto out;
5628		}
5629	}
5630
5631	dev->ifindex = dev_new_index(net);
5632	if (dev->iflink == -1)
5633		dev->iflink = dev->ifindex;
5634
5635	/* Transfer changeable features to wanted_features and enable
5636	 * software offloads (GSO and GRO).
5637	 */
5638	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5639	dev->features |= NETIF_F_SOFT_FEATURES;
5640	dev->wanted_features = dev->features & dev->hw_features;
5641
5642	/* Turn on no cache copy if HW is doing checksum */
5643	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5644	if ((dev->features & NETIF_F_ALL_CSUM) &&
5645	    !(dev->features & NETIF_F_NO_CSUM)) {
5646		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5647		dev->features |= NETIF_F_NOCACHE_COPY;
5648	}
5649
5650	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5651	 */
5652	dev->vlan_features |= NETIF_F_HIGHDMA;
5653
5654	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5655	ret = notifier_to_errno(ret);
5656	if (ret)
5657		goto err_uninit;
5658
5659	ret = netdev_register_kobject(dev);
5660	if (ret)
5661		goto err_uninit;
5662	dev->reg_state = NETREG_REGISTERED;
5663
5664	__netdev_update_features(dev);
5665
5666	/*
5667	 *	Default initial state at registry is that the
5668	 *	device is present.
5669	 */
5670
5671	set_bit(__LINK_STATE_PRESENT, &dev->state);
5672
5673	dev_init_scheduler(dev);
5674	dev_hold(dev);
5675	list_netdevice(dev);
5676
5677	/* Notify protocols, that a new device appeared. */
5678	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5679	ret = notifier_to_errno(ret);
5680	if (ret) {
5681		rollback_registered(dev);
5682		dev->reg_state = NETREG_UNREGISTERED;
5683	}
5684	/*
5685	 *	Prevent userspace races by waiting until the network
5686	 *	device is fully setup before sending notifications.
5687	 */
5688	if (!dev->rtnl_link_ops ||
5689	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5690		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5691
5692out:
5693	return ret;
5694
5695err_uninit:
5696	if (dev->netdev_ops->ndo_uninit)
5697		dev->netdev_ops->ndo_uninit(dev);
5698	goto out;
5699}
5700EXPORT_SYMBOL(register_netdevice);
5701
5702/**
5703 *	init_dummy_netdev	- init a dummy network device for NAPI
5704 *	@dev: device to init
5705 *
5706 *	This takes a network device structure and initialize the minimum
5707 *	amount of fields so it can be used to schedule NAPI polls without
5708 *	registering a full blown interface. This is to be used by drivers
5709 *	that need to tie several hardware interfaces to a single NAPI
5710 *	poll scheduler due to HW limitations.
5711 */
5712int init_dummy_netdev(struct net_device *dev)
5713{
5714	/* Clear everything. Note we don't initialize spinlocks
5715	 * are they aren't supposed to be taken by any of the
5716	 * NAPI code and this dummy netdev is supposed to be
5717	 * only ever used for NAPI polls
5718	 */
5719	memset(dev, 0, sizeof(struct net_device));
5720
5721	/* make sure we BUG if trying to hit standard
5722	 * register/unregister code path
5723	 */
5724	dev->reg_state = NETREG_DUMMY;
5725
5726	/* NAPI wants this */
5727	INIT_LIST_HEAD(&dev->napi_list);
5728
5729	/* a dummy interface is started by default */
5730	set_bit(__LINK_STATE_PRESENT, &dev->state);
5731	set_bit(__LINK_STATE_START, &dev->state);
5732
5733	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5734	 * because users of this 'device' dont need to change
5735	 * its refcount.
5736	 */
5737
5738	return 0;
5739}
5740EXPORT_SYMBOL_GPL(init_dummy_netdev);
5741
5742
5743/**
5744 *	register_netdev	- register a network device
5745 *	@dev: device to register
5746 *
5747 *	Take a completed network device structure and add it to the kernel
5748 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5749 *	chain. 0 is returned on success. A negative errno code is returned
5750 *	on a failure to set up the device, or if the name is a duplicate.
5751 *
5752 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5753 *	and expands the device name if you passed a format string to
5754 *	alloc_netdev.
5755 */
5756int register_netdev(struct net_device *dev)
5757{
5758	int err;
5759
5760	rtnl_lock();
5761	err = register_netdevice(dev);
5762	rtnl_unlock();
5763	return err;
5764}
5765EXPORT_SYMBOL(register_netdev);
5766
5767int netdev_refcnt_read(const struct net_device *dev)
5768{
5769	int i, refcnt = 0;
5770
5771	for_each_possible_cpu(i)
5772		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5773	return refcnt;
5774}
5775EXPORT_SYMBOL(netdev_refcnt_read);
5776
5777/*
5778 * netdev_wait_allrefs - wait until all references are gone.
5779 *
5780 * This is called when unregistering network devices.
5781 *
5782 * Any protocol or device that holds a reference should register
5783 * for netdevice notification, and cleanup and put back the
5784 * reference if they receive an UNREGISTER event.
5785 * We can get stuck here if buggy protocols don't correctly
5786 * call dev_put.
5787 */
5788static void netdev_wait_allrefs(struct net_device *dev)
5789{
5790	unsigned long rebroadcast_time, warning_time;
5791	int refcnt;
5792
5793	linkwatch_forget_dev(dev);
5794
5795	rebroadcast_time = warning_time = jiffies;
5796	refcnt = netdev_refcnt_read(dev);
5797
5798	while (refcnt != 0) {
5799		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5800			rtnl_lock();
5801
5802			/* Rebroadcast unregister notification */
5803			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5804			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5805			 * should have already handle it the first time */
5806
5807			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5808				     &dev->state)) {
5809				/* We must not have linkwatch events
5810				 * pending on unregister. If this
5811				 * happens, we simply run the queue
5812				 * unscheduled, resulting in a noop
5813				 * for this device.
5814				 */
5815				linkwatch_run_queue();
5816			}
5817
5818			__rtnl_unlock();
5819
5820			rebroadcast_time = jiffies;
5821		}
5822
5823		msleep(250);
5824
5825		refcnt = netdev_refcnt_read(dev);
5826
5827		if (time_after(jiffies, warning_time + 10 * HZ)) {
5828			printk(KERN_EMERG "unregister_netdevice: "
5829			       "waiting for %s to become free. Usage "
5830			       "count = %d\n",
5831			       dev->name, refcnt);
5832			warning_time = jiffies;
5833		}
5834	}
5835}
5836
5837/* The sequence is:
5838 *
5839 *	rtnl_lock();
5840 *	...
5841 *	register_netdevice(x1);
5842 *	register_netdevice(x2);
5843 *	...
5844 *	unregister_netdevice(y1);
5845 *	unregister_netdevice(y2);
5846 *      ...
5847 *	rtnl_unlock();
5848 *	free_netdev(y1);
5849 *	free_netdev(y2);
5850 *
5851 * We are invoked by rtnl_unlock().
5852 * This allows us to deal with problems:
5853 * 1) We can delete sysfs objects which invoke hotplug
5854 *    without deadlocking with linkwatch via keventd.
5855 * 2) Since we run with the RTNL semaphore not held, we can sleep
5856 *    safely in order to wait for the netdev refcnt to drop to zero.
5857 *
5858 * We must not return until all unregister events added during
5859 * the interval the lock was held have been completed.
5860 */
5861void netdev_run_todo(void)
5862{
5863	struct list_head list;
5864
5865	/* Snapshot list, allow later requests */
5866	list_replace_init(&net_todo_list, &list);
5867
5868	__rtnl_unlock();
5869
5870	/* Wait for rcu callbacks to finish before attempting to drain
5871	 * the device list.  This usually avoids a 250ms wait.
5872	 */
5873	if (!list_empty(&list))
5874		rcu_barrier();
5875
5876	while (!list_empty(&list)) {
5877		struct net_device *dev
5878			= list_first_entry(&list, struct net_device, todo_list);
5879		list_del(&dev->todo_list);
5880
5881		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5882			printk(KERN_ERR "network todo '%s' but state %d\n",
5883			       dev->name, dev->reg_state);
5884			dump_stack();
5885			continue;
5886		}
5887
5888		dev->reg_state = NETREG_UNREGISTERED;
5889
5890		on_each_cpu(flush_backlog, dev, 1);
5891
5892		netdev_wait_allrefs(dev);
5893
5894		/* paranoia */
5895		BUG_ON(netdev_refcnt_read(dev));
5896		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5897		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5898		WARN_ON(dev->dn_ptr);
5899
5900		if (dev->destructor)
5901			dev->destructor(dev);
5902
5903		/* Free network device */
5904		kobject_put(&dev->dev.kobj);
5905	}
5906}
5907
5908/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5909 * fields in the same order, with only the type differing.
5910 */
5911static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5912				    const struct net_device_stats *netdev_stats)
5913{
5914#if BITS_PER_LONG == 64
5915        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5916        memcpy(stats64, netdev_stats, sizeof(*stats64));
5917#else
5918	size_t i, n = sizeof(*stats64) / sizeof(u64);
5919	const unsigned long *src = (const unsigned long *)netdev_stats;
5920	u64 *dst = (u64 *)stats64;
5921
5922	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5923		     sizeof(*stats64) / sizeof(u64));
5924	for (i = 0; i < n; i++)
5925		dst[i] = src[i];
5926#endif
5927}
5928
5929/**
5930 *	dev_get_stats	- get network device statistics
5931 *	@dev: device to get statistics from
5932 *	@storage: place to store stats
5933 *
5934 *	Get network statistics from device. Return @storage.
5935 *	The device driver may provide its own method by setting
5936 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5937 *	otherwise the internal statistics structure is used.
5938 */
5939struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5940					struct rtnl_link_stats64 *storage)
5941{
5942	const struct net_device_ops *ops = dev->netdev_ops;
5943
5944	if (ops->ndo_get_stats64) {
5945		memset(storage, 0, sizeof(*storage));
5946		ops->ndo_get_stats64(dev, storage);
5947	} else if (ops->ndo_get_stats) {
5948		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5949	} else {
5950		netdev_stats_to_stats64(storage, &dev->stats);
5951	}
5952	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5953	return storage;
5954}
5955EXPORT_SYMBOL(dev_get_stats);
5956
5957struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5958{
5959	struct netdev_queue *queue = dev_ingress_queue(dev);
5960
5961#ifdef CONFIG_NET_CLS_ACT
5962	if (queue)
5963		return queue;
5964	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5965	if (!queue)
5966		return NULL;
5967	netdev_init_one_queue(dev, queue, NULL);
5968	queue->qdisc = &noop_qdisc;
5969	queue->qdisc_sleeping = &noop_qdisc;
5970	rcu_assign_pointer(dev->ingress_queue, queue);
5971#endif
5972	return queue;
5973}
5974
5975/**
5976 *	alloc_netdev_mqs - allocate network device
5977 *	@sizeof_priv:	size of private data to allocate space for
5978 *	@name:		device name format string
5979 *	@setup:		callback to initialize device
5980 *	@txqs:		the number of TX subqueues to allocate
5981 *	@rxqs:		the number of RX subqueues to allocate
5982 *
5983 *	Allocates a struct net_device with private data area for driver use
5984 *	and performs basic initialization.  Also allocates subquue structs
5985 *	for each queue on the device.
5986 */
5987struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5988		void (*setup)(struct net_device *),
5989		unsigned int txqs, unsigned int rxqs)
5990{
5991	struct net_device *dev;
5992	size_t alloc_size;
5993	struct net_device *p;
5994
5995	BUG_ON(strlen(name) >= sizeof(dev->name));
5996
5997	if (txqs < 1) {
5998		pr_err("alloc_netdev: Unable to allocate device "
5999		       "with zero queues.\n");
6000		return NULL;
6001	}
6002
6003#ifdef CONFIG_RPS
6004	if (rxqs < 1) {
6005		pr_err("alloc_netdev: Unable to allocate device "
6006		       "with zero RX queues.\n");
6007		return NULL;
6008	}
6009#endif
6010
6011	alloc_size = sizeof(struct net_device);
6012	if (sizeof_priv) {
6013		/* ensure 32-byte alignment of private area */
6014		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6015		alloc_size += sizeof_priv;
6016	}
6017	/* ensure 32-byte alignment of whole construct */
6018	alloc_size += NETDEV_ALIGN - 1;
6019
6020	p = kzalloc(alloc_size, GFP_KERNEL);
6021	if (!p) {
6022		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6023		return NULL;
6024	}
6025
6026	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6027	dev->padded = (char *)dev - (char *)p;
6028
6029	dev->pcpu_refcnt = alloc_percpu(int);
6030	if (!dev->pcpu_refcnt)
6031		goto free_p;
6032
6033	if (dev_addr_init(dev))
6034		goto free_pcpu;
6035
6036	dev_mc_init(dev);
6037	dev_uc_init(dev);
6038
6039	dev_net_set(dev, &init_net);
6040
6041	dev->gso_max_size = GSO_MAX_SIZE;
6042
6043	INIT_LIST_HEAD(&dev->napi_list);
6044	INIT_LIST_HEAD(&dev->unreg_list);
6045	INIT_LIST_HEAD(&dev->link_watch_list);
6046	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6047	setup(dev);
6048
6049	dev->num_tx_queues = txqs;
6050	dev->real_num_tx_queues = txqs;
6051	if (netif_alloc_netdev_queues(dev))
6052		goto free_all;
6053
6054#ifdef CONFIG_RPS
6055	dev->num_rx_queues = rxqs;
6056	dev->real_num_rx_queues = rxqs;
6057	if (netif_alloc_rx_queues(dev))
6058		goto free_all;
6059#endif
6060
6061	strcpy(dev->name, name);
6062	dev->group = INIT_NETDEV_GROUP;
6063	return dev;
6064
6065free_all:
6066	free_netdev(dev);
6067	return NULL;
6068
6069free_pcpu:
6070	free_percpu(dev->pcpu_refcnt);
6071	kfree(dev->_tx);
6072#ifdef CONFIG_RPS
6073	kfree(dev->_rx);
6074#endif
6075
6076free_p:
6077	kfree(p);
6078	return NULL;
6079}
6080EXPORT_SYMBOL(alloc_netdev_mqs);
6081
6082/**
6083 *	free_netdev - free network device
6084 *	@dev: device
6085 *
6086 *	This function does the last stage of destroying an allocated device
6087 * 	interface. The reference to the device object is released.
6088 *	If this is the last reference then it will be freed.
6089 */
6090void free_netdev(struct net_device *dev)
6091{
6092	struct napi_struct *p, *n;
6093
6094	release_net(dev_net(dev));
6095
6096	kfree(dev->_tx);
6097#ifdef CONFIG_RPS
6098	kfree(dev->_rx);
6099#endif
6100
6101	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6102
6103	/* Flush device addresses */
6104	dev_addr_flush(dev);
6105
6106	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6107		netif_napi_del(p);
6108
6109	free_percpu(dev->pcpu_refcnt);
6110	dev->pcpu_refcnt = NULL;
6111
6112	/*  Compatibility with error handling in drivers */
6113	if (dev->reg_state == NETREG_UNINITIALIZED) {
6114		kfree((char *)dev - dev->padded);
6115		return;
6116	}
6117
6118	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6119	dev->reg_state = NETREG_RELEASED;
6120
6121	/* will free via device release */
6122	put_device(&dev->dev);
6123}
6124EXPORT_SYMBOL(free_netdev);
6125
6126/**
6127 *	synchronize_net -  Synchronize with packet receive processing
6128 *
6129 *	Wait for packets currently being received to be done.
6130 *	Does not block later packets from starting.
6131 */
6132void synchronize_net(void)
6133{
6134	might_sleep();
6135	if (rtnl_is_locked())
6136		synchronize_rcu_expedited();
6137	else
6138		synchronize_rcu();
6139}
6140EXPORT_SYMBOL(synchronize_net);
6141
6142/**
6143 *	unregister_netdevice_queue - remove device from the kernel
6144 *	@dev: device
6145 *	@head: list
6146 *
6147 *	This function shuts down a device interface and removes it
6148 *	from the kernel tables.
6149 *	If head not NULL, device is queued to be unregistered later.
6150 *
6151 *	Callers must hold the rtnl semaphore.  You may want
6152 *	unregister_netdev() instead of this.
6153 */
6154
6155void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6156{
6157	ASSERT_RTNL();
6158
6159	if (head) {
6160		list_move_tail(&dev->unreg_list, head);
6161	} else {
6162		rollback_registered(dev);
6163		/* Finish processing unregister after unlock */
6164		net_set_todo(dev);
6165	}
6166}
6167EXPORT_SYMBOL(unregister_netdevice_queue);
6168
6169/**
6170 *	unregister_netdevice_many - unregister many devices
6171 *	@head: list of devices
6172 */
6173void unregister_netdevice_many(struct list_head *head)
6174{
6175	struct net_device *dev;
6176
6177	if (!list_empty(head)) {
6178		rollback_registered_many(head);
6179		list_for_each_entry(dev, head, unreg_list)
6180			net_set_todo(dev);
6181	}
6182}
6183EXPORT_SYMBOL(unregister_netdevice_many);
6184
6185/**
6186 *	unregister_netdev - remove device from the kernel
6187 *	@dev: device
6188 *
6189 *	This function shuts down a device interface and removes it
6190 *	from the kernel tables.
6191 *
6192 *	This is just a wrapper for unregister_netdevice that takes
6193 *	the rtnl semaphore.  In general you want to use this and not
6194 *	unregister_netdevice.
6195 */
6196void unregister_netdev(struct net_device *dev)
6197{
6198	rtnl_lock();
6199	unregister_netdevice(dev);
6200	rtnl_unlock();
6201}
6202EXPORT_SYMBOL(unregister_netdev);
6203
6204/**
6205 *	dev_change_net_namespace - move device to different nethost namespace
6206 *	@dev: device
6207 *	@net: network namespace
6208 *	@pat: If not NULL name pattern to try if the current device name
6209 *	      is already taken in the destination network namespace.
6210 *
6211 *	This function shuts down a device interface and moves it
6212 *	to a new network namespace. On success 0 is returned, on
6213 *	a failure a netagive errno code is returned.
6214 *
6215 *	Callers must hold the rtnl semaphore.
6216 */
6217
6218int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6219{
6220	int err;
6221
6222	ASSERT_RTNL();
6223
6224	/* Don't allow namespace local devices to be moved. */
6225	err = -EINVAL;
6226	if (dev->features & NETIF_F_NETNS_LOCAL)
6227		goto out;
6228
6229	/* Ensure the device has been registrered */
6230	err = -EINVAL;
6231	if (dev->reg_state != NETREG_REGISTERED)
6232		goto out;
6233
6234	/* Get out if there is nothing todo */
6235	err = 0;
6236	if (net_eq(dev_net(dev), net))
6237		goto out;
6238
6239	/* Pick the destination device name, and ensure
6240	 * we can use it in the destination network namespace.
6241	 */
6242	err = -EEXIST;
6243	if (__dev_get_by_name(net, dev->name)) {
6244		/* We get here if we can't use the current device name */
6245		if (!pat)
6246			goto out;
6247		if (dev_get_valid_name(dev, pat) < 0)
6248			goto out;
6249	}
6250
6251	/*
6252	 * And now a mini version of register_netdevice unregister_netdevice.
6253	 */
6254
6255	/* If device is running close it first. */
6256	dev_close(dev);
6257
6258	/* And unlink it from device chain */
6259	err = -ENODEV;
6260	unlist_netdevice(dev);
6261
6262	synchronize_net();
6263
6264	/* Shutdown queueing discipline. */
6265	dev_shutdown(dev);
6266
6267	/* Notify protocols, that we are about to destroy
6268	   this device. They should clean all the things.
6269
6270	   Note that dev->reg_state stays at NETREG_REGISTERED.
6271	   This is wanted because this way 8021q and macvlan know
6272	   the device is just moving and can keep their slaves up.
6273	*/
6274	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6275	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6276	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6277
6278	/*
6279	 *	Flush the unicast and multicast chains
6280	 */
6281	dev_uc_flush(dev);
6282	dev_mc_flush(dev);
6283
6284	/* Actually switch the network namespace */
6285	dev_net_set(dev, net);
6286
6287	/* If there is an ifindex conflict assign a new one */
6288	if (__dev_get_by_index(net, dev->ifindex)) {
6289		int iflink = (dev->iflink == dev->ifindex);
6290		dev->ifindex = dev_new_index(net);
6291		if (iflink)
6292			dev->iflink = dev->ifindex;
6293	}
6294
6295	/* Fixup kobjects */
6296	err = device_rename(&dev->dev, dev->name);
6297	WARN_ON(err);
6298
6299	/* Add the device back in the hashes */
6300	list_netdevice(dev);
6301
6302	/* Notify protocols, that a new device appeared. */
6303	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6304
6305	/*
6306	 *	Prevent userspace races by waiting until the network
6307	 *	device is fully setup before sending notifications.
6308	 */
6309	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6310
6311	synchronize_net();
6312	err = 0;
6313out:
6314	return err;
6315}
6316EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6317
6318static int dev_cpu_callback(struct notifier_block *nfb,
6319			    unsigned long action,
6320			    void *ocpu)
6321{
6322	struct sk_buff **list_skb;
6323	struct sk_buff *skb;
6324	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6325	struct softnet_data *sd, *oldsd;
6326
6327	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6328		return NOTIFY_OK;
6329
6330	local_irq_disable();
6331	cpu = smp_processor_id();
6332	sd = &per_cpu(softnet_data, cpu);
6333	oldsd = &per_cpu(softnet_data, oldcpu);
6334
6335	/* Find end of our completion_queue. */
6336	list_skb = &sd->completion_queue;
6337	while (*list_skb)
6338		list_skb = &(*list_skb)->next;
6339	/* Append completion queue from offline CPU. */
6340	*list_skb = oldsd->completion_queue;
6341	oldsd->completion_queue = NULL;
6342
6343	/* Append output queue from offline CPU. */
6344	if (oldsd->output_queue) {
6345		*sd->output_queue_tailp = oldsd->output_queue;
6346		sd->output_queue_tailp = oldsd->output_queue_tailp;
6347		oldsd->output_queue = NULL;
6348		oldsd->output_queue_tailp = &oldsd->output_queue;
6349	}
6350	/* Append NAPI poll list from offline CPU. */
6351	if (!list_empty(&oldsd->poll_list)) {
6352		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6353		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6354	}
6355
6356	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6357	local_irq_enable();
6358
6359	/* Process offline CPU's input_pkt_queue */
6360	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6361		netif_rx(skb);
6362		input_queue_head_incr(oldsd);
6363	}
6364	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6365		netif_rx(skb);
6366		input_queue_head_incr(oldsd);
6367	}
6368
6369	return NOTIFY_OK;
6370}
6371
6372
6373/**
6374 *	netdev_increment_features - increment feature set by one
6375 *	@all: current feature set
6376 *	@one: new feature set
6377 *	@mask: mask feature set
6378 *
6379 *	Computes a new feature set after adding a device with feature set
6380 *	@one to the master device with current feature set @all.  Will not
6381 *	enable anything that is off in @mask. Returns the new feature set.
6382 */
6383u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6384{
6385	if (mask & NETIF_F_GEN_CSUM)
6386		mask |= NETIF_F_ALL_CSUM;
6387	mask |= NETIF_F_VLAN_CHALLENGED;
6388
6389	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6390	all &= one | ~NETIF_F_ALL_FOR_ALL;
6391
6392	/* If device needs checksumming, downgrade to it. */
6393	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6394		all &= ~NETIF_F_NO_CSUM;
6395
6396	/* If one device supports hw checksumming, set for all. */
6397	if (all & NETIF_F_GEN_CSUM)
6398		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6399
6400	return all;
6401}
6402EXPORT_SYMBOL(netdev_increment_features);
6403
6404static struct hlist_head *netdev_create_hash(void)
6405{
6406	int i;
6407	struct hlist_head *hash;
6408
6409	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6410	if (hash != NULL)
6411		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6412			INIT_HLIST_HEAD(&hash[i]);
6413
6414	return hash;
6415}
6416
6417/* Initialize per network namespace state */
6418static int __net_init netdev_init(struct net *net)
6419{
6420	INIT_LIST_HEAD(&net->dev_base_head);
6421
6422	net->dev_name_head = netdev_create_hash();
6423	if (net->dev_name_head == NULL)
6424		goto err_name;
6425
6426	net->dev_index_head = netdev_create_hash();
6427	if (net->dev_index_head == NULL)
6428		goto err_idx;
6429
6430	return 0;
6431
6432err_idx:
6433	kfree(net->dev_name_head);
6434err_name:
6435	return -ENOMEM;
6436}
6437
6438/**
6439 *	netdev_drivername - network driver for the device
6440 *	@dev: network device
6441 *
6442 *	Determine network driver for device.
6443 */
6444const char *netdev_drivername(const struct net_device *dev)
6445{
6446	const struct device_driver *driver;
6447	const struct device *parent;
6448	const char *empty = "";
6449
6450	parent = dev->dev.parent;
6451	if (!parent)
6452		return empty;
6453
6454	driver = parent->driver;
6455	if (driver && driver->name)
6456		return driver->name;
6457	return empty;
6458}
6459
6460int __netdev_printk(const char *level, const struct net_device *dev,
6461			   struct va_format *vaf)
6462{
6463	int r;
6464
6465	if (dev && dev->dev.parent)
6466		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6467			       netdev_name(dev), vaf);
6468	else if (dev)
6469		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6470	else
6471		r = printk("%s(NULL net_device): %pV", level, vaf);
6472
6473	return r;
6474}
6475EXPORT_SYMBOL(__netdev_printk);
6476
6477int netdev_printk(const char *level, const struct net_device *dev,
6478		  const char *format, ...)
6479{
6480	struct va_format vaf;
6481	va_list args;
6482	int r;
6483
6484	va_start(args, format);
6485
6486	vaf.fmt = format;
6487	vaf.va = &args;
6488
6489	r = __netdev_printk(level, dev, &vaf);
6490	va_end(args);
6491
6492	return r;
6493}
6494EXPORT_SYMBOL(netdev_printk);
6495
6496#define define_netdev_printk_level(func, level)			\
6497int func(const struct net_device *dev, const char *fmt, ...)	\
6498{								\
6499	int r;							\
6500	struct va_format vaf;					\
6501	va_list args;						\
6502								\
6503	va_start(args, fmt);					\
6504								\
6505	vaf.fmt = fmt;						\
6506	vaf.va = &args;						\
6507								\
6508	r = __netdev_printk(level, dev, &vaf);			\
6509	va_end(args);						\
6510								\
6511	return r;						\
6512}								\
6513EXPORT_SYMBOL(func);
6514
6515define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6516define_netdev_printk_level(netdev_alert, KERN_ALERT);
6517define_netdev_printk_level(netdev_crit, KERN_CRIT);
6518define_netdev_printk_level(netdev_err, KERN_ERR);
6519define_netdev_printk_level(netdev_warn, KERN_WARNING);
6520define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6521define_netdev_printk_level(netdev_info, KERN_INFO);
6522
6523static void __net_exit netdev_exit(struct net *net)
6524{
6525	kfree(net->dev_name_head);
6526	kfree(net->dev_index_head);
6527}
6528
6529static struct pernet_operations __net_initdata netdev_net_ops = {
6530	.init = netdev_init,
6531	.exit = netdev_exit,
6532};
6533
6534static void __net_exit default_device_exit(struct net *net)
6535{
6536	struct net_device *dev, *aux;
6537	/*
6538	 * Push all migratable network devices back to the
6539	 * initial network namespace
6540	 */
6541	rtnl_lock();
6542	for_each_netdev_safe(net, dev, aux) {
6543		int err;
6544		char fb_name[IFNAMSIZ];
6545
6546		/* Ignore unmoveable devices (i.e. loopback) */
6547		if (dev->features & NETIF_F_NETNS_LOCAL)
6548			continue;
6549
6550		/* Leave virtual devices for the generic cleanup */
6551		if (dev->rtnl_link_ops)
6552			continue;
6553
6554		/* Push remaining network devices to init_net */
6555		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6556		err = dev_change_net_namespace(dev, &init_net, fb_name);
6557		if (err) {
6558			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6559				__func__, dev->name, err);
6560			BUG();
6561		}
6562	}
6563	rtnl_unlock();
6564}
6565
6566static void __net_exit default_device_exit_batch(struct list_head *net_list)
6567{
6568	/* At exit all network devices most be removed from a network
6569	 * namespace.  Do this in the reverse order of registration.
6570	 * Do this across as many network namespaces as possible to
6571	 * improve batching efficiency.
6572	 */
6573	struct net_device *dev;
6574	struct net *net;
6575	LIST_HEAD(dev_kill_list);
6576
6577	rtnl_lock();
6578	list_for_each_entry(net, net_list, exit_list) {
6579		for_each_netdev_reverse(net, dev) {
6580			if (dev->rtnl_link_ops)
6581				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6582			else
6583				unregister_netdevice_queue(dev, &dev_kill_list);
6584		}
6585	}
6586	unregister_netdevice_many(&dev_kill_list);
6587	list_del(&dev_kill_list);
6588	rtnl_unlock();
6589}
6590
6591static struct pernet_operations __net_initdata default_device_ops = {
6592	.exit = default_device_exit,
6593	.exit_batch = default_device_exit_batch,
6594};
6595
6596/*
6597 *	Initialize the DEV module. At boot time this walks the device list and
6598 *	unhooks any devices that fail to initialise (normally hardware not
6599 *	present) and leaves us with a valid list of present and active devices.
6600 *
6601 */
6602
6603/*
6604 *       This is called single threaded during boot, so no need
6605 *       to take the rtnl semaphore.
6606 */
6607static int __init net_dev_init(void)
6608{
6609	int i, rc = -ENOMEM;
6610
6611	BUG_ON(!dev_boot_phase);
6612
6613	if (dev_proc_init())
6614		goto out;
6615
6616	if (netdev_kobject_init())
6617		goto out;
6618
6619	INIT_LIST_HEAD(&ptype_all);
6620	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6621		INIT_LIST_HEAD(&ptype_base[i]);
6622
6623	if (register_pernet_subsys(&netdev_net_ops))
6624		goto out;
6625
6626	/*
6627	 *	Initialise the packet receive queues.
6628	 */
6629
6630	for_each_possible_cpu(i) {
6631		struct softnet_data *sd = &per_cpu(softnet_data, i);
6632
6633		memset(sd, 0, sizeof(*sd));
6634		skb_queue_head_init(&sd->input_pkt_queue);
6635		skb_queue_head_init(&sd->process_queue);
6636		sd->completion_queue = NULL;
6637		INIT_LIST_HEAD(&sd->poll_list);
6638		sd->output_queue = NULL;
6639		sd->output_queue_tailp = &sd->output_queue;
6640#ifdef CONFIG_RPS
6641		sd->csd.func = rps_trigger_softirq;
6642		sd->csd.info = sd;
6643		sd->csd.flags = 0;
6644		sd->cpu = i;
6645#endif
6646
6647		sd->backlog.poll = process_backlog;
6648		sd->backlog.weight = weight_p;
6649		sd->backlog.gro_list = NULL;
6650		sd->backlog.gro_count = 0;
6651	}
6652
6653	dev_boot_phase = 0;
6654
6655	/* The loopback device is special if any other network devices
6656	 * is present in a network namespace the loopback device must
6657	 * be present. Since we now dynamically allocate and free the
6658	 * loopback device ensure this invariant is maintained by
6659	 * keeping the loopback device as the first device on the
6660	 * list of network devices.  Ensuring the loopback devices
6661	 * is the first device that appears and the last network device
6662	 * that disappears.
6663	 */
6664	if (register_pernet_device(&loopback_net_ops))
6665		goto out;
6666
6667	if (register_pernet_device(&default_device_ops))
6668		goto out;
6669
6670	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6671	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6672
6673	hotcpu_notifier(dev_cpu_callback, 0);
6674	dst_init();
6675	dev_mcast_init();
6676	rc = 0;
6677out:
6678	return rc;
6679}
6680
6681subsys_initcall(net_dev_init);
6682
6683static int __init initialize_hashrnd(void)
6684{
6685	get_random_bytes(&hashrnd, sizeof(hashrnd));
6686	return 0;
6687}
6688
6689late_initcall_sync(initialize_hashrnd);
6690