net/core/dev.c at v3.3-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v3.3-rc3 164 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <asm/system.h>
  77#include <linux/bitops.h>
  78#include <linux/capability.h>
  79#include <linux/cpu.h>
  80#include <linux/types.h>
  81#include <linux/kernel.h>
  82#include <linux/hash.h>
  83#include <linux/slab.h>
  84#include <linux/sched.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/notifier.h>
  97#include <linux/skbuff.h>
  98#include <net/net_namespace.h>
  99#include <net/sock.h>
 100#include <linux/rtnetlink.h>
 101#include <linux/proc_fs.h>
 102#include <linux/seq_file.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/pkt_sched.h>
 106#include <net/checksum.h>
 107#include <net/xfrm.h>
 108#include <linux/highmem.h>
 109#include <linux/init.h>
 110#include <linux/kmod.h>
 111#include <linux/module.h>
 112#include <linux/netpoll.h>
 113#include <linux/rcupdate.h>
 114#include <linux/delay.h>
 115#include <net/wext.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <linux/ipv6.h>
 127#include <linux/in.h>
 128#include <linux/jhash.h>
 129#include <linux/random.h>
 130#include <trace/events/napi.h>
 131#include <trace/events/net.h>
 132#include <trace/events/skb.h>
 133#include <linux/pci.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136#include <linux/net_tstamp.h>
 137#include <linux/jump_label.h>
 138#include <net/flow_keys.h>
 139
 140#include "net-sysfs.h"
 141
 142/* Instead of increasing this, you should create a hash table. */
 143#define MAX_GRO_SKBS 8
 144
 145/* This should be increased if a protocol with a bigger head is added. */
 146#define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148/*
 149 *	The list of packet types we will receive (as opposed to discard)
 150 *	and the routines to invoke.
 151 *
 152 *	Why 16. Because with 16 the only overlap we get on a hash of the
 153 *	low nibble of the protocol value is RARP/SNAP/X.25.
 154 *
 155 *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 156 *             sure which should go first, but I bet it won't make much
 157 *             difference if we are running VLANs.  The good news is that
 158 *             this protocol won't be in the list unless compiled in, so
 159 *             the average user (w/out VLANs) will not be adversely affected.
 160 *             --BLG
 161 *
 162 *		0800	IP
 163 *		8100    802.1Q VLAN
 164 *		0001	802.3
 165 *		0002	AX.25
 166 *		0004	802.2
 167 *		8035	RARP
 168 *		0005	SNAP
 169 *		0805	X.25
 170 *		0806	ARP
 171 *		8137	IPX
 172 *		0009	Localtalk
 173 *		86DD	IPv6
 174 */
 175
 176#define PTYPE_HASH_SIZE	(16)
 177#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 178
 179static DEFINE_SPINLOCK(ptype_lock);
 180static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181static struct list_head ptype_all __read_mostly;	/* Taps */
 182
 183/*
 184 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185 * semaphore.
 186 *
 187 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188 *
 189 * Writers must hold the rtnl semaphore while they loop through the
 190 * dev_base_head list, and hold dev_base_lock for writing when they do the
 191 * actual updates.  This allows pure readers to access the list even
 192 * while a writer is preparing to update it.
 193 *
 194 * To put it another way, dev_base_lock is held for writing only to
 195 * protect against pure readers; the rtnl semaphore provides the
 196 * protection against other writers.
 197 *
 198 * See, for example usages, register_netdevice() and
 199 * unregister_netdevice(), which must be called with the rtnl
 200 * semaphore held.
 201 */
 202DEFINE_RWLOCK(dev_base_lock);
 203EXPORT_SYMBOL(dev_base_lock);
 204
 205static inline void dev_base_seq_inc(struct net *net)
 206{
 207	while (++net->dev_base_seq == 0);
 208}
 209
 210static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 211{
 212	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 235/* Device list insertion */
 236static int list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250
 251	return 0;
 252}
 253
 254/* Device list removal
 255 * caller must respect a RCU grace period before freeing/reusing dev
 256 */
 257static void unlist_netdevice(struct net_device *dev)
 258{
 259	ASSERT_RTNL();
 260
 261	/* Unlink dev from the device chain */
 262	write_lock_bh(&dev_base_lock);
 263	list_del_rcu(&dev->dev_list);
 264	hlist_del_rcu(&dev->name_hlist);
 265	hlist_del_rcu(&dev->index_hlist);
 266	write_unlock_bh(&dev_base_lock);
 267
 268	dev_base_seq_inc(dev_net(dev));
 269}
 270
 271/*
 272 *	Our notifier list
 273 */
 274
 275static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277/*
 278 *	Device drivers call our routines to queue packets here. We empty the
 279 *	queue in the local softnet handler.
 280 */
 281
 282DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285#ifdef CONFIG_LOCKDEP
 286/*
 287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288 * according to dev->type
 289 */
 290static const unsigned short netdev_lock_type[] =
 291	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 304	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 305	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 306	 ARPHRD_VOID, ARPHRD_NONE};
 307
 308static const char *const netdev_lock_name[] =
 309	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 310	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 311	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 312	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 313	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 314	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 315	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 316	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 317	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 318	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 319	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 320	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 321	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 322	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 323	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 324	 "_xmit_VOID", "_xmit_NONE"};
 325
 326static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330{
 331	int i;
 332
 333	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334		if (netdev_lock_type[i] == dev_type)
 335			return i;
 336	/* the last key is used by default */
 337	return ARRAY_SIZE(netdev_lock_type) - 1;
 338}
 339
 340static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341						 unsigned short dev_type)
 342{
 343	int i;
 344
 345	i = netdev_lock_pos(dev_type);
 346	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347				   netdev_lock_name[i]);
 348}
 349
 350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351{
 352	int i;
 353
 354	i = netdev_lock_pos(dev->type);
 355	lockdep_set_class_and_name(&dev->addr_list_lock,
 356				   &netdev_addr_lock_key[i],
 357				   netdev_lock_name[i]);
 358}
 359#else
 360static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361						 unsigned short dev_type)
 362{
 363}
 364static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365{
 366}
 367#endif
 368
 369/*******************************************************************************
 370
 371		Protocol management and registration routines
 372
 373*******************************************************************************/
 374
 375/*
 376 *	Add a protocol ID to the list. Now that the input handler is
 377 *	smarter we can dispense with all the messy stuff that used to be
 378 *	here.
 379 *
 380 *	BEWARE!!! Protocol handlers, mangling input packets,
 381 *	MUST BE last in hash buckets and checking protocol handlers
 382 *	MUST start from promiscuous ptype_all chain in net_bh.
 383 *	It is true now, do not change it.
 384 *	Explanation follows: if protocol handler, mangling packet, will
 385 *	be the first on list, it is not able to sense, that packet
 386 *	is cloned and should be copied-on-write, so that it will
 387 *	change it and subsequent readers will get broken packet.
 388 *							--ANK (980803)
 389 */
 390
 391static inline struct list_head *ptype_head(const struct packet_type *pt)
 392{
 393	if (pt->type == htons(ETH_P_ALL))
 394		return &ptype_all;
 395	else
 396		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397}
 398
 399/**
 400 *	dev_add_pack - add packet handler
 401 *	@pt: packet type declaration
 402 *
 403 *	Add a protocol handler to the networking stack. The passed &packet_type
 404 *	is linked into kernel lists and may not be freed until it has been
 405 *	removed from the kernel lists.
 406 *
 407 *	This call does not sleep therefore it can not
 408 *	guarantee all CPU's that are in middle of receiving packets
 409 *	will see the new packet type (until the next received packet).
 410 */
 411
 412void dev_add_pack(struct packet_type *pt)
 413{
 414	struct list_head *head = ptype_head(pt);
 415
 416	spin_lock(&ptype_lock);
 417	list_add_rcu(&pt->list, head);
 418	spin_unlock(&ptype_lock);
 419}
 420EXPORT_SYMBOL(dev_add_pack);
 421
 422/**
 423 *	__dev_remove_pack	 - remove packet handler
 424 *	@pt: packet type declaration
 425 *
 426 *	Remove a protocol handler that was previously added to the kernel
 427 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428 *	from the kernel lists and can be freed or reused once this function
 429 *	returns.
 430 *
 431 *      The packet type might still be in use by receivers
 432 *	and must not be freed until after all the CPU's have gone
 433 *	through a quiescent state.
 434 */
 435void __dev_remove_pack(struct packet_type *pt)
 436{
 437	struct list_head *head = ptype_head(pt);
 438	struct packet_type *pt1;
 439
 440	spin_lock(&ptype_lock);
 441
 442	list_for_each_entry(pt1, head, list) {
 443		if (pt == pt1) {
 444			list_del_rcu(&pt->list);
 445			goto out;
 446		}
 447	}
 448
 449	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 450out:
 451	spin_unlock(&ptype_lock);
 452}
 453EXPORT_SYMBOL(__dev_remove_pack);
 454
 455/**
 456 *	dev_remove_pack	 - remove packet handler
 457 *	@pt: packet type declaration
 458 *
 459 *	Remove a protocol handler that was previously added to the kernel
 460 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461 *	from the kernel lists and can be freed or reused once this function
 462 *	returns.
 463 *
 464 *	This call sleeps to guarantee that no CPU is looking at the packet
 465 *	type after return.
 466 */
 467void dev_remove_pack(struct packet_type *pt)
 468{
 469	__dev_remove_pack(pt);
 470
 471	synchronize_net();
 472}
 473EXPORT_SYMBOL(dev_remove_pack);
 474
 475/******************************************************************************
 476
 477		      Device Boot-time Settings Routines
 478
 479*******************************************************************************/
 480
 481/* Boot time configuration table */
 482static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 483
 484/**
 485 *	netdev_boot_setup_add	- add new setup entry
 486 *	@name: name of the device
 487 *	@map: configured settings for the device
 488 *
 489 *	Adds new setup entry to the dev_boot_setup list.  The function
 490 *	returns 0 on error and 1 on success.  This is a generic routine to
 491 *	all netdevices.
 492 */
 493static int netdev_boot_setup_add(char *name, struct ifmap *map)
 494{
 495	struct netdev_boot_setup *s;
 496	int i;
 497
 498	s = dev_boot_setup;
 499	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 500		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 501			memset(s[i].name, 0, sizeof(s[i].name));
 502			strlcpy(s[i].name, name, IFNAMSIZ);
 503			memcpy(&s[i].map, map, sizeof(s[i].map));
 504			break;
 505		}
 506	}
 507
 508	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 509}
 510
 511/**
 512 *	netdev_boot_setup_check	- check boot time settings
 513 *	@dev: the netdevice
 514 *
 515 * 	Check boot time settings for the device.
 516 *	The found settings are set for the device to be used
 517 *	later in the device probing.
 518 *	Returns 0 if no settings found, 1 if they are.
 519 */
 520int netdev_boot_setup_check(struct net_device *dev)
 521{
 522	struct netdev_boot_setup *s = dev_boot_setup;
 523	int i;
 524
 525	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 526		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 527		    !strcmp(dev->name, s[i].name)) {
 528			dev->irq 	= s[i].map.irq;
 529			dev->base_addr 	= s[i].map.base_addr;
 530			dev->mem_start 	= s[i].map.mem_start;
 531			dev->mem_end 	= s[i].map.mem_end;
 532			return 1;
 533		}
 534	}
 535	return 0;
 536}
 537EXPORT_SYMBOL(netdev_boot_setup_check);
 538
 539
 540/**
 541 *	netdev_boot_base	- get address from boot time settings
 542 *	@prefix: prefix for network device
 543 *	@unit: id for network device
 544 *
 545 * 	Check boot time settings for the base address of device.
 546 *	The found settings are set for the device to be used
 547 *	later in the device probing.
 548 *	Returns 0 if no settings found.
 549 */
 550unsigned long netdev_boot_base(const char *prefix, int unit)
 551{
 552	const struct netdev_boot_setup *s = dev_boot_setup;
 553	char name[IFNAMSIZ];
 554	int i;
 555
 556	sprintf(name, "%s%d", prefix, unit);
 557
 558	/*
 559	 * If device already registered then return base of 1
 560	 * to indicate not to probe for this interface
 561	 */
 562	if (__dev_get_by_name(&init_net, name))
 563		return 1;
 564
 565	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 566		if (!strcmp(name, s[i].name))
 567			return s[i].map.base_addr;
 568	return 0;
 569}
 570
 571/*
 572 * Saves at boot time configured settings for any netdevice.
 573 */
 574int __init netdev_boot_setup(char *str)
 575{
 576	int ints[5];
 577	struct ifmap map;
 578
 579	str = get_options(str, ARRAY_SIZE(ints), ints);
 580	if (!str || !*str)
 581		return 0;
 582
 583	/* Save settings */
 584	memset(&map, 0, sizeof(map));
 585	if (ints[0] > 0)
 586		map.irq = ints[1];
 587	if (ints[0] > 1)
 588		map.base_addr = ints[2];
 589	if (ints[0] > 2)
 590		map.mem_start = ints[3];
 591	if (ints[0] > 3)
 592		map.mem_end = ints[4];
 593
 594	/* Add new entry to the list */
 595	return netdev_boot_setup_add(str, &map);
 596}
 597
 598__setup("netdev=", netdev_boot_setup);
 599
 600/*******************************************************************************
 601
 602			    Device Interface Subroutines
 603
 604*******************************************************************************/
 605
 606/**
 607 *	__dev_get_by_name	- find a device by its name
 608 *	@net: the applicable net namespace
 609 *	@name: name to find
 610 *
 611 *	Find an interface by name. Must be called under RTNL semaphore
 612 *	or @dev_base_lock. If the name is found a pointer to the device
 613 *	is returned. If the name is not found then %NULL is returned. The
 614 *	reference counters are not incremented so the caller must be
 615 *	careful with locks.
 616 */
 617
 618struct net_device *__dev_get_by_name(struct net *net, const char *name)
 619{
 620	struct hlist_node *p;
 621	struct net_device *dev;
 622	struct hlist_head *head = dev_name_hash(net, name);
 623
 624	hlist_for_each_entry(dev, p, head, name_hlist)
 625		if (!strncmp(dev->name, name, IFNAMSIZ))
 626			return dev;
 627
 628	return NULL;
 629}
 630EXPORT_SYMBOL(__dev_get_by_name);
 631
 632/**
 633 *	dev_get_by_name_rcu	- find a device by its name
 634 *	@net: the applicable net namespace
 635 *	@name: name to find
 636 *
 637 *	Find an interface by name.
 638 *	If the name is found a pointer to the device is returned.
 639 * 	If the name is not found then %NULL is returned.
 640 *	The reference counters are not incremented so the caller must be
 641 *	careful with locks. The caller must hold RCU lock.
 642 */
 643
 644struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 645{
 646	struct hlist_node *p;
 647	struct net_device *dev;
 648	struct hlist_head *head = dev_name_hash(net, name);
 649
 650	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 651		if (!strncmp(dev->name, name, IFNAMSIZ))
 652			return dev;
 653
 654	return NULL;
 655}
 656EXPORT_SYMBOL(dev_get_by_name_rcu);
 657
 658/**
 659 *	dev_get_by_name		- find a device by its name
 660 *	@net: the applicable net namespace
 661 *	@name: name to find
 662 *
 663 *	Find an interface by name. This can be called from any
 664 *	context and does its own locking. The returned handle has
 665 *	the usage count incremented and the caller must use dev_put() to
 666 *	release it when it is no longer needed. %NULL is returned if no
 667 *	matching device is found.
 668 */
 669
 670struct net_device *dev_get_by_name(struct net *net, const char *name)
 671{
 672	struct net_device *dev;
 673
 674	rcu_read_lock();
 675	dev = dev_get_by_name_rcu(net, name);
 676	if (dev)
 677		dev_hold(dev);
 678	rcu_read_unlock();
 679	return dev;
 680}
 681EXPORT_SYMBOL(dev_get_by_name);
 682
 683/**
 684 *	__dev_get_by_index - find a device by its ifindex
 685 *	@net: the applicable net namespace
 686 *	@ifindex: index of device
 687 *
 688 *	Search for an interface by index. Returns %NULL if the device
 689 *	is not found or a pointer to the device. The device has not
 690 *	had its reference counter increased so the caller must be careful
 691 *	about locking. The caller must hold either the RTNL semaphore
 692 *	or @dev_base_lock.
 693 */
 694
 695struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 696{
 697	struct hlist_node *p;
 698	struct net_device *dev;
 699	struct hlist_head *head = dev_index_hash(net, ifindex);
 700
 701	hlist_for_each_entry(dev, p, head, index_hlist)
 702		if (dev->ifindex == ifindex)
 703			return dev;
 704
 705	return NULL;
 706}
 707EXPORT_SYMBOL(__dev_get_by_index);
 708
 709/**
 710 *	dev_get_by_index_rcu - find a device by its ifindex
 711 *	@net: the applicable net namespace
 712 *	@ifindex: index of device
 713 *
 714 *	Search for an interface by index. Returns %NULL if the device
 715 *	is not found or a pointer to the device. The device has not
 716 *	had its reference counter increased so the caller must be careful
 717 *	about locking. The caller must hold RCU lock.
 718 */
 719
 720struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 721{
 722	struct hlist_node *p;
 723	struct net_device *dev;
 724	struct hlist_head *head = dev_index_hash(net, ifindex);
 725
 726	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 727		if (dev->ifindex == ifindex)
 728			return dev;
 729
 730	return NULL;
 731}
 732EXPORT_SYMBOL(dev_get_by_index_rcu);
 733
 734
 735/**
 736 *	dev_get_by_index - find a device by its ifindex
 737 *	@net: the applicable net namespace
 738 *	@ifindex: index of device
 739 *
 740 *	Search for an interface by index. Returns NULL if the device
 741 *	is not found or a pointer to the device. The device returned has
 742 *	had a reference added and the pointer is safe until the user calls
 743 *	dev_put to indicate they have finished with it.
 744 */
 745
 746struct net_device *dev_get_by_index(struct net *net, int ifindex)
 747{
 748	struct net_device *dev;
 749
 750	rcu_read_lock();
 751	dev = dev_get_by_index_rcu(net, ifindex);
 752	if (dev)
 753		dev_hold(dev);
 754	rcu_read_unlock();
 755	return dev;
 756}
 757EXPORT_SYMBOL(dev_get_by_index);
 758
 759/**
 760 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 761 *	@net: the applicable net namespace
 762 *	@type: media type of device
 763 *	@ha: hardware address
 764 *
 765 *	Search for an interface by MAC address. Returns NULL if the device
 766 *	is not found or a pointer to the device.
 767 *	The caller must hold RCU or RTNL.
 768 *	The returned device has not had its ref count increased
 769 *	and the caller must therefore be careful about locking
 770 *
 771 */
 772
 773struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 774				       const char *ha)
 775{
 776	struct net_device *dev;
 777
 778	for_each_netdev_rcu(net, dev)
 779		if (dev->type == type &&
 780		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 781			return dev;
 782
 783	return NULL;
 784}
 785EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 786
 787struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788{
 789	struct net_device *dev;
 790
 791	ASSERT_RTNL();
 792	for_each_netdev(net, dev)
 793		if (dev->type == type)
 794			return dev;
 795
 796	return NULL;
 797}
 798EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 799
 800struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 801{
 802	struct net_device *dev, *ret = NULL;
 803
 804	rcu_read_lock();
 805	for_each_netdev_rcu(net, dev)
 806		if (dev->type == type) {
 807			dev_hold(dev);
 808			ret = dev;
 809			break;
 810		}
 811	rcu_read_unlock();
 812	return ret;
 813}
 814EXPORT_SYMBOL(dev_getfirstbyhwtype);
 815
 816/**
 817 *	dev_get_by_flags_rcu - find any device with given flags
 818 *	@net: the applicable net namespace
 819 *	@if_flags: IFF_* values
 820 *	@mask: bitmask of bits in if_flags to check
 821 *
 822 *	Search for any interface with the given flags. Returns NULL if a device
 823 *	is not found or a pointer to the device. Must be called inside
 824 *	rcu_read_lock(), and result refcount is unchanged.
 825 */
 826
 827struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 828				    unsigned short mask)
 829{
 830	struct net_device *dev, *ret;
 831
 832	ret = NULL;
 833	for_each_netdev_rcu(net, dev) {
 834		if (((dev->flags ^ if_flags) & mask) == 0) {
 835			ret = dev;
 836			break;
 837		}
 838	}
 839	return ret;
 840}
 841EXPORT_SYMBOL(dev_get_by_flags_rcu);
 842
 843/**
 844 *	dev_valid_name - check if name is okay for network device
 845 *	@name: name string
 846 *
 847 *	Network device names need to be valid file names to
 848 *	to allow sysfs to work.  We also disallow any kind of
 849 *	whitespace.
 850 */
 851int dev_valid_name(const char *name)
 852{
 853	if (*name == '\0')
 854		return 0;
 855	if (strlen(name) >= IFNAMSIZ)
 856		return 0;
 857	if (!strcmp(name, ".") || !strcmp(name, ".."))
 858		return 0;
 859
 860	while (*name) {
 861		if (*name == '/' || isspace(*name))
 862			return 0;
 863		name++;
 864	}
 865	return 1;
 866}
 867EXPORT_SYMBOL(dev_valid_name);
 868
 869/**
 870 *	__dev_alloc_name - allocate a name for a device
 871 *	@net: network namespace to allocate the device name in
 872 *	@name: name format string
 873 *	@buf:  scratch buffer and result name string
 874 *
 875 *	Passed a format string - eg "lt%d" it will try and find a suitable
 876 *	id. It scans list of devices to build up a free map, then chooses
 877 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 878 *	while allocating the name and adding the device in order to avoid
 879 *	duplicates.
 880 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 881 *	Returns the number of the unit assigned or a negative errno code.
 882 */
 883
 884static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 885{
 886	int i = 0;
 887	const char *p;
 888	const int max_netdevices = 8*PAGE_SIZE;
 889	unsigned long *inuse;
 890	struct net_device *d;
 891
 892	p = strnchr(name, IFNAMSIZ-1, '%');
 893	if (p) {
 894		/*
 895		 * Verify the string as this thing may have come from
 896		 * the user.  There must be either one "%d" and no other "%"
 897		 * characters.
 898		 */
 899		if (p[1] != 'd' || strchr(p + 2, '%'))
 900			return -EINVAL;
 901
 902		/* Use one page as a bit array of possible slots */
 903		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 904		if (!inuse)
 905			return -ENOMEM;
 906
 907		for_each_netdev(net, d) {
 908			if (!sscanf(d->name, name, &i))
 909				continue;
 910			if (i < 0 || i >= max_netdevices)
 911				continue;
 912
 913			/*  avoid cases where sscanf is not exact inverse of printf */
 914			snprintf(buf, IFNAMSIZ, name, i);
 915			if (!strncmp(buf, d->name, IFNAMSIZ))
 916				set_bit(i, inuse);
 917		}
 918
 919		i = find_first_zero_bit(inuse, max_netdevices);
 920		free_page((unsigned long) inuse);
 921	}
 922
 923	if (buf != name)
 924		snprintf(buf, IFNAMSIZ, name, i);
 925	if (!__dev_get_by_name(net, buf))
 926		return i;
 927
 928	/* It is possible to run out of possible slots
 929	 * when the name is long and there isn't enough space left
 930	 * for the digits, or if all bits are used.
 931	 */
 932	return -ENFILE;
 933}
 934
 935/**
 936 *	dev_alloc_name - allocate a name for a device
 937 *	@dev: device
 938 *	@name: name format string
 939 *
 940 *	Passed a format string - eg "lt%d" it will try and find a suitable
 941 *	id. It scans list of devices to build up a free map, then chooses
 942 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 943 *	while allocating the name and adding the device in order to avoid
 944 *	duplicates.
 945 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 946 *	Returns the number of the unit assigned or a negative errno code.
 947 */
 948
 949int dev_alloc_name(struct net_device *dev, const char *name)
 950{
 951	char buf[IFNAMSIZ];
 952	struct net *net;
 953	int ret;
 954
 955	BUG_ON(!dev_net(dev));
 956	net = dev_net(dev);
 957	ret = __dev_alloc_name(net, name, buf);
 958	if (ret >= 0)
 959		strlcpy(dev->name, buf, IFNAMSIZ);
 960	return ret;
 961}
 962EXPORT_SYMBOL(dev_alloc_name);
 963
 964static int dev_get_valid_name(struct net_device *dev, const char *name)
 965{
 966	struct net *net;
 967
 968	BUG_ON(!dev_net(dev));
 969	net = dev_net(dev);
 970
 971	if (!dev_valid_name(name))
 972		return -EINVAL;
 973
 974	if (strchr(name, '%'))
 975		return dev_alloc_name(dev, name);
 976	else if (__dev_get_by_name(net, name))
 977		return -EEXIST;
 978	else if (dev->name != name)
 979		strlcpy(dev->name, name, IFNAMSIZ);
 980
 981	return 0;
 982}
 983
 984/**
 985 *	dev_change_name - change name of a device
 986 *	@dev: device
 987 *	@newname: name (or format string) must be at least IFNAMSIZ
 988 *
 989 *	Change name of a device, can pass format strings "eth%d".
 990 *	for wildcarding.
 991 */
 992int dev_change_name(struct net_device *dev, const char *newname)
 993{
 994	char oldname[IFNAMSIZ];
 995	int err = 0;
 996	int ret;
 997	struct net *net;
 998
 999	ASSERT_RTNL();
1000	BUG_ON(!dev_net(dev));
1001
1002	net = dev_net(dev);
1003	if (dev->flags & IFF_UP)
1004		return -EBUSY;
1005
1006	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1007		return 0;
1008
1009	memcpy(oldname, dev->name, IFNAMSIZ);
1010
1011	err = dev_get_valid_name(dev, newname);
1012	if (err < 0)
1013		return err;
1014
1015rollback:
1016	ret = device_rename(&dev->dev, dev->name);
1017	if (ret) {
1018		memcpy(dev->name, oldname, IFNAMSIZ);
1019		return ret;
1020	}
1021
1022	write_lock_bh(&dev_base_lock);
1023	hlist_del_rcu(&dev->name_hlist);
1024	write_unlock_bh(&dev_base_lock);
1025
1026	synchronize_rcu();
1027
1028	write_lock_bh(&dev_base_lock);
1029	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1030	write_unlock_bh(&dev_base_lock);
1031
1032	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1033	ret = notifier_to_errno(ret);
1034
1035	if (ret) {
1036		/* err >= 0 after dev_alloc_name() or stores the first errno */
1037		if (err >= 0) {
1038			err = ret;
1039			memcpy(dev->name, oldname, IFNAMSIZ);
1040			goto rollback;
1041		} else {
1042			printk(KERN_ERR
1043			       "%s: name change rollback failed: %d.\n",
1044			       dev->name, ret);
1045		}
1046	}
1047
1048	return err;
1049}
1050
1051/**
1052 *	dev_set_alias - change ifalias of a device
1053 *	@dev: device
1054 *	@alias: name up to IFALIASZ
1055 *	@len: limit of bytes to copy from info
1056 *
1057 *	Set ifalias for a device,
1058 */
1059int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1060{
1061	ASSERT_RTNL();
1062
1063	if (len >= IFALIASZ)
1064		return -EINVAL;
1065
1066	if (!len) {
1067		if (dev->ifalias) {
1068			kfree(dev->ifalias);
1069			dev->ifalias = NULL;
1070		}
1071		return 0;
1072	}
1073
1074	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1075	if (!dev->ifalias)
1076		return -ENOMEM;
1077
1078	strlcpy(dev->ifalias, alias, len+1);
1079	return len;
1080}
1081
1082
1083/**
1084 *	netdev_features_change - device changes features
1085 *	@dev: device to cause notification
1086 *
1087 *	Called to indicate a device has changed features.
1088 */
1089void netdev_features_change(struct net_device *dev)
1090{
1091	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092}
1093EXPORT_SYMBOL(netdev_features_change);
1094
1095/**
1096 *	netdev_state_change - device changes state
1097 *	@dev: device to cause notification
1098 *
1099 *	Called to indicate a device has changed state. This function calls
1100 *	the notifier chains for netdev_chain and sends a NEWLINK message
1101 *	to the routing socket.
1102 */
1103void netdev_state_change(struct net_device *dev)
1104{
1105	if (dev->flags & IFF_UP) {
1106		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108	}
1109}
1110EXPORT_SYMBOL(netdev_state_change);
1111
1112int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113{
1114	return call_netdevice_notifiers(event, dev);
1115}
1116EXPORT_SYMBOL(netdev_bonding_change);
1117
1118/**
1119 *	dev_load 	- load a network module
1120 *	@net: the applicable net namespace
1121 *	@name: name of interface
1122 *
1123 *	If a network interface is not present and the process has suitable
1124 *	privileges this function loads the module. If module loading is not
1125 *	available in this kernel then it becomes a nop.
1126 */
1127
1128void dev_load(struct net *net, const char *name)
1129{
1130	struct net_device *dev;
1131	int no_module;
1132
1133	rcu_read_lock();
1134	dev = dev_get_by_name_rcu(net, name);
1135	rcu_read_unlock();
1136
1137	no_module = !dev;
1138	if (no_module && capable(CAP_NET_ADMIN))
1139		no_module = request_module("netdev-%s", name);
1140	if (no_module && capable(CAP_SYS_MODULE)) {
1141		if (!request_module("%s", name))
1142			pr_err("Loading kernel module for a network device "
1143"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1144"instead\n", name);
1145	}
1146}
1147EXPORT_SYMBOL(dev_load);
1148
1149static int __dev_open(struct net_device *dev)
1150{
1151	const struct net_device_ops *ops = dev->netdev_ops;
1152	int ret;
1153
1154	ASSERT_RTNL();
1155
1156	if (!netif_device_present(dev))
1157		return -ENODEV;
1158
1159	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1160	ret = notifier_to_errno(ret);
1161	if (ret)
1162		return ret;
1163
1164	set_bit(__LINK_STATE_START, &dev->state);
1165
1166	if (ops->ndo_validate_addr)
1167		ret = ops->ndo_validate_addr(dev);
1168
1169	if (!ret && ops->ndo_open)
1170		ret = ops->ndo_open(dev);
1171
1172	if (ret)
1173		clear_bit(__LINK_STATE_START, &dev->state);
1174	else {
1175		dev->flags |= IFF_UP;
1176		net_dmaengine_get();
1177		dev_set_rx_mode(dev);
1178		dev_activate(dev);
1179	}
1180
1181	return ret;
1182}
1183
1184/**
1185 *	dev_open	- prepare an interface for use.
1186 *	@dev:	device to open
1187 *
1188 *	Takes a device from down to up state. The device's private open
1189 *	function is invoked and then the multicast lists are loaded. Finally
1190 *	the device is moved into the up state and a %NETDEV_UP message is
1191 *	sent to the netdev notifier chain.
1192 *
1193 *	Calling this function on an active interface is a nop. On a failure
1194 *	a negative errno code is returned.
1195 */
1196int dev_open(struct net_device *dev)
1197{
1198	int ret;
1199
1200	if (dev->flags & IFF_UP)
1201		return 0;
1202
1203	ret = __dev_open(dev);
1204	if (ret < 0)
1205		return ret;
1206
1207	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208	call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210	return ret;
1211}
1212EXPORT_SYMBOL(dev_open);
1213
1214static int __dev_close_many(struct list_head *head)
1215{
1216	struct net_device *dev;
1217
1218	ASSERT_RTNL();
1219	might_sleep();
1220
1221	list_for_each_entry(dev, head, unreg_list) {
1222		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224		clear_bit(__LINK_STATE_START, &dev->state);
1225
1226		/* Synchronize to scheduled poll. We cannot touch poll list, it
1227		 * can be even on different cpu. So just clear netif_running().
1228		 *
1229		 * dev->stop() will invoke napi_disable() on all of it's
1230		 * napi_struct instances on this device.
1231		 */
1232		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233	}
1234
1235	dev_deactivate_many(head);
1236
1237	list_for_each_entry(dev, head, unreg_list) {
1238		const struct net_device_ops *ops = dev->netdev_ops;
1239
1240		/*
1241		 *	Call the device specific close. This cannot fail.
1242		 *	Only if device is UP
1243		 *
1244		 *	We allow it to be called even after a DETACH hot-plug
1245		 *	event.
1246		 */
1247		if (ops->ndo_stop)
1248			ops->ndo_stop(dev);
1249
1250		dev->flags &= ~IFF_UP;
1251		net_dmaengine_put();
1252	}
1253
1254	return 0;
1255}
1256
1257static int __dev_close(struct net_device *dev)
1258{
1259	int retval;
1260	LIST_HEAD(single);
1261
1262	list_add(&dev->unreg_list, &single);
1263	retval = __dev_close_many(&single);
1264	list_del(&single);
1265	return retval;
1266}
1267
1268static int dev_close_many(struct list_head *head)
1269{
1270	struct net_device *dev, *tmp;
1271	LIST_HEAD(tmp_list);
1272
1273	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1274		if (!(dev->flags & IFF_UP))
1275			list_move(&dev->unreg_list, &tmp_list);
1276
1277	__dev_close_many(head);
1278
1279	list_for_each_entry(dev, head, unreg_list) {
1280		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281		call_netdevice_notifiers(NETDEV_DOWN, dev);
1282	}
1283
1284	/* rollback_registered_many needs the complete original list */
1285	list_splice(&tmp_list, head);
1286	return 0;
1287}
1288
1289/**
1290 *	dev_close - shutdown an interface.
1291 *	@dev: device to shutdown
1292 *
1293 *	This function moves an active device into down state. A
1294 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296 *	chain.
1297 */
1298int dev_close(struct net_device *dev)
1299{
1300	if (dev->flags & IFF_UP) {
1301		LIST_HEAD(single);
1302
1303		list_add(&dev->unreg_list, &single);
1304		dev_close_many(&single);
1305		list_del(&single);
1306	}
1307	return 0;
1308}
1309EXPORT_SYMBOL(dev_close);
1310
1311
1312/**
1313 *	dev_disable_lro - disable Large Receive Offload on a device
1314 *	@dev: device
1315 *
1316 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1317 *	called under RTNL.  This is needed if received packets may be
1318 *	forwarded to another interface.
1319 */
1320void dev_disable_lro(struct net_device *dev)
1321{
1322	/*
1323	 * If we're trying to disable lro on a vlan device
1324	 * use the underlying physical device instead
1325	 */
1326	if (is_vlan_dev(dev))
1327		dev = vlan_dev_real_dev(dev);
1328
1329	dev->wanted_features &= ~NETIF_F_LRO;
1330	netdev_update_features(dev);
1331
1332	if (unlikely(dev->features & NETIF_F_LRO))
1333		netdev_WARN(dev, "failed to disable LRO!\n");
1334}
1335EXPORT_SYMBOL(dev_disable_lro);
1336
1337
1338static int dev_boot_phase = 1;
1339
1340/**
1341 *	register_netdevice_notifier - register a network notifier block
1342 *	@nb: notifier
1343 *
1344 *	Register a notifier to be called when network device events occur.
1345 *	The notifier passed is linked into the kernel structures and must
1346 *	not be reused until it has been unregistered. A negative errno code
1347 *	is returned on a failure.
1348 *
1349 * 	When registered all registration and up events are replayed
1350 *	to the new notifier to allow device to have a race free
1351 *	view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356	struct net_device *dev;
1357	struct net_device *last;
1358	struct net *net;
1359	int err;
1360
1361	rtnl_lock();
1362	err = raw_notifier_chain_register(&netdev_chain, nb);
1363	if (err)
1364		goto unlock;
1365	if (dev_boot_phase)
1366		goto unlock;
1367	for_each_net(net) {
1368		for_each_netdev(net, dev) {
1369			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370			err = notifier_to_errno(err);
1371			if (err)
1372				goto rollback;
1373
1374			if (!(dev->flags & IFF_UP))
1375				continue;
1376
1377			nb->notifier_call(nb, NETDEV_UP, dev);
1378		}
1379	}
1380
1381unlock:
1382	rtnl_unlock();
1383	return err;
1384
1385rollback:
1386	last = dev;
1387	for_each_net(net) {
1388		for_each_netdev(net, dev) {
1389			if (dev == last)
1390				goto outroll;
1391
1392			if (dev->flags & IFF_UP) {
1393				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394				nb->notifier_call(nb, NETDEV_DOWN, dev);
1395			}
1396			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398		}
1399	}
1400
1401outroll:
1402	raw_notifier_chain_unregister(&netdev_chain, nb);
1403	goto unlock;
1404}
1405EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407/**
1408 *	unregister_netdevice_notifier - unregister a network notifier block
1409 *	@nb: notifier
1410 *
1411 *	Unregister a notifier previously registered by
1412 *	register_netdevice_notifier(). The notifier is unlinked into the
1413 *	kernel structures and may then be reused. A negative errno code
1414 *	is returned on a failure.
1415 */
1416
1417int unregister_netdevice_notifier(struct notifier_block *nb)
1418{
1419	int err;
1420
1421	rtnl_lock();
1422	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1423	rtnl_unlock();
1424	return err;
1425}
1426EXPORT_SYMBOL(unregister_netdevice_notifier);
1427
1428/**
1429 *	call_netdevice_notifiers - call all network notifier blocks
1430 *      @val: value passed unmodified to notifier function
1431 *      @dev: net_device pointer passed unmodified to notifier function
1432 *
1433 *	Call all network notifier blocks.  Parameters and return value
1434 *	are as for raw_notifier_call_chain().
1435 */
1436
1437int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1438{
1439	ASSERT_RTNL();
1440	return raw_notifier_call_chain(&netdev_chain, val, dev);
1441}
1442EXPORT_SYMBOL(call_netdevice_notifiers);
1443
1444static struct jump_label_key netstamp_needed __read_mostly;
1445#ifdef HAVE_JUMP_LABEL
1446/* We are not allowed to call jump_label_dec() from irq context
1447 * If net_disable_timestamp() is called from irq context, defer the
1448 * jump_label_dec() calls.
1449 */
1450static atomic_t netstamp_needed_deferred;
1451#endif
1452
1453void net_enable_timestamp(void)
1454{
1455#ifdef HAVE_JUMP_LABEL
1456	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1457
1458	if (deferred) {
1459		while (--deferred)
1460			jump_label_dec(&netstamp_needed);
1461		return;
1462	}
1463#endif
1464	WARN_ON(in_interrupt());
1465	jump_label_inc(&netstamp_needed);
1466}
1467EXPORT_SYMBOL(net_enable_timestamp);
1468
1469void net_disable_timestamp(void)
1470{
1471#ifdef HAVE_JUMP_LABEL
1472	if (in_interrupt()) {
1473		atomic_inc(&netstamp_needed_deferred);
1474		return;
1475	}
1476#endif
1477	jump_label_dec(&netstamp_needed);
1478}
1479EXPORT_SYMBOL(net_disable_timestamp);
1480
1481static inline void net_timestamp_set(struct sk_buff *skb)
1482{
1483	skb->tstamp.tv64 = 0;
1484	if (static_branch(&netstamp_needed))
1485		__net_timestamp(skb);
1486}
1487
1488#define net_timestamp_check(COND, SKB)			\
1489	if (static_branch(&netstamp_needed)) {		\
1490		if ((COND) && !(SKB)->tstamp.tv64)	\
1491			__net_timestamp(SKB);		\
1492	}						\
1493
1494static int net_hwtstamp_validate(struct ifreq *ifr)
1495{
1496	struct hwtstamp_config cfg;
1497	enum hwtstamp_tx_types tx_type;
1498	enum hwtstamp_rx_filters rx_filter;
1499	int tx_type_valid = 0;
1500	int rx_filter_valid = 0;
1501
1502	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1503		return -EFAULT;
1504
1505	if (cfg.flags) /* reserved for future extensions */
1506		return -EINVAL;
1507
1508	tx_type = cfg.tx_type;
1509	rx_filter = cfg.rx_filter;
1510
1511	switch (tx_type) {
1512	case HWTSTAMP_TX_OFF:
1513	case HWTSTAMP_TX_ON:
1514	case HWTSTAMP_TX_ONESTEP_SYNC:
1515		tx_type_valid = 1;
1516		break;
1517	}
1518
1519	switch (rx_filter) {
1520	case HWTSTAMP_FILTER_NONE:
1521	case HWTSTAMP_FILTER_ALL:
1522	case HWTSTAMP_FILTER_SOME:
1523	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1524	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1525	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1526	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1527	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1528	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1529	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1530	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1531	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1532	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1533	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1534	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1535		rx_filter_valid = 1;
1536		break;
1537	}
1538
1539	if (!tx_type_valid || !rx_filter_valid)
1540		return -ERANGE;
1541
1542	return 0;
1543}
1544
1545static inline bool is_skb_forwardable(struct net_device *dev,
1546				      struct sk_buff *skb)
1547{
1548	unsigned int len;
1549
1550	if (!(dev->flags & IFF_UP))
1551		return false;
1552
1553	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1554	if (skb->len <= len)
1555		return true;
1556
1557	/* if TSO is enabled, we don't care about the length as the packet
1558	 * could be forwarded without being segmented before
1559	 */
1560	if (skb_is_gso(skb))
1561		return true;
1562
1563	return false;
1564}
1565
1566/**
1567 * dev_forward_skb - loopback an skb to another netif
1568 *
1569 * @dev: destination network device
1570 * @skb: buffer to forward
1571 *
1572 * return values:
1573 *	NET_RX_SUCCESS	(no congestion)
1574 *	NET_RX_DROP     (packet was dropped, but freed)
1575 *
1576 * dev_forward_skb can be used for injecting an skb from the
1577 * start_xmit function of one device into the receive queue
1578 * of another device.
1579 *
1580 * The receiving device may be in another namespace, so
1581 * we have to clear all information in the skb that could
1582 * impact namespace isolation.
1583 */
1584int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1585{
1586	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1587		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1588			atomic_long_inc(&dev->rx_dropped);
1589			kfree_skb(skb);
1590			return NET_RX_DROP;
1591		}
1592	}
1593
1594	skb_orphan(skb);
1595	nf_reset(skb);
1596
1597	if (unlikely(!is_skb_forwardable(dev, skb))) {
1598		atomic_long_inc(&dev->rx_dropped);
1599		kfree_skb(skb);
1600		return NET_RX_DROP;
1601	}
1602	skb_set_dev(skb, dev);
1603	skb->tstamp.tv64 = 0;
1604	skb->pkt_type = PACKET_HOST;
1605	skb->protocol = eth_type_trans(skb, dev);
1606	return netif_rx(skb);
1607}
1608EXPORT_SYMBOL_GPL(dev_forward_skb);
1609
1610static inline int deliver_skb(struct sk_buff *skb,
1611			      struct packet_type *pt_prev,
1612			      struct net_device *orig_dev)
1613{
1614	atomic_inc(&skb->users);
1615	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1616}
1617
1618/*
1619 *	Support routine. Sends outgoing frames to any network
1620 *	taps currently in use.
1621 */
1622
1623static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1624{
1625	struct packet_type *ptype;
1626	struct sk_buff *skb2 = NULL;
1627	struct packet_type *pt_prev = NULL;
1628
1629	rcu_read_lock();
1630	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1631		/* Never send packets back to the socket
1632		 * they originated from - MvS (miquels@drinkel.ow.org)
1633		 */
1634		if ((ptype->dev == dev || !ptype->dev) &&
1635		    (ptype->af_packet_priv == NULL ||
1636		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1637			if (pt_prev) {
1638				deliver_skb(skb2, pt_prev, skb->dev);
1639				pt_prev = ptype;
1640				continue;
1641			}
1642
1643			skb2 = skb_clone(skb, GFP_ATOMIC);
1644			if (!skb2)
1645				break;
1646
1647			net_timestamp_set(skb2);
1648
1649			/* skb->nh should be correctly
1650			   set by sender, so that the second statement is
1651			   just protection against buggy protocols.
1652			 */
1653			skb_reset_mac_header(skb2);
1654
1655			if (skb_network_header(skb2) < skb2->data ||
1656			    skb2->network_header > skb2->tail) {
1657				if (net_ratelimit())
1658					printk(KERN_CRIT "protocol %04x is "
1659					       "buggy, dev %s\n",
1660					       ntohs(skb2->protocol),
1661					       dev->name);
1662				skb_reset_network_header(skb2);
1663			}
1664
1665			skb2->transport_header = skb2->network_header;
1666			skb2->pkt_type = PACKET_OUTGOING;
1667			pt_prev = ptype;
1668		}
1669	}
1670	if (pt_prev)
1671		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1672	rcu_read_unlock();
1673}
1674
1675/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1676 * @dev: Network device
1677 * @txq: number of queues available
1678 *
1679 * If real_num_tx_queues is changed the tc mappings may no longer be
1680 * valid. To resolve this verify the tc mapping remains valid and if
1681 * not NULL the mapping. With no priorities mapping to this
1682 * offset/count pair it will no longer be used. In the worst case TC0
1683 * is invalid nothing can be done so disable priority mappings. If is
1684 * expected that drivers will fix this mapping if they can before
1685 * calling netif_set_real_num_tx_queues.
1686 */
1687static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1688{
1689	int i;
1690	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1691
1692	/* If TC0 is invalidated disable TC mapping */
1693	if (tc->offset + tc->count > txq) {
1694		pr_warning("Number of in use tx queues changed "
1695			   "invalidating tc mappings. Priority "
1696			   "traffic classification disabled!\n");
1697		dev->num_tc = 0;
1698		return;
1699	}
1700
1701	/* Invalidated prio to tc mappings set to TC0 */
1702	for (i = 1; i < TC_BITMASK + 1; i++) {
1703		int q = netdev_get_prio_tc_map(dev, i);
1704
1705		tc = &dev->tc_to_txq[q];
1706		if (tc->offset + tc->count > txq) {
1707			pr_warning("Number of in use tx queues "
1708				   "changed. Priority %i to tc "
1709				   "mapping %i is no longer valid "
1710				   "setting map to 0\n",
1711				   i, q);
1712			netdev_set_prio_tc_map(dev, i, 0);
1713		}
1714	}
1715}
1716
1717/*
1718 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1719 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1720 */
1721int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1722{
1723	int rc;
1724
1725	if (txq < 1 || txq > dev->num_tx_queues)
1726		return -EINVAL;
1727
1728	if (dev->reg_state == NETREG_REGISTERED ||
1729	    dev->reg_state == NETREG_UNREGISTERING) {
1730		ASSERT_RTNL();
1731
1732		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1733						  txq);
1734		if (rc)
1735			return rc;
1736
1737		if (dev->num_tc)
1738			netif_setup_tc(dev, txq);
1739
1740		if (txq < dev->real_num_tx_queues)
1741			qdisc_reset_all_tx_gt(dev, txq);
1742	}
1743
1744	dev->real_num_tx_queues = txq;
1745	return 0;
1746}
1747EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1748
1749#ifdef CONFIG_RPS
1750/**
1751 *	netif_set_real_num_rx_queues - set actual number of RX queues used
1752 *	@dev: Network device
1753 *	@rxq: Actual number of RX queues
1754 *
1755 *	This must be called either with the rtnl_lock held or before
1756 *	registration of the net device.  Returns 0 on success, or a
1757 *	negative error code.  If called before registration, it always
1758 *	succeeds.
1759 */
1760int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1761{
1762	int rc;
1763
1764	if (rxq < 1 || rxq > dev->num_rx_queues)
1765		return -EINVAL;
1766
1767	if (dev->reg_state == NETREG_REGISTERED) {
1768		ASSERT_RTNL();
1769
1770		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1771						  rxq);
1772		if (rc)
1773			return rc;
1774	}
1775
1776	dev->real_num_rx_queues = rxq;
1777	return 0;
1778}
1779EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1780#endif
1781
1782static inline void __netif_reschedule(struct Qdisc *q)
1783{
1784	struct softnet_data *sd;
1785	unsigned long flags;
1786
1787	local_irq_save(flags);
1788	sd = &__get_cpu_var(softnet_data);
1789	q->next_sched = NULL;
1790	*sd->output_queue_tailp = q;
1791	sd->output_queue_tailp = &q->next_sched;
1792	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1793	local_irq_restore(flags);
1794}
1795
1796void __netif_schedule(struct Qdisc *q)
1797{
1798	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1799		__netif_reschedule(q);
1800}
1801EXPORT_SYMBOL(__netif_schedule);
1802
1803void dev_kfree_skb_irq(struct sk_buff *skb)
1804{
1805	if (atomic_dec_and_test(&skb->users)) {
1806		struct softnet_data *sd;
1807		unsigned long flags;
1808
1809		local_irq_save(flags);
1810		sd = &__get_cpu_var(softnet_data);
1811		skb->next = sd->completion_queue;
1812		sd->completion_queue = skb;
1813		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1814		local_irq_restore(flags);
1815	}
1816}
1817EXPORT_SYMBOL(dev_kfree_skb_irq);
1818
1819void dev_kfree_skb_any(struct sk_buff *skb)
1820{
1821	if (in_irq() || irqs_disabled())
1822		dev_kfree_skb_irq(skb);
1823	else
1824		dev_kfree_skb(skb);
1825}
1826EXPORT_SYMBOL(dev_kfree_skb_any);
1827
1828
1829/**
1830 * netif_device_detach - mark device as removed
1831 * @dev: network device
1832 *
1833 * Mark device as removed from system and therefore no longer available.
1834 */
1835void netif_device_detach(struct net_device *dev)
1836{
1837	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1838	    netif_running(dev)) {
1839		netif_tx_stop_all_queues(dev);
1840	}
1841}
1842EXPORT_SYMBOL(netif_device_detach);
1843
1844/**
1845 * netif_device_attach - mark device as attached
1846 * @dev: network device
1847 *
1848 * Mark device as attached from system and restart if needed.
1849 */
1850void netif_device_attach(struct net_device *dev)
1851{
1852	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1853	    netif_running(dev)) {
1854		netif_tx_wake_all_queues(dev);
1855		__netdev_watchdog_up(dev);
1856	}
1857}
1858EXPORT_SYMBOL(netif_device_attach);
1859
1860/**
1861 * skb_dev_set -- assign a new device to a buffer
1862 * @skb: buffer for the new device
1863 * @dev: network device
1864 *
1865 * If an skb is owned by a device already, we have to reset
1866 * all data private to the namespace a device belongs to
1867 * before assigning it a new device.
1868 */
1869#ifdef CONFIG_NET_NS
1870void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1871{
1872	skb_dst_drop(skb);
1873	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1874		secpath_reset(skb);
1875		nf_reset(skb);
1876		skb_init_secmark(skb);
1877		skb->mark = 0;
1878		skb->priority = 0;
1879		skb->nf_trace = 0;
1880		skb->ipvs_property = 0;
1881#ifdef CONFIG_NET_SCHED
1882		skb->tc_index = 0;
1883#endif
1884	}
1885	skb->dev = dev;
1886}
1887EXPORT_SYMBOL(skb_set_dev);
1888#endif /* CONFIG_NET_NS */
1889
1890static void skb_warn_bad_offload(const struct sk_buff *skb)
1891{
1892	static const netdev_features_t null_features = 0;
1893	struct net_device *dev = skb->dev;
1894	const char *driver = "";
1895
1896	if (dev && dev->dev.parent)
1897		driver = dev_driver_string(dev->dev.parent);
1898
1899	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900	     "gso_type=%d ip_summed=%d\n",
1901	     driver, dev ? &dev->features : &null_features,
1902	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904	     skb_shinfo(skb)->gso_type, skb->ip_summed);
1905}
1906
1907/*
1908 * Invalidate hardware checksum when packet is to be mangled, and
1909 * complete checksum manually on outgoing path.
1910 */
1911int skb_checksum_help(struct sk_buff *skb)
1912{
1913	__wsum csum;
1914	int ret = 0, offset;
1915
1916	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917		goto out_set_summed;
1918
1919	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920		skb_warn_bad_offload(skb);
1921		return -EINVAL;
1922	}
1923
1924	offset = skb_checksum_start_offset(skb);
1925	BUG_ON(offset >= skb_headlen(skb));
1926	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927
1928	offset += skb->csum_offset;
1929	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930
1931	if (skb_cloned(skb) &&
1932	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934		if (ret)
1935			goto out;
1936	}
1937
1938	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939out_set_summed:
1940	skb->ip_summed = CHECKSUM_NONE;
1941out:
1942	return ret;
1943}
1944EXPORT_SYMBOL(skb_checksum_help);
1945
1946/**
1947 *	skb_gso_segment - Perform segmentation on skb.
1948 *	@skb: buffer to segment
1949 *	@features: features for the output path (see dev->features)
1950 *
1951 *	This function segments the given skb and returns a list of segments.
1952 *
1953 *	It may return NULL if the skb requires no segmentation.  This is
1954 *	only possible when GSO is used for verifying header integrity.
1955 */
1956struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957	netdev_features_t features)
1958{
1959	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960	struct packet_type *ptype;
1961	__be16 type = skb->protocol;
1962	int vlan_depth = ETH_HLEN;
1963	int err;
1964
1965	while (type == htons(ETH_P_8021Q)) {
1966		struct vlan_hdr *vh;
1967
1968		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969			return ERR_PTR(-EINVAL);
1970
1971		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972		type = vh->h_vlan_encapsulated_proto;
1973		vlan_depth += VLAN_HLEN;
1974	}
1975
1976	skb_reset_mac_header(skb);
1977	skb->mac_len = skb->network_header - skb->mac_header;
1978	__skb_pull(skb, skb->mac_len);
1979
1980	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981		skb_warn_bad_offload(skb);
1982
1983		if (skb_header_cloned(skb) &&
1984		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985			return ERR_PTR(err);
1986	}
1987
1988	rcu_read_lock();
1989	list_for_each_entry_rcu(ptype,
1990			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993				err = ptype->gso_send_check(skb);
1994				segs = ERR_PTR(err);
1995				if (err || skb_gso_ok(skb, features))
1996					break;
1997				__skb_push(skb, (skb->data -
1998						 skb_network_header(skb)));
1999			}
2000			segs = ptype->gso_segment(skb, features);
2001			break;
2002		}
2003	}
2004	rcu_read_unlock();
2005
2006	__skb_push(skb, skb->data - skb_mac_header(skb));
2007
2008	return segs;
2009}
2010EXPORT_SYMBOL(skb_gso_segment);
2011
2012/* Take action when hardware reception checksum errors are detected. */
2013#ifdef CONFIG_BUG
2014void netdev_rx_csum_fault(struct net_device *dev)
2015{
2016	if (net_ratelimit()) {
2017		printk(KERN_ERR "%s: hw csum failure.\n",
2018			dev ? dev->name : "<unknown>");
2019		dump_stack();
2020	}
2021}
2022EXPORT_SYMBOL(netdev_rx_csum_fault);
2023#endif
2024
2025/* Actually, we should eliminate this check as soon as we know, that:
2026 * 1. IOMMU is present and allows to map all the memory.
2027 * 2. No high memory really exists on this machine.
2028 */
2029
2030static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2031{
2032#ifdef CONFIG_HIGHMEM
2033	int i;
2034	if (!(dev->features & NETIF_F_HIGHDMA)) {
2035		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2036			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2037			if (PageHighMem(skb_frag_page(frag)))
2038				return 1;
2039		}
2040	}
2041
2042	if (PCI_DMA_BUS_IS_PHYS) {
2043		struct device *pdev = dev->dev.parent;
2044
2045		if (!pdev)
2046			return 0;
2047		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2048			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2049			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2050			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2051				return 1;
2052		}
2053	}
2054#endif
2055	return 0;
2056}
2057
2058struct dev_gso_cb {
2059	void (*destructor)(struct sk_buff *skb);
2060};
2061
2062#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2063
2064static void dev_gso_skb_destructor(struct sk_buff *skb)
2065{
2066	struct dev_gso_cb *cb;
2067
2068	do {
2069		struct sk_buff *nskb = skb->next;
2070
2071		skb->next = nskb->next;
2072		nskb->next = NULL;
2073		kfree_skb(nskb);
2074	} while (skb->next);
2075
2076	cb = DEV_GSO_CB(skb);
2077	if (cb->destructor)
2078		cb->destructor(skb);
2079}
2080
2081/**
2082 *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2083 *	@skb: buffer to segment
2084 *	@features: device features as applicable to this skb
2085 *
2086 *	This function segments the given skb and stores the list of segments
2087 *	in skb->next.
2088 */
2089static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2090{
2091	struct sk_buff *segs;
2092
2093	segs = skb_gso_segment(skb, features);
2094
2095	/* Verifying header integrity only. */
2096	if (!segs)
2097		return 0;
2098
2099	if (IS_ERR(segs))
2100		return PTR_ERR(segs);
2101
2102	skb->next = segs;
2103	DEV_GSO_CB(skb)->destructor = skb->destructor;
2104	skb->destructor = dev_gso_skb_destructor;
2105
2106	return 0;
2107}
2108
2109/*
2110 * Try to orphan skb early, right before transmission by the device.
2111 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2112 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2113 */
2114static inline void skb_orphan_try(struct sk_buff *skb)
2115{
2116	struct sock *sk = skb->sk;
2117
2118	if (sk && !skb_shinfo(skb)->tx_flags) {
2119		/* skb_tx_hash() wont be able to get sk.
2120		 * We copy sk_hash into skb->rxhash
2121		 */
2122		if (!skb->rxhash)
2123			skb->rxhash = sk->sk_hash;
2124		skb_orphan(skb);
2125	}
2126}
2127
2128static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2129{
2130	return ((features & NETIF_F_GEN_CSUM) ||
2131		((features & NETIF_F_V4_CSUM) &&
2132		 protocol == htons(ETH_P_IP)) ||
2133		((features & NETIF_F_V6_CSUM) &&
2134		 protocol == htons(ETH_P_IPV6)) ||
2135		((features & NETIF_F_FCOE_CRC) &&
2136		 protocol == htons(ETH_P_FCOE)));
2137}
2138
2139static netdev_features_t harmonize_features(struct sk_buff *skb,
2140	__be16 protocol, netdev_features_t features)
2141{
2142	if (!can_checksum_protocol(features, protocol)) {
2143		features &= ~NETIF_F_ALL_CSUM;
2144		features &= ~NETIF_F_SG;
2145	} else if (illegal_highdma(skb->dev, skb)) {
2146		features &= ~NETIF_F_SG;
2147	}
2148
2149	return features;
2150}
2151
2152netdev_features_t netif_skb_features(struct sk_buff *skb)
2153{
2154	__be16 protocol = skb->protocol;
2155	netdev_features_t features = skb->dev->features;
2156
2157	if (protocol == htons(ETH_P_8021Q)) {
2158		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2159		protocol = veh->h_vlan_encapsulated_proto;
2160	} else if (!vlan_tx_tag_present(skb)) {
2161		return harmonize_features(skb, protocol, features);
2162	}
2163
2164	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2165
2166	if (protocol != htons(ETH_P_8021Q)) {
2167		return harmonize_features(skb, protocol, features);
2168	} else {
2169		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2170				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2171		return harmonize_features(skb, protocol, features);
2172	}
2173}
2174EXPORT_SYMBOL(netif_skb_features);
2175
2176/*
2177 * Returns true if either:
2178 *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2179 *	2. skb is fragmented and the device does not support SG, or if
2180 *	   at least one of fragments is in highmem and device does not
2181 *	   support DMA from it.
2182 */
2183static inline int skb_needs_linearize(struct sk_buff *skb,
2184				      int features)
2185{
2186	return skb_is_nonlinear(skb) &&
2187			((skb_has_frag_list(skb) &&
2188				!(features & NETIF_F_FRAGLIST)) ||
2189			(skb_shinfo(skb)->nr_frags &&
2190				!(features & NETIF_F_SG)));
2191}
2192
2193int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2194			struct netdev_queue *txq)
2195{
2196	const struct net_device_ops *ops = dev->netdev_ops;
2197	int rc = NETDEV_TX_OK;
2198	unsigned int skb_len;
2199
2200	if (likely(!skb->next)) {
2201		netdev_features_t features;
2202
2203		/*
2204		 * If device doesn't need skb->dst, release it right now while
2205		 * its hot in this cpu cache
2206		 */
2207		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2208			skb_dst_drop(skb);
2209
2210		if (!list_empty(&ptype_all))
2211			dev_queue_xmit_nit(skb, dev);
2212
2213		skb_orphan_try(skb);
2214
2215		features = netif_skb_features(skb);
2216
2217		if (vlan_tx_tag_present(skb) &&
2218		    !(features & NETIF_F_HW_VLAN_TX)) {
2219			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2220			if (unlikely(!skb))
2221				goto out;
2222
2223			skb->vlan_tci = 0;
2224		}
2225
2226		if (netif_needs_gso(skb, features)) {
2227			if (unlikely(dev_gso_segment(skb, features)))
2228				goto out_kfree_skb;
2229			if (skb->next)
2230				goto gso;
2231		} else {
2232			if (skb_needs_linearize(skb, features) &&
2233			    __skb_linearize(skb))
2234				goto out_kfree_skb;
2235
2236			/* If packet is not checksummed and device does not
2237			 * support checksumming for this protocol, complete
2238			 * checksumming here.
2239			 */
2240			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2241				skb_set_transport_header(skb,
2242					skb_checksum_start_offset(skb));
2243				if (!(features & NETIF_F_ALL_CSUM) &&
2244				     skb_checksum_help(skb))
2245					goto out_kfree_skb;
2246			}
2247		}
2248
2249		skb_len = skb->len;
2250		rc = ops->ndo_start_xmit(skb, dev);
2251		trace_net_dev_xmit(skb, rc, dev, skb_len);
2252		if (rc == NETDEV_TX_OK)
2253			txq_trans_update(txq);
2254		return rc;
2255	}
2256
2257gso:
2258	do {
2259		struct sk_buff *nskb = skb->next;
2260
2261		skb->next = nskb->next;
2262		nskb->next = NULL;
2263
2264		/*
2265		 * If device doesn't need nskb->dst, release it right now while
2266		 * its hot in this cpu cache
2267		 */
2268		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2269			skb_dst_drop(nskb);
2270
2271		skb_len = nskb->len;
2272		rc = ops->ndo_start_xmit(nskb, dev);
2273		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2274		if (unlikely(rc != NETDEV_TX_OK)) {
2275			if (rc & ~NETDEV_TX_MASK)
2276				goto out_kfree_gso_skb;
2277			nskb->next = skb->next;
2278			skb->next = nskb;
2279			return rc;
2280		}
2281		txq_trans_update(txq);
2282		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2283			return NETDEV_TX_BUSY;
2284	} while (skb->next);
2285
2286out_kfree_gso_skb:
2287	if (likely(skb->next == NULL))
2288		skb->destructor = DEV_GSO_CB(skb)->destructor;
2289out_kfree_skb:
2290	kfree_skb(skb);
2291out:
2292	return rc;
2293}
2294
2295static u32 hashrnd __read_mostly;
2296
2297/*
2298 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2299 * to be used as a distribution range.
2300 */
2301u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2302		  unsigned int num_tx_queues)
2303{
2304	u32 hash;
2305	u16 qoffset = 0;
2306	u16 qcount = num_tx_queues;
2307
2308	if (skb_rx_queue_recorded(skb)) {
2309		hash = skb_get_rx_queue(skb);
2310		while (unlikely(hash >= num_tx_queues))
2311			hash -= num_tx_queues;
2312		return hash;
2313	}
2314
2315	if (dev->num_tc) {
2316		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2317		qoffset = dev->tc_to_txq[tc].offset;
2318		qcount = dev->tc_to_txq[tc].count;
2319	}
2320
2321	if (skb->sk && skb->sk->sk_hash)
2322		hash = skb->sk->sk_hash;
2323	else
2324		hash = (__force u16) skb->protocol ^ skb->rxhash;
2325	hash = jhash_1word(hash, hashrnd);
2326
2327	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2328}
2329EXPORT_SYMBOL(__skb_tx_hash);
2330
2331static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2332{
2333	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2334		if (net_ratelimit()) {
2335			pr_warning("%s selects TX queue %d, but "
2336				"real number of TX queues is %d\n",
2337				dev->name, queue_index, dev->real_num_tx_queues);
2338		}
2339		return 0;
2340	}
2341	return queue_index;
2342}
2343
2344static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2345{
2346#ifdef CONFIG_XPS
2347	struct xps_dev_maps *dev_maps;
2348	struct xps_map *map;
2349	int queue_index = -1;
2350
2351	rcu_read_lock();
2352	dev_maps = rcu_dereference(dev->xps_maps);
2353	if (dev_maps) {
2354		map = rcu_dereference(
2355		    dev_maps->cpu_map[raw_smp_processor_id()]);
2356		if (map) {
2357			if (map->len == 1)
2358				queue_index = map->queues[0];
2359			else {
2360				u32 hash;
2361				if (skb->sk && skb->sk->sk_hash)
2362					hash = skb->sk->sk_hash;
2363				else
2364					hash = (__force u16) skb->protocol ^
2365					    skb->rxhash;
2366				hash = jhash_1word(hash, hashrnd);
2367				queue_index = map->queues[
2368				    ((u64)hash * map->len) >> 32];
2369			}
2370			if (unlikely(queue_index >= dev->real_num_tx_queues))
2371				queue_index = -1;
2372		}
2373	}
2374	rcu_read_unlock();
2375
2376	return queue_index;
2377#else
2378	return -1;
2379#endif
2380}
2381
2382static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2383					struct sk_buff *skb)
2384{
2385	int queue_index;
2386	const struct net_device_ops *ops = dev->netdev_ops;
2387
2388	if (dev->real_num_tx_queues == 1)
2389		queue_index = 0;
2390	else if (ops->ndo_select_queue) {
2391		queue_index = ops->ndo_select_queue(dev, skb);
2392		queue_index = dev_cap_txqueue(dev, queue_index);
2393	} else {
2394		struct sock *sk = skb->sk;
2395		queue_index = sk_tx_queue_get(sk);
2396
2397		if (queue_index < 0 || skb->ooo_okay ||
2398		    queue_index >= dev->real_num_tx_queues) {
2399			int old_index = queue_index;
2400
2401			queue_index = get_xps_queue(dev, skb);
2402			if (queue_index < 0)
2403				queue_index = skb_tx_hash(dev, skb);
2404
2405			if (queue_index != old_index && sk) {
2406				struct dst_entry *dst =
2407				    rcu_dereference_check(sk->sk_dst_cache, 1);
2408
2409				if (dst && skb_dst(skb) == dst)
2410					sk_tx_queue_set(sk, queue_index);
2411			}
2412		}
2413	}
2414
2415	skb_set_queue_mapping(skb, queue_index);
2416	return netdev_get_tx_queue(dev, queue_index);
2417}
2418
2419static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2420				 struct net_device *dev,
2421				 struct netdev_queue *txq)
2422{
2423	spinlock_t *root_lock = qdisc_lock(q);
2424	bool contended;
2425	int rc;
2426
2427	qdisc_skb_cb(skb)->pkt_len = skb->len;
2428	qdisc_calculate_pkt_len(skb, q);
2429	/*
2430	 * Heuristic to force contended enqueues to serialize on a
2431	 * separate lock before trying to get qdisc main lock.
2432	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2433	 * and dequeue packets faster.
2434	 */
2435	contended = qdisc_is_running(q);
2436	if (unlikely(contended))
2437		spin_lock(&q->busylock);
2438
2439	spin_lock(root_lock);
2440	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2441		kfree_skb(skb);
2442		rc = NET_XMIT_DROP;
2443	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2444		   qdisc_run_begin(q)) {
2445		/*
2446		 * This is a work-conserving queue; there are no old skbs
2447		 * waiting to be sent out; and the qdisc is not running -
2448		 * xmit the skb directly.
2449		 */
2450		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2451			skb_dst_force(skb);
2452
2453		qdisc_bstats_update(q, skb);
2454
2455		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2456			if (unlikely(contended)) {
2457				spin_unlock(&q->busylock);
2458				contended = false;
2459			}
2460			__qdisc_run(q);
2461		} else
2462			qdisc_run_end(q);
2463
2464		rc = NET_XMIT_SUCCESS;
2465	} else {
2466		skb_dst_force(skb);
2467		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2468		if (qdisc_run_begin(q)) {
2469			if (unlikely(contended)) {
2470				spin_unlock(&q->busylock);
2471				contended = false;
2472			}
2473			__qdisc_run(q);
2474		}
2475	}
2476	spin_unlock(root_lock);
2477	if (unlikely(contended))
2478		spin_unlock(&q->busylock);
2479	return rc;
2480}
2481
2482#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2483static void skb_update_prio(struct sk_buff *skb)
2484{
2485	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2486
2487	if ((!skb->priority) && (skb->sk) && map)
2488		skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2489}
2490#else
2491#define skb_update_prio(skb)
2492#endif
2493
2494static DEFINE_PER_CPU(int, xmit_recursion);
2495#define RECURSION_LIMIT 10
2496
2497/**
2498 *	dev_queue_xmit - transmit a buffer
2499 *	@skb: buffer to transmit
2500 *
2501 *	Queue a buffer for transmission to a network device. The caller must
2502 *	have set the device and priority and built the buffer before calling
2503 *	this function. The function can be called from an interrupt.
2504 *
2505 *	A negative errno code is returned on a failure. A success does not
2506 *	guarantee the frame will be transmitted as it may be dropped due
2507 *	to congestion or traffic shaping.
2508 *
2509 * -----------------------------------------------------------------------------------
2510 *      I notice this method can also return errors from the queue disciplines,
2511 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2512 *      be positive.
2513 *
2514 *      Regardless of the return value, the skb is consumed, so it is currently
2515 *      difficult to retry a send to this method.  (You can bump the ref count
2516 *      before sending to hold a reference for retry if you are careful.)
2517 *
2518 *      When calling this method, interrupts MUST be enabled.  This is because
2519 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2520 *          --BLG
2521 */
2522int dev_queue_xmit(struct sk_buff *skb)
2523{
2524	struct net_device *dev = skb->dev;
2525	struct netdev_queue *txq;
2526	struct Qdisc *q;
2527	int rc = -ENOMEM;
2528
2529	/* Disable soft irqs for various locks below. Also
2530	 * stops preemption for RCU.
2531	 */
2532	rcu_read_lock_bh();
2533
2534	skb_update_prio(skb);
2535
2536	txq = dev_pick_tx(dev, skb);
2537	q = rcu_dereference_bh(txq->qdisc);
2538
2539#ifdef CONFIG_NET_CLS_ACT
2540	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2541#endif
2542	trace_net_dev_queue(skb);
2543	if (q->enqueue) {
2544		rc = __dev_xmit_skb(skb, q, dev, txq);
2545		goto out;
2546	}
2547
2548	/* The device has no queue. Common case for software devices:
2549	   loopback, all the sorts of tunnels...
2550
2551	   Really, it is unlikely that netif_tx_lock protection is necessary
2552	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2553	   counters.)
2554	   However, it is possible, that they rely on protection
2555	   made by us here.
2556
2557	   Check this and shot the lock. It is not prone from deadlocks.
2558	   Either shot noqueue qdisc, it is even simpler 8)
2559	 */
2560	if (dev->flags & IFF_UP) {
2561		int cpu = smp_processor_id(); /* ok because BHs are off */
2562
2563		if (txq->xmit_lock_owner != cpu) {
2564
2565			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2566				goto recursion_alert;
2567
2568			HARD_TX_LOCK(dev, txq, cpu);
2569
2570			if (!netif_xmit_stopped(txq)) {
2571				__this_cpu_inc(xmit_recursion);
2572				rc = dev_hard_start_xmit(skb, dev, txq);
2573				__this_cpu_dec(xmit_recursion);
2574				if (dev_xmit_complete(rc)) {
2575					HARD_TX_UNLOCK(dev, txq);
2576					goto out;
2577				}
2578			}
2579			HARD_TX_UNLOCK(dev, txq);
2580			if (net_ratelimit())
2581				printk(KERN_CRIT "Virtual device %s asks to "
2582				       "queue packet!\n", dev->name);
2583		} else {
2584			/* Recursion is detected! It is possible,
2585			 * unfortunately
2586			 */
2587recursion_alert:
2588			if (net_ratelimit())
2589				printk(KERN_CRIT "Dead loop on virtual device "
2590				       "%s, fix it urgently!\n", dev->name);
2591		}
2592	}
2593
2594	rc = -ENETDOWN;
2595	rcu_read_unlock_bh();
2596
2597	kfree_skb(skb);
2598	return rc;
2599out:
2600	rcu_read_unlock_bh();
2601	return rc;
2602}
2603EXPORT_SYMBOL(dev_queue_xmit);
2604
2605
2606/*=======================================================================
2607			Receiver routines
2608  =======================================================================*/
2609
2610int netdev_max_backlog __read_mostly = 1000;
2611int netdev_tstamp_prequeue __read_mostly = 1;
2612int netdev_budget __read_mostly = 300;
2613int weight_p __read_mostly = 64;            /* old backlog weight */
2614
2615/* Called with irq disabled */
2616static inline void ____napi_schedule(struct softnet_data *sd,
2617				     struct napi_struct *napi)
2618{
2619	list_add_tail(&napi->poll_list, &sd->poll_list);
2620	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2621}
2622
2623/*
2624 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2625 * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2626 * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2627 * if hash is a canonical 4-tuple hash over transport ports.
2628 */
2629void __skb_get_rxhash(struct sk_buff *skb)
2630{
2631	struct flow_keys keys;
2632	u32 hash;
2633
2634	if (!skb_flow_dissect(skb, &keys))
2635		return;
2636
2637	if (keys.ports) {
2638		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2639			swap(keys.port16[0], keys.port16[1]);
2640		skb->l4_rxhash = 1;
2641	}
2642
2643	/* get a consistent hash (same value on both flow directions) */
2644	if ((__force u32)keys.dst < (__force u32)keys.src)
2645		swap(keys.dst, keys.src);
2646
2647	hash = jhash_3words((__force u32)keys.dst,
2648			    (__force u32)keys.src,
2649			    (__force u32)keys.ports, hashrnd);
2650	if (!hash)
2651		hash = 1;
2652
2653	skb->rxhash = hash;
2654}
2655EXPORT_SYMBOL(__skb_get_rxhash);
2656
2657#ifdef CONFIG_RPS
2658
2659/* One global table that all flow-based protocols share. */
2660struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2661EXPORT_SYMBOL(rps_sock_flow_table);
2662
2663struct jump_label_key rps_needed __read_mostly;
2664
2665static struct rps_dev_flow *
2666set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2667	    struct rps_dev_flow *rflow, u16 next_cpu)
2668{
2669	if (next_cpu != RPS_NO_CPU) {
2670#ifdef CONFIG_RFS_ACCEL
2671		struct netdev_rx_queue *rxqueue;
2672		struct rps_dev_flow_table *flow_table;
2673		struct rps_dev_flow *old_rflow;
2674		u32 flow_id;
2675		u16 rxq_index;
2676		int rc;
2677
2678		/* Should we steer this flow to a different hardware queue? */
2679		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2680		    !(dev->features & NETIF_F_NTUPLE))
2681			goto out;
2682		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2683		if (rxq_index == skb_get_rx_queue(skb))
2684			goto out;
2685
2686		rxqueue = dev->_rx + rxq_index;
2687		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2688		if (!flow_table)
2689			goto out;
2690		flow_id = skb->rxhash & flow_table->mask;
2691		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2692							rxq_index, flow_id);
2693		if (rc < 0)
2694			goto out;
2695		old_rflow = rflow;
2696		rflow = &flow_table->flows[flow_id];
2697		rflow->filter = rc;
2698		if (old_rflow->filter == rflow->filter)
2699			old_rflow->filter = RPS_NO_FILTER;
2700	out:
2701#endif
2702		rflow->last_qtail =
2703			per_cpu(softnet_data, next_cpu).input_queue_head;
2704	}
2705
2706	rflow->cpu = next_cpu;
2707	return rflow;
2708}
2709
2710/*
2711 * get_rps_cpu is called from netif_receive_skb and returns the target
2712 * CPU from the RPS map of the receiving queue for a given skb.
2713 * rcu_read_lock must be held on entry.
2714 */
2715static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2716		       struct rps_dev_flow **rflowp)
2717{
2718	struct netdev_rx_queue *rxqueue;
2719	struct rps_map *map;
2720	struct rps_dev_flow_table *flow_table;
2721	struct rps_sock_flow_table *sock_flow_table;
2722	int cpu = -1;
2723	u16 tcpu;
2724
2725	if (skb_rx_queue_recorded(skb)) {
2726		u16 index = skb_get_rx_queue(skb);
2727		if (unlikely(index >= dev->real_num_rx_queues)) {
2728			WARN_ONCE(dev->real_num_rx_queues > 1,
2729				  "%s received packet on queue %u, but number "
2730				  "of RX queues is %u\n",
2731				  dev->name, index, dev->real_num_rx_queues);
2732			goto done;
2733		}
2734		rxqueue = dev->_rx + index;
2735	} else
2736		rxqueue = dev->_rx;
2737
2738	map = rcu_dereference(rxqueue->rps_map);
2739	if (map) {
2740		if (map->len == 1 &&
2741		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2742			tcpu = map->cpus[0];
2743			if (cpu_online(tcpu))
2744				cpu = tcpu;
2745			goto done;
2746		}
2747	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2748		goto done;
2749	}
2750
2751	skb_reset_network_header(skb);
2752	if (!skb_get_rxhash(skb))
2753		goto done;
2754
2755	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2756	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2757	if (flow_table && sock_flow_table) {
2758		u16 next_cpu;
2759		struct rps_dev_flow *rflow;
2760
2761		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2762		tcpu = rflow->cpu;
2763
2764		next_cpu = sock_flow_table->ents[skb->rxhash &
2765		    sock_flow_table->mask];
2766
2767		/*
2768		 * If the desired CPU (where last recvmsg was done) is
2769		 * different from current CPU (one in the rx-queue flow
2770		 * table entry), switch if one of the following holds:
2771		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2772		 *   - Current CPU is offline.
2773		 *   - The current CPU's queue tail has advanced beyond the
2774		 *     last packet that was enqueued using this table entry.
2775		 *     This guarantees that all previous packets for the flow
2776		 *     have been dequeued, thus preserving in order delivery.
2777		 */
2778		if (unlikely(tcpu != next_cpu) &&
2779		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2780		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2781		      rflow->last_qtail)) >= 0))
2782			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2783
2784		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2785			*rflowp = rflow;
2786			cpu = tcpu;
2787			goto done;
2788		}
2789	}
2790
2791	if (map) {
2792		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2793
2794		if (cpu_online(tcpu)) {
2795			cpu = tcpu;
2796			goto done;
2797		}
2798	}
2799
2800done:
2801	return cpu;
2802}
2803
2804#ifdef CONFIG_RFS_ACCEL
2805
2806/**
2807 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2808 * @dev: Device on which the filter was set
2809 * @rxq_index: RX queue index
2810 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2811 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2812 *
2813 * Drivers that implement ndo_rx_flow_steer() should periodically call
2814 * this function for each installed filter and remove the filters for
2815 * which it returns %true.
2816 */
2817bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2818			 u32 flow_id, u16 filter_id)
2819{
2820	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2821	struct rps_dev_flow_table *flow_table;
2822	struct rps_dev_flow *rflow;
2823	bool expire = true;
2824	int cpu;
2825
2826	rcu_read_lock();
2827	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2828	if (flow_table && flow_id <= flow_table->mask) {
2829		rflow = &flow_table->flows[flow_id];
2830		cpu = ACCESS_ONCE(rflow->cpu);
2831		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2832		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2833			   rflow->last_qtail) <
2834		     (int)(10 * flow_table->mask)))
2835			expire = false;
2836	}
2837	rcu_read_unlock();
2838	return expire;
2839}
2840EXPORT_SYMBOL(rps_may_expire_flow);
2841
2842#endif /* CONFIG_RFS_ACCEL */
2843
2844/* Called from hardirq (IPI) context */
2845static void rps_trigger_softirq(void *data)
2846{
2847	struct softnet_data *sd = data;
2848
2849	____napi_schedule(sd, &sd->backlog);
2850	sd->received_rps++;
2851}
2852
2853#endif /* CONFIG_RPS */
2854
2855/*
2856 * Check if this softnet_data structure is another cpu one
2857 * If yes, queue it to our IPI list and return 1
2858 * If no, return 0
2859 */
2860static int rps_ipi_queued(struct softnet_data *sd)
2861{
2862#ifdef CONFIG_RPS
2863	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2864
2865	if (sd != mysd) {
2866		sd->rps_ipi_next = mysd->rps_ipi_list;
2867		mysd->rps_ipi_list = sd;
2868
2869		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2870		return 1;
2871	}
2872#endif /* CONFIG_RPS */
2873	return 0;
2874}
2875
2876/*
2877 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2878 * queue (may be a remote CPU queue).
2879 */
2880static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2881			      unsigned int *qtail)
2882{
2883	struct softnet_data *sd;
2884	unsigned long flags;
2885
2886	sd = &per_cpu(softnet_data, cpu);
2887
2888	local_irq_save(flags);
2889
2890	rps_lock(sd);
2891	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2892		if (skb_queue_len(&sd->input_pkt_queue)) {
2893enqueue:
2894			__skb_queue_tail(&sd->input_pkt_queue, skb);
2895			input_queue_tail_incr_save(sd, qtail);
2896			rps_unlock(sd);
2897			local_irq_restore(flags);
2898			return NET_RX_SUCCESS;
2899		}
2900
2901		/* Schedule NAPI for backlog device
2902		 * We can use non atomic operation since we own the queue lock
2903		 */
2904		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2905			if (!rps_ipi_queued(sd))
2906				____napi_schedule(sd, &sd->backlog);
2907		}
2908		goto enqueue;
2909	}
2910
2911	sd->dropped++;
2912	rps_unlock(sd);
2913
2914	local_irq_restore(flags);
2915
2916	atomic_long_inc(&skb->dev->rx_dropped);
2917	kfree_skb(skb);
2918	return NET_RX_DROP;
2919}
2920
2921/**
2922 *	netif_rx	-	post buffer to the network code
2923 *	@skb: buffer to post
2924 *
2925 *	This function receives a packet from a device driver and queues it for
2926 *	the upper (protocol) levels to process.  It always succeeds. The buffer
2927 *	may be dropped during processing for congestion control or by the
2928 *	protocol layers.
2929 *
2930 *	return values:
2931 *	NET_RX_SUCCESS	(no congestion)
2932 *	NET_RX_DROP     (packet was dropped)
2933 *
2934 */
2935
2936int netif_rx(struct sk_buff *skb)
2937{
2938	int ret;
2939
2940	/* if netpoll wants it, pretend we never saw it */
2941	if (netpoll_rx(skb))
2942		return NET_RX_DROP;
2943
2944	net_timestamp_check(netdev_tstamp_prequeue, skb);
2945
2946	trace_netif_rx(skb);
2947#ifdef CONFIG_RPS
2948	if (static_branch(&rps_needed))	{
2949		struct rps_dev_flow voidflow, *rflow = &voidflow;
2950		int cpu;
2951
2952		preempt_disable();
2953		rcu_read_lock();
2954
2955		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2956		if (cpu < 0)
2957			cpu = smp_processor_id();
2958
2959		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2960
2961		rcu_read_unlock();
2962		preempt_enable();
2963	} else
2964#endif
2965	{
2966		unsigned int qtail;
2967		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2968		put_cpu();
2969	}
2970	return ret;
2971}
2972EXPORT_SYMBOL(netif_rx);
2973
2974int netif_rx_ni(struct sk_buff *skb)
2975{
2976	int err;
2977
2978	preempt_disable();
2979	err = netif_rx(skb);
2980	if (local_softirq_pending())
2981		do_softirq();
2982	preempt_enable();
2983
2984	return err;
2985}
2986EXPORT_SYMBOL(netif_rx_ni);
2987
2988static void net_tx_action(struct softirq_action *h)
2989{
2990	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2991
2992	if (sd->completion_queue) {
2993		struct sk_buff *clist;
2994
2995		local_irq_disable();
2996		clist = sd->completion_queue;
2997		sd->completion_queue = NULL;
2998		local_irq_enable();
2999
3000		while (clist) {
3001			struct sk_buff *skb = clist;
3002			clist = clist->next;
3003
3004			WARN_ON(atomic_read(&skb->users));
3005			trace_kfree_skb(skb, net_tx_action);
3006			__kfree_skb(skb);
3007		}
3008	}
3009
3010	if (sd->output_queue) {
3011		struct Qdisc *head;
3012
3013		local_irq_disable();
3014		head = sd->output_queue;
3015		sd->output_queue = NULL;
3016		sd->output_queue_tailp = &sd->output_queue;
3017		local_irq_enable();
3018
3019		while (head) {
3020			struct Qdisc *q = head;
3021			spinlock_t *root_lock;
3022
3023			head = head->next_sched;
3024
3025			root_lock = qdisc_lock(q);
3026			if (spin_trylock(root_lock)) {
3027				smp_mb__before_clear_bit();
3028				clear_bit(__QDISC_STATE_SCHED,
3029					  &q->state);
3030				qdisc_run(q);
3031				spin_unlock(root_lock);
3032			} else {
3033				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3034					      &q->state)) {
3035					__netif_reschedule(q);
3036				} else {
3037					smp_mb__before_clear_bit();
3038					clear_bit(__QDISC_STATE_SCHED,
3039						  &q->state);
3040				}
3041			}
3042		}
3043	}
3044}
3045
3046#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3047    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3048/* This hook is defined here for ATM LANE */
3049int (*br_fdb_test_addr_hook)(struct net_device *dev,
3050			     unsigned char *addr) __read_mostly;
3051EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3052#endif
3053
3054#ifdef CONFIG_NET_CLS_ACT
3055/* TODO: Maybe we should just force sch_ingress to be compiled in
3056 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3057 * a compare and 2 stores extra right now if we dont have it on
3058 * but have CONFIG_NET_CLS_ACT
3059 * NOTE: This doesn't stop any functionality; if you dont have
3060 * the ingress scheduler, you just can't add policies on ingress.
3061 *
3062 */
3063static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3064{
3065	struct net_device *dev = skb->dev;
3066	u32 ttl = G_TC_RTTL(skb->tc_verd);
3067	int result = TC_ACT_OK;
3068	struct Qdisc *q;
3069
3070	if (unlikely(MAX_RED_LOOP < ttl++)) {
3071		if (net_ratelimit())
3072			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3073			       skb->skb_iif, dev->ifindex);
3074		return TC_ACT_SHOT;
3075	}
3076
3077	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3078	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3079
3080	q = rxq->qdisc;
3081	if (q != &noop_qdisc) {
3082		spin_lock(qdisc_lock(q));
3083		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3084			result = qdisc_enqueue_root(skb, q);
3085		spin_unlock(qdisc_lock(q));
3086	}
3087
3088	return result;
3089}
3090
3091static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3092					 struct packet_type **pt_prev,
3093					 int *ret, struct net_device *orig_dev)
3094{
3095	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3096
3097	if (!rxq || rxq->qdisc == &noop_qdisc)
3098		goto out;
3099
3100	if (*pt_prev) {
3101		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3102		*pt_prev = NULL;
3103	}
3104
3105	switch (ing_filter(skb, rxq)) {
3106	case TC_ACT_SHOT:
3107	case TC_ACT_STOLEN:
3108		kfree_skb(skb);
3109		return NULL;
3110	}
3111
3112out:
3113	skb->tc_verd = 0;
3114	return skb;
3115}
3116#endif
3117
3118/**
3119 *	netdev_rx_handler_register - register receive handler
3120 *	@dev: device to register a handler for
3121 *	@rx_handler: receive handler to register
3122 *	@rx_handler_data: data pointer that is used by rx handler
3123 *
3124 *	Register a receive hander for a device. This handler will then be
3125 *	called from __netif_receive_skb. A negative errno code is returned
3126 *	on a failure.
3127 *
3128 *	The caller must hold the rtnl_mutex.
3129 *
3130 *	For a general description of rx_handler, see enum rx_handler_result.
3131 */
3132int netdev_rx_handler_register(struct net_device *dev,
3133			       rx_handler_func_t *rx_handler,
3134			       void *rx_handler_data)
3135{
3136	ASSERT_RTNL();
3137
3138	if (dev->rx_handler)
3139		return -EBUSY;
3140
3141	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3142	rcu_assign_pointer(dev->rx_handler, rx_handler);
3143
3144	return 0;
3145}
3146EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3147
3148/**
3149 *	netdev_rx_handler_unregister - unregister receive handler
3150 *	@dev: device to unregister a handler from
3151 *
3152 *	Unregister a receive hander from a device.
3153 *
3154 *	The caller must hold the rtnl_mutex.
3155 */
3156void netdev_rx_handler_unregister(struct net_device *dev)
3157{
3158
3159	ASSERT_RTNL();
3160	RCU_INIT_POINTER(dev->rx_handler, NULL);
3161	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3162}
3163EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3164
3165static int __netif_receive_skb(struct sk_buff *skb)
3166{
3167	struct packet_type *ptype, *pt_prev;
3168	rx_handler_func_t *rx_handler;
3169	struct net_device *orig_dev;
3170	struct net_device *null_or_dev;
3171	bool deliver_exact = false;
3172	int ret = NET_RX_DROP;
3173	__be16 type;
3174
3175	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3176
3177	trace_netif_receive_skb(skb);
3178
3179	/* if we've gotten here through NAPI, check netpoll */
3180	if (netpoll_receive_skb(skb))
3181		return NET_RX_DROP;
3182
3183	if (!skb->skb_iif)
3184		skb->skb_iif = skb->dev->ifindex;
3185	orig_dev = skb->dev;
3186
3187	skb_reset_network_header(skb);
3188	skb_reset_transport_header(skb);
3189	skb_reset_mac_len(skb);
3190
3191	pt_prev = NULL;
3192
3193	rcu_read_lock();
3194
3195another_round:
3196
3197	__this_cpu_inc(softnet_data.processed);
3198
3199	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3200		skb = vlan_untag(skb);
3201		if (unlikely(!skb))
3202			goto out;
3203	}
3204
3205#ifdef CONFIG_NET_CLS_ACT
3206	if (skb->tc_verd & TC_NCLS) {
3207		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3208		goto ncls;
3209	}
3210#endif
3211
3212	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3213		if (!ptype->dev || ptype->dev == skb->dev) {
3214			if (pt_prev)
3215				ret = deliver_skb(skb, pt_prev, orig_dev);
3216			pt_prev = ptype;
3217		}
3218	}
3219
3220#ifdef CONFIG_NET_CLS_ACT
3221	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3222	if (!skb)
3223		goto out;
3224ncls:
3225#endif
3226
3227	rx_handler = rcu_dereference(skb->dev->rx_handler);
3228	if (vlan_tx_tag_present(skb)) {
3229		if (pt_prev) {
3230			ret = deliver_skb(skb, pt_prev, orig_dev);
3231			pt_prev = NULL;
3232		}
3233		if (vlan_do_receive(&skb, !rx_handler))
3234			goto another_round;
3235		else if (unlikely(!skb))
3236			goto out;
3237	}
3238
3239	if (rx_handler) {
3240		if (pt_prev) {
3241			ret = deliver_skb(skb, pt_prev, orig_dev);
3242			pt_prev = NULL;
3243		}
3244		switch (rx_handler(&skb)) {
3245		case RX_HANDLER_CONSUMED:
3246			goto out;
3247		case RX_HANDLER_ANOTHER:
3248			goto another_round;
3249		case RX_HANDLER_EXACT:
3250			deliver_exact = true;
3251		case RX_HANDLER_PASS:
3252			break;
3253		default:
3254			BUG();
3255		}
3256	}
3257
3258	/* deliver only exact match when indicated */
3259	null_or_dev = deliver_exact ? skb->dev : NULL;
3260
3261	type = skb->protocol;
3262	list_for_each_entry_rcu(ptype,
3263			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3264		if (ptype->type == type &&
3265		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3266		     ptype->dev == orig_dev)) {
3267			if (pt_prev)
3268				ret = deliver_skb(skb, pt_prev, orig_dev);
3269			pt_prev = ptype;
3270		}
3271	}
3272
3273	if (pt_prev) {
3274		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3275	} else {
3276		atomic_long_inc(&skb->dev->rx_dropped);
3277		kfree_skb(skb);
3278		/* Jamal, now you will not able to escape explaining
3279		 * me how you were going to use this. :-)
3280		 */
3281		ret = NET_RX_DROP;
3282	}
3283
3284out:
3285	rcu_read_unlock();
3286	return ret;
3287}
3288
3289/**
3290 *	netif_receive_skb - process receive buffer from network
3291 *	@skb: buffer to process
3292 *
3293 *	netif_receive_skb() is the main receive data processing function.
3294 *	It always succeeds. The buffer may be dropped during processing
3295 *	for congestion control or by the protocol layers.
3296 *
3297 *	This function may only be called from softirq context and interrupts
3298 *	should be enabled.
3299 *
3300 *	Return values (usually ignored):
3301 *	NET_RX_SUCCESS: no congestion
3302 *	NET_RX_DROP: packet was dropped
3303 */
3304int netif_receive_skb(struct sk_buff *skb)
3305{
3306	net_timestamp_check(netdev_tstamp_prequeue, skb);
3307
3308	if (skb_defer_rx_timestamp(skb))
3309		return NET_RX_SUCCESS;
3310
3311#ifdef CONFIG_RPS
3312	if (static_branch(&rps_needed)) {
3313		struct rps_dev_flow voidflow, *rflow = &voidflow;
3314		int cpu, ret;
3315
3316		rcu_read_lock();
3317
3318		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3319
3320		if (cpu >= 0) {
3321			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3322			rcu_read_unlock();
3323			return ret;
3324		}
3325		rcu_read_unlock();
3326	}
3327#endif
3328	return __netif_receive_skb(skb);
3329}
3330EXPORT_SYMBOL(netif_receive_skb);
3331
3332/* Network device is going away, flush any packets still pending
3333 * Called with irqs disabled.
3334 */
3335static void flush_backlog(void *arg)
3336{
3337	struct net_device *dev = arg;
3338	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3339	struct sk_buff *skb, *tmp;
3340
3341	rps_lock(sd);
3342	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3343		if (skb->dev == dev) {
3344			__skb_unlink(skb, &sd->input_pkt_queue);
3345			kfree_skb(skb);
3346			input_queue_head_incr(sd);
3347		}
3348	}
3349	rps_unlock(sd);
3350
3351	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3352		if (skb->dev == dev) {
3353			__skb_unlink(skb, &sd->process_queue);
3354			kfree_skb(skb);
3355			input_queue_head_incr(sd);
3356		}
3357	}
3358}
3359
3360static int napi_gro_complete(struct sk_buff *skb)
3361{
3362	struct packet_type *ptype;
3363	__be16 type = skb->protocol;
3364	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365	int err = -ENOENT;
3366
3367	if (NAPI_GRO_CB(skb)->count == 1) {
3368		skb_shinfo(skb)->gso_size = 0;
3369		goto out;
3370	}
3371
3372	rcu_read_lock();
3373	list_for_each_entry_rcu(ptype, head, list) {
3374		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3375			continue;
3376
3377		err = ptype->gro_complete(skb);
3378		break;
3379	}
3380	rcu_read_unlock();
3381
3382	if (err) {
3383		WARN_ON(&ptype->list == head);
3384		kfree_skb(skb);
3385		return NET_RX_SUCCESS;
3386	}
3387
3388out:
3389	return netif_receive_skb(skb);
3390}
3391
3392inline void napi_gro_flush(struct napi_struct *napi)
3393{
3394	struct sk_buff *skb, *next;
3395
3396	for (skb = napi->gro_list; skb; skb = next) {
3397		next = skb->next;
3398		skb->next = NULL;
3399		napi_gro_complete(skb);
3400	}
3401
3402	napi->gro_count = 0;
3403	napi->gro_list = NULL;
3404}
3405EXPORT_SYMBOL(napi_gro_flush);
3406
3407enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3408{
3409	struct sk_buff **pp = NULL;
3410	struct packet_type *ptype;
3411	__be16 type = skb->protocol;
3412	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3413	int same_flow;
3414	int mac_len;
3415	enum gro_result ret;
3416
3417	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3418		goto normal;
3419
3420	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3421		goto normal;
3422
3423	rcu_read_lock();
3424	list_for_each_entry_rcu(ptype, head, list) {
3425		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3426			continue;
3427
3428		skb_set_network_header(skb, skb_gro_offset(skb));
3429		mac_len = skb->network_header - skb->mac_header;
3430		skb->mac_len = mac_len;
3431		NAPI_GRO_CB(skb)->same_flow = 0;
3432		NAPI_GRO_CB(skb)->flush = 0;
3433		NAPI_GRO_CB(skb)->free = 0;
3434
3435		pp = ptype->gro_receive(&napi->gro_list, skb);
3436		break;
3437	}
3438	rcu_read_unlock();
3439
3440	if (&ptype->list == head)
3441		goto normal;
3442
3443	same_flow = NAPI_GRO_CB(skb)->same_flow;
3444	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3445
3446	if (pp) {
3447		struct sk_buff *nskb = *pp;
3448
3449		*pp = nskb->next;
3450		nskb->next = NULL;
3451		napi_gro_complete(nskb);
3452		napi->gro_count--;
3453	}
3454
3455	if (same_flow)
3456		goto ok;
3457
3458	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3459		goto normal;
3460
3461	napi->gro_count++;
3462	NAPI_GRO_CB(skb)->count = 1;
3463	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3464	skb->next = napi->gro_list;
3465	napi->gro_list = skb;
3466	ret = GRO_HELD;
3467
3468pull:
3469	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3470		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3471
3472		BUG_ON(skb->end - skb->tail < grow);
3473
3474		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3475
3476		skb->tail += grow;
3477		skb->data_len -= grow;
3478
3479		skb_shinfo(skb)->frags[0].page_offset += grow;
3480		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3481
3482		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3483			skb_frag_unref(skb, 0);
3484			memmove(skb_shinfo(skb)->frags,
3485				skb_shinfo(skb)->frags + 1,
3486				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3487		}
3488	}
3489
3490ok:
3491	return ret;
3492
3493normal:
3494	ret = GRO_NORMAL;
3495	goto pull;
3496}
3497EXPORT_SYMBOL(dev_gro_receive);
3498
3499static inline gro_result_t
3500__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3501{
3502	struct sk_buff *p;
3503
3504	for (p = napi->gro_list; p; p = p->next) {
3505		unsigned long diffs;
3506
3507		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3508		diffs |= p->vlan_tci ^ skb->vlan_tci;
3509		diffs |= compare_ether_header(skb_mac_header(p),
3510					      skb_gro_mac_header(skb));
3511		NAPI_GRO_CB(p)->same_flow = !diffs;
3512		NAPI_GRO_CB(p)->flush = 0;
3513	}
3514
3515	return dev_gro_receive(napi, skb);
3516}
3517
3518gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3519{
3520	switch (ret) {
3521	case GRO_NORMAL:
3522		if (netif_receive_skb(skb))
3523			ret = GRO_DROP;
3524		break;
3525
3526	case GRO_DROP:
3527	case GRO_MERGED_FREE:
3528		kfree_skb(skb);
3529		break;
3530
3531	case GRO_HELD:
3532	case GRO_MERGED:
3533		break;
3534	}
3535
3536	return ret;
3537}
3538EXPORT_SYMBOL(napi_skb_finish);
3539
3540void skb_gro_reset_offset(struct sk_buff *skb)
3541{
3542	NAPI_GRO_CB(skb)->data_offset = 0;
3543	NAPI_GRO_CB(skb)->frag0 = NULL;
3544	NAPI_GRO_CB(skb)->frag0_len = 0;
3545
3546	if (skb->mac_header == skb->tail &&
3547	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3548		NAPI_GRO_CB(skb)->frag0 =
3549			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3550		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3551	}
3552}
3553EXPORT_SYMBOL(skb_gro_reset_offset);
3554
3555gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3556{
3557	skb_gro_reset_offset(skb);
3558
3559	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3560}
3561EXPORT_SYMBOL(napi_gro_receive);
3562
3563static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3564{
3565	__skb_pull(skb, skb_headlen(skb));
3566	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3567	skb->vlan_tci = 0;
3568	skb->dev = napi->dev;
3569	skb->skb_iif = 0;
3570
3571	napi->skb = skb;
3572}
3573
3574struct sk_buff *napi_get_frags(struct napi_struct *napi)
3575{
3576	struct sk_buff *skb = napi->skb;
3577
3578	if (!skb) {
3579		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3580		if (skb)
3581			napi->skb = skb;
3582	}
3583	return skb;
3584}
3585EXPORT_SYMBOL(napi_get_frags);
3586
3587gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3588			       gro_result_t ret)
3589{
3590	switch (ret) {
3591	case GRO_NORMAL:
3592	case GRO_HELD:
3593		skb->protocol = eth_type_trans(skb, skb->dev);
3594
3595		if (ret == GRO_HELD)
3596			skb_gro_pull(skb, -ETH_HLEN);
3597		else if (netif_receive_skb(skb))
3598			ret = GRO_DROP;
3599		break;
3600
3601	case GRO_DROP:
3602	case GRO_MERGED_FREE:
3603		napi_reuse_skb(napi, skb);
3604		break;
3605
3606	case GRO_MERGED:
3607		break;
3608	}
3609
3610	return ret;
3611}
3612EXPORT_SYMBOL(napi_frags_finish);
3613
3614struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3615{
3616	struct sk_buff *skb = napi->skb;
3617	struct ethhdr *eth;
3618	unsigned int hlen;
3619	unsigned int off;
3620
3621	napi->skb = NULL;
3622
3623	skb_reset_mac_header(skb);
3624	skb_gro_reset_offset(skb);
3625
3626	off = skb_gro_offset(skb);
3627	hlen = off + sizeof(*eth);
3628	eth = skb_gro_header_fast(skb, off);
3629	if (skb_gro_header_hard(skb, hlen)) {
3630		eth = skb_gro_header_slow(skb, hlen, off);
3631		if (unlikely(!eth)) {
3632			napi_reuse_skb(napi, skb);
3633			skb = NULL;
3634			goto out;
3635		}
3636	}
3637
3638	skb_gro_pull(skb, sizeof(*eth));
3639
3640	/*
3641	 * This works because the only protocols we care about don't require
3642	 * special handling.  We'll fix it up properly at the end.
3643	 */
3644	skb->protocol = eth->h_proto;
3645
3646out:
3647	return skb;
3648}
3649EXPORT_SYMBOL(napi_frags_skb);
3650
3651gro_result_t napi_gro_frags(struct napi_struct *napi)
3652{
3653	struct sk_buff *skb = napi_frags_skb(napi);
3654
3655	if (!skb)
3656		return GRO_DROP;
3657
3658	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3659}
3660EXPORT_SYMBOL(napi_gro_frags);
3661
3662/*
3663 * net_rps_action sends any pending IPI's for rps.
3664 * Note: called with local irq disabled, but exits with local irq enabled.
3665 */
3666static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3667{
3668#ifdef CONFIG_RPS
3669	struct softnet_data *remsd = sd->rps_ipi_list;
3670
3671	if (remsd) {
3672		sd->rps_ipi_list = NULL;
3673
3674		local_irq_enable();
3675
3676		/* Send pending IPI's to kick RPS processing on remote cpus. */
3677		while (remsd) {
3678			struct softnet_data *next = remsd->rps_ipi_next;
3679
3680			if (cpu_online(remsd->cpu))
3681				__smp_call_function_single(remsd->cpu,
3682							   &remsd->csd, 0);
3683			remsd = next;
3684		}
3685	} else
3686#endif
3687		local_irq_enable();
3688}
3689
3690static int process_backlog(struct napi_struct *napi, int quota)
3691{
3692	int work = 0;
3693	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3694
3695#ifdef CONFIG_RPS
3696	/* Check if we have pending ipi, its better to send them now,
3697	 * not waiting net_rx_action() end.
3698	 */
3699	if (sd->rps_ipi_list) {
3700		local_irq_disable();
3701		net_rps_action_and_irq_enable(sd);
3702	}
3703#endif
3704	napi->weight = weight_p;
3705	local_irq_disable();
3706	while (work < quota) {
3707		struct sk_buff *skb;
3708		unsigned int qlen;
3709
3710		while ((skb = __skb_dequeue(&sd->process_queue))) {
3711			local_irq_enable();
3712			__netif_receive_skb(skb);
3713			local_irq_disable();
3714			input_queue_head_incr(sd);
3715			if (++work >= quota) {
3716				local_irq_enable();
3717				return work;
3718			}
3719		}
3720
3721		rps_lock(sd);
3722		qlen = skb_queue_len(&sd->input_pkt_queue);
3723		if (qlen)
3724			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3725						   &sd->process_queue);
3726
3727		if (qlen < quota - work) {
3728			/*
3729			 * Inline a custom version of __napi_complete().
3730			 * only current cpu owns and manipulates this napi,
3731			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3732			 * we can use a plain write instead of clear_bit(),
3733			 * and we dont need an smp_mb() memory barrier.
3734			 */
3735			list_del(&napi->poll_list);
3736			napi->state = 0;
3737
3738			quota = work + qlen;
3739		}
3740		rps_unlock(sd);
3741	}
3742	local_irq_enable();
3743
3744	return work;
3745}
3746
3747/**
3748 * __napi_schedule - schedule for receive
3749 * @n: entry to schedule
3750 *
3751 * The entry's receive function will be scheduled to run
3752 */
3753void __napi_schedule(struct napi_struct *n)
3754{
3755	unsigned long flags;
3756
3757	local_irq_save(flags);
3758	____napi_schedule(&__get_cpu_var(softnet_data), n);
3759	local_irq_restore(flags);
3760}
3761EXPORT_SYMBOL(__napi_schedule);
3762
3763void __napi_complete(struct napi_struct *n)
3764{
3765	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3766	BUG_ON(n->gro_list);
3767
3768	list_del(&n->poll_list);
3769	smp_mb__before_clear_bit();
3770	clear_bit(NAPI_STATE_SCHED, &n->state);
3771}
3772EXPORT_SYMBOL(__napi_complete);
3773
3774void napi_complete(struct napi_struct *n)
3775{
3776	unsigned long flags;
3777
3778	/*
3779	 * don't let napi dequeue from the cpu poll list
3780	 * just in case its running on a different cpu
3781	 */
3782	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3783		return;
3784
3785	napi_gro_flush(n);
3786	local_irq_save(flags);
3787	__napi_complete(n);
3788	local_irq_restore(flags);
3789}
3790EXPORT_SYMBOL(napi_complete);
3791
3792void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3793		    int (*poll)(struct napi_struct *, int), int weight)
3794{
3795	INIT_LIST_HEAD(&napi->poll_list);
3796	napi->gro_count = 0;
3797	napi->gro_list = NULL;
3798	napi->skb = NULL;
3799	napi->poll = poll;
3800	napi->weight = weight;
3801	list_add(&napi->dev_list, &dev->napi_list);
3802	napi->dev = dev;
3803#ifdef CONFIG_NETPOLL
3804	spin_lock_init(&napi->poll_lock);
3805	napi->poll_owner = -1;
3806#endif
3807	set_bit(NAPI_STATE_SCHED, &napi->state);
3808}
3809EXPORT_SYMBOL(netif_napi_add);
3810
3811void netif_napi_del(struct napi_struct *napi)
3812{
3813	struct sk_buff *skb, *next;
3814
3815	list_del_init(&napi->dev_list);
3816	napi_free_frags(napi);
3817
3818	for (skb = napi->gro_list; skb; skb = next) {
3819		next = skb->next;
3820		skb->next = NULL;
3821		kfree_skb(skb);
3822	}
3823
3824	napi->gro_list = NULL;
3825	napi->gro_count = 0;
3826}
3827EXPORT_SYMBOL(netif_napi_del);
3828
3829static void net_rx_action(struct softirq_action *h)
3830{
3831	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3832	unsigned long time_limit = jiffies + 2;
3833	int budget = netdev_budget;
3834	void *have;
3835
3836	local_irq_disable();
3837
3838	while (!list_empty(&sd->poll_list)) {
3839		struct napi_struct *n;
3840		int work, weight;
3841
3842		/* If softirq window is exhuasted then punt.
3843		 * Allow this to run for 2 jiffies since which will allow
3844		 * an average latency of 1.5/HZ.
3845		 */
3846		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3847			goto softnet_break;
3848
3849		local_irq_enable();
3850
3851		/* Even though interrupts have been re-enabled, this
3852		 * access is safe because interrupts can only add new
3853		 * entries to the tail of this list, and only ->poll()
3854		 * calls can remove this head entry from the list.
3855		 */
3856		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3857
3858		have = netpoll_poll_lock(n);
3859
3860		weight = n->weight;
3861
3862		/* This NAPI_STATE_SCHED test is for avoiding a race
3863		 * with netpoll's poll_napi().  Only the entity which
3864		 * obtains the lock and sees NAPI_STATE_SCHED set will
3865		 * actually make the ->poll() call.  Therefore we avoid
3866		 * accidentally calling ->poll() when NAPI is not scheduled.
3867		 */
3868		work = 0;
3869		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3870			work = n->poll(n, weight);
3871			trace_napi_poll(n);
3872		}
3873
3874		WARN_ON_ONCE(work > weight);
3875
3876		budget -= work;
3877
3878		local_irq_disable();
3879
3880		/* Drivers must not modify the NAPI state if they
3881		 * consume the entire weight.  In such cases this code
3882		 * still "owns" the NAPI instance and therefore can
3883		 * move the instance around on the list at-will.
3884		 */
3885		if (unlikely(work == weight)) {
3886			if (unlikely(napi_disable_pending(n))) {
3887				local_irq_enable();
3888				napi_complete(n);
3889				local_irq_disable();
3890			} else
3891				list_move_tail(&n->poll_list, &sd->poll_list);
3892		}
3893
3894		netpoll_poll_unlock(have);
3895	}
3896out:
3897	net_rps_action_and_irq_enable(sd);
3898
3899#ifdef CONFIG_NET_DMA
3900	/*
3901	 * There may not be any more sk_buffs coming right now, so push
3902	 * any pending DMA copies to hardware
3903	 */
3904	dma_issue_pending_all();
3905#endif
3906
3907	return;
3908
3909softnet_break:
3910	sd->time_squeeze++;
3911	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3912	goto out;
3913}
3914
3915static gifconf_func_t *gifconf_list[NPROTO];
3916
3917/**
3918 *	register_gifconf	-	register a SIOCGIF handler
3919 *	@family: Address family
3920 *	@gifconf: Function handler
3921 *
3922 *	Register protocol dependent address dumping routines. The handler
3923 *	that is passed must not be freed or reused until it has been replaced
3924 *	by another handler.
3925 */
3926int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3927{
3928	if (family >= NPROTO)
3929		return -EINVAL;
3930	gifconf_list[family] = gifconf;
3931	return 0;
3932}
3933EXPORT_SYMBOL(register_gifconf);
3934
3935
3936/*
3937 *	Map an interface index to its name (SIOCGIFNAME)
3938 */
3939
3940/*
3941 *	We need this ioctl for efficient implementation of the
3942 *	if_indextoname() function required by the IPv6 API.  Without
3943 *	it, we would have to search all the interfaces to find a
3944 *	match.  --pb
3945 */
3946
3947static int dev_ifname(struct net *net, struct ifreq __user *arg)
3948{
3949	struct net_device *dev;
3950	struct ifreq ifr;
3951
3952	/*
3953	 *	Fetch the caller's info block.
3954	 */
3955
3956	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3957		return -EFAULT;
3958
3959	rcu_read_lock();
3960	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3961	if (!dev) {
3962		rcu_read_unlock();
3963		return -ENODEV;
3964	}
3965
3966	strcpy(ifr.ifr_name, dev->name);
3967	rcu_read_unlock();
3968
3969	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3970		return -EFAULT;
3971	return 0;
3972}
3973
3974/*
3975 *	Perform a SIOCGIFCONF call. This structure will change
3976 *	size eventually, and there is nothing I can do about it.
3977 *	Thus we will need a 'compatibility mode'.
3978 */
3979
3980static int dev_ifconf(struct net *net, char __user *arg)
3981{
3982	struct ifconf ifc;
3983	struct net_device *dev;
3984	char __user *pos;
3985	int len;
3986	int total;
3987	int i;
3988
3989	/*
3990	 *	Fetch the caller's info block.
3991	 */
3992
3993	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3994		return -EFAULT;
3995
3996	pos = ifc.ifc_buf;
3997	len = ifc.ifc_len;
3998
3999	/*
4000	 *	Loop over the interfaces, and write an info block for each.
4001	 */
4002
4003	total = 0;
4004	for_each_netdev(net, dev) {
4005		for (i = 0; i < NPROTO; i++) {
4006			if (gifconf_list[i]) {
4007				int done;
4008				if (!pos)
4009					done = gifconf_list[i](dev, NULL, 0);
4010				else
4011					done = gifconf_list[i](dev, pos + total,
4012							       len - total);
4013				if (done < 0)
4014					return -EFAULT;
4015				total += done;
4016			}
4017		}
4018	}
4019
4020	/*
4021	 *	All done.  Write the updated control block back to the caller.
4022	 */
4023	ifc.ifc_len = total;
4024
4025	/*
4026	 * 	Both BSD and Solaris return 0 here, so we do too.
4027	 */
4028	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4029}
4030
4031#ifdef CONFIG_PROC_FS
4032
4033#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4034
4035struct dev_iter_state {
4036	struct seq_net_private p;
4037	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4038};
4039
4040#define get_bucket(x) ((x) >> BUCKET_SPACE)
4041#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4042#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4043
4044static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4045{
4046	struct dev_iter_state *state = seq->private;
4047	struct net *net = seq_file_net(seq);
4048	struct net_device *dev;
4049	struct hlist_node *p;
4050	struct hlist_head *h;
4051	unsigned int count, bucket, offset;
4052
4053	bucket = get_bucket(state->pos);
4054	offset = get_offset(state->pos);
4055	h = &net->dev_name_head[bucket];
4056	count = 0;
4057	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4058		if (count++ == offset) {
4059			state->pos = set_bucket_offset(bucket, count);
4060			return dev;
4061		}
4062	}
4063
4064	return NULL;
4065}
4066
4067static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4068{
4069	struct dev_iter_state *state = seq->private;
4070	struct net_device *dev;
4071	unsigned int bucket;
4072
4073	bucket = get_bucket(state->pos);
4074	do {
4075		dev = dev_from_same_bucket(seq);
4076		if (dev)
4077			return dev;
4078
4079		bucket++;
4080		state->pos = set_bucket_offset(bucket, 0);
4081	} while (bucket < NETDEV_HASHENTRIES);
4082
4083	return NULL;
4084}
4085
4086/*
4087 *	This is invoked by the /proc filesystem handler to display a device
4088 *	in detail.
4089 */
4090void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4091	__acquires(RCU)
4092{
4093	struct dev_iter_state *state = seq->private;
4094
4095	rcu_read_lock();
4096	if (!*pos)
4097		return SEQ_START_TOKEN;
4098
4099	/* check for end of the hash */
4100	if (state->pos == 0 && *pos > 1)
4101		return NULL;
4102
4103	return dev_from_new_bucket(seq);
4104}
4105
4106void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4107{
4108	struct net_device *dev;
4109
4110	++*pos;
4111
4112	if (v == SEQ_START_TOKEN)
4113		return dev_from_new_bucket(seq);
4114
4115	dev = dev_from_same_bucket(seq);
4116	if (dev)
4117		return dev;
4118
4119	return dev_from_new_bucket(seq);
4120}
4121
4122void dev_seq_stop(struct seq_file *seq, void *v)
4123	__releases(RCU)
4124{
4125	rcu_read_unlock();
4126}
4127
4128static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4129{
4130	struct rtnl_link_stats64 temp;
4131	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4132
4133	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4134		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4135		   dev->name, stats->rx_bytes, stats->rx_packets,
4136		   stats->rx_errors,
4137		   stats->rx_dropped + stats->rx_missed_errors,
4138		   stats->rx_fifo_errors,
4139		   stats->rx_length_errors + stats->rx_over_errors +
4140		    stats->rx_crc_errors + stats->rx_frame_errors,
4141		   stats->rx_compressed, stats->multicast,
4142		   stats->tx_bytes, stats->tx_packets,
4143		   stats->tx_errors, stats->tx_dropped,
4144		   stats->tx_fifo_errors, stats->collisions,
4145		   stats->tx_carrier_errors +
4146		    stats->tx_aborted_errors +
4147		    stats->tx_window_errors +
4148		    stats->tx_heartbeat_errors,
4149		   stats->tx_compressed);
4150}
4151
4152/*
4153 *	Called from the PROCfs module. This now uses the new arbitrary sized
4154 *	/proc/net interface to create /proc/net/dev
4155 */
4156static int dev_seq_show(struct seq_file *seq, void *v)
4157{
4158	if (v == SEQ_START_TOKEN)
4159		seq_puts(seq, "Inter-|   Receive                            "
4160			      "                    |  Transmit\n"
4161			      " face |bytes    packets errs drop fifo frame "
4162			      "compressed multicast|bytes    packets errs "
4163			      "drop fifo colls carrier compressed\n");
4164	else
4165		dev_seq_printf_stats(seq, v);
4166	return 0;
4167}
4168
4169static struct softnet_data *softnet_get_online(loff_t *pos)
4170{
4171	struct softnet_data *sd = NULL;
4172
4173	while (*pos < nr_cpu_ids)
4174		if (cpu_online(*pos)) {
4175			sd = &per_cpu(softnet_data, *pos);
4176			break;
4177		} else
4178			++*pos;
4179	return sd;
4180}
4181
4182static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4183{
4184	return softnet_get_online(pos);
4185}
4186
4187static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4188{
4189	++*pos;
4190	return softnet_get_online(pos);
4191}
4192
4193static void softnet_seq_stop(struct seq_file *seq, void *v)
4194{
4195}
4196
4197static int softnet_seq_show(struct seq_file *seq, void *v)
4198{
4199	struct softnet_data *sd = v;
4200
4201	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4202		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4203		   0, 0, 0, 0, /* was fastroute */
4204		   sd->cpu_collision, sd->received_rps);
4205	return 0;
4206}
4207
4208static const struct seq_operations dev_seq_ops = {
4209	.start = dev_seq_start,
4210	.next  = dev_seq_next,
4211	.stop  = dev_seq_stop,
4212	.show  = dev_seq_show,
4213};
4214
4215static int dev_seq_open(struct inode *inode, struct file *file)
4216{
4217	return seq_open_net(inode, file, &dev_seq_ops,
4218			    sizeof(struct dev_iter_state));
4219}
4220
4221int dev_seq_open_ops(struct inode *inode, struct file *file,
4222		     const struct seq_operations *ops)
4223{
4224	return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4225}
4226
4227static const struct file_operations dev_seq_fops = {
4228	.owner	 = THIS_MODULE,
4229	.open    = dev_seq_open,
4230	.read    = seq_read,
4231	.llseek  = seq_lseek,
4232	.release = seq_release_net,
4233};
4234
4235static const struct seq_operations softnet_seq_ops = {
4236	.start = softnet_seq_start,
4237	.next  = softnet_seq_next,
4238	.stop  = softnet_seq_stop,
4239	.show  = softnet_seq_show,
4240};
4241
4242static int softnet_seq_open(struct inode *inode, struct file *file)
4243{
4244	return seq_open(file, &softnet_seq_ops);
4245}
4246
4247static const struct file_operations softnet_seq_fops = {
4248	.owner	 = THIS_MODULE,
4249	.open    = softnet_seq_open,
4250	.read    = seq_read,
4251	.llseek  = seq_lseek,
4252	.release = seq_release,
4253};
4254
4255static void *ptype_get_idx(loff_t pos)
4256{
4257	struct packet_type *pt = NULL;
4258	loff_t i = 0;
4259	int t;
4260
4261	list_for_each_entry_rcu(pt, &ptype_all, list) {
4262		if (i == pos)
4263			return pt;
4264		++i;
4265	}
4266
4267	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4268		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4269			if (i == pos)
4270				return pt;
4271			++i;
4272		}
4273	}
4274	return NULL;
4275}
4276
4277static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4278	__acquires(RCU)
4279{
4280	rcu_read_lock();
4281	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4282}
4283
4284static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4285{
4286	struct packet_type *pt;
4287	struct list_head *nxt;
4288	int hash;
4289
4290	++*pos;
4291	if (v == SEQ_START_TOKEN)
4292		return ptype_get_idx(0);
4293
4294	pt = v;
4295	nxt = pt->list.next;
4296	if (pt->type == htons(ETH_P_ALL)) {
4297		if (nxt != &ptype_all)
4298			goto found;
4299		hash = 0;
4300		nxt = ptype_base[0].next;
4301	} else
4302		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4303
4304	while (nxt == &ptype_base[hash]) {
4305		if (++hash >= PTYPE_HASH_SIZE)
4306			return NULL;
4307		nxt = ptype_base[hash].next;
4308	}
4309found:
4310	return list_entry(nxt, struct packet_type, list);
4311}
4312
4313static void ptype_seq_stop(struct seq_file *seq, void *v)
4314	__releases(RCU)
4315{
4316	rcu_read_unlock();
4317}
4318
4319static int ptype_seq_show(struct seq_file *seq, void *v)
4320{
4321	struct packet_type *pt = v;
4322
4323	if (v == SEQ_START_TOKEN)
4324		seq_puts(seq, "Type Device      Function\n");
4325	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4326		if (pt->type == htons(ETH_P_ALL))
4327			seq_puts(seq, "ALL ");
4328		else
4329			seq_printf(seq, "%04x", ntohs(pt->type));
4330
4331		seq_printf(seq, " %-8s %pF\n",
4332			   pt->dev ? pt->dev->name : "", pt->func);
4333	}
4334
4335	return 0;
4336}
4337
4338static const struct seq_operations ptype_seq_ops = {
4339	.start = ptype_seq_start,
4340	.next  = ptype_seq_next,
4341	.stop  = ptype_seq_stop,
4342	.show  = ptype_seq_show,
4343};
4344
4345static int ptype_seq_open(struct inode *inode, struct file *file)
4346{
4347	return seq_open_net(inode, file, &ptype_seq_ops,
4348			sizeof(struct seq_net_private));
4349}
4350
4351static const struct file_operations ptype_seq_fops = {
4352	.owner	 = THIS_MODULE,
4353	.open    = ptype_seq_open,
4354	.read    = seq_read,
4355	.llseek  = seq_lseek,
4356	.release = seq_release_net,
4357};
4358
4359
4360static int __net_init dev_proc_net_init(struct net *net)
4361{
4362	int rc = -ENOMEM;
4363
4364	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4365		goto out;
4366	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4367		goto out_dev;
4368	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4369		goto out_softnet;
4370
4371	if (wext_proc_init(net))
4372		goto out_ptype;
4373	rc = 0;
4374out:
4375	return rc;
4376out_ptype:
4377	proc_net_remove(net, "ptype");
4378out_softnet:
4379	proc_net_remove(net, "softnet_stat");
4380out_dev:
4381	proc_net_remove(net, "dev");
4382	goto out;
4383}
4384
4385static void __net_exit dev_proc_net_exit(struct net *net)
4386{
4387	wext_proc_exit(net);
4388
4389	proc_net_remove(net, "ptype");
4390	proc_net_remove(net, "softnet_stat");
4391	proc_net_remove(net, "dev");
4392}
4393
4394static struct pernet_operations __net_initdata dev_proc_ops = {
4395	.init = dev_proc_net_init,
4396	.exit = dev_proc_net_exit,
4397};
4398
4399static int __init dev_proc_init(void)
4400{
4401	return register_pernet_subsys(&dev_proc_ops);
4402}
4403#else
4404#define dev_proc_init() 0
4405#endif	/* CONFIG_PROC_FS */
4406
4407
4408/**
4409 *	netdev_set_master	-	set up master pointer
4410 *	@slave: slave device
4411 *	@master: new master device
4412 *
4413 *	Changes the master device of the slave. Pass %NULL to break the
4414 *	bonding. The caller must hold the RTNL semaphore. On a failure
4415 *	a negative errno code is returned. On success the reference counts
4416 *	are adjusted and the function returns zero.
4417 */
4418int netdev_set_master(struct net_device *slave, struct net_device *master)
4419{
4420	struct net_device *old = slave->master;
4421
4422	ASSERT_RTNL();
4423
4424	if (master) {
4425		if (old)
4426			return -EBUSY;
4427		dev_hold(master);
4428	}
4429
4430	slave->master = master;
4431
4432	if (old)
4433		dev_put(old);
4434	return 0;
4435}
4436EXPORT_SYMBOL(netdev_set_master);
4437
4438/**
4439 *	netdev_set_bond_master	-	set up bonding master/slave pair
4440 *	@slave: slave device
4441 *	@master: new master device
4442 *
4443 *	Changes the master device of the slave. Pass %NULL to break the
4444 *	bonding. The caller must hold the RTNL semaphore. On a failure
4445 *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4446 *	to the routing socket and the function returns zero.
4447 */
4448int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4449{
4450	int err;
4451
4452	ASSERT_RTNL();
4453
4454	err = netdev_set_master(slave, master);
4455	if (err)
4456		return err;
4457	if (master)
4458		slave->flags |= IFF_SLAVE;
4459	else
4460		slave->flags &= ~IFF_SLAVE;
4461
4462	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4463	return 0;
4464}
4465EXPORT_SYMBOL(netdev_set_bond_master);
4466
4467static void dev_change_rx_flags(struct net_device *dev, int flags)
4468{
4469	const struct net_device_ops *ops = dev->netdev_ops;
4470
4471	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4472		ops->ndo_change_rx_flags(dev, flags);
4473}
4474
4475static int __dev_set_promiscuity(struct net_device *dev, int inc)
4476{
4477	unsigned int old_flags = dev->flags;
4478	uid_t uid;
4479	gid_t gid;
4480
4481	ASSERT_RTNL();
4482
4483	dev->flags |= IFF_PROMISC;
4484	dev->promiscuity += inc;
4485	if (dev->promiscuity == 0) {
4486		/*
4487		 * Avoid overflow.
4488		 * If inc causes overflow, untouch promisc and return error.
4489		 */
4490		if (inc < 0)
4491			dev->flags &= ~IFF_PROMISC;
4492		else {
4493			dev->promiscuity -= inc;
4494			printk(KERN_WARNING "%s: promiscuity touches roof, "
4495				"set promiscuity failed, promiscuity feature "
4496				"of device might be broken.\n", dev->name);
4497			return -EOVERFLOW;
4498		}
4499	}
4500	if (dev->flags != old_flags) {
4501		printk(KERN_INFO "device %s %s promiscuous mode\n",
4502		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4503							       "left");
4504		if (audit_enabled) {
4505			current_uid_gid(&uid, &gid);
4506			audit_log(current->audit_context, GFP_ATOMIC,
4507				AUDIT_ANOM_PROMISCUOUS,
4508				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4509				dev->name, (dev->flags & IFF_PROMISC),
4510				(old_flags & IFF_PROMISC),
4511				audit_get_loginuid(current),
4512				uid, gid,
4513				audit_get_sessionid(current));
4514		}
4515
4516		dev_change_rx_flags(dev, IFF_PROMISC);
4517	}
4518	return 0;
4519}
4520
4521/**
4522 *	dev_set_promiscuity	- update promiscuity count on a device
4523 *	@dev: device
4524 *	@inc: modifier
4525 *
4526 *	Add or remove promiscuity from a device. While the count in the device
4527 *	remains above zero the interface remains promiscuous. Once it hits zero
4528 *	the device reverts back to normal filtering operation. A negative inc
4529 *	value is used to drop promiscuity on the device.
4530 *	Return 0 if successful or a negative errno code on error.
4531 */
4532int dev_set_promiscuity(struct net_device *dev, int inc)
4533{
4534	unsigned int old_flags = dev->flags;
4535	int err;
4536
4537	err = __dev_set_promiscuity(dev, inc);
4538	if (err < 0)
4539		return err;
4540	if (dev->flags != old_flags)
4541		dev_set_rx_mode(dev);
4542	return err;
4543}
4544EXPORT_SYMBOL(dev_set_promiscuity);
4545
4546/**
4547 *	dev_set_allmulti	- update allmulti count on a device
4548 *	@dev: device
4549 *	@inc: modifier
4550 *
4551 *	Add or remove reception of all multicast frames to a device. While the
4552 *	count in the device remains above zero the interface remains listening
4553 *	to all interfaces. Once it hits zero the device reverts back to normal
4554 *	filtering operation. A negative @inc value is used to drop the counter
4555 *	when releasing a resource needing all multicasts.
4556 *	Return 0 if successful or a negative errno code on error.
4557 */
4558
4559int dev_set_allmulti(struct net_device *dev, int inc)
4560{
4561	unsigned int old_flags = dev->flags;
4562
4563	ASSERT_RTNL();
4564
4565	dev->flags |= IFF_ALLMULTI;
4566	dev->allmulti += inc;
4567	if (dev->allmulti == 0) {
4568		/*
4569		 * Avoid overflow.
4570		 * If inc causes overflow, untouch allmulti and return error.
4571		 */
4572		if (inc < 0)
4573			dev->flags &= ~IFF_ALLMULTI;
4574		else {
4575			dev->allmulti -= inc;
4576			printk(KERN_WARNING "%s: allmulti touches roof, "
4577				"set allmulti failed, allmulti feature of "
4578				"device might be broken.\n", dev->name);
4579			return -EOVERFLOW;
4580		}
4581	}
4582	if (dev->flags ^ old_flags) {
4583		dev_change_rx_flags(dev, IFF_ALLMULTI);
4584		dev_set_rx_mode(dev);
4585	}
4586	return 0;
4587}
4588EXPORT_SYMBOL(dev_set_allmulti);
4589
4590/*
4591 *	Upload unicast and multicast address lists to device and
4592 *	configure RX filtering. When the device doesn't support unicast
4593 *	filtering it is put in promiscuous mode while unicast addresses
4594 *	are present.
4595 */
4596void __dev_set_rx_mode(struct net_device *dev)
4597{
4598	const struct net_device_ops *ops = dev->netdev_ops;
4599
4600	/* dev_open will call this function so the list will stay sane. */
4601	if (!(dev->flags&IFF_UP))
4602		return;
4603
4604	if (!netif_device_present(dev))
4605		return;
4606
4607	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4608		/* Unicast addresses changes may only happen under the rtnl,
4609		 * therefore calling __dev_set_promiscuity here is safe.
4610		 */
4611		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4612			__dev_set_promiscuity(dev, 1);
4613			dev->uc_promisc = true;
4614		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4615			__dev_set_promiscuity(dev, -1);
4616			dev->uc_promisc = false;
4617		}
4618	}
4619
4620	if (ops->ndo_set_rx_mode)
4621		ops->ndo_set_rx_mode(dev);
4622}
4623
4624void dev_set_rx_mode(struct net_device *dev)
4625{
4626	netif_addr_lock_bh(dev);
4627	__dev_set_rx_mode(dev);
4628	netif_addr_unlock_bh(dev);
4629}
4630
4631/**
4632 *	dev_get_flags - get flags reported to userspace
4633 *	@dev: device
4634 *
4635 *	Get the combination of flag bits exported through APIs to userspace.
4636 */
4637unsigned dev_get_flags(const struct net_device *dev)
4638{
4639	unsigned flags;
4640
4641	flags = (dev->flags & ~(IFF_PROMISC |
4642				IFF_ALLMULTI |
4643				IFF_RUNNING |
4644				IFF_LOWER_UP |
4645				IFF_DORMANT)) |
4646		(dev->gflags & (IFF_PROMISC |
4647				IFF_ALLMULTI));
4648
4649	if (netif_running(dev)) {
4650		if (netif_oper_up(dev))
4651			flags |= IFF_RUNNING;
4652		if (netif_carrier_ok(dev))
4653			flags |= IFF_LOWER_UP;
4654		if (netif_dormant(dev))
4655			flags |= IFF_DORMANT;
4656	}
4657
4658	return flags;
4659}
4660EXPORT_SYMBOL(dev_get_flags);
4661
4662int __dev_change_flags(struct net_device *dev, unsigned int flags)
4663{
4664	unsigned int old_flags = dev->flags;
4665	int ret;
4666
4667	ASSERT_RTNL();
4668
4669	/*
4670	 *	Set the flags on our device.
4671	 */
4672
4673	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4674			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4675			       IFF_AUTOMEDIA)) |
4676		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4677				    IFF_ALLMULTI));
4678
4679	/*
4680	 *	Load in the correct multicast list now the flags have changed.
4681	 */
4682
4683	if ((old_flags ^ flags) & IFF_MULTICAST)
4684		dev_change_rx_flags(dev, IFF_MULTICAST);
4685
4686	dev_set_rx_mode(dev);
4687
4688	/*
4689	 *	Have we downed the interface. We handle IFF_UP ourselves
4690	 *	according to user attempts to set it, rather than blindly
4691	 *	setting it.
4692	 */
4693
4694	ret = 0;
4695	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4696		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4697
4698		if (!ret)
4699			dev_set_rx_mode(dev);
4700	}
4701
4702	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4703		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4704
4705		dev->gflags ^= IFF_PROMISC;
4706		dev_set_promiscuity(dev, inc);
4707	}
4708
4709	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4710	   is important. Some (broken) drivers set IFF_PROMISC, when
4711	   IFF_ALLMULTI is requested not asking us and not reporting.
4712	 */
4713	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4714		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4715
4716		dev->gflags ^= IFF_ALLMULTI;
4717		dev_set_allmulti(dev, inc);
4718	}
4719
4720	return ret;
4721}
4722
4723void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4724{
4725	unsigned int changes = dev->flags ^ old_flags;
4726
4727	if (changes & IFF_UP) {
4728		if (dev->flags & IFF_UP)
4729			call_netdevice_notifiers(NETDEV_UP, dev);
4730		else
4731			call_netdevice_notifiers(NETDEV_DOWN, dev);
4732	}
4733
4734	if (dev->flags & IFF_UP &&
4735	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4736		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4737}
4738
4739/**
4740 *	dev_change_flags - change device settings
4741 *	@dev: device
4742 *	@flags: device state flags
4743 *
4744 *	Change settings on device based state flags. The flags are
4745 *	in the userspace exported format.
4746 */
4747int dev_change_flags(struct net_device *dev, unsigned int flags)
4748{
4749	int ret;
4750	unsigned int changes, old_flags = dev->flags;
4751
4752	ret = __dev_change_flags(dev, flags);
4753	if (ret < 0)
4754		return ret;
4755
4756	changes = old_flags ^ dev->flags;
4757	if (changes)
4758		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4759
4760	__dev_notify_flags(dev, old_flags);
4761	return ret;
4762}
4763EXPORT_SYMBOL(dev_change_flags);
4764
4765/**
4766 *	dev_set_mtu - Change maximum transfer unit
4767 *	@dev: device
4768 *	@new_mtu: new transfer unit
4769 *
4770 *	Change the maximum transfer size of the network device.
4771 */
4772int dev_set_mtu(struct net_device *dev, int new_mtu)
4773{
4774	const struct net_device_ops *ops = dev->netdev_ops;
4775	int err;
4776
4777	if (new_mtu == dev->mtu)
4778		return 0;
4779
4780	/*	MTU must be positive.	 */
4781	if (new_mtu < 0)
4782		return -EINVAL;
4783
4784	if (!netif_device_present(dev))
4785		return -ENODEV;
4786
4787	err = 0;
4788	if (ops->ndo_change_mtu)
4789		err = ops->ndo_change_mtu(dev, new_mtu);
4790	else
4791		dev->mtu = new_mtu;
4792
4793	if (!err && dev->flags & IFF_UP)
4794		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4795	return err;
4796}
4797EXPORT_SYMBOL(dev_set_mtu);
4798
4799/**
4800 *	dev_set_group - Change group this device belongs to
4801 *	@dev: device
4802 *	@new_group: group this device should belong to
4803 */
4804void dev_set_group(struct net_device *dev, int new_group)
4805{
4806	dev->group = new_group;
4807}
4808EXPORT_SYMBOL(dev_set_group);
4809
4810/**
4811 *	dev_set_mac_address - Change Media Access Control Address
4812 *	@dev: device
4813 *	@sa: new address
4814 *
4815 *	Change the hardware (MAC) address of the device
4816 */
4817int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4818{
4819	const struct net_device_ops *ops = dev->netdev_ops;
4820	int err;
4821
4822	if (!ops->ndo_set_mac_address)
4823		return -EOPNOTSUPP;
4824	if (sa->sa_family != dev->type)
4825		return -EINVAL;
4826	if (!netif_device_present(dev))
4827		return -ENODEV;
4828	err = ops->ndo_set_mac_address(dev, sa);
4829	if (!err)
4830		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4831	return err;
4832}
4833EXPORT_SYMBOL(dev_set_mac_address);
4834
4835/*
4836 *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4837 */
4838static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4839{
4840	int err;
4841	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4842
4843	if (!dev)
4844		return -ENODEV;
4845
4846	switch (cmd) {
4847	case SIOCGIFFLAGS:	/* Get interface flags */
4848		ifr->ifr_flags = (short) dev_get_flags(dev);
4849		return 0;
4850
4851	case SIOCGIFMETRIC:	/* Get the metric on the interface
4852				   (currently unused) */
4853		ifr->ifr_metric = 0;
4854		return 0;
4855
4856	case SIOCGIFMTU:	/* Get the MTU of a device */
4857		ifr->ifr_mtu = dev->mtu;
4858		return 0;
4859
4860	case SIOCGIFHWADDR:
4861		if (!dev->addr_len)
4862			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4863		else
4864			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4865			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4866		ifr->ifr_hwaddr.sa_family = dev->type;
4867		return 0;
4868
4869	case SIOCGIFSLAVE:
4870		err = -EINVAL;
4871		break;
4872
4873	case SIOCGIFMAP:
4874		ifr->ifr_map.mem_start = dev->mem_start;
4875		ifr->ifr_map.mem_end   = dev->mem_end;
4876		ifr->ifr_map.base_addr = dev->base_addr;
4877		ifr->ifr_map.irq       = dev->irq;
4878		ifr->ifr_map.dma       = dev->dma;
4879		ifr->ifr_map.port      = dev->if_port;
4880		return 0;
4881
4882	case SIOCGIFINDEX:
4883		ifr->ifr_ifindex = dev->ifindex;
4884		return 0;
4885
4886	case SIOCGIFTXQLEN:
4887		ifr->ifr_qlen = dev->tx_queue_len;
4888		return 0;
4889
4890	default:
4891		/* dev_ioctl() should ensure this case
4892		 * is never reached
4893		 */
4894		WARN_ON(1);
4895		err = -ENOTTY;
4896		break;
4897
4898	}
4899	return err;
4900}
4901
4902/*
4903 *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4904 */
4905static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4906{
4907	int err;
4908	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4909	const struct net_device_ops *ops;
4910
4911	if (!dev)
4912		return -ENODEV;
4913
4914	ops = dev->netdev_ops;
4915
4916	switch (cmd) {
4917	case SIOCSIFFLAGS:	/* Set interface flags */
4918		return dev_change_flags(dev, ifr->ifr_flags);
4919
4920	case SIOCSIFMETRIC:	/* Set the metric on the interface
4921				   (currently unused) */
4922		return -EOPNOTSUPP;
4923
4924	case SIOCSIFMTU:	/* Set the MTU of a device */
4925		return dev_set_mtu(dev, ifr->ifr_mtu);
4926
4927	case SIOCSIFHWADDR:
4928		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4929
4930	case SIOCSIFHWBROADCAST:
4931		if (ifr->ifr_hwaddr.sa_family != dev->type)
4932			return -EINVAL;
4933		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4934		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4935		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4936		return 0;
4937
4938	case SIOCSIFMAP:
4939		if (ops->ndo_set_config) {
4940			if (!netif_device_present(dev))
4941				return -ENODEV;
4942			return ops->ndo_set_config(dev, &ifr->ifr_map);
4943		}
4944		return -EOPNOTSUPP;
4945
4946	case SIOCADDMULTI:
4947		if (!ops->ndo_set_rx_mode ||
4948		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4949			return -EINVAL;
4950		if (!netif_device_present(dev))
4951			return -ENODEV;
4952		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4953
4954	case SIOCDELMULTI:
4955		if (!ops->ndo_set_rx_mode ||
4956		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4957			return -EINVAL;
4958		if (!netif_device_present(dev))
4959			return -ENODEV;
4960		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4961
4962	case SIOCSIFTXQLEN:
4963		if (ifr->ifr_qlen < 0)
4964			return -EINVAL;
4965		dev->tx_queue_len = ifr->ifr_qlen;
4966		return 0;
4967
4968	case SIOCSIFNAME:
4969		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4970		return dev_change_name(dev, ifr->ifr_newname);
4971
4972	case SIOCSHWTSTAMP:
4973		err = net_hwtstamp_validate(ifr);
4974		if (err)
4975			return err;
4976		/* fall through */
4977
4978	/*
4979	 *	Unknown or private ioctl
4980	 */
4981	default:
4982		if ((cmd >= SIOCDEVPRIVATE &&
4983		    cmd <= SIOCDEVPRIVATE + 15) ||
4984		    cmd == SIOCBONDENSLAVE ||
4985		    cmd == SIOCBONDRELEASE ||
4986		    cmd == SIOCBONDSETHWADDR ||
4987		    cmd == SIOCBONDSLAVEINFOQUERY ||
4988		    cmd == SIOCBONDINFOQUERY ||
4989		    cmd == SIOCBONDCHANGEACTIVE ||
4990		    cmd == SIOCGMIIPHY ||
4991		    cmd == SIOCGMIIREG ||
4992		    cmd == SIOCSMIIREG ||
4993		    cmd == SIOCBRADDIF ||
4994		    cmd == SIOCBRDELIF ||
4995		    cmd == SIOCSHWTSTAMP ||
4996		    cmd == SIOCWANDEV) {
4997			err = -EOPNOTSUPP;
4998			if (ops->ndo_do_ioctl) {
4999				if (netif_device_present(dev))
5000					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5001				else
5002					err = -ENODEV;
5003			}
5004		} else
5005			err = -EINVAL;
5006
5007	}
5008	return err;
5009}
5010
5011/*
5012 *	This function handles all "interface"-type I/O control requests. The actual
5013 *	'doing' part of this is dev_ifsioc above.
5014 */
5015
5016/**
5017 *	dev_ioctl	-	network device ioctl
5018 *	@net: the applicable net namespace
5019 *	@cmd: command to issue
5020 *	@arg: pointer to a struct ifreq in user space
5021 *
5022 *	Issue ioctl functions to devices. This is normally called by the
5023 *	user space syscall interfaces but can sometimes be useful for
5024 *	other purposes. The return value is the return from the syscall if
5025 *	positive or a negative errno code on error.
5026 */
5027
5028int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5029{
5030	struct ifreq ifr;
5031	int ret;
5032	char *colon;
5033
5034	/* One special case: SIOCGIFCONF takes ifconf argument
5035	   and requires shared lock, because it sleeps writing
5036	   to user space.
5037	 */
5038
5039	if (cmd == SIOCGIFCONF) {
5040		rtnl_lock();
5041		ret = dev_ifconf(net, (char __user *) arg);
5042		rtnl_unlock();
5043		return ret;
5044	}
5045	if (cmd == SIOCGIFNAME)
5046		return dev_ifname(net, (struct ifreq __user *)arg);
5047
5048	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5049		return -EFAULT;
5050
5051	ifr.ifr_name[IFNAMSIZ-1] = 0;
5052
5053	colon = strchr(ifr.ifr_name, ':');
5054	if (colon)
5055		*colon = 0;
5056
5057	/*
5058	 *	See which interface the caller is talking about.
5059	 */
5060
5061	switch (cmd) {
5062	/*
5063	 *	These ioctl calls:
5064	 *	- can be done by all.
5065	 *	- atomic and do not require locking.
5066	 *	- return a value
5067	 */
5068	case SIOCGIFFLAGS:
5069	case SIOCGIFMETRIC:
5070	case SIOCGIFMTU:
5071	case SIOCGIFHWADDR:
5072	case SIOCGIFSLAVE:
5073	case SIOCGIFMAP:
5074	case SIOCGIFINDEX:
5075	case SIOCGIFTXQLEN:
5076		dev_load(net, ifr.ifr_name);
5077		rcu_read_lock();
5078		ret = dev_ifsioc_locked(net, &ifr, cmd);
5079		rcu_read_unlock();
5080		if (!ret) {
5081			if (colon)
5082				*colon = ':';
5083			if (copy_to_user(arg, &ifr,
5084					 sizeof(struct ifreq)))
5085				ret = -EFAULT;
5086		}
5087		return ret;
5088
5089	case SIOCETHTOOL:
5090		dev_load(net, ifr.ifr_name);
5091		rtnl_lock();
5092		ret = dev_ethtool(net, &ifr);
5093		rtnl_unlock();
5094		if (!ret) {
5095			if (colon)
5096				*colon = ':';
5097			if (copy_to_user(arg, &ifr,
5098					 sizeof(struct ifreq)))
5099				ret = -EFAULT;
5100		}
5101		return ret;
5102
5103	/*
5104	 *	These ioctl calls:
5105	 *	- require superuser power.
5106	 *	- require strict serialization.
5107	 *	- return a value
5108	 */
5109	case SIOCGMIIPHY:
5110	case SIOCGMIIREG:
5111	case SIOCSIFNAME:
5112		if (!capable(CAP_NET_ADMIN))
5113			return -EPERM;
5114		dev_load(net, ifr.ifr_name);
5115		rtnl_lock();
5116		ret = dev_ifsioc(net, &ifr, cmd);
5117		rtnl_unlock();
5118		if (!ret) {
5119			if (colon)
5120				*colon = ':';
5121			if (copy_to_user(arg, &ifr,
5122					 sizeof(struct ifreq)))
5123				ret = -EFAULT;
5124		}
5125		return ret;
5126
5127	/*
5128	 *	These ioctl calls:
5129	 *	- require superuser power.
5130	 *	- require strict serialization.
5131	 *	- do not return a value
5132	 */
5133	case SIOCSIFFLAGS:
5134	case SIOCSIFMETRIC:
5135	case SIOCSIFMTU:
5136	case SIOCSIFMAP:
5137	case SIOCSIFHWADDR:
5138	case SIOCSIFSLAVE:
5139	case SIOCADDMULTI:
5140	case SIOCDELMULTI:
5141	case SIOCSIFHWBROADCAST:
5142	case SIOCSIFTXQLEN:
5143	case SIOCSMIIREG:
5144	case SIOCBONDENSLAVE:
5145	case SIOCBONDRELEASE:
5146	case SIOCBONDSETHWADDR:
5147	case SIOCBONDCHANGEACTIVE:
5148	case SIOCBRADDIF:
5149	case SIOCBRDELIF:
5150	case SIOCSHWTSTAMP:
5151		if (!capable(CAP_NET_ADMIN))
5152			return -EPERM;
5153		/* fall through */
5154	case SIOCBONDSLAVEINFOQUERY:
5155	case SIOCBONDINFOQUERY:
5156		dev_load(net, ifr.ifr_name);
5157		rtnl_lock();
5158		ret = dev_ifsioc(net, &ifr, cmd);
5159		rtnl_unlock();
5160		return ret;
5161
5162	case SIOCGIFMEM:
5163		/* Get the per device memory space. We can add this but
5164		 * currently do not support it */
5165	case SIOCSIFMEM:
5166		/* Set the per device memory buffer space.
5167		 * Not applicable in our case */
5168	case SIOCSIFLINK:
5169		return -ENOTTY;
5170
5171	/*
5172	 *	Unknown or private ioctl.
5173	 */
5174	default:
5175		if (cmd == SIOCWANDEV ||
5176		    (cmd >= SIOCDEVPRIVATE &&
5177		     cmd <= SIOCDEVPRIVATE + 15)) {
5178			dev_load(net, ifr.ifr_name);
5179			rtnl_lock();
5180			ret = dev_ifsioc(net, &ifr, cmd);
5181			rtnl_unlock();
5182			if (!ret && copy_to_user(arg, &ifr,
5183						 sizeof(struct ifreq)))
5184				ret = -EFAULT;
5185			return ret;
5186		}
5187		/* Take care of Wireless Extensions */
5188		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5189			return wext_handle_ioctl(net, &ifr, cmd, arg);
5190		return -ENOTTY;
5191	}
5192}
5193
5194
5195/**
5196 *	dev_new_index	-	allocate an ifindex
5197 *	@net: the applicable net namespace
5198 *
5199 *	Returns a suitable unique value for a new device interface
5200 *	number.  The caller must hold the rtnl semaphore or the
5201 *	dev_base_lock to be sure it remains unique.
5202 */
5203static int dev_new_index(struct net *net)
5204{
5205	static int ifindex;
5206	for (;;) {
5207		if (++ifindex <= 0)
5208			ifindex = 1;
5209		if (!__dev_get_by_index(net, ifindex))
5210			return ifindex;
5211	}
5212}
5213
5214/* Delayed registration/unregisteration */
5215static LIST_HEAD(net_todo_list);
5216
5217static void net_set_todo(struct net_device *dev)
5218{
5219	list_add_tail(&dev->todo_list, &net_todo_list);
5220}
5221
5222static void rollback_registered_many(struct list_head *head)
5223{
5224	struct net_device *dev, *tmp;
5225
5226	BUG_ON(dev_boot_phase);
5227	ASSERT_RTNL();
5228
5229	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5230		/* Some devices call without registering
5231		 * for initialization unwind. Remove those
5232		 * devices and proceed with the remaining.
5233		 */
5234		if (dev->reg_state == NETREG_UNINITIALIZED) {
5235			pr_debug("unregister_netdevice: device %s/%p never "
5236				 "was registered\n", dev->name, dev);
5237
5238			WARN_ON(1);
5239			list_del(&dev->unreg_list);
5240			continue;
5241		}
5242		dev->dismantle = true;
5243		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5244	}
5245
5246	/* If device is running, close it first. */
5247	dev_close_many(head);
5248
5249	list_for_each_entry(dev, head, unreg_list) {
5250		/* And unlink it from device chain. */
5251		unlist_netdevice(dev);
5252
5253		dev->reg_state = NETREG_UNREGISTERING;
5254	}
5255
5256	synchronize_net();
5257
5258	list_for_each_entry(dev, head, unreg_list) {
5259		/* Shutdown queueing discipline. */
5260		dev_shutdown(dev);
5261
5262
5263		/* Notify protocols, that we are about to destroy
5264		   this device. They should clean all the things.
5265		*/
5266		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5267
5268		if (!dev->rtnl_link_ops ||
5269		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5270			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5271
5272		/*
5273		 *	Flush the unicast and multicast chains
5274		 */
5275		dev_uc_flush(dev);
5276		dev_mc_flush(dev);
5277
5278		if (dev->netdev_ops->ndo_uninit)
5279			dev->netdev_ops->ndo_uninit(dev);
5280
5281		/* Notifier chain MUST detach us from master device. */
5282		WARN_ON(dev->master);
5283
5284		/* Remove entries from kobject tree */
5285		netdev_unregister_kobject(dev);
5286	}
5287
5288	/* Process any work delayed until the end of the batch */
5289	dev = list_first_entry(head, struct net_device, unreg_list);
5290	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5291
5292	synchronize_net();
5293
5294	list_for_each_entry(dev, head, unreg_list)
5295		dev_put(dev);
5296}
5297
5298static void rollback_registered(struct net_device *dev)
5299{
5300	LIST_HEAD(single);
5301
5302	list_add(&dev->unreg_list, &single);
5303	rollback_registered_many(&single);
5304	list_del(&single);
5305}
5306
5307static netdev_features_t netdev_fix_features(struct net_device *dev,
5308	netdev_features_t features)
5309{
5310	/* Fix illegal checksum combinations */
5311	if ((features & NETIF_F_HW_CSUM) &&
5312	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5313		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5314		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5315	}
5316
5317	/* Fix illegal SG+CSUM combinations. */
5318	if ((features & NETIF_F_SG) &&
5319	    !(features & NETIF_F_ALL_CSUM)) {
5320		netdev_dbg(dev,
5321			"Dropping NETIF_F_SG since no checksum feature.\n");
5322		features &= ~NETIF_F_SG;
5323	}
5324
5325	/* TSO requires that SG is present as well. */
5326	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5327		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5328		features &= ~NETIF_F_ALL_TSO;
5329	}
5330
5331	/* TSO ECN requires that TSO is present as well. */
5332	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5333		features &= ~NETIF_F_TSO_ECN;
5334
5335	/* Software GSO depends on SG. */
5336	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5337		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5338		features &= ~NETIF_F_GSO;
5339	}
5340
5341	/* UFO needs SG and checksumming */
5342	if (features & NETIF_F_UFO) {
5343		/* maybe split UFO into V4 and V6? */
5344		if (!((features & NETIF_F_GEN_CSUM) ||
5345		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5346			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5347			netdev_dbg(dev,
5348				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5349			features &= ~NETIF_F_UFO;
5350		}
5351
5352		if (!(features & NETIF_F_SG)) {
5353			netdev_dbg(dev,
5354				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5355			features &= ~NETIF_F_UFO;
5356		}
5357	}
5358
5359	return features;
5360}
5361
5362int __netdev_update_features(struct net_device *dev)
5363{
5364	netdev_features_t features;
5365	int err = 0;
5366
5367	ASSERT_RTNL();
5368
5369	features = netdev_get_wanted_features(dev);
5370
5371	if (dev->netdev_ops->ndo_fix_features)
5372		features = dev->netdev_ops->ndo_fix_features(dev, features);
5373
5374	/* driver might be less strict about feature dependencies */
5375	features = netdev_fix_features(dev, features);
5376
5377	if (dev->features == features)
5378		return 0;
5379
5380	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5381		&dev->features, &features);
5382
5383	if (dev->netdev_ops->ndo_set_features)
5384		err = dev->netdev_ops->ndo_set_features(dev, features);
5385
5386	if (unlikely(err < 0)) {
5387		netdev_err(dev,
5388			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5389			err, &features, &dev->features);
5390		return -1;
5391	}
5392
5393	if (!err)
5394		dev->features = features;
5395
5396	return 1;
5397}
5398
5399/**
5400 *	netdev_update_features - recalculate device features
5401 *	@dev: the device to check
5402 *
5403 *	Recalculate dev->features set and send notifications if it
5404 *	has changed. Should be called after driver or hardware dependent
5405 *	conditions might have changed that influence the features.
5406 */
5407void netdev_update_features(struct net_device *dev)
5408{
5409	if (__netdev_update_features(dev))
5410		netdev_features_change(dev);
5411}
5412EXPORT_SYMBOL(netdev_update_features);
5413
5414/**
5415 *	netdev_change_features - recalculate device features
5416 *	@dev: the device to check
5417 *
5418 *	Recalculate dev->features set and send notifications even
5419 *	if they have not changed. Should be called instead of
5420 *	netdev_update_features() if also dev->vlan_features might
5421 *	have changed to allow the changes to be propagated to stacked
5422 *	VLAN devices.
5423 */
5424void netdev_change_features(struct net_device *dev)
5425{
5426	__netdev_update_features(dev);
5427	netdev_features_change(dev);
5428}
5429EXPORT_SYMBOL(netdev_change_features);
5430
5431/**
5432 *	netif_stacked_transfer_operstate -	transfer operstate
5433 *	@rootdev: the root or lower level device to transfer state from
5434 *	@dev: the device to transfer operstate to
5435 *
5436 *	Transfer operational state from root to device. This is normally
5437 *	called when a stacking relationship exists between the root
5438 *	device and the device(a leaf device).
5439 */
5440void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5441					struct net_device *dev)
5442{
5443	if (rootdev->operstate == IF_OPER_DORMANT)
5444		netif_dormant_on(dev);
5445	else
5446		netif_dormant_off(dev);
5447
5448	if (netif_carrier_ok(rootdev)) {
5449		if (!netif_carrier_ok(dev))
5450			netif_carrier_on(dev);
5451	} else {
5452		if (netif_carrier_ok(dev))
5453			netif_carrier_off(dev);
5454	}
5455}
5456EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5457
5458#ifdef CONFIG_RPS
5459static int netif_alloc_rx_queues(struct net_device *dev)
5460{
5461	unsigned int i, count = dev->num_rx_queues;
5462	struct netdev_rx_queue *rx;
5463
5464	BUG_ON(count < 1);
5465
5466	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5467	if (!rx) {
5468		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5469		return -ENOMEM;
5470	}
5471	dev->_rx = rx;
5472
5473	for (i = 0; i < count; i++)
5474		rx[i].dev = dev;
5475	return 0;
5476}
5477#endif
5478
5479static void netdev_init_one_queue(struct net_device *dev,
5480				  struct netdev_queue *queue, void *_unused)
5481{
5482	/* Initialize queue lock */
5483	spin_lock_init(&queue->_xmit_lock);
5484	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5485	queue->xmit_lock_owner = -1;
5486	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5487	queue->dev = dev;
5488#ifdef CONFIG_BQL
5489	dql_init(&queue->dql, HZ);
5490#endif
5491}
5492
5493static int netif_alloc_netdev_queues(struct net_device *dev)
5494{
5495	unsigned int count = dev->num_tx_queues;
5496	struct netdev_queue *tx;
5497
5498	BUG_ON(count < 1);
5499
5500	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5501	if (!tx) {
5502		pr_err("netdev: Unable to allocate %u tx queues.\n",
5503		       count);
5504		return -ENOMEM;
5505	}
5506	dev->_tx = tx;
5507
5508	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5509	spin_lock_init(&dev->tx_global_lock);
5510
5511	return 0;
5512}
5513
5514/**
5515 *	register_netdevice	- register a network device
5516 *	@dev: device to register
5517 *
5518 *	Take a completed network device structure and add it to the kernel
5519 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5520 *	chain. 0 is returned on success. A negative errno code is returned
5521 *	on a failure to set up the device, or if the name is a duplicate.
5522 *
5523 *	Callers must hold the rtnl semaphore. You may want
5524 *	register_netdev() instead of this.
5525 *
5526 *	BUGS:
5527 *	The locking appears insufficient to guarantee two parallel registers
5528 *	will not get the same name.
5529 */
5530
5531int register_netdevice(struct net_device *dev)
5532{
5533	int ret;
5534	struct net *net = dev_net(dev);
5535
5536	BUG_ON(dev_boot_phase);
5537	ASSERT_RTNL();
5538
5539	might_sleep();
5540
5541	/* When net_device's are persistent, this will be fatal. */
5542	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5543	BUG_ON(!net);
5544
5545	spin_lock_init(&dev->addr_list_lock);
5546	netdev_set_addr_lockdep_class(dev);
5547
5548	dev->iflink = -1;
5549
5550	ret = dev_get_valid_name(dev, dev->name);
5551	if (ret < 0)
5552		goto out;
5553
5554	/* Init, if this function is available */
5555	if (dev->netdev_ops->ndo_init) {
5556		ret = dev->netdev_ops->ndo_init(dev);
5557		if (ret) {
5558			if (ret > 0)
5559				ret = -EIO;
5560			goto out;
5561		}
5562	}
5563
5564	dev->ifindex = dev_new_index(net);
5565	if (dev->iflink == -1)
5566		dev->iflink = dev->ifindex;
5567
5568	/* Transfer changeable features to wanted_features and enable
5569	 * software offloads (GSO and GRO).
5570	 */
5571	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5572	dev->features |= NETIF_F_SOFT_FEATURES;
5573	dev->wanted_features = dev->features & dev->hw_features;
5574
5575	/* Turn on no cache copy if HW is doing checksum */
5576	if (!(dev->flags & IFF_LOOPBACK)) {
5577		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5578		if (dev->features & NETIF_F_ALL_CSUM) {
5579			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5580			dev->features |= NETIF_F_NOCACHE_COPY;
5581		}
5582	}
5583
5584	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5585	 */
5586	dev->vlan_features |= NETIF_F_HIGHDMA;
5587
5588	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5589	ret = notifier_to_errno(ret);
5590	if (ret)
5591		goto err_uninit;
5592
5593	ret = netdev_register_kobject(dev);
5594	if (ret)
5595		goto err_uninit;
5596	dev->reg_state = NETREG_REGISTERED;
5597
5598	__netdev_update_features(dev);
5599
5600	/*
5601	 *	Default initial state at registry is that the
5602	 *	device is present.
5603	 */
5604
5605	set_bit(__LINK_STATE_PRESENT, &dev->state);
5606
5607	dev_init_scheduler(dev);
5608	dev_hold(dev);
5609	list_netdevice(dev);
5610
5611	/* Notify protocols, that a new device appeared. */
5612	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5613	ret = notifier_to_errno(ret);
5614	if (ret) {
5615		rollback_registered(dev);
5616		dev->reg_state = NETREG_UNREGISTERED;
5617	}
5618	/*
5619	 *	Prevent userspace races by waiting until the network
5620	 *	device is fully setup before sending notifications.
5621	 */
5622	if (!dev->rtnl_link_ops ||
5623	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5624		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5625
5626out:
5627	return ret;
5628
5629err_uninit:
5630	if (dev->netdev_ops->ndo_uninit)
5631		dev->netdev_ops->ndo_uninit(dev);
5632	goto out;
5633}
5634EXPORT_SYMBOL(register_netdevice);
5635
5636/**
5637 *	init_dummy_netdev	- init a dummy network device for NAPI
5638 *	@dev: device to init
5639 *
5640 *	This takes a network device structure and initialize the minimum
5641 *	amount of fields so it can be used to schedule NAPI polls without
5642 *	registering a full blown interface. This is to be used by drivers
5643 *	that need to tie several hardware interfaces to a single NAPI
5644 *	poll scheduler due to HW limitations.
5645 */
5646int init_dummy_netdev(struct net_device *dev)
5647{
5648	/* Clear everything. Note we don't initialize spinlocks
5649	 * are they aren't supposed to be taken by any of the
5650	 * NAPI code and this dummy netdev is supposed to be
5651	 * only ever used for NAPI polls
5652	 */
5653	memset(dev, 0, sizeof(struct net_device));
5654
5655	/* make sure we BUG if trying to hit standard
5656	 * register/unregister code path
5657	 */
5658	dev->reg_state = NETREG_DUMMY;
5659
5660	/* NAPI wants this */
5661	INIT_LIST_HEAD(&dev->napi_list);
5662
5663	/* a dummy interface is started by default */
5664	set_bit(__LINK_STATE_PRESENT, &dev->state);
5665	set_bit(__LINK_STATE_START, &dev->state);
5666
5667	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5668	 * because users of this 'device' dont need to change
5669	 * its refcount.
5670	 */
5671
5672	return 0;
5673}
5674EXPORT_SYMBOL_GPL(init_dummy_netdev);
5675
5676
5677/**
5678 *	register_netdev	- register a network device
5679 *	@dev: device to register
5680 *
5681 *	Take a completed network device structure and add it to the kernel
5682 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5683 *	chain. 0 is returned on success. A negative errno code is returned
5684 *	on a failure to set up the device, or if the name is a duplicate.
5685 *
5686 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5687 *	and expands the device name if you passed a format string to
5688 *	alloc_netdev.
5689 */
5690int register_netdev(struct net_device *dev)
5691{
5692	int err;
5693
5694	rtnl_lock();
5695	err = register_netdevice(dev);
5696	rtnl_unlock();
5697	return err;
5698}
5699EXPORT_SYMBOL(register_netdev);
5700
5701int netdev_refcnt_read(const struct net_device *dev)
5702{
5703	int i, refcnt = 0;
5704
5705	for_each_possible_cpu(i)
5706		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5707	return refcnt;
5708}
5709EXPORT_SYMBOL(netdev_refcnt_read);
5710
5711/*
5712 * netdev_wait_allrefs - wait until all references are gone.
5713 *
5714 * This is called when unregistering network devices.
5715 *
5716 * Any protocol or device that holds a reference should register
5717 * for netdevice notification, and cleanup and put back the
5718 * reference if they receive an UNREGISTER event.
5719 * We can get stuck here if buggy protocols don't correctly
5720 * call dev_put.
5721 */
5722static void netdev_wait_allrefs(struct net_device *dev)
5723{
5724	unsigned long rebroadcast_time, warning_time;
5725	int refcnt;
5726
5727	linkwatch_forget_dev(dev);
5728
5729	rebroadcast_time = warning_time = jiffies;
5730	refcnt = netdev_refcnt_read(dev);
5731
5732	while (refcnt != 0) {
5733		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5734			rtnl_lock();
5735
5736			/* Rebroadcast unregister notification */
5737			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5738			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5739			 * should have already handle it the first time */
5740
5741			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5742				     &dev->state)) {
5743				/* We must not have linkwatch events
5744				 * pending on unregister. If this
5745				 * happens, we simply run the queue
5746				 * unscheduled, resulting in a noop
5747				 * for this device.
5748				 */
5749				linkwatch_run_queue();
5750			}
5751
5752			__rtnl_unlock();
5753
5754			rebroadcast_time = jiffies;
5755		}
5756
5757		msleep(250);
5758
5759		refcnt = netdev_refcnt_read(dev);
5760
5761		if (time_after(jiffies, warning_time + 10 * HZ)) {
5762			printk(KERN_EMERG "unregister_netdevice: "
5763			       "waiting for %s to become free. Usage "
5764			       "count = %d\n",
5765			       dev->name, refcnt);
5766			warning_time = jiffies;
5767		}
5768	}
5769}
5770
5771/* The sequence is:
5772 *
5773 *	rtnl_lock();
5774 *	...
5775 *	register_netdevice(x1);
5776 *	register_netdevice(x2);
5777 *	...
5778 *	unregister_netdevice(y1);
5779 *	unregister_netdevice(y2);
5780 *      ...
5781 *	rtnl_unlock();
5782 *	free_netdev(y1);
5783 *	free_netdev(y2);
5784 *
5785 * We are invoked by rtnl_unlock().
5786 * This allows us to deal with problems:
5787 * 1) We can delete sysfs objects which invoke hotplug
5788 *    without deadlocking with linkwatch via keventd.
5789 * 2) Since we run with the RTNL semaphore not held, we can sleep
5790 *    safely in order to wait for the netdev refcnt to drop to zero.
5791 *
5792 * We must not return until all unregister events added during
5793 * the interval the lock was held have been completed.
5794 */
5795void netdev_run_todo(void)
5796{
5797	struct list_head list;
5798
5799	/* Snapshot list, allow later requests */
5800	list_replace_init(&net_todo_list, &list);
5801
5802	__rtnl_unlock();
5803
5804	/* Wait for rcu callbacks to finish before attempting to drain
5805	 * the device list.  This usually avoids a 250ms wait.
5806	 */
5807	if (!list_empty(&list))
5808		rcu_barrier();
5809
5810	while (!list_empty(&list)) {
5811		struct net_device *dev
5812			= list_first_entry(&list, struct net_device, todo_list);
5813		list_del(&dev->todo_list);
5814
5815		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5816			printk(KERN_ERR "network todo '%s' but state %d\n",
5817			       dev->name, dev->reg_state);
5818			dump_stack();
5819			continue;
5820		}
5821
5822		dev->reg_state = NETREG_UNREGISTERED;
5823
5824		on_each_cpu(flush_backlog, dev, 1);
5825
5826		netdev_wait_allrefs(dev);
5827
5828		/* paranoia */
5829		BUG_ON(netdev_refcnt_read(dev));
5830		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5831		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5832		WARN_ON(dev->dn_ptr);
5833
5834		if (dev->destructor)
5835			dev->destructor(dev);
5836
5837		/* Free network device */
5838		kobject_put(&dev->dev.kobj);
5839	}
5840}
5841
5842/* Convert net_device_stats to rtnl_link_stats64.  They have the same
5843 * fields in the same order, with only the type differing.
5844 */
5845static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5846				    const struct net_device_stats *netdev_stats)
5847{
5848#if BITS_PER_LONG == 64
5849        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5850        memcpy(stats64, netdev_stats, sizeof(*stats64));
5851#else
5852	size_t i, n = sizeof(*stats64) / sizeof(u64);
5853	const unsigned long *src = (const unsigned long *)netdev_stats;
5854	u64 *dst = (u64 *)stats64;
5855
5856	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5857		     sizeof(*stats64) / sizeof(u64));
5858	for (i = 0; i < n; i++)
5859		dst[i] = src[i];
5860#endif
5861}
5862
5863/**
5864 *	dev_get_stats	- get network device statistics
5865 *	@dev: device to get statistics from
5866 *	@storage: place to store stats
5867 *
5868 *	Get network statistics from device. Return @storage.
5869 *	The device driver may provide its own method by setting
5870 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5871 *	otherwise the internal statistics structure is used.
5872 */
5873struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5874					struct rtnl_link_stats64 *storage)
5875{
5876	const struct net_device_ops *ops = dev->netdev_ops;
5877
5878	if (ops->ndo_get_stats64) {
5879		memset(storage, 0, sizeof(*storage));
5880		ops->ndo_get_stats64(dev, storage);
5881	} else if (ops->ndo_get_stats) {
5882		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5883	} else {
5884		netdev_stats_to_stats64(storage, &dev->stats);
5885	}
5886	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5887	return storage;
5888}
5889EXPORT_SYMBOL(dev_get_stats);
5890
5891struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5892{
5893	struct netdev_queue *queue = dev_ingress_queue(dev);
5894
5895#ifdef CONFIG_NET_CLS_ACT
5896	if (queue)
5897		return queue;
5898	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5899	if (!queue)
5900		return NULL;
5901	netdev_init_one_queue(dev, queue, NULL);
5902	queue->qdisc = &noop_qdisc;
5903	queue->qdisc_sleeping = &noop_qdisc;
5904	rcu_assign_pointer(dev->ingress_queue, queue);
5905#endif
5906	return queue;
5907}
5908
5909/**
5910 *	alloc_netdev_mqs - allocate network device
5911 *	@sizeof_priv:	size of private data to allocate space for
5912 *	@name:		device name format string
5913 *	@setup:		callback to initialize device
5914 *	@txqs:		the number of TX subqueues to allocate
5915 *	@rxqs:		the number of RX subqueues to allocate
5916 *
5917 *	Allocates a struct net_device with private data area for driver use
5918 *	and performs basic initialization.  Also allocates subquue structs
5919 *	for each queue on the device.
5920 */
5921struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5922		void (*setup)(struct net_device *),
5923		unsigned int txqs, unsigned int rxqs)
5924{
5925	struct net_device *dev;
5926	size_t alloc_size;
5927	struct net_device *p;
5928
5929	BUG_ON(strlen(name) >= sizeof(dev->name));
5930
5931	if (txqs < 1) {
5932		pr_err("alloc_netdev: Unable to allocate device "
5933		       "with zero queues.\n");
5934		return NULL;
5935	}
5936
5937#ifdef CONFIG_RPS
5938	if (rxqs < 1) {
5939		pr_err("alloc_netdev: Unable to allocate device "
5940		       "with zero RX queues.\n");
5941		return NULL;
5942	}
5943#endif
5944
5945	alloc_size = sizeof(struct net_device);
5946	if (sizeof_priv) {
5947		/* ensure 32-byte alignment of private area */
5948		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5949		alloc_size += sizeof_priv;
5950	}
5951	/* ensure 32-byte alignment of whole construct */
5952	alloc_size += NETDEV_ALIGN - 1;
5953
5954	p = kzalloc(alloc_size, GFP_KERNEL);
5955	if (!p) {
5956		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5957		return NULL;
5958	}
5959
5960	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5961	dev->padded = (char *)dev - (char *)p;
5962
5963	dev->pcpu_refcnt = alloc_percpu(int);
5964	if (!dev->pcpu_refcnt)
5965		goto free_p;
5966
5967	if (dev_addr_init(dev))
5968		goto free_pcpu;
5969
5970	dev_mc_init(dev);
5971	dev_uc_init(dev);
5972
5973	dev_net_set(dev, &init_net);
5974
5975	dev->gso_max_size = GSO_MAX_SIZE;
5976
5977	INIT_LIST_HEAD(&dev->napi_list);
5978	INIT_LIST_HEAD(&dev->unreg_list);
5979	INIT_LIST_HEAD(&dev->link_watch_list);
5980	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5981	setup(dev);
5982
5983	dev->num_tx_queues = txqs;
5984	dev->real_num_tx_queues = txqs;
5985	if (netif_alloc_netdev_queues(dev))
5986		goto free_all;
5987
5988#ifdef CONFIG_RPS
5989	dev->num_rx_queues = rxqs;
5990	dev->real_num_rx_queues = rxqs;
5991	if (netif_alloc_rx_queues(dev))
5992		goto free_all;
5993#endif
5994
5995	strcpy(dev->name, name);
5996	dev->group = INIT_NETDEV_GROUP;
5997	return dev;
5998
5999free_all:
6000	free_netdev(dev);
6001	return NULL;
6002
6003free_pcpu:
6004	free_percpu(dev->pcpu_refcnt);
6005	kfree(dev->_tx);
6006#ifdef CONFIG_RPS
6007	kfree(dev->_rx);
6008#endif
6009
6010free_p:
6011	kfree(p);
6012	return NULL;
6013}
6014EXPORT_SYMBOL(alloc_netdev_mqs);
6015
6016/**
6017 *	free_netdev - free network device
6018 *	@dev: device
6019 *
6020 *	This function does the last stage of destroying an allocated device
6021 * 	interface. The reference to the device object is released.
6022 *	If this is the last reference then it will be freed.
6023 */
6024void free_netdev(struct net_device *dev)
6025{
6026	struct napi_struct *p, *n;
6027
6028	release_net(dev_net(dev));
6029
6030	kfree(dev->_tx);
6031#ifdef CONFIG_RPS
6032	kfree(dev->_rx);
6033#endif
6034
6035	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6036
6037	/* Flush device addresses */
6038	dev_addr_flush(dev);
6039
6040	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6041		netif_napi_del(p);
6042
6043	free_percpu(dev->pcpu_refcnt);
6044	dev->pcpu_refcnt = NULL;
6045
6046	/*  Compatibility with error handling in drivers */
6047	if (dev->reg_state == NETREG_UNINITIALIZED) {
6048		kfree((char *)dev - dev->padded);
6049		return;
6050	}
6051
6052	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6053	dev->reg_state = NETREG_RELEASED;
6054
6055	/* will free via device release */
6056	put_device(&dev->dev);
6057}
6058EXPORT_SYMBOL(free_netdev);
6059
6060/**
6061 *	synchronize_net -  Synchronize with packet receive processing
6062 *
6063 *	Wait for packets currently being received to be done.
6064 *	Does not block later packets from starting.
6065 */
6066void synchronize_net(void)
6067{
6068	might_sleep();
6069	if (rtnl_is_locked())
6070		synchronize_rcu_expedited();
6071	else
6072		synchronize_rcu();
6073}
6074EXPORT_SYMBOL(synchronize_net);
6075
6076/**
6077 *	unregister_netdevice_queue - remove device from the kernel
6078 *	@dev: device
6079 *	@head: list
6080 *
6081 *	This function shuts down a device interface and removes it
6082 *	from the kernel tables.
6083 *	If head not NULL, device is queued to be unregistered later.
6084 *
6085 *	Callers must hold the rtnl semaphore.  You may want
6086 *	unregister_netdev() instead of this.
6087 */
6088
6089void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6090{
6091	ASSERT_RTNL();
6092
6093	if (head) {
6094		list_move_tail(&dev->unreg_list, head);
6095	} else {
6096		rollback_registered(dev);
6097		/* Finish processing unregister after unlock */
6098		net_set_todo(dev);
6099	}
6100}
6101EXPORT_SYMBOL(unregister_netdevice_queue);
6102
6103/**
6104 *	unregister_netdevice_many - unregister many devices
6105 *	@head: list of devices
6106 */
6107void unregister_netdevice_many(struct list_head *head)
6108{
6109	struct net_device *dev;
6110
6111	if (!list_empty(head)) {
6112		rollback_registered_many(head);
6113		list_for_each_entry(dev, head, unreg_list)
6114			net_set_todo(dev);
6115	}
6116}
6117EXPORT_SYMBOL(unregister_netdevice_many);
6118
6119/**
6120 *	unregister_netdev - remove device from the kernel
6121 *	@dev: device
6122 *
6123 *	This function shuts down a device interface and removes it
6124 *	from the kernel tables.
6125 *
6126 *	This is just a wrapper for unregister_netdevice that takes
6127 *	the rtnl semaphore.  In general you want to use this and not
6128 *	unregister_netdevice.
6129 */
6130void unregister_netdev(struct net_device *dev)
6131{
6132	rtnl_lock();
6133	unregister_netdevice(dev);
6134	rtnl_unlock();
6135}
6136EXPORT_SYMBOL(unregister_netdev);
6137
6138/**
6139 *	dev_change_net_namespace - move device to different nethost namespace
6140 *	@dev: device
6141 *	@net: network namespace
6142 *	@pat: If not NULL name pattern to try if the current device name
6143 *	      is already taken in the destination network namespace.
6144 *
6145 *	This function shuts down a device interface and moves it
6146 *	to a new network namespace. On success 0 is returned, on
6147 *	a failure a netagive errno code is returned.
6148 *
6149 *	Callers must hold the rtnl semaphore.
6150 */
6151
6152int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6153{
6154	int err;
6155
6156	ASSERT_RTNL();
6157
6158	/* Don't allow namespace local devices to be moved. */
6159	err = -EINVAL;
6160	if (dev->features & NETIF_F_NETNS_LOCAL)
6161		goto out;
6162
6163	/* Ensure the device has been registrered */
6164	err = -EINVAL;
6165	if (dev->reg_state != NETREG_REGISTERED)
6166		goto out;
6167
6168	/* Get out if there is nothing todo */
6169	err = 0;
6170	if (net_eq(dev_net(dev), net))
6171		goto out;
6172
6173	/* Pick the destination device name, and ensure
6174	 * we can use it in the destination network namespace.
6175	 */
6176	err = -EEXIST;
6177	if (__dev_get_by_name(net, dev->name)) {
6178		/* We get here if we can't use the current device name */
6179		if (!pat)
6180			goto out;
6181		if (dev_get_valid_name(dev, pat) < 0)
6182			goto out;
6183	}
6184
6185	/*
6186	 * And now a mini version of register_netdevice unregister_netdevice.
6187	 */
6188
6189	/* If device is running close it first. */
6190	dev_close(dev);
6191
6192	/* And unlink it from device chain */
6193	err = -ENODEV;
6194	unlist_netdevice(dev);
6195
6196	synchronize_net();
6197
6198	/* Shutdown queueing discipline. */
6199	dev_shutdown(dev);
6200
6201	/* Notify protocols, that we are about to destroy
6202	   this device. They should clean all the things.
6203
6204	   Note that dev->reg_state stays at NETREG_REGISTERED.
6205	   This is wanted because this way 8021q and macvlan know
6206	   the device is just moving and can keep their slaves up.
6207	*/
6208	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6209	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6210	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6211
6212	/*
6213	 *	Flush the unicast and multicast chains
6214	 */
6215	dev_uc_flush(dev);
6216	dev_mc_flush(dev);
6217
6218	/* Actually switch the network namespace */
6219	dev_net_set(dev, net);
6220
6221	/* If there is an ifindex conflict assign a new one */
6222	if (__dev_get_by_index(net, dev->ifindex)) {
6223		int iflink = (dev->iflink == dev->ifindex);
6224		dev->ifindex = dev_new_index(net);
6225		if (iflink)
6226			dev->iflink = dev->ifindex;
6227	}
6228
6229	/* Fixup kobjects */
6230	err = device_rename(&dev->dev, dev->name);
6231	WARN_ON(err);
6232
6233	/* Add the device back in the hashes */
6234	list_netdevice(dev);
6235
6236	/* Notify protocols, that a new device appeared. */
6237	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6238
6239	/*
6240	 *	Prevent userspace races by waiting until the network
6241	 *	device is fully setup before sending notifications.
6242	 */
6243	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6244
6245	synchronize_net();
6246	err = 0;
6247out:
6248	return err;
6249}
6250EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6251
6252static int dev_cpu_callback(struct notifier_block *nfb,
6253			    unsigned long action,
6254			    void *ocpu)
6255{
6256	struct sk_buff **list_skb;
6257	struct sk_buff *skb;
6258	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6259	struct softnet_data *sd, *oldsd;
6260
6261	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6262		return NOTIFY_OK;
6263
6264	local_irq_disable();
6265	cpu = smp_processor_id();
6266	sd = &per_cpu(softnet_data, cpu);
6267	oldsd = &per_cpu(softnet_data, oldcpu);
6268
6269	/* Find end of our completion_queue. */
6270	list_skb = &sd->completion_queue;
6271	while (*list_skb)
6272		list_skb = &(*list_skb)->next;
6273	/* Append completion queue from offline CPU. */
6274	*list_skb = oldsd->completion_queue;
6275	oldsd->completion_queue = NULL;
6276
6277	/* Append output queue from offline CPU. */
6278	if (oldsd->output_queue) {
6279		*sd->output_queue_tailp = oldsd->output_queue;
6280		sd->output_queue_tailp = oldsd->output_queue_tailp;
6281		oldsd->output_queue = NULL;
6282		oldsd->output_queue_tailp = &oldsd->output_queue;
6283	}
6284	/* Append NAPI poll list from offline CPU. */
6285	if (!list_empty(&oldsd->poll_list)) {
6286		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6287		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6288	}
6289
6290	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6291	local_irq_enable();
6292
6293	/* Process offline CPU's input_pkt_queue */
6294	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6295		netif_rx(skb);
6296		input_queue_head_incr(oldsd);
6297	}
6298	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6299		netif_rx(skb);
6300		input_queue_head_incr(oldsd);
6301	}
6302
6303	return NOTIFY_OK;
6304}
6305
6306
6307/**
6308 *	netdev_increment_features - increment feature set by one
6309 *	@all: current feature set
6310 *	@one: new feature set
6311 *	@mask: mask feature set
6312 *
6313 *	Computes a new feature set after adding a device with feature set
6314 *	@one to the master device with current feature set @all.  Will not
6315 *	enable anything that is off in @mask. Returns the new feature set.
6316 */
6317netdev_features_t netdev_increment_features(netdev_features_t all,
6318	netdev_features_t one, netdev_features_t mask)
6319{
6320	if (mask & NETIF_F_GEN_CSUM)
6321		mask |= NETIF_F_ALL_CSUM;
6322	mask |= NETIF_F_VLAN_CHALLENGED;
6323
6324	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6325	all &= one | ~NETIF_F_ALL_FOR_ALL;
6326
6327	/* If one device supports hw checksumming, set for all. */
6328	if (all & NETIF_F_GEN_CSUM)
6329		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6330
6331	return all;
6332}
6333EXPORT_SYMBOL(netdev_increment_features);
6334
6335static struct hlist_head *netdev_create_hash(void)
6336{
6337	int i;
6338	struct hlist_head *hash;
6339
6340	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6341	if (hash != NULL)
6342		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6343			INIT_HLIST_HEAD(&hash[i]);
6344
6345	return hash;
6346}
6347
6348/* Initialize per network namespace state */
6349static int __net_init netdev_init(struct net *net)
6350{
6351	INIT_LIST_HEAD(&net->dev_base_head);
6352
6353	net->dev_name_head = netdev_create_hash();
6354	if (net->dev_name_head == NULL)
6355		goto err_name;
6356
6357	net->dev_index_head = netdev_create_hash();
6358	if (net->dev_index_head == NULL)
6359		goto err_idx;
6360
6361	return 0;
6362
6363err_idx:
6364	kfree(net->dev_name_head);
6365err_name:
6366	return -ENOMEM;
6367}
6368
6369/**
6370 *	netdev_drivername - network driver for the device
6371 *	@dev: network device
6372 *
6373 *	Determine network driver for device.
6374 */
6375const char *netdev_drivername(const struct net_device *dev)
6376{
6377	const struct device_driver *driver;
6378	const struct device *parent;
6379	const char *empty = "";
6380
6381	parent = dev->dev.parent;
6382	if (!parent)
6383		return empty;
6384
6385	driver = parent->driver;
6386	if (driver && driver->name)
6387		return driver->name;
6388	return empty;
6389}
6390
6391int __netdev_printk(const char *level, const struct net_device *dev,
6392			   struct va_format *vaf)
6393{
6394	int r;
6395
6396	if (dev && dev->dev.parent)
6397		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6398			       netdev_name(dev), vaf);
6399	else if (dev)
6400		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6401	else
6402		r = printk("%s(NULL net_device): %pV", level, vaf);
6403
6404	return r;
6405}
6406EXPORT_SYMBOL(__netdev_printk);
6407
6408int netdev_printk(const char *level, const struct net_device *dev,
6409		  const char *format, ...)
6410{
6411	struct va_format vaf;
6412	va_list args;
6413	int r;
6414
6415	va_start(args, format);
6416
6417	vaf.fmt = format;
6418	vaf.va = &args;
6419
6420	r = __netdev_printk(level, dev, &vaf);
6421	va_end(args);
6422
6423	return r;
6424}
6425EXPORT_SYMBOL(netdev_printk);
6426
6427#define define_netdev_printk_level(func, level)			\
6428int func(const struct net_device *dev, const char *fmt, ...)	\
6429{								\
6430	int r;							\
6431	struct va_format vaf;					\
6432	va_list args;						\
6433								\
6434	va_start(args, fmt);					\
6435								\
6436	vaf.fmt = fmt;						\
6437	vaf.va = &args;						\
6438								\
6439	r = __netdev_printk(level, dev, &vaf);			\
6440	va_end(args);						\
6441								\
6442	return r;						\
6443}								\
6444EXPORT_SYMBOL(func);
6445
6446define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6447define_netdev_printk_level(netdev_alert, KERN_ALERT);
6448define_netdev_printk_level(netdev_crit, KERN_CRIT);
6449define_netdev_printk_level(netdev_err, KERN_ERR);
6450define_netdev_printk_level(netdev_warn, KERN_WARNING);
6451define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6452define_netdev_printk_level(netdev_info, KERN_INFO);
6453
6454static void __net_exit netdev_exit(struct net *net)
6455{
6456	kfree(net->dev_name_head);
6457	kfree(net->dev_index_head);
6458}
6459
6460static struct pernet_operations __net_initdata netdev_net_ops = {
6461	.init = netdev_init,
6462	.exit = netdev_exit,
6463};
6464
6465static void __net_exit default_device_exit(struct net *net)
6466{
6467	struct net_device *dev, *aux;
6468	/*
6469	 * Push all migratable network devices back to the
6470	 * initial network namespace
6471	 */
6472	rtnl_lock();
6473	for_each_netdev_safe(net, dev, aux) {
6474		int err;
6475		char fb_name[IFNAMSIZ];
6476
6477		/* Ignore unmoveable devices (i.e. loopback) */
6478		if (dev->features & NETIF_F_NETNS_LOCAL)
6479			continue;
6480
6481		/* Leave virtual devices for the generic cleanup */
6482		if (dev->rtnl_link_ops)
6483			continue;
6484
6485		/* Push remaining network devices to init_net */
6486		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6487		err = dev_change_net_namespace(dev, &init_net, fb_name);
6488		if (err) {
6489			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6490				__func__, dev->name, err);
6491			BUG();
6492		}
6493	}
6494	rtnl_unlock();
6495}
6496
6497static void __net_exit default_device_exit_batch(struct list_head *net_list)
6498{
6499	/* At exit all network devices most be removed from a network
6500	 * namespace.  Do this in the reverse order of registration.
6501	 * Do this across as many network namespaces as possible to
6502	 * improve batching efficiency.
6503	 */
6504	struct net_device *dev;
6505	struct net *net;
6506	LIST_HEAD(dev_kill_list);
6507
6508	rtnl_lock();
6509	list_for_each_entry(net, net_list, exit_list) {
6510		for_each_netdev_reverse(net, dev) {
6511			if (dev->rtnl_link_ops)
6512				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6513			else
6514				unregister_netdevice_queue(dev, &dev_kill_list);
6515		}
6516	}
6517	unregister_netdevice_many(&dev_kill_list);
6518	list_del(&dev_kill_list);
6519	rtnl_unlock();
6520}
6521
6522static struct pernet_operations __net_initdata default_device_ops = {
6523	.exit = default_device_exit,
6524	.exit_batch = default_device_exit_batch,
6525};
6526
6527/*
6528 *	Initialize the DEV module. At boot time this walks the device list and
6529 *	unhooks any devices that fail to initialise (normally hardware not
6530 *	present) and leaves us with a valid list of present and active devices.
6531 *
6532 */
6533
6534/*
6535 *       This is called single threaded during boot, so no need
6536 *       to take the rtnl semaphore.
6537 */
6538static int __init net_dev_init(void)
6539{
6540	int i, rc = -ENOMEM;
6541
6542	BUG_ON(!dev_boot_phase);
6543
6544	if (dev_proc_init())
6545		goto out;
6546
6547	if (netdev_kobject_init())
6548		goto out;
6549
6550	INIT_LIST_HEAD(&ptype_all);
6551	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6552		INIT_LIST_HEAD(&ptype_base[i]);
6553
6554	if (register_pernet_subsys(&netdev_net_ops))
6555		goto out;
6556
6557	/*
6558	 *	Initialise the packet receive queues.
6559	 */
6560
6561	for_each_possible_cpu(i) {
6562		struct softnet_data *sd = &per_cpu(softnet_data, i);
6563
6564		memset(sd, 0, sizeof(*sd));
6565		skb_queue_head_init(&sd->input_pkt_queue);
6566		skb_queue_head_init(&sd->process_queue);
6567		sd->completion_queue = NULL;
6568		INIT_LIST_HEAD(&sd->poll_list);
6569		sd->output_queue = NULL;
6570		sd->output_queue_tailp = &sd->output_queue;
6571#ifdef CONFIG_RPS
6572		sd->csd.func = rps_trigger_softirq;
6573		sd->csd.info = sd;
6574		sd->csd.flags = 0;
6575		sd->cpu = i;
6576#endif
6577
6578		sd->backlog.poll = process_backlog;
6579		sd->backlog.weight = weight_p;
6580		sd->backlog.gro_list = NULL;
6581		sd->backlog.gro_count = 0;
6582	}
6583
6584	dev_boot_phase = 0;
6585
6586	/* The loopback device is special if any other network devices
6587	 * is present in a network namespace the loopback device must
6588	 * be present. Since we now dynamically allocate and free the
6589	 * loopback device ensure this invariant is maintained by
6590	 * keeping the loopback device as the first device on the
6591	 * list of network devices.  Ensuring the loopback devices
6592	 * is the first device that appears and the last network device
6593	 * that disappears.
6594	 */
6595	if (register_pernet_device(&loopback_net_ops))
6596		goto out;
6597
6598	if (register_pernet_device(&default_device_ops))
6599		goto out;
6600
6601	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6602	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6603
6604	hotcpu_notifier(dev_cpu_callback, 0);
6605	dst_init();
6606	dev_mcast_init();
6607	rc = 0;
6608out:
6609	return rc;
6610}
6611
6612subsys_initcall(net_dev_init);
6613
6614static int __init initialize_hashrnd(void)
6615{
6616	get_random_bytes(&hashrnd, sizeof(hashrnd));
6617	return 0;
6618}
6619
6620late_initcall_sync(initialize_hashrnd);
6621