net/core/dev.c at v4.0-rc7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v4.0-rc7 7484 lines 190 kB view raw
   1/*
   2 * 	NET3	Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 * 		Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *              			to 2 if register_netdev gets called
  25 *              			before net_dev_init & also removed a
  26 *              			few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *		Alan Cox	: 	Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *              			indefinitely on dev->refcnt
  71 * 		J Hadi Salim	:	- Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <asm/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/mutex.h>
  85#include <linux/string.h>
  86#include <linux/mm.h>
  87#include <linux/socket.h>
  88#include <linux/sockios.h>
  89#include <linux/errno.h>
  90#include <linux/interrupt.h>
  91#include <linux/if_ether.h>
  92#include <linux/netdevice.h>
  93#include <linux/etherdevice.h>
  94#include <linux/ethtool.h>
  95#include <linux/notifier.h>
  96#include <linux/skbuff.h>
  97#include <net/net_namespace.h>
  98#include <net/sock.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/pkt_sched.h>
 103#include <net/checksum.h>
 104#include <net/xfrm.h>
 105#include <linux/highmem.h>
 106#include <linux/init.h>
 107#include <linux/module.h>
 108#include <linux/netpoll.h>
 109#include <linux/rcupdate.h>
 110#include <linux/delay.h>
 111#include <net/iw_handler.h>
 112#include <asm/current.h>
 113#include <linux/audit.h>
 114#include <linux/dmaengine.h>
 115#include <linux/err.h>
 116#include <linux/ctype.h>
 117#include <linux/if_arp.h>
 118#include <linux/if_vlan.h>
 119#include <linux/ip.h>
 120#include <net/ip.h>
 121#include <net/mpls.h>
 122#include <linux/ipv6.h>
 123#include <linux/in.h>
 124#include <linux/jhash.h>
 125#include <linux/random.h>
 126#include <trace/events/napi.h>
 127#include <trace/events/net.h>
 128#include <trace/events/skb.h>
 129#include <linux/pci.h>
 130#include <linux/inetdevice.h>
 131#include <linux/cpu_rmap.h>
 132#include <linux/static_key.h>
 133#include <linux/hashtable.h>
 134#include <linux/vmalloc.h>
 135#include <linux/if_macvlan.h>
 136#include <linux/errqueue.h>
 137#include <linux/hrtimer.h>
 138
 139#include "net-sysfs.h"
 140
 141/* Instead of increasing this, you should create a hash table. */
 142#define MAX_GRO_SKBS 8
 143
 144/* This should be increased if a protocol with a bigger head is added. */
 145#define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147static DEFINE_SPINLOCK(ptype_lock);
 148static DEFINE_SPINLOCK(offload_lock);
 149struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150struct list_head ptype_all __read_mostly;	/* Taps */
 151static struct list_head offload_base __read_mostly;
 152
 153static int netif_rx_internal(struct sk_buff *skb);
 154static int call_netdevice_notifiers_info(unsigned long val,
 155					 struct net_device *dev,
 156					 struct netdev_notifier_info *info);
 157
 158/*
 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160 * semaphore.
 161 *
 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163 *
 164 * Writers must hold the rtnl semaphore while they loop through the
 165 * dev_base_head list, and hold dev_base_lock for writing when they do the
 166 * actual updates.  This allows pure readers to access the list even
 167 * while a writer is preparing to update it.
 168 *
 169 * To put it another way, dev_base_lock is held for writing only to
 170 * protect against pure readers; the rtnl semaphore provides the
 171 * protection against other writers.
 172 *
 173 * See, for example usages, register_netdevice() and
 174 * unregister_netdevice(), which must be called with the rtnl
 175 * semaphore held.
 176 */
 177DEFINE_RWLOCK(dev_base_lock);
 178EXPORT_SYMBOL(dev_base_lock);
 179
 180/* protects napi_hash addition/deletion and napi_gen_id */
 181static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183static unsigned int napi_gen_id;
 184static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186static seqcount_t devnet_rename_seq;
 187
 188static inline void dev_base_seq_inc(struct net *net)
 189{
 190	while (++net->dev_base_seq == 0);
 191}
 192
 193static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194{
 195	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198}
 199
 200static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201{
 202	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203}
 204
 205static inline void rps_lock(struct softnet_data *sd)
 206{
 207#ifdef CONFIG_RPS
 208	spin_lock(&sd->input_pkt_queue.lock);
 209#endif
 210}
 211
 212static inline void rps_unlock(struct softnet_data *sd)
 213{
 214#ifdef CONFIG_RPS
 215	spin_unlock(&sd->input_pkt_queue.lock);
 216#endif
 217}
 218
 219/* Device list insertion */
 220static void list_netdevice(struct net_device *dev)
 221{
 222	struct net *net = dev_net(dev);
 223
 224	ASSERT_RTNL();
 225
 226	write_lock_bh(&dev_base_lock);
 227	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229	hlist_add_head_rcu(&dev->index_hlist,
 230			   dev_index_hash(net, dev->ifindex));
 231	write_unlock_bh(&dev_base_lock);
 232
 233	dev_base_seq_inc(net);
 234}
 235
 236/* Device list removal
 237 * caller must respect a RCU grace period before freeing/reusing dev
 238 */
 239static void unlist_netdevice(struct net_device *dev)
 240{
 241	ASSERT_RTNL();
 242
 243	/* Unlink dev from the device chain */
 244	write_lock_bh(&dev_base_lock);
 245	list_del_rcu(&dev->dev_list);
 246	hlist_del_rcu(&dev->name_hlist);
 247	hlist_del_rcu(&dev->index_hlist);
 248	write_unlock_bh(&dev_base_lock);
 249
 250	dev_base_seq_inc(dev_net(dev));
 251}
 252
 253/*
 254 *	Our notifier list
 255 */
 256
 257static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259/*
 260 *	Device drivers call our routines to queue packets here. We empty the
 261 *	queue in the local softnet handler.
 262 */
 263
 264DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267#ifdef CONFIG_LOCKDEP
 268/*
 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270 * according to dev->type
 271 */
 272static const unsigned short netdev_lock_type[] =
 273	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289static const char *const netdev_lock_name[] =
 290	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310{
 311	int i;
 312
 313	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314		if (netdev_lock_type[i] == dev_type)
 315			return i;
 316	/* the last key is used by default */
 317	return ARRAY_SIZE(netdev_lock_type) - 1;
 318}
 319
 320static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321						 unsigned short dev_type)
 322{
 323	int i;
 324
 325	i = netdev_lock_pos(dev_type);
 326	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327				   netdev_lock_name[i]);
 328}
 329
 330static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331{
 332	int i;
 333
 334	i = netdev_lock_pos(dev->type);
 335	lockdep_set_class_and_name(&dev->addr_list_lock,
 336				   &netdev_addr_lock_key[i],
 337				   netdev_lock_name[i]);
 338}
 339#else
 340static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341						 unsigned short dev_type)
 342{
 343}
 344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345{
 346}
 347#endif
 348
 349/*******************************************************************************
 350
 351		Protocol management and registration routines
 352
 353*******************************************************************************/
 354
 355/*
 356 *	Add a protocol ID to the list. Now that the input handler is
 357 *	smarter we can dispense with all the messy stuff that used to be
 358 *	here.
 359 *
 360 *	BEWARE!!! Protocol handlers, mangling input packets,
 361 *	MUST BE last in hash buckets and checking protocol handlers
 362 *	MUST start from promiscuous ptype_all chain in net_bh.
 363 *	It is true now, do not change it.
 364 *	Explanation follows: if protocol handler, mangling packet, will
 365 *	be the first on list, it is not able to sense, that packet
 366 *	is cloned and should be copied-on-write, so that it will
 367 *	change it and subsequent readers will get broken packet.
 368 *							--ANK (980803)
 369 */
 370
 371static inline struct list_head *ptype_head(const struct packet_type *pt)
 372{
 373	if (pt->type == htons(ETH_P_ALL))
 374		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375	else
 376		return pt->dev ? &pt->dev->ptype_specific :
 377				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378}
 379
 380/**
 381 *	dev_add_pack - add packet handler
 382 *	@pt: packet type declaration
 383 *
 384 *	Add a protocol handler to the networking stack. The passed &packet_type
 385 *	is linked into kernel lists and may not be freed until it has been
 386 *	removed from the kernel lists.
 387 *
 388 *	This call does not sleep therefore it can not
 389 *	guarantee all CPU's that are in middle of receiving packets
 390 *	will see the new packet type (until the next received packet).
 391 */
 392
 393void dev_add_pack(struct packet_type *pt)
 394{
 395	struct list_head *head = ptype_head(pt);
 396
 397	spin_lock(&ptype_lock);
 398	list_add_rcu(&pt->list, head);
 399	spin_unlock(&ptype_lock);
 400}
 401EXPORT_SYMBOL(dev_add_pack);
 402
 403/**
 404 *	__dev_remove_pack	 - remove packet handler
 405 *	@pt: packet type declaration
 406 *
 407 *	Remove a protocol handler that was previously added to the kernel
 408 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409 *	from the kernel lists and can be freed or reused once this function
 410 *	returns.
 411 *
 412 *      The packet type might still be in use by receivers
 413 *	and must not be freed until after all the CPU's have gone
 414 *	through a quiescent state.
 415 */
 416void __dev_remove_pack(struct packet_type *pt)
 417{
 418	struct list_head *head = ptype_head(pt);
 419	struct packet_type *pt1;
 420
 421	spin_lock(&ptype_lock);
 422
 423	list_for_each_entry(pt1, head, list) {
 424		if (pt == pt1) {
 425			list_del_rcu(&pt->list);
 426			goto out;
 427		}
 428	}
 429
 430	pr_warn("dev_remove_pack: %p not found\n", pt);
 431out:
 432	spin_unlock(&ptype_lock);
 433}
 434EXPORT_SYMBOL(__dev_remove_pack);
 435
 436/**
 437 *	dev_remove_pack	 - remove packet handler
 438 *	@pt: packet type declaration
 439 *
 440 *	Remove a protocol handler that was previously added to the kernel
 441 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442 *	from the kernel lists and can be freed or reused once this function
 443 *	returns.
 444 *
 445 *	This call sleeps to guarantee that no CPU is looking at the packet
 446 *	type after return.
 447 */
 448void dev_remove_pack(struct packet_type *pt)
 449{
 450	__dev_remove_pack(pt);
 451
 452	synchronize_net();
 453}
 454EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457/**
 458 *	dev_add_offload - register offload handlers
 459 *	@po: protocol offload declaration
 460 *
 461 *	Add protocol offload handlers to the networking stack. The passed
 462 *	&proto_offload is linked into kernel lists and may not be freed until
 463 *	it has been removed from the kernel lists.
 464 *
 465 *	This call does not sleep therefore it can not
 466 *	guarantee all CPU's that are in middle of receiving packets
 467 *	will see the new offload handlers (until the next received packet).
 468 */
 469void dev_add_offload(struct packet_offload *po)
 470{
 471	struct list_head *head = &offload_base;
 472
 473	spin_lock(&offload_lock);
 474	list_add_rcu(&po->list, head);
 475	spin_unlock(&offload_lock);
 476}
 477EXPORT_SYMBOL(dev_add_offload);
 478
 479/**
 480 *	__dev_remove_offload	 - remove offload handler
 481 *	@po: packet offload declaration
 482 *
 483 *	Remove a protocol offload handler that was previously added to the
 484 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 485 *	is removed from the kernel lists and can be freed or reused once this
 486 *	function returns.
 487 *
 488 *      The packet type might still be in use by receivers
 489 *	and must not be freed until after all the CPU's have gone
 490 *	through a quiescent state.
 491 */
 492static void __dev_remove_offload(struct packet_offload *po)
 493{
 494	struct list_head *head = &offload_base;
 495	struct packet_offload *po1;
 496
 497	spin_lock(&offload_lock);
 498
 499	list_for_each_entry(po1, head, list) {
 500		if (po == po1) {
 501			list_del_rcu(&po->list);
 502			goto out;
 503		}
 504	}
 505
 506	pr_warn("dev_remove_offload: %p not found\n", po);
 507out:
 508	spin_unlock(&offload_lock);
 509}
 510
 511/**
 512 *	dev_remove_offload	 - remove packet offload handler
 513 *	@po: packet offload declaration
 514 *
 515 *	Remove a packet offload handler that was previously added to the kernel
 516 *	offload handlers by dev_add_offload(). The passed &offload_type is
 517 *	removed from the kernel lists and can be freed or reused once this
 518 *	function returns.
 519 *
 520 *	This call sleeps to guarantee that no CPU is looking at the packet
 521 *	type after return.
 522 */
 523void dev_remove_offload(struct packet_offload *po)
 524{
 525	__dev_remove_offload(po);
 526
 527	synchronize_net();
 528}
 529EXPORT_SYMBOL(dev_remove_offload);
 530
 531/******************************************************************************
 532
 533		      Device Boot-time Settings Routines
 534
 535*******************************************************************************/
 536
 537/* Boot time configuration table */
 538static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540/**
 541 *	netdev_boot_setup_add	- add new setup entry
 542 *	@name: name of the device
 543 *	@map: configured settings for the device
 544 *
 545 *	Adds new setup entry to the dev_boot_setup list.  The function
 546 *	returns 0 on error and 1 on success.  This is a generic routine to
 547 *	all netdevices.
 548 */
 549static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550{
 551	struct netdev_boot_setup *s;
 552	int i;
 553
 554	s = dev_boot_setup;
 555	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557			memset(s[i].name, 0, sizeof(s[i].name));
 558			strlcpy(s[i].name, name, IFNAMSIZ);
 559			memcpy(&s[i].map, map, sizeof(s[i].map));
 560			break;
 561		}
 562	}
 563
 564	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565}
 566
 567/**
 568 *	netdev_boot_setup_check	- check boot time settings
 569 *	@dev: the netdevice
 570 *
 571 * 	Check boot time settings for the device.
 572 *	The found settings are set for the device to be used
 573 *	later in the device probing.
 574 *	Returns 0 if no settings found, 1 if they are.
 575 */
 576int netdev_boot_setup_check(struct net_device *dev)
 577{
 578	struct netdev_boot_setup *s = dev_boot_setup;
 579	int i;
 580
 581	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583		    !strcmp(dev->name, s[i].name)) {
 584			dev->irq 	= s[i].map.irq;
 585			dev->base_addr 	= s[i].map.base_addr;
 586			dev->mem_start 	= s[i].map.mem_start;
 587			dev->mem_end 	= s[i].map.mem_end;
 588			return 1;
 589		}
 590	}
 591	return 0;
 592}
 593EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596/**
 597 *	netdev_boot_base	- get address from boot time settings
 598 *	@prefix: prefix for network device
 599 *	@unit: id for network device
 600 *
 601 * 	Check boot time settings for the base address of device.
 602 *	The found settings are set for the device to be used
 603 *	later in the device probing.
 604 *	Returns 0 if no settings found.
 605 */
 606unsigned long netdev_boot_base(const char *prefix, int unit)
 607{
 608	const struct netdev_boot_setup *s = dev_boot_setup;
 609	char name[IFNAMSIZ];
 610	int i;
 611
 612	sprintf(name, "%s%d", prefix, unit);
 613
 614	/*
 615	 * If device already registered then return base of 1
 616	 * to indicate not to probe for this interface
 617	 */
 618	if (__dev_get_by_name(&init_net, name))
 619		return 1;
 620
 621	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622		if (!strcmp(name, s[i].name))
 623			return s[i].map.base_addr;
 624	return 0;
 625}
 626
 627/*
 628 * Saves at boot time configured settings for any netdevice.
 629 */
 630int __init netdev_boot_setup(char *str)
 631{
 632	int ints[5];
 633	struct ifmap map;
 634
 635	str = get_options(str, ARRAY_SIZE(ints), ints);
 636	if (!str || !*str)
 637		return 0;
 638
 639	/* Save settings */
 640	memset(&map, 0, sizeof(map));
 641	if (ints[0] > 0)
 642		map.irq = ints[1];
 643	if (ints[0] > 1)
 644		map.base_addr = ints[2];
 645	if (ints[0] > 2)
 646		map.mem_start = ints[3];
 647	if (ints[0] > 3)
 648		map.mem_end = ints[4];
 649
 650	/* Add new entry to the list */
 651	return netdev_boot_setup_add(str, &map);
 652}
 653
 654__setup("netdev=", netdev_boot_setup);
 655
 656/*******************************************************************************
 657
 658			    Device Interface Subroutines
 659
 660*******************************************************************************/
 661
 662/**
 663 *	__dev_get_by_name	- find a device by its name
 664 *	@net: the applicable net namespace
 665 *	@name: name to find
 666 *
 667 *	Find an interface by name. Must be called under RTNL semaphore
 668 *	or @dev_base_lock. If the name is found a pointer to the device
 669 *	is returned. If the name is not found then %NULL is returned. The
 670 *	reference counters are not incremented so the caller must be
 671 *	careful with locks.
 672 */
 673
 674struct net_device *__dev_get_by_name(struct net *net, const char *name)
 675{
 676	struct net_device *dev;
 677	struct hlist_head *head = dev_name_hash(net, name);
 678
 679	hlist_for_each_entry(dev, head, name_hlist)
 680		if (!strncmp(dev->name, name, IFNAMSIZ))
 681			return dev;
 682
 683	return NULL;
 684}
 685EXPORT_SYMBOL(__dev_get_by_name);
 686
 687/**
 688 *	dev_get_by_name_rcu	- find a device by its name
 689 *	@net: the applicable net namespace
 690 *	@name: name to find
 691 *
 692 *	Find an interface by name.
 693 *	If the name is found a pointer to the device is returned.
 694 * 	If the name is not found then %NULL is returned.
 695 *	The reference counters are not incremented so the caller must be
 696 *	careful with locks. The caller must hold RCU lock.
 697 */
 698
 699struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 700{
 701	struct net_device *dev;
 702	struct hlist_head *head = dev_name_hash(net, name);
 703
 704	hlist_for_each_entry_rcu(dev, head, name_hlist)
 705		if (!strncmp(dev->name, name, IFNAMSIZ))
 706			return dev;
 707
 708	return NULL;
 709}
 710EXPORT_SYMBOL(dev_get_by_name_rcu);
 711
 712/**
 713 *	dev_get_by_name		- find a device by its name
 714 *	@net: the applicable net namespace
 715 *	@name: name to find
 716 *
 717 *	Find an interface by name. This can be called from any
 718 *	context and does its own locking. The returned handle has
 719 *	the usage count incremented and the caller must use dev_put() to
 720 *	release it when it is no longer needed. %NULL is returned if no
 721 *	matching device is found.
 722 */
 723
 724struct net_device *dev_get_by_name(struct net *net, const char *name)
 725{
 726	struct net_device *dev;
 727
 728	rcu_read_lock();
 729	dev = dev_get_by_name_rcu(net, name);
 730	if (dev)
 731		dev_hold(dev);
 732	rcu_read_unlock();
 733	return dev;
 734}
 735EXPORT_SYMBOL(dev_get_by_name);
 736
 737/**
 738 *	__dev_get_by_index - find a device by its ifindex
 739 *	@net: the applicable net namespace
 740 *	@ifindex: index of device
 741 *
 742 *	Search for an interface by index. Returns %NULL if the device
 743 *	is not found or a pointer to the device. The device has not
 744 *	had its reference counter increased so the caller must be careful
 745 *	about locking. The caller must hold either the RTNL semaphore
 746 *	or @dev_base_lock.
 747 */
 748
 749struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 750{
 751	struct net_device *dev;
 752	struct hlist_head *head = dev_index_hash(net, ifindex);
 753
 754	hlist_for_each_entry(dev, head, index_hlist)
 755		if (dev->ifindex == ifindex)
 756			return dev;
 757
 758	return NULL;
 759}
 760EXPORT_SYMBOL(__dev_get_by_index);
 761
 762/**
 763 *	dev_get_by_index_rcu - find a device by its ifindex
 764 *	@net: the applicable net namespace
 765 *	@ifindex: index of device
 766 *
 767 *	Search for an interface by index. Returns %NULL if the device
 768 *	is not found or a pointer to the device. The device has not
 769 *	had its reference counter increased so the caller must be careful
 770 *	about locking. The caller must hold RCU lock.
 771 */
 772
 773struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 774{
 775	struct net_device *dev;
 776	struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778	hlist_for_each_entry_rcu(dev, head, index_hlist)
 779		if (dev->ifindex == ifindex)
 780			return dev;
 781
 782	return NULL;
 783}
 784EXPORT_SYMBOL(dev_get_by_index_rcu);
 785
 786
 787/**
 788 *	dev_get_by_index - find a device by its ifindex
 789 *	@net: the applicable net namespace
 790 *	@ifindex: index of device
 791 *
 792 *	Search for an interface by index. Returns NULL if the device
 793 *	is not found or a pointer to the device. The device returned has
 794 *	had a reference added and the pointer is safe until the user calls
 795 *	dev_put to indicate they have finished with it.
 796 */
 797
 798struct net_device *dev_get_by_index(struct net *net, int ifindex)
 799{
 800	struct net_device *dev;
 801
 802	rcu_read_lock();
 803	dev = dev_get_by_index_rcu(net, ifindex);
 804	if (dev)
 805		dev_hold(dev);
 806	rcu_read_unlock();
 807	return dev;
 808}
 809EXPORT_SYMBOL(dev_get_by_index);
 810
 811/**
 812 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 813 *	@net: network namespace
 814 *	@name: a pointer to the buffer where the name will be stored.
 815 *	@ifindex: the ifindex of the interface to get the name from.
 816 *
 817 *	The use of raw_seqcount_begin() and cond_resched() before
 818 *	retrying is required as we want to give the writers a chance
 819 *	to complete when CONFIG_PREEMPT is not set.
 820 */
 821int netdev_get_name(struct net *net, char *name, int ifindex)
 822{
 823	struct net_device *dev;
 824	unsigned int seq;
 825
 826retry:
 827	seq = raw_seqcount_begin(&devnet_rename_seq);
 828	rcu_read_lock();
 829	dev = dev_get_by_index_rcu(net, ifindex);
 830	if (!dev) {
 831		rcu_read_unlock();
 832		return -ENODEV;
 833	}
 834
 835	strcpy(name, dev->name);
 836	rcu_read_unlock();
 837	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 838		cond_resched();
 839		goto retry;
 840	}
 841
 842	return 0;
 843}
 844
 845/**
 846 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 847 *	@net: the applicable net namespace
 848 *	@type: media type of device
 849 *	@ha: hardware address
 850 *
 851 *	Search for an interface by MAC address. Returns NULL if the device
 852 *	is not found or a pointer to the device.
 853 *	The caller must hold RCU or RTNL.
 854 *	The returned device has not had its ref count increased
 855 *	and the caller must therefore be careful about locking
 856 *
 857 */
 858
 859struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 860				       const char *ha)
 861{
 862	struct net_device *dev;
 863
 864	for_each_netdev_rcu(net, dev)
 865		if (dev->type == type &&
 866		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 867			return dev;
 868
 869	return NULL;
 870}
 871EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 872
 873struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874{
 875	struct net_device *dev;
 876
 877	ASSERT_RTNL();
 878	for_each_netdev(net, dev)
 879		if (dev->type == type)
 880			return dev;
 881
 882	return NULL;
 883}
 884EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 885
 886struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 887{
 888	struct net_device *dev, *ret = NULL;
 889
 890	rcu_read_lock();
 891	for_each_netdev_rcu(net, dev)
 892		if (dev->type == type) {
 893			dev_hold(dev);
 894			ret = dev;
 895			break;
 896		}
 897	rcu_read_unlock();
 898	return ret;
 899}
 900EXPORT_SYMBOL(dev_getfirstbyhwtype);
 901
 902/**
 903 *	__dev_get_by_flags - find any device with given flags
 904 *	@net: the applicable net namespace
 905 *	@if_flags: IFF_* values
 906 *	@mask: bitmask of bits in if_flags to check
 907 *
 908 *	Search for any interface with the given flags. Returns NULL if a device
 909 *	is not found or a pointer to the device. Must be called inside
 910 *	rtnl_lock(), and result refcount is unchanged.
 911 */
 912
 913struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 914				      unsigned short mask)
 915{
 916	struct net_device *dev, *ret;
 917
 918	ASSERT_RTNL();
 919
 920	ret = NULL;
 921	for_each_netdev(net, dev) {
 922		if (((dev->flags ^ if_flags) & mask) == 0) {
 923			ret = dev;
 924			break;
 925		}
 926	}
 927	return ret;
 928}
 929EXPORT_SYMBOL(__dev_get_by_flags);
 930
 931/**
 932 *	dev_valid_name - check if name is okay for network device
 933 *	@name: name string
 934 *
 935 *	Network device names need to be valid file names to
 936 *	to allow sysfs to work.  We also disallow any kind of
 937 *	whitespace.
 938 */
 939bool dev_valid_name(const char *name)
 940{
 941	if (*name == '\0')
 942		return false;
 943	if (strlen(name) >= IFNAMSIZ)
 944		return false;
 945	if (!strcmp(name, ".") || !strcmp(name, ".."))
 946		return false;
 947
 948	while (*name) {
 949		if (*name == '/' || *name == ':' || isspace(*name))
 950			return false;
 951		name++;
 952	}
 953	return true;
 954}
 955EXPORT_SYMBOL(dev_valid_name);
 956
 957/**
 958 *	__dev_alloc_name - allocate a name for a device
 959 *	@net: network namespace to allocate the device name in
 960 *	@name: name format string
 961 *	@buf:  scratch buffer and result name string
 962 *
 963 *	Passed a format string - eg "lt%d" it will try and find a suitable
 964 *	id. It scans list of devices to build up a free map, then chooses
 965 *	the first empty slot. The caller must hold the dev_base or rtnl lock
 966 *	while allocating the name and adding the device in order to avoid
 967 *	duplicates.
 968 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 969 *	Returns the number of the unit assigned or a negative errno code.
 970 */
 971
 972static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 973{
 974	int i = 0;
 975	const char *p;
 976	const int max_netdevices = 8*PAGE_SIZE;
 977	unsigned long *inuse;
 978	struct net_device *d;
 979
 980	p = strnchr(name, IFNAMSIZ-1, '%');
 981	if (p) {
 982		/*
 983		 * Verify the string as this thing may have come from
 984		 * the user.  There must be either one "%d" and no other "%"
 985		 * characters.
 986		 */
 987		if (p[1] != 'd' || strchr(p + 2, '%'))
 988			return -EINVAL;
 989
 990		/* Use one page as a bit array of possible slots */
 991		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 992		if (!inuse)
 993			return -ENOMEM;
 994
 995		for_each_netdev(net, d) {
 996			if (!sscanf(d->name, name, &i))
 997				continue;
 998			if (i < 0 || i >= max_netdevices)
 999				continue;
1000
1001			/*  avoid cases where sscanf is not exact inverse of printf */
1002			snprintf(buf, IFNAMSIZ, name, i);
1003			if (!strncmp(buf, d->name, IFNAMSIZ))
1004				set_bit(i, inuse);
1005		}
1006
1007		i = find_first_zero_bit(inuse, max_netdevices);
1008		free_page((unsigned long) inuse);
1009	}
1010
1011	if (buf != name)
1012		snprintf(buf, IFNAMSIZ, name, i);
1013	if (!__dev_get_by_name(net, buf))
1014		return i;
1015
1016	/* It is possible to run out of possible slots
1017	 * when the name is long and there isn't enough space left
1018	 * for the digits, or if all bits are used.
1019	 */
1020	return -ENFILE;
1021}
1022
1023/**
1024 *	dev_alloc_name - allocate a name for a device
1025 *	@dev: device
1026 *	@name: name format string
1027 *
1028 *	Passed a format string - eg "lt%d" it will try and find a suitable
1029 *	id. It scans list of devices to build up a free map, then chooses
1030 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1031 *	while allocating the name and adding the device in order to avoid
1032 *	duplicates.
1033 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1034 *	Returns the number of the unit assigned or a negative errno code.
1035 */
1036
1037int dev_alloc_name(struct net_device *dev, const char *name)
1038{
1039	char buf[IFNAMSIZ];
1040	struct net *net;
1041	int ret;
1042
1043	BUG_ON(!dev_net(dev));
1044	net = dev_net(dev);
1045	ret = __dev_alloc_name(net, name, buf);
1046	if (ret >= 0)
1047		strlcpy(dev->name, buf, IFNAMSIZ);
1048	return ret;
1049}
1050EXPORT_SYMBOL(dev_alloc_name);
1051
1052static int dev_alloc_name_ns(struct net *net,
1053			     struct net_device *dev,
1054			     const char *name)
1055{
1056	char buf[IFNAMSIZ];
1057	int ret;
1058
1059	ret = __dev_alloc_name(net, name, buf);
1060	if (ret >= 0)
1061		strlcpy(dev->name, buf, IFNAMSIZ);
1062	return ret;
1063}
1064
1065static int dev_get_valid_name(struct net *net,
1066			      struct net_device *dev,
1067			      const char *name)
1068{
1069	BUG_ON(!net);
1070
1071	if (!dev_valid_name(name))
1072		return -EINVAL;
1073
1074	if (strchr(name, '%'))
1075		return dev_alloc_name_ns(net, dev, name);
1076	else if (__dev_get_by_name(net, name))
1077		return -EEXIST;
1078	else if (dev->name != name)
1079		strlcpy(dev->name, name, IFNAMSIZ);
1080
1081	return 0;
1082}
1083
1084/**
1085 *	dev_change_name - change name of a device
1086 *	@dev: device
1087 *	@newname: name (or format string) must be at least IFNAMSIZ
1088 *
1089 *	Change name of a device, can pass format strings "eth%d".
1090 *	for wildcarding.
1091 */
1092int dev_change_name(struct net_device *dev, const char *newname)
1093{
1094	unsigned char old_assign_type;
1095	char oldname[IFNAMSIZ];
1096	int err = 0;
1097	int ret;
1098	struct net *net;
1099
1100	ASSERT_RTNL();
1101	BUG_ON(!dev_net(dev));
1102
1103	net = dev_net(dev);
1104	if (dev->flags & IFF_UP)
1105		return -EBUSY;
1106
1107	write_seqcount_begin(&devnet_rename_seq);
1108
1109	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1110		write_seqcount_end(&devnet_rename_seq);
1111		return 0;
1112	}
1113
1114	memcpy(oldname, dev->name, IFNAMSIZ);
1115
1116	err = dev_get_valid_name(net, dev, newname);
1117	if (err < 0) {
1118		write_seqcount_end(&devnet_rename_seq);
1119		return err;
1120	}
1121
1122	if (oldname[0] && !strchr(oldname, '%'))
1123		netdev_info(dev, "renamed from %s\n", oldname);
1124
1125	old_assign_type = dev->name_assign_type;
1126	dev->name_assign_type = NET_NAME_RENAMED;
1127
1128rollback:
1129	ret = device_rename(&dev->dev, dev->name);
1130	if (ret) {
1131		memcpy(dev->name, oldname, IFNAMSIZ);
1132		dev->name_assign_type = old_assign_type;
1133		write_seqcount_end(&devnet_rename_seq);
1134		return ret;
1135	}
1136
1137	write_seqcount_end(&devnet_rename_seq);
1138
1139	netdev_adjacent_rename_links(dev, oldname);
1140
1141	write_lock_bh(&dev_base_lock);
1142	hlist_del_rcu(&dev->name_hlist);
1143	write_unlock_bh(&dev_base_lock);
1144
1145	synchronize_rcu();
1146
1147	write_lock_bh(&dev_base_lock);
1148	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1149	write_unlock_bh(&dev_base_lock);
1150
1151	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1152	ret = notifier_to_errno(ret);
1153
1154	if (ret) {
1155		/* err >= 0 after dev_alloc_name() or stores the first errno */
1156		if (err >= 0) {
1157			err = ret;
1158			write_seqcount_begin(&devnet_rename_seq);
1159			memcpy(dev->name, oldname, IFNAMSIZ);
1160			memcpy(oldname, newname, IFNAMSIZ);
1161			dev->name_assign_type = old_assign_type;
1162			old_assign_type = NET_NAME_RENAMED;
1163			goto rollback;
1164		} else {
1165			pr_err("%s: name change rollback failed: %d\n",
1166			       dev->name, ret);
1167		}
1168	}
1169
1170	return err;
1171}
1172
1173/**
1174 *	dev_set_alias - change ifalias of a device
1175 *	@dev: device
1176 *	@alias: name up to IFALIASZ
1177 *	@len: limit of bytes to copy from info
1178 *
1179 *	Set ifalias for a device,
1180 */
1181int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1182{
1183	char *new_ifalias;
1184
1185	ASSERT_RTNL();
1186
1187	if (len >= IFALIASZ)
1188		return -EINVAL;
1189
1190	if (!len) {
1191		kfree(dev->ifalias);
1192		dev->ifalias = NULL;
1193		return 0;
1194	}
1195
1196	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1197	if (!new_ifalias)
1198		return -ENOMEM;
1199	dev->ifalias = new_ifalias;
1200
1201	strlcpy(dev->ifalias, alias, len+1);
1202	return len;
1203}
1204
1205
1206/**
1207 *	netdev_features_change - device changes features
1208 *	@dev: device to cause notification
1209 *
1210 *	Called to indicate a device has changed features.
1211 */
1212void netdev_features_change(struct net_device *dev)
1213{
1214	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1215}
1216EXPORT_SYMBOL(netdev_features_change);
1217
1218/**
1219 *	netdev_state_change - device changes state
1220 *	@dev: device to cause notification
1221 *
1222 *	Called to indicate a device has changed state. This function calls
1223 *	the notifier chains for netdev_chain and sends a NEWLINK message
1224 *	to the routing socket.
1225 */
1226void netdev_state_change(struct net_device *dev)
1227{
1228	if (dev->flags & IFF_UP) {
1229		struct netdev_notifier_change_info change_info;
1230
1231		change_info.flags_changed = 0;
1232		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1233					      &change_info.info);
1234		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1235	}
1236}
1237EXPORT_SYMBOL(netdev_state_change);
1238
1239/**
1240 * 	netdev_notify_peers - notify network peers about existence of @dev
1241 * 	@dev: network device
1242 *
1243 * Generate traffic such that interested network peers are aware of
1244 * @dev, such as by generating a gratuitous ARP. This may be used when
1245 * a device wants to inform the rest of the network about some sort of
1246 * reconfiguration such as a failover event or virtual machine
1247 * migration.
1248 */
1249void netdev_notify_peers(struct net_device *dev)
1250{
1251	rtnl_lock();
1252	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1253	rtnl_unlock();
1254}
1255EXPORT_SYMBOL(netdev_notify_peers);
1256
1257static int __dev_open(struct net_device *dev)
1258{
1259	const struct net_device_ops *ops = dev->netdev_ops;
1260	int ret;
1261
1262	ASSERT_RTNL();
1263
1264	if (!netif_device_present(dev))
1265		return -ENODEV;
1266
1267	/* Block netpoll from trying to do any rx path servicing.
1268	 * If we don't do this there is a chance ndo_poll_controller
1269	 * or ndo_poll may be running while we open the device
1270	 */
1271	netpoll_poll_disable(dev);
1272
1273	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1274	ret = notifier_to_errno(ret);
1275	if (ret)
1276		return ret;
1277
1278	set_bit(__LINK_STATE_START, &dev->state);
1279
1280	if (ops->ndo_validate_addr)
1281		ret = ops->ndo_validate_addr(dev);
1282
1283	if (!ret && ops->ndo_open)
1284		ret = ops->ndo_open(dev);
1285
1286	netpoll_poll_enable(dev);
1287
1288	if (ret)
1289		clear_bit(__LINK_STATE_START, &dev->state);
1290	else {
1291		dev->flags |= IFF_UP;
1292		dev_set_rx_mode(dev);
1293		dev_activate(dev);
1294		add_device_randomness(dev->dev_addr, dev->addr_len);
1295	}
1296
1297	return ret;
1298}
1299
1300/**
1301 *	dev_open	- prepare an interface for use.
1302 *	@dev:	device to open
1303 *
1304 *	Takes a device from down to up state. The device's private open
1305 *	function is invoked and then the multicast lists are loaded. Finally
1306 *	the device is moved into the up state and a %NETDEV_UP message is
1307 *	sent to the netdev notifier chain.
1308 *
1309 *	Calling this function on an active interface is a nop. On a failure
1310 *	a negative errno code is returned.
1311 */
1312int dev_open(struct net_device *dev)
1313{
1314	int ret;
1315
1316	if (dev->flags & IFF_UP)
1317		return 0;
1318
1319	ret = __dev_open(dev);
1320	if (ret < 0)
1321		return ret;
1322
1323	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1324	call_netdevice_notifiers(NETDEV_UP, dev);
1325
1326	return ret;
1327}
1328EXPORT_SYMBOL(dev_open);
1329
1330static int __dev_close_many(struct list_head *head)
1331{
1332	struct net_device *dev;
1333
1334	ASSERT_RTNL();
1335	might_sleep();
1336
1337	list_for_each_entry(dev, head, close_list) {
1338		/* Temporarily disable netpoll until the interface is down */
1339		netpoll_poll_disable(dev);
1340
1341		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1342
1343		clear_bit(__LINK_STATE_START, &dev->state);
1344
1345		/* Synchronize to scheduled poll. We cannot touch poll list, it
1346		 * can be even on different cpu. So just clear netif_running().
1347		 *
1348		 * dev->stop() will invoke napi_disable() on all of it's
1349		 * napi_struct instances on this device.
1350		 */
1351		smp_mb__after_atomic(); /* Commit netif_running(). */
1352	}
1353
1354	dev_deactivate_many(head);
1355
1356	list_for_each_entry(dev, head, close_list) {
1357		const struct net_device_ops *ops = dev->netdev_ops;
1358
1359		/*
1360		 *	Call the device specific close. This cannot fail.
1361		 *	Only if device is UP
1362		 *
1363		 *	We allow it to be called even after a DETACH hot-plug
1364		 *	event.
1365		 */
1366		if (ops->ndo_stop)
1367			ops->ndo_stop(dev);
1368
1369		dev->flags &= ~IFF_UP;
1370		netpoll_poll_enable(dev);
1371	}
1372
1373	return 0;
1374}
1375
1376static int __dev_close(struct net_device *dev)
1377{
1378	int retval;
1379	LIST_HEAD(single);
1380
1381	list_add(&dev->close_list, &single);
1382	retval = __dev_close_many(&single);
1383	list_del(&single);
1384
1385	return retval;
1386}
1387
1388static int dev_close_many(struct list_head *head)
1389{
1390	struct net_device *dev, *tmp;
1391
1392	/* Remove the devices that don't need to be closed */
1393	list_for_each_entry_safe(dev, tmp, head, close_list)
1394		if (!(dev->flags & IFF_UP))
1395			list_del_init(&dev->close_list);
1396
1397	__dev_close_many(head);
1398
1399	list_for_each_entry_safe(dev, tmp, head, close_list) {
1400		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1401		call_netdevice_notifiers(NETDEV_DOWN, dev);
1402		list_del_init(&dev->close_list);
1403	}
1404
1405	return 0;
1406}
1407
1408/**
1409 *	dev_close - shutdown an interface.
1410 *	@dev: device to shutdown
1411 *
1412 *	This function moves an active device into down state. A
1413 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1414 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1415 *	chain.
1416 */
1417int dev_close(struct net_device *dev)
1418{
1419	if (dev->flags & IFF_UP) {
1420		LIST_HEAD(single);
1421
1422		list_add(&dev->close_list, &single);
1423		dev_close_many(&single);
1424		list_del(&single);
1425	}
1426	return 0;
1427}
1428EXPORT_SYMBOL(dev_close);
1429
1430
1431/**
1432 *	dev_disable_lro - disable Large Receive Offload on a device
1433 *	@dev: device
1434 *
1435 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1436 *	called under RTNL.  This is needed if received packets may be
1437 *	forwarded to another interface.
1438 */
1439void dev_disable_lro(struct net_device *dev)
1440{
1441	struct net_device *lower_dev;
1442	struct list_head *iter;
1443
1444	dev->wanted_features &= ~NETIF_F_LRO;
1445	netdev_update_features(dev);
1446
1447	if (unlikely(dev->features & NETIF_F_LRO))
1448		netdev_WARN(dev, "failed to disable LRO!\n");
1449
1450	netdev_for_each_lower_dev(dev, lower_dev, iter)
1451		dev_disable_lro(lower_dev);
1452}
1453EXPORT_SYMBOL(dev_disable_lro);
1454
1455static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1456				   struct net_device *dev)
1457{
1458	struct netdev_notifier_info info;
1459
1460	netdev_notifier_info_init(&info, dev);
1461	return nb->notifier_call(nb, val, &info);
1462}
1463
1464static int dev_boot_phase = 1;
1465
1466/**
1467 *	register_netdevice_notifier - register a network notifier block
1468 *	@nb: notifier
1469 *
1470 *	Register a notifier to be called when network device events occur.
1471 *	The notifier passed is linked into the kernel structures and must
1472 *	not be reused until it has been unregistered. A negative errno code
1473 *	is returned on a failure.
1474 *
1475 * 	When registered all registration and up events are replayed
1476 *	to the new notifier to allow device to have a race free
1477 *	view of the network device list.
1478 */
1479
1480int register_netdevice_notifier(struct notifier_block *nb)
1481{
1482	struct net_device *dev;
1483	struct net_device *last;
1484	struct net *net;
1485	int err;
1486
1487	rtnl_lock();
1488	err = raw_notifier_chain_register(&netdev_chain, nb);
1489	if (err)
1490		goto unlock;
1491	if (dev_boot_phase)
1492		goto unlock;
1493	for_each_net(net) {
1494		for_each_netdev(net, dev) {
1495			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1496			err = notifier_to_errno(err);
1497			if (err)
1498				goto rollback;
1499
1500			if (!(dev->flags & IFF_UP))
1501				continue;
1502
1503			call_netdevice_notifier(nb, NETDEV_UP, dev);
1504		}
1505	}
1506
1507unlock:
1508	rtnl_unlock();
1509	return err;
1510
1511rollback:
1512	last = dev;
1513	for_each_net(net) {
1514		for_each_netdev(net, dev) {
1515			if (dev == last)
1516				goto outroll;
1517
1518			if (dev->flags & IFF_UP) {
1519				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1520							dev);
1521				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1522			}
1523			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1524		}
1525	}
1526
1527outroll:
1528	raw_notifier_chain_unregister(&netdev_chain, nb);
1529	goto unlock;
1530}
1531EXPORT_SYMBOL(register_netdevice_notifier);
1532
1533/**
1534 *	unregister_netdevice_notifier - unregister a network notifier block
1535 *	@nb: notifier
1536 *
1537 *	Unregister a notifier previously registered by
1538 *	register_netdevice_notifier(). The notifier is unlinked into the
1539 *	kernel structures and may then be reused. A negative errno code
1540 *	is returned on a failure.
1541 *
1542 * 	After unregistering unregister and down device events are synthesized
1543 *	for all devices on the device list to the removed notifier to remove
1544 *	the need for special case cleanup code.
1545 */
1546
1547int unregister_netdevice_notifier(struct notifier_block *nb)
1548{
1549	struct net_device *dev;
1550	struct net *net;
1551	int err;
1552
1553	rtnl_lock();
1554	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1555	if (err)
1556		goto unlock;
1557
1558	for_each_net(net) {
1559		for_each_netdev(net, dev) {
1560			if (dev->flags & IFF_UP) {
1561				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1562							dev);
1563				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1564			}
1565			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1566		}
1567	}
1568unlock:
1569	rtnl_unlock();
1570	return err;
1571}
1572EXPORT_SYMBOL(unregister_netdevice_notifier);
1573
1574/**
1575 *	call_netdevice_notifiers_info - call all network notifier blocks
1576 *	@val: value passed unmodified to notifier function
1577 *	@dev: net_device pointer passed unmodified to notifier function
1578 *	@info: notifier information data
1579 *
1580 *	Call all network notifier blocks.  Parameters and return value
1581 *	are as for raw_notifier_call_chain().
1582 */
1583
1584static int call_netdevice_notifiers_info(unsigned long val,
1585					 struct net_device *dev,
1586					 struct netdev_notifier_info *info)
1587{
1588	ASSERT_RTNL();
1589	netdev_notifier_info_init(info, dev);
1590	return raw_notifier_call_chain(&netdev_chain, val, info);
1591}
1592
1593/**
1594 *	call_netdevice_notifiers - call all network notifier blocks
1595 *      @val: value passed unmodified to notifier function
1596 *      @dev: net_device pointer passed unmodified to notifier function
1597 *
1598 *	Call all network notifier blocks.  Parameters and return value
1599 *	are as for raw_notifier_call_chain().
1600 */
1601
1602int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1603{
1604	struct netdev_notifier_info info;
1605
1606	return call_netdevice_notifiers_info(val, dev, &info);
1607}
1608EXPORT_SYMBOL(call_netdevice_notifiers);
1609
1610static struct static_key netstamp_needed __read_mostly;
1611#ifdef HAVE_JUMP_LABEL
1612/* We are not allowed to call static_key_slow_dec() from irq context
1613 * If net_disable_timestamp() is called from irq context, defer the
1614 * static_key_slow_dec() calls.
1615 */
1616static atomic_t netstamp_needed_deferred;
1617#endif
1618
1619void net_enable_timestamp(void)
1620{
1621#ifdef HAVE_JUMP_LABEL
1622	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1623
1624	if (deferred) {
1625		while (--deferred)
1626			static_key_slow_dec(&netstamp_needed);
1627		return;
1628	}
1629#endif
1630	static_key_slow_inc(&netstamp_needed);
1631}
1632EXPORT_SYMBOL(net_enable_timestamp);
1633
1634void net_disable_timestamp(void)
1635{
1636#ifdef HAVE_JUMP_LABEL
1637	if (in_interrupt()) {
1638		atomic_inc(&netstamp_needed_deferred);
1639		return;
1640	}
1641#endif
1642	static_key_slow_dec(&netstamp_needed);
1643}
1644EXPORT_SYMBOL(net_disable_timestamp);
1645
1646static inline void net_timestamp_set(struct sk_buff *skb)
1647{
1648	skb->tstamp.tv64 = 0;
1649	if (static_key_false(&netstamp_needed))
1650		__net_timestamp(skb);
1651}
1652
1653#define net_timestamp_check(COND, SKB)			\
1654	if (static_key_false(&netstamp_needed)) {		\
1655		if ((COND) && !(SKB)->tstamp.tv64)	\
1656			__net_timestamp(SKB);		\
1657	}						\
1658
1659bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1660{
1661	unsigned int len;
1662
1663	if (!(dev->flags & IFF_UP))
1664		return false;
1665
1666	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1667	if (skb->len <= len)
1668		return true;
1669
1670	/* if TSO is enabled, we don't care about the length as the packet
1671	 * could be forwarded without being segmented before
1672	 */
1673	if (skb_is_gso(skb))
1674		return true;
1675
1676	return false;
1677}
1678EXPORT_SYMBOL_GPL(is_skb_forwardable);
1679
1680int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1681{
1682	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1683		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1684			atomic_long_inc(&dev->rx_dropped);
1685			kfree_skb(skb);
1686			return NET_RX_DROP;
1687		}
1688	}
1689
1690	if (unlikely(!is_skb_forwardable(dev, skb))) {
1691		atomic_long_inc(&dev->rx_dropped);
1692		kfree_skb(skb);
1693		return NET_RX_DROP;
1694	}
1695
1696	skb_scrub_packet(skb, true);
1697	skb->protocol = eth_type_trans(skb, dev);
1698	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1699
1700	return 0;
1701}
1702EXPORT_SYMBOL_GPL(__dev_forward_skb);
1703
1704/**
1705 * dev_forward_skb - loopback an skb to another netif
1706 *
1707 * @dev: destination network device
1708 * @skb: buffer to forward
1709 *
1710 * return values:
1711 *	NET_RX_SUCCESS	(no congestion)
1712 *	NET_RX_DROP     (packet was dropped, but freed)
1713 *
1714 * dev_forward_skb can be used for injecting an skb from the
1715 * start_xmit function of one device into the receive queue
1716 * of another device.
1717 *
1718 * The receiving device may be in another namespace, so
1719 * we have to clear all information in the skb that could
1720 * impact namespace isolation.
1721 */
1722int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1723{
1724	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1725}
1726EXPORT_SYMBOL_GPL(dev_forward_skb);
1727
1728static inline int deliver_skb(struct sk_buff *skb,
1729			      struct packet_type *pt_prev,
1730			      struct net_device *orig_dev)
1731{
1732	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1733		return -ENOMEM;
1734	atomic_inc(&skb->users);
1735	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1736}
1737
1738static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1739					  struct packet_type **pt,
1740					  struct net_device *dev, __be16 type,
1741					  struct list_head *ptype_list)
1742{
1743	struct packet_type *ptype, *pt_prev = *pt;
1744
1745	list_for_each_entry_rcu(ptype, ptype_list, list) {
1746		if (ptype->type != type)
1747			continue;
1748		if (pt_prev)
1749			deliver_skb(skb, pt_prev, dev);
1750		pt_prev = ptype;
1751	}
1752	*pt = pt_prev;
1753}
1754
1755static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1756{
1757	if (!ptype->af_packet_priv || !skb->sk)
1758		return false;
1759
1760	if (ptype->id_match)
1761		return ptype->id_match(ptype, skb->sk);
1762	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1763		return true;
1764
1765	return false;
1766}
1767
1768/*
1769 *	Support routine. Sends outgoing frames to any network
1770 *	taps currently in use.
1771 */
1772
1773static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1774{
1775	struct packet_type *ptype;
1776	struct sk_buff *skb2 = NULL;
1777	struct packet_type *pt_prev = NULL;
1778	struct list_head *ptype_list = &ptype_all;
1779
1780	rcu_read_lock();
1781again:
1782	list_for_each_entry_rcu(ptype, ptype_list, list) {
1783		/* Never send packets back to the socket
1784		 * they originated from - MvS (miquels@drinkel.ow.org)
1785		 */
1786		if (skb_loop_sk(ptype, skb))
1787			continue;
1788
1789		if (pt_prev) {
1790			deliver_skb(skb2, pt_prev, skb->dev);
1791			pt_prev = ptype;
1792			continue;
1793		}
1794
1795		/* need to clone skb, done only once */
1796		skb2 = skb_clone(skb, GFP_ATOMIC);
1797		if (!skb2)
1798			goto out_unlock;
1799
1800		net_timestamp_set(skb2);
1801
1802		/* skb->nh should be correctly
1803		 * set by sender, so that the second statement is
1804		 * just protection against buggy protocols.
1805		 */
1806		skb_reset_mac_header(skb2);
1807
1808		if (skb_network_header(skb2) < skb2->data ||
1809		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1810			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1811					     ntohs(skb2->protocol),
1812					     dev->name);
1813			skb_reset_network_header(skb2);
1814		}
1815
1816		skb2->transport_header = skb2->network_header;
1817		skb2->pkt_type = PACKET_OUTGOING;
1818		pt_prev = ptype;
1819	}
1820
1821	if (ptype_list == &ptype_all) {
1822		ptype_list = &dev->ptype_all;
1823		goto again;
1824	}
1825out_unlock:
1826	if (pt_prev)
1827		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1828	rcu_read_unlock();
1829}
1830
1831/**
1832 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1833 * @dev: Network device
1834 * @txq: number of queues available
1835 *
1836 * If real_num_tx_queues is changed the tc mappings may no longer be
1837 * valid. To resolve this verify the tc mapping remains valid and if
1838 * not NULL the mapping. With no priorities mapping to this
1839 * offset/count pair it will no longer be used. In the worst case TC0
1840 * is invalid nothing can be done so disable priority mappings. If is
1841 * expected that drivers will fix this mapping if they can before
1842 * calling netif_set_real_num_tx_queues.
1843 */
1844static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1845{
1846	int i;
1847	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1848
1849	/* If TC0 is invalidated disable TC mapping */
1850	if (tc->offset + tc->count > txq) {
1851		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1852		dev->num_tc = 0;
1853		return;
1854	}
1855
1856	/* Invalidated prio to tc mappings set to TC0 */
1857	for (i = 1; i < TC_BITMASK + 1; i++) {
1858		int q = netdev_get_prio_tc_map(dev, i);
1859
1860		tc = &dev->tc_to_txq[q];
1861		if (tc->offset + tc->count > txq) {
1862			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1863				i, q);
1864			netdev_set_prio_tc_map(dev, i, 0);
1865		}
1866	}
1867}
1868
1869#ifdef CONFIG_XPS
1870static DEFINE_MUTEX(xps_map_mutex);
1871#define xmap_dereference(P)		\
1872	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1873
1874static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1875					int cpu, u16 index)
1876{
1877	struct xps_map *map = NULL;
1878	int pos;
1879
1880	if (dev_maps)
1881		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1882
1883	for (pos = 0; map && pos < map->len; pos++) {
1884		if (map->queues[pos] == index) {
1885			if (map->len > 1) {
1886				map->queues[pos] = map->queues[--map->len];
1887			} else {
1888				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1889				kfree_rcu(map, rcu);
1890				map = NULL;
1891			}
1892			break;
1893		}
1894	}
1895
1896	return map;
1897}
1898
1899static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1900{
1901	struct xps_dev_maps *dev_maps;
1902	int cpu, i;
1903	bool active = false;
1904
1905	mutex_lock(&xps_map_mutex);
1906	dev_maps = xmap_dereference(dev->xps_maps);
1907
1908	if (!dev_maps)
1909		goto out_no_maps;
1910
1911	for_each_possible_cpu(cpu) {
1912		for (i = index; i < dev->num_tx_queues; i++) {
1913			if (!remove_xps_queue(dev_maps, cpu, i))
1914				break;
1915		}
1916		if (i == dev->num_tx_queues)
1917			active = true;
1918	}
1919
1920	if (!active) {
1921		RCU_INIT_POINTER(dev->xps_maps, NULL);
1922		kfree_rcu(dev_maps, rcu);
1923	}
1924
1925	for (i = index; i < dev->num_tx_queues; i++)
1926		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1927					     NUMA_NO_NODE);
1928
1929out_no_maps:
1930	mutex_unlock(&xps_map_mutex);
1931}
1932
1933static struct xps_map *expand_xps_map(struct xps_map *map,
1934				      int cpu, u16 index)
1935{
1936	struct xps_map *new_map;
1937	int alloc_len = XPS_MIN_MAP_ALLOC;
1938	int i, pos;
1939
1940	for (pos = 0; map && pos < map->len; pos++) {
1941		if (map->queues[pos] != index)
1942			continue;
1943		return map;
1944	}
1945
1946	/* Need to add queue to this CPU's existing map */
1947	if (map) {
1948		if (pos < map->alloc_len)
1949			return map;
1950
1951		alloc_len = map->alloc_len * 2;
1952	}
1953
1954	/* Need to allocate new map to store queue on this CPU's map */
1955	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1956			       cpu_to_node(cpu));
1957	if (!new_map)
1958		return NULL;
1959
1960	for (i = 0; i < pos; i++)
1961		new_map->queues[i] = map->queues[i];
1962	new_map->alloc_len = alloc_len;
1963	new_map->len = pos;
1964
1965	return new_map;
1966}
1967
1968int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1969			u16 index)
1970{
1971	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1972	struct xps_map *map, *new_map;
1973	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1974	int cpu, numa_node_id = -2;
1975	bool active = false;
1976
1977	mutex_lock(&xps_map_mutex);
1978
1979	dev_maps = xmap_dereference(dev->xps_maps);
1980
1981	/* allocate memory for queue storage */
1982	for_each_online_cpu(cpu) {
1983		if (!cpumask_test_cpu(cpu, mask))
1984			continue;
1985
1986		if (!new_dev_maps)
1987			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1988		if (!new_dev_maps) {
1989			mutex_unlock(&xps_map_mutex);
1990			return -ENOMEM;
1991		}
1992
1993		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1994				 NULL;
1995
1996		map = expand_xps_map(map, cpu, index);
1997		if (!map)
1998			goto error;
1999
2000		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2001	}
2002
2003	if (!new_dev_maps)
2004		goto out_no_new_maps;
2005
2006	for_each_possible_cpu(cpu) {
2007		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2008			/* add queue to CPU maps */
2009			int pos = 0;
2010
2011			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2012			while ((pos < map->len) && (map->queues[pos] != index))
2013				pos++;
2014
2015			if (pos == map->len)
2016				map->queues[map->len++] = index;
2017#ifdef CONFIG_NUMA
2018			if (numa_node_id == -2)
2019				numa_node_id = cpu_to_node(cpu);
2020			else if (numa_node_id != cpu_to_node(cpu))
2021				numa_node_id = -1;
2022#endif
2023		} else if (dev_maps) {
2024			/* fill in the new device map from the old device map */
2025			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2026			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2027		}
2028
2029	}
2030
2031	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2032
2033	/* Cleanup old maps */
2034	if (dev_maps) {
2035		for_each_possible_cpu(cpu) {
2036			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2037			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2038			if (map && map != new_map)
2039				kfree_rcu(map, rcu);
2040		}
2041
2042		kfree_rcu(dev_maps, rcu);
2043	}
2044
2045	dev_maps = new_dev_maps;
2046	active = true;
2047
2048out_no_new_maps:
2049	/* update Tx queue numa node */
2050	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2051				     (numa_node_id >= 0) ? numa_node_id :
2052				     NUMA_NO_NODE);
2053
2054	if (!dev_maps)
2055		goto out_no_maps;
2056
2057	/* removes queue from unused CPUs */
2058	for_each_possible_cpu(cpu) {
2059		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2060			continue;
2061
2062		if (remove_xps_queue(dev_maps, cpu, index))
2063			active = true;
2064	}
2065
2066	/* free map if not active */
2067	if (!active) {
2068		RCU_INIT_POINTER(dev->xps_maps, NULL);
2069		kfree_rcu(dev_maps, rcu);
2070	}
2071
2072out_no_maps:
2073	mutex_unlock(&xps_map_mutex);
2074
2075	return 0;
2076error:
2077	/* remove any maps that we added */
2078	for_each_possible_cpu(cpu) {
2079		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2080		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2081				 NULL;
2082		if (new_map && new_map != map)
2083			kfree(new_map);
2084	}
2085
2086	mutex_unlock(&xps_map_mutex);
2087
2088	kfree(new_dev_maps);
2089	return -ENOMEM;
2090}
2091EXPORT_SYMBOL(netif_set_xps_queue);
2092
2093#endif
2094/*
2095 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2096 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2097 */
2098int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2099{
2100	int rc;
2101
2102	if (txq < 1 || txq > dev->num_tx_queues)
2103		return -EINVAL;
2104
2105	if (dev->reg_state == NETREG_REGISTERED ||
2106	    dev->reg_state == NETREG_UNREGISTERING) {
2107		ASSERT_RTNL();
2108
2109		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2110						  txq);
2111		if (rc)
2112			return rc;
2113
2114		if (dev->num_tc)
2115			netif_setup_tc(dev, txq);
2116
2117		if (txq < dev->real_num_tx_queues) {
2118			qdisc_reset_all_tx_gt(dev, txq);
2119#ifdef CONFIG_XPS
2120			netif_reset_xps_queues_gt(dev, txq);
2121#endif
2122		}
2123	}
2124
2125	dev->real_num_tx_queues = txq;
2126	return 0;
2127}
2128EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2129
2130#ifdef CONFIG_SYSFS
2131/**
2132 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2133 *	@dev: Network device
2134 *	@rxq: Actual number of RX queues
2135 *
2136 *	This must be called either with the rtnl_lock held or before
2137 *	registration of the net device.  Returns 0 on success, or a
2138 *	negative error code.  If called before registration, it always
2139 *	succeeds.
2140 */
2141int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2142{
2143	int rc;
2144
2145	if (rxq < 1 || rxq > dev->num_rx_queues)
2146		return -EINVAL;
2147
2148	if (dev->reg_state == NETREG_REGISTERED) {
2149		ASSERT_RTNL();
2150
2151		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2152						  rxq);
2153		if (rc)
2154			return rc;
2155	}
2156
2157	dev->real_num_rx_queues = rxq;
2158	return 0;
2159}
2160EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2161#endif
2162
2163/**
2164 * netif_get_num_default_rss_queues - default number of RSS queues
2165 *
2166 * This routine should set an upper limit on the number of RSS queues
2167 * used by default by multiqueue devices.
2168 */
2169int netif_get_num_default_rss_queues(void)
2170{
2171	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2172}
2173EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2174
2175static inline void __netif_reschedule(struct Qdisc *q)
2176{
2177	struct softnet_data *sd;
2178	unsigned long flags;
2179
2180	local_irq_save(flags);
2181	sd = this_cpu_ptr(&softnet_data);
2182	q->next_sched = NULL;
2183	*sd->output_queue_tailp = q;
2184	sd->output_queue_tailp = &q->next_sched;
2185	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2186	local_irq_restore(flags);
2187}
2188
2189void __netif_schedule(struct Qdisc *q)
2190{
2191	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2192		__netif_reschedule(q);
2193}
2194EXPORT_SYMBOL(__netif_schedule);
2195
2196struct dev_kfree_skb_cb {
2197	enum skb_free_reason reason;
2198};
2199
2200static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2201{
2202	return (struct dev_kfree_skb_cb *)skb->cb;
2203}
2204
2205void netif_schedule_queue(struct netdev_queue *txq)
2206{
2207	rcu_read_lock();
2208	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2209		struct Qdisc *q = rcu_dereference(txq->qdisc);
2210
2211		__netif_schedule(q);
2212	}
2213	rcu_read_unlock();
2214}
2215EXPORT_SYMBOL(netif_schedule_queue);
2216
2217/**
2218 *	netif_wake_subqueue - allow sending packets on subqueue
2219 *	@dev: network device
2220 *	@queue_index: sub queue index
2221 *
2222 * Resume individual transmit queue of a device with multiple transmit queues.
2223 */
2224void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2225{
2226	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2227
2228	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2229		struct Qdisc *q;
2230
2231		rcu_read_lock();
2232		q = rcu_dereference(txq->qdisc);
2233		__netif_schedule(q);
2234		rcu_read_unlock();
2235	}
2236}
2237EXPORT_SYMBOL(netif_wake_subqueue);
2238
2239void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2240{
2241	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2242		struct Qdisc *q;
2243
2244		rcu_read_lock();
2245		q = rcu_dereference(dev_queue->qdisc);
2246		__netif_schedule(q);
2247		rcu_read_unlock();
2248	}
2249}
2250EXPORT_SYMBOL(netif_tx_wake_queue);
2251
2252void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2253{
2254	unsigned long flags;
2255
2256	if (likely(atomic_read(&skb->users) == 1)) {
2257		smp_rmb();
2258		atomic_set(&skb->users, 0);
2259	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2260		return;
2261	}
2262	get_kfree_skb_cb(skb)->reason = reason;
2263	local_irq_save(flags);
2264	skb->next = __this_cpu_read(softnet_data.completion_queue);
2265	__this_cpu_write(softnet_data.completion_queue, skb);
2266	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2267	local_irq_restore(flags);
2268}
2269EXPORT_SYMBOL(__dev_kfree_skb_irq);
2270
2271void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2272{
2273	if (in_irq() || irqs_disabled())
2274		__dev_kfree_skb_irq(skb, reason);
2275	else
2276		dev_kfree_skb(skb);
2277}
2278EXPORT_SYMBOL(__dev_kfree_skb_any);
2279
2280
2281/**
2282 * netif_device_detach - mark device as removed
2283 * @dev: network device
2284 *
2285 * Mark device as removed from system and therefore no longer available.
2286 */
2287void netif_device_detach(struct net_device *dev)
2288{
2289	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2290	    netif_running(dev)) {
2291		netif_tx_stop_all_queues(dev);
2292	}
2293}
2294EXPORT_SYMBOL(netif_device_detach);
2295
2296/**
2297 * netif_device_attach - mark device as attached
2298 * @dev: network device
2299 *
2300 * Mark device as attached from system and restart if needed.
2301 */
2302void netif_device_attach(struct net_device *dev)
2303{
2304	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2305	    netif_running(dev)) {
2306		netif_tx_wake_all_queues(dev);
2307		__netdev_watchdog_up(dev);
2308	}
2309}
2310EXPORT_SYMBOL(netif_device_attach);
2311
2312static void skb_warn_bad_offload(const struct sk_buff *skb)
2313{
2314	static const netdev_features_t null_features = 0;
2315	struct net_device *dev = skb->dev;
2316	const char *driver = "";
2317
2318	if (!net_ratelimit())
2319		return;
2320
2321	if (dev && dev->dev.parent)
2322		driver = dev_driver_string(dev->dev.parent);
2323
2324	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2325	     "gso_type=%d ip_summed=%d\n",
2326	     driver, dev ? &dev->features : &null_features,
2327	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2328	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2329	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2330}
2331
2332/*
2333 * Invalidate hardware checksum when packet is to be mangled, and
2334 * complete checksum manually on outgoing path.
2335 */
2336int skb_checksum_help(struct sk_buff *skb)
2337{
2338	__wsum csum;
2339	int ret = 0, offset;
2340
2341	if (skb->ip_summed == CHECKSUM_COMPLETE)
2342		goto out_set_summed;
2343
2344	if (unlikely(skb_shinfo(skb)->gso_size)) {
2345		skb_warn_bad_offload(skb);
2346		return -EINVAL;
2347	}
2348
2349	/* Before computing a checksum, we should make sure no frag could
2350	 * be modified by an external entity : checksum could be wrong.
2351	 */
2352	if (skb_has_shared_frag(skb)) {
2353		ret = __skb_linearize(skb);
2354		if (ret)
2355			goto out;
2356	}
2357
2358	offset = skb_checksum_start_offset(skb);
2359	BUG_ON(offset >= skb_headlen(skb));
2360	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2361
2362	offset += skb->csum_offset;
2363	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2364
2365	if (skb_cloned(skb) &&
2366	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2367		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2368		if (ret)
2369			goto out;
2370	}
2371
2372	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2373out_set_summed:
2374	skb->ip_summed = CHECKSUM_NONE;
2375out:
2376	return ret;
2377}
2378EXPORT_SYMBOL(skb_checksum_help);
2379
2380__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2381{
2382	__be16 type = skb->protocol;
2383
2384	/* Tunnel gso handlers can set protocol to ethernet. */
2385	if (type == htons(ETH_P_TEB)) {
2386		struct ethhdr *eth;
2387
2388		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2389			return 0;
2390
2391		eth = (struct ethhdr *)skb_mac_header(skb);
2392		type = eth->h_proto;
2393	}
2394
2395	return __vlan_get_protocol(skb, type, depth);
2396}
2397
2398/**
2399 *	skb_mac_gso_segment - mac layer segmentation handler.
2400 *	@skb: buffer to segment
2401 *	@features: features for the output path (see dev->features)
2402 */
2403struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2404				    netdev_features_t features)
2405{
2406	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2407	struct packet_offload *ptype;
2408	int vlan_depth = skb->mac_len;
2409	__be16 type = skb_network_protocol(skb, &vlan_depth);
2410
2411	if (unlikely(!type))
2412		return ERR_PTR(-EINVAL);
2413
2414	__skb_pull(skb, vlan_depth);
2415
2416	rcu_read_lock();
2417	list_for_each_entry_rcu(ptype, &offload_base, list) {
2418		if (ptype->type == type && ptype->callbacks.gso_segment) {
2419			segs = ptype->callbacks.gso_segment(skb, features);
2420			break;
2421		}
2422	}
2423	rcu_read_unlock();
2424
2425	__skb_push(skb, skb->data - skb_mac_header(skb));
2426
2427	return segs;
2428}
2429EXPORT_SYMBOL(skb_mac_gso_segment);
2430
2431
2432/* openvswitch calls this on rx path, so we need a different check.
2433 */
2434static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2435{
2436	if (tx_path)
2437		return skb->ip_summed != CHECKSUM_PARTIAL;
2438	else
2439		return skb->ip_summed == CHECKSUM_NONE;
2440}
2441
2442/**
2443 *	__skb_gso_segment - Perform segmentation on skb.
2444 *	@skb: buffer to segment
2445 *	@features: features for the output path (see dev->features)
2446 *	@tx_path: whether it is called in TX path
2447 *
2448 *	This function segments the given skb and returns a list of segments.
2449 *
2450 *	It may return NULL if the skb requires no segmentation.  This is
2451 *	only possible when GSO is used for verifying header integrity.
2452 */
2453struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2454				  netdev_features_t features, bool tx_path)
2455{
2456	if (unlikely(skb_needs_check(skb, tx_path))) {
2457		int err;
2458
2459		skb_warn_bad_offload(skb);
2460
2461		err = skb_cow_head(skb, 0);
2462		if (err < 0)
2463			return ERR_PTR(err);
2464	}
2465
2466	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2467	SKB_GSO_CB(skb)->encap_level = 0;
2468
2469	skb_reset_mac_header(skb);
2470	skb_reset_mac_len(skb);
2471
2472	return skb_mac_gso_segment(skb, features);
2473}
2474EXPORT_SYMBOL(__skb_gso_segment);
2475
2476/* Take action when hardware reception checksum errors are detected. */
2477#ifdef CONFIG_BUG
2478void netdev_rx_csum_fault(struct net_device *dev)
2479{
2480	if (net_ratelimit()) {
2481		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2482		dump_stack();
2483	}
2484}
2485EXPORT_SYMBOL(netdev_rx_csum_fault);
2486#endif
2487
2488/* Actually, we should eliminate this check as soon as we know, that:
2489 * 1. IOMMU is present and allows to map all the memory.
2490 * 2. No high memory really exists on this machine.
2491 */
2492
2493static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2494{
2495#ifdef CONFIG_HIGHMEM
2496	int i;
2497	if (!(dev->features & NETIF_F_HIGHDMA)) {
2498		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2499			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2500			if (PageHighMem(skb_frag_page(frag)))
2501				return 1;
2502		}
2503	}
2504
2505	if (PCI_DMA_BUS_IS_PHYS) {
2506		struct device *pdev = dev->dev.parent;
2507
2508		if (!pdev)
2509			return 0;
2510		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2511			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2512			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2513			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2514				return 1;
2515		}
2516	}
2517#endif
2518	return 0;
2519}
2520
2521/* If MPLS offload request, verify we are testing hardware MPLS features
2522 * instead of standard features for the netdev.
2523 */
2524#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2525static netdev_features_t net_mpls_features(struct sk_buff *skb,
2526					   netdev_features_t features,
2527					   __be16 type)
2528{
2529	if (eth_p_mpls(type))
2530		features &= skb->dev->mpls_features;
2531
2532	return features;
2533}
2534#else
2535static netdev_features_t net_mpls_features(struct sk_buff *skb,
2536					   netdev_features_t features,
2537					   __be16 type)
2538{
2539	return features;
2540}
2541#endif
2542
2543static netdev_features_t harmonize_features(struct sk_buff *skb,
2544	netdev_features_t features)
2545{
2546	int tmp;
2547	__be16 type;
2548
2549	type = skb_network_protocol(skb, &tmp);
2550	features = net_mpls_features(skb, features, type);
2551
2552	if (skb->ip_summed != CHECKSUM_NONE &&
2553	    !can_checksum_protocol(features, type)) {
2554		features &= ~NETIF_F_ALL_CSUM;
2555	} else if (illegal_highdma(skb->dev, skb)) {
2556		features &= ~NETIF_F_SG;
2557	}
2558
2559	return features;
2560}
2561
2562netdev_features_t netif_skb_features(struct sk_buff *skb)
2563{
2564	struct net_device *dev = skb->dev;
2565	netdev_features_t features = dev->features;
2566	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2567	__be16 protocol = skb->protocol;
2568
2569	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2570		features &= ~NETIF_F_GSO_MASK;
2571
2572	/* If encapsulation offload request, verify we are testing
2573	 * hardware encapsulation features instead of standard
2574	 * features for the netdev
2575	 */
2576	if (skb->encapsulation)
2577		features &= dev->hw_enc_features;
2578
2579	if (!skb_vlan_tag_present(skb)) {
2580		if (unlikely(protocol == htons(ETH_P_8021Q) ||
2581			     protocol == htons(ETH_P_8021AD))) {
2582			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2583			protocol = veh->h_vlan_encapsulated_proto;
2584		} else {
2585			goto finalize;
2586		}
2587	}
2588
2589	features = netdev_intersect_features(features,
2590					     dev->vlan_features |
2591					     NETIF_F_HW_VLAN_CTAG_TX |
2592					     NETIF_F_HW_VLAN_STAG_TX);
2593
2594	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2595		features = netdev_intersect_features(features,
2596						     NETIF_F_SG |
2597						     NETIF_F_HIGHDMA |
2598						     NETIF_F_FRAGLIST |
2599						     NETIF_F_GEN_CSUM |
2600						     NETIF_F_HW_VLAN_CTAG_TX |
2601						     NETIF_F_HW_VLAN_STAG_TX);
2602
2603finalize:
2604	if (dev->netdev_ops->ndo_features_check)
2605		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2606								features);
2607
2608	return harmonize_features(skb, features);
2609}
2610EXPORT_SYMBOL(netif_skb_features);
2611
2612static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2613		    struct netdev_queue *txq, bool more)
2614{
2615	unsigned int len;
2616	int rc;
2617
2618	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2619		dev_queue_xmit_nit(skb, dev);
2620
2621	len = skb->len;
2622	trace_net_dev_start_xmit(skb, dev);
2623	rc = netdev_start_xmit(skb, dev, txq, more);
2624	trace_net_dev_xmit(skb, rc, dev, len);
2625
2626	return rc;
2627}
2628
2629struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2630				    struct netdev_queue *txq, int *ret)
2631{
2632	struct sk_buff *skb = first;
2633	int rc = NETDEV_TX_OK;
2634
2635	while (skb) {
2636		struct sk_buff *next = skb->next;
2637
2638		skb->next = NULL;
2639		rc = xmit_one(skb, dev, txq, next != NULL);
2640		if (unlikely(!dev_xmit_complete(rc))) {
2641			skb->next = next;
2642			goto out;
2643		}
2644
2645		skb = next;
2646		if (netif_xmit_stopped(txq) && skb) {
2647			rc = NETDEV_TX_BUSY;
2648			break;
2649		}
2650	}
2651
2652out:
2653	*ret = rc;
2654	return skb;
2655}
2656
2657static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2658					  netdev_features_t features)
2659{
2660	if (skb_vlan_tag_present(skb) &&
2661	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2662		skb = __vlan_hwaccel_push_inside(skb);
2663	return skb;
2664}
2665
2666static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2667{
2668	netdev_features_t features;
2669
2670	if (skb->next)
2671		return skb;
2672
2673	features = netif_skb_features(skb);
2674	skb = validate_xmit_vlan(skb, features);
2675	if (unlikely(!skb))
2676		goto out_null;
2677
2678	if (netif_needs_gso(dev, skb, features)) {
2679		struct sk_buff *segs;
2680
2681		segs = skb_gso_segment(skb, features);
2682		if (IS_ERR(segs)) {
2683			goto out_kfree_skb;
2684		} else if (segs) {
2685			consume_skb(skb);
2686			skb = segs;
2687		}
2688	} else {
2689		if (skb_needs_linearize(skb, features) &&
2690		    __skb_linearize(skb))
2691			goto out_kfree_skb;
2692
2693		/* If packet is not checksummed and device does not
2694		 * support checksumming for this protocol, complete
2695		 * checksumming here.
2696		 */
2697		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2698			if (skb->encapsulation)
2699				skb_set_inner_transport_header(skb,
2700							       skb_checksum_start_offset(skb));
2701			else
2702				skb_set_transport_header(skb,
2703							 skb_checksum_start_offset(skb));
2704			if (!(features & NETIF_F_ALL_CSUM) &&
2705			    skb_checksum_help(skb))
2706				goto out_kfree_skb;
2707		}
2708	}
2709
2710	return skb;
2711
2712out_kfree_skb:
2713	kfree_skb(skb);
2714out_null:
2715	return NULL;
2716}
2717
2718struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2719{
2720	struct sk_buff *next, *head = NULL, *tail;
2721
2722	for (; skb != NULL; skb = next) {
2723		next = skb->next;
2724		skb->next = NULL;
2725
2726		/* in case skb wont be segmented, point to itself */
2727		skb->prev = skb;
2728
2729		skb = validate_xmit_skb(skb, dev);
2730		if (!skb)
2731			continue;
2732
2733		if (!head)
2734			head = skb;
2735		else
2736			tail->next = skb;
2737		/* If skb was segmented, skb->prev points to
2738		 * the last segment. If not, it still contains skb.
2739		 */
2740		tail = skb->prev;
2741	}
2742	return head;
2743}
2744
2745static void qdisc_pkt_len_init(struct sk_buff *skb)
2746{
2747	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2748
2749	qdisc_skb_cb(skb)->pkt_len = skb->len;
2750
2751	/* To get more precise estimation of bytes sent on wire,
2752	 * we add to pkt_len the headers size of all segments
2753	 */
2754	if (shinfo->gso_size)  {
2755		unsigned int hdr_len;
2756		u16 gso_segs = shinfo->gso_segs;
2757
2758		/* mac layer + network layer */
2759		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2760
2761		/* + transport layer */
2762		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2763			hdr_len += tcp_hdrlen(skb);
2764		else
2765			hdr_len += sizeof(struct udphdr);
2766
2767		if (shinfo->gso_type & SKB_GSO_DODGY)
2768			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2769						shinfo->gso_size);
2770
2771		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2772	}
2773}
2774
2775static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2776				 struct net_device *dev,
2777				 struct netdev_queue *txq)
2778{
2779	spinlock_t *root_lock = qdisc_lock(q);
2780	bool contended;
2781	int rc;
2782
2783	qdisc_pkt_len_init(skb);
2784	qdisc_calculate_pkt_len(skb, q);
2785	/*
2786	 * Heuristic to force contended enqueues to serialize on a
2787	 * separate lock before trying to get qdisc main lock.
2788	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2789	 * often and dequeue packets faster.
2790	 */
2791	contended = qdisc_is_running(q);
2792	if (unlikely(contended))
2793		spin_lock(&q->busylock);
2794
2795	spin_lock(root_lock);
2796	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2797		kfree_skb(skb);
2798		rc = NET_XMIT_DROP;
2799	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2800		   qdisc_run_begin(q)) {
2801		/*
2802		 * This is a work-conserving queue; there are no old skbs
2803		 * waiting to be sent out; and the qdisc is not running -
2804		 * xmit the skb directly.
2805		 */
2806
2807		qdisc_bstats_update(q, skb);
2808
2809		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2810			if (unlikely(contended)) {
2811				spin_unlock(&q->busylock);
2812				contended = false;
2813			}
2814			__qdisc_run(q);
2815		} else
2816			qdisc_run_end(q);
2817
2818		rc = NET_XMIT_SUCCESS;
2819	} else {
2820		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2821		if (qdisc_run_begin(q)) {
2822			if (unlikely(contended)) {
2823				spin_unlock(&q->busylock);
2824				contended = false;
2825			}
2826			__qdisc_run(q);
2827		}
2828	}
2829	spin_unlock(root_lock);
2830	if (unlikely(contended))
2831		spin_unlock(&q->busylock);
2832	return rc;
2833}
2834
2835#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2836static void skb_update_prio(struct sk_buff *skb)
2837{
2838	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2839
2840	if (!skb->priority && skb->sk && map) {
2841		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2842
2843		if (prioidx < map->priomap_len)
2844			skb->priority = map->priomap[prioidx];
2845	}
2846}
2847#else
2848#define skb_update_prio(skb)
2849#endif
2850
2851DEFINE_PER_CPU(int, xmit_recursion);
2852EXPORT_SYMBOL(xmit_recursion);
2853
2854#define RECURSION_LIMIT 10
2855
2856/**
2857 *	dev_loopback_xmit - loop back @skb
2858 *	@skb: buffer to transmit
2859 */
2860int dev_loopback_xmit(struct sk_buff *skb)
2861{
2862	skb_reset_mac_header(skb);
2863	__skb_pull(skb, skb_network_offset(skb));
2864	skb->pkt_type = PACKET_LOOPBACK;
2865	skb->ip_summed = CHECKSUM_UNNECESSARY;
2866	WARN_ON(!skb_dst(skb));
2867	skb_dst_force(skb);
2868	netif_rx_ni(skb);
2869	return 0;
2870}
2871EXPORT_SYMBOL(dev_loopback_xmit);
2872
2873/**
2874 *	__dev_queue_xmit - transmit a buffer
2875 *	@skb: buffer to transmit
2876 *	@accel_priv: private data used for L2 forwarding offload
2877 *
2878 *	Queue a buffer for transmission to a network device. The caller must
2879 *	have set the device and priority and built the buffer before calling
2880 *	this function. The function can be called from an interrupt.
2881 *
2882 *	A negative errno code is returned on a failure. A success does not
2883 *	guarantee the frame will be transmitted as it may be dropped due
2884 *	to congestion or traffic shaping.
2885 *
2886 * -----------------------------------------------------------------------------------
2887 *      I notice this method can also return errors from the queue disciplines,
2888 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2889 *      be positive.
2890 *
2891 *      Regardless of the return value, the skb is consumed, so it is currently
2892 *      difficult to retry a send to this method.  (You can bump the ref count
2893 *      before sending to hold a reference for retry if you are careful.)
2894 *
2895 *      When calling this method, interrupts MUST be enabled.  This is because
2896 *      the BH enable code must have IRQs enabled so that it will not deadlock.
2897 *          --BLG
2898 */
2899static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2900{
2901	struct net_device *dev = skb->dev;
2902	struct netdev_queue *txq;
2903	struct Qdisc *q;
2904	int rc = -ENOMEM;
2905
2906	skb_reset_mac_header(skb);
2907
2908	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2909		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2910
2911	/* Disable soft irqs for various locks below. Also
2912	 * stops preemption for RCU.
2913	 */
2914	rcu_read_lock_bh();
2915
2916	skb_update_prio(skb);
2917
2918	/* If device/qdisc don't need skb->dst, release it right now while
2919	 * its hot in this cpu cache.
2920	 */
2921	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2922		skb_dst_drop(skb);
2923	else
2924		skb_dst_force(skb);
2925
2926	txq = netdev_pick_tx(dev, skb, accel_priv);
2927	q = rcu_dereference_bh(txq->qdisc);
2928
2929#ifdef CONFIG_NET_CLS_ACT
2930	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2931#endif
2932	trace_net_dev_queue(skb);
2933	if (q->enqueue) {
2934		rc = __dev_xmit_skb(skb, q, dev, txq);
2935		goto out;
2936	}
2937
2938	/* The device has no queue. Common case for software devices:
2939	   loopback, all the sorts of tunnels...
2940
2941	   Really, it is unlikely that netif_tx_lock protection is necessary
2942	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2943	   counters.)
2944	   However, it is possible, that they rely on protection
2945	   made by us here.
2946
2947	   Check this and shot the lock. It is not prone from deadlocks.
2948	   Either shot noqueue qdisc, it is even simpler 8)
2949	 */
2950	if (dev->flags & IFF_UP) {
2951		int cpu = smp_processor_id(); /* ok because BHs are off */
2952
2953		if (txq->xmit_lock_owner != cpu) {
2954
2955			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2956				goto recursion_alert;
2957
2958			skb = validate_xmit_skb(skb, dev);
2959			if (!skb)
2960				goto drop;
2961
2962			HARD_TX_LOCK(dev, txq, cpu);
2963
2964			if (!netif_xmit_stopped(txq)) {
2965				__this_cpu_inc(xmit_recursion);
2966				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2967				__this_cpu_dec(xmit_recursion);
2968				if (dev_xmit_complete(rc)) {
2969					HARD_TX_UNLOCK(dev, txq);
2970					goto out;
2971				}
2972			}
2973			HARD_TX_UNLOCK(dev, txq);
2974			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2975					     dev->name);
2976		} else {
2977			/* Recursion is detected! It is possible,
2978			 * unfortunately
2979			 */
2980recursion_alert:
2981			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2982					     dev->name);
2983		}
2984	}
2985
2986	rc = -ENETDOWN;
2987drop:
2988	rcu_read_unlock_bh();
2989
2990	atomic_long_inc(&dev->tx_dropped);
2991	kfree_skb_list(skb);
2992	return rc;
2993out:
2994	rcu_read_unlock_bh();
2995	return rc;
2996}
2997
2998int dev_queue_xmit(struct sk_buff *skb)
2999{
3000	return __dev_queue_xmit(skb, NULL);
3001}
3002EXPORT_SYMBOL(dev_queue_xmit);
3003
3004int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3005{
3006	return __dev_queue_xmit(skb, accel_priv);
3007}
3008EXPORT_SYMBOL(dev_queue_xmit_accel);
3009
3010
3011/*=======================================================================
3012			Receiver routines
3013  =======================================================================*/
3014
3015int netdev_max_backlog __read_mostly = 1000;
3016EXPORT_SYMBOL(netdev_max_backlog);
3017
3018int netdev_tstamp_prequeue __read_mostly = 1;
3019int netdev_budget __read_mostly = 300;
3020int weight_p __read_mostly = 64;            /* old backlog weight */
3021
3022/* Called with irq disabled */
3023static inline void ____napi_schedule(struct softnet_data *sd,
3024				     struct napi_struct *napi)
3025{
3026	list_add_tail(&napi->poll_list, &sd->poll_list);
3027	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3028}
3029
3030#ifdef CONFIG_RPS
3031
3032/* One global table that all flow-based protocols share. */
3033struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3034EXPORT_SYMBOL(rps_sock_flow_table);
3035u32 rps_cpu_mask __read_mostly;
3036EXPORT_SYMBOL(rps_cpu_mask);
3037
3038struct static_key rps_needed __read_mostly;
3039
3040static struct rps_dev_flow *
3041set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3042	    struct rps_dev_flow *rflow, u16 next_cpu)
3043{
3044	if (next_cpu != RPS_NO_CPU) {
3045#ifdef CONFIG_RFS_ACCEL
3046		struct netdev_rx_queue *rxqueue;
3047		struct rps_dev_flow_table *flow_table;
3048		struct rps_dev_flow *old_rflow;
3049		u32 flow_id;
3050		u16 rxq_index;
3051		int rc;
3052
3053		/* Should we steer this flow to a different hardware queue? */
3054		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3055		    !(dev->features & NETIF_F_NTUPLE))
3056			goto out;
3057		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3058		if (rxq_index == skb_get_rx_queue(skb))
3059			goto out;
3060
3061		rxqueue = dev->_rx + rxq_index;
3062		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3063		if (!flow_table)
3064			goto out;
3065		flow_id = skb_get_hash(skb) & flow_table->mask;
3066		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3067							rxq_index, flow_id);
3068		if (rc < 0)
3069			goto out;
3070		old_rflow = rflow;
3071		rflow = &flow_table->flows[flow_id];
3072		rflow->filter = rc;
3073		if (old_rflow->filter == rflow->filter)
3074			old_rflow->filter = RPS_NO_FILTER;
3075	out:
3076#endif
3077		rflow->last_qtail =
3078			per_cpu(softnet_data, next_cpu).input_queue_head;
3079	}
3080
3081	rflow->cpu = next_cpu;
3082	return rflow;
3083}
3084
3085/*
3086 * get_rps_cpu is called from netif_receive_skb and returns the target
3087 * CPU from the RPS map of the receiving queue for a given skb.
3088 * rcu_read_lock must be held on entry.
3089 */
3090static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3091		       struct rps_dev_flow **rflowp)
3092{
3093	const struct rps_sock_flow_table *sock_flow_table;
3094	struct netdev_rx_queue *rxqueue = dev->_rx;
3095	struct rps_dev_flow_table *flow_table;
3096	struct rps_map *map;
3097	int cpu = -1;
3098	u32 tcpu;
3099	u32 hash;
3100
3101	if (skb_rx_queue_recorded(skb)) {
3102		u16 index = skb_get_rx_queue(skb);
3103
3104		if (unlikely(index >= dev->real_num_rx_queues)) {
3105			WARN_ONCE(dev->real_num_rx_queues > 1,
3106				  "%s received packet on queue %u, but number "
3107				  "of RX queues is %u\n",
3108				  dev->name, index, dev->real_num_rx_queues);
3109			goto done;
3110		}
3111		rxqueue += index;
3112	}
3113
3114	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3115
3116	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3117	map = rcu_dereference(rxqueue->rps_map);
3118	if (!flow_table && !map)
3119		goto done;
3120
3121	skb_reset_network_header(skb);
3122	hash = skb_get_hash(skb);
3123	if (!hash)
3124		goto done;
3125
3126	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3127	if (flow_table && sock_flow_table) {
3128		struct rps_dev_flow *rflow;
3129		u32 next_cpu;
3130		u32 ident;
3131
3132		/* First check into global flow table if there is a match */
3133		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3134		if ((ident ^ hash) & ~rps_cpu_mask)
3135			goto try_rps;
3136
3137		next_cpu = ident & rps_cpu_mask;
3138
3139		/* OK, now we know there is a match,
3140		 * we can look at the local (per receive queue) flow table
3141		 */
3142		rflow = &flow_table->flows[hash & flow_table->mask];
3143		tcpu = rflow->cpu;
3144
3145		/*
3146		 * If the desired CPU (where last recvmsg was done) is
3147		 * different from current CPU (one in the rx-queue flow
3148		 * table entry), switch if one of the following holds:
3149		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3150		 *   - Current CPU is offline.
3151		 *   - The current CPU's queue tail has advanced beyond the
3152		 *     last packet that was enqueued using this table entry.
3153		 *     This guarantees that all previous packets for the flow
3154		 *     have been dequeued, thus preserving in order delivery.
3155		 */
3156		if (unlikely(tcpu != next_cpu) &&
3157		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3158		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3159		      rflow->last_qtail)) >= 0)) {
3160			tcpu = next_cpu;
3161			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3162		}
3163
3164		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3165			*rflowp = rflow;
3166			cpu = tcpu;
3167			goto done;
3168		}
3169	}
3170
3171try_rps:
3172
3173	if (map) {
3174		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3175		if (cpu_online(tcpu)) {
3176			cpu = tcpu;
3177			goto done;
3178		}
3179	}
3180
3181done:
3182	return cpu;
3183}
3184
3185#ifdef CONFIG_RFS_ACCEL
3186
3187/**
3188 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3189 * @dev: Device on which the filter was set
3190 * @rxq_index: RX queue index
3191 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3192 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3193 *
3194 * Drivers that implement ndo_rx_flow_steer() should periodically call
3195 * this function for each installed filter and remove the filters for
3196 * which it returns %true.
3197 */
3198bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3199			 u32 flow_id, u16 filter_id)
3200{
3201	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3202	struct rps_dev_flow_table *flow_table;
3203	struct rps_dev_flow *rflow;
3204	bool expire = true;
3205	int cpu;
3206
3207	rcu_read_lock();
3208	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3209	if (flow_table && flow_id <= flow_table->mask) {
3210		rflow = &flow_table->flows[flow_id];
3211		cpu = ACCESS_ONCE(rflow->cpu);
3212		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3213		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3214			   rflow->last_qtail) <
3215		     (int)(10 * flow_table->mask)))
3216			expire = false;
3217	}
3218	rcu_read_unlock();
3219	return expire;
3220}
3221EXPORT_SYMBOL(rps_may_expire_flow);
3222
3223#endif /* CONFIG_RFS_ACCEL */
3224
3225/* Called from hardirq (IPI) context */
3226static void rps_trigger_softirq(void *data)
3227{
3228	struct softnet_data *sd = data;
3229
3230	____napi_schedule(sd, &sd->backlog);
3231	sd->received_rps++;
3232}
3233
3234#endif /* CONFIG_RPS */
3235
3236/*
3237 * Check if this softnet_data structure is another cpu one
3238 * If yes, queue it to our IPI list and return 1
3239 * If no, return 0
3240 */
3241static int rps_ipi_queued(struct softnet_data *sd)
3242{
3243#ifdef CONFIG_RPS
3244	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3245
3246	if (sd != mysd) {
3247		sd->rps_ipi_next = mysd->rps_ipi_list;
3248		mysd->rps_ipi_list = sd;
3249
3250		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3251		return 1;
3252	}
3253#endif /* CONFIG_RPS */
3254	return 0;
3255}
3256
3257#ifdef CONFIG_NET_FLOW_LIMIT
3258int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3259#endif
3260
3261static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3262{
3263#ifdef CONFIG_NET_FLOW_LIMIT
3264	struct sd_flow_limit *fl;
3265	struct softnet_data *sd;
3266	unsigned int old_flow, new_flow;
3267
3268	if (qlen < (netdev_max_backlog >> 1))
3269		return false;
3270
3271	sd = this_cpu_ptr(&softnet_data);
3272
3273	rcu_read_lock();
3274	fl = rcu_dereference(sd->flow_limit);
3275	if (fl) {
3276		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3277		old_flow = fl->history[fl->history_head];
3278		fl->history[fl->history_head] = new_flow;
3279
3280		fl->history_head++;
3281		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3282
3283		if (likely(fl->buckets[old_flow]))
3284			fl->buckets[old_flow]--;
3285
3286		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3287			fl->count++;
3288			rcu_read_unlock();
3289			return true;
3290		}
3291	}
3292	rcu_read_unlock();
3293#endif
3294	return false;
3295}
3296
3297/*
3298 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3299 * queue (may be a remote CPU queue).
3300 */
3301static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3302			      unsigned int *qtail)
3303{
3304	struct softnet_data *sd;
3305	unsigned long flags;
3306	unsigned int qlen;
3307
3308	sd = &per_cpu(softnet_data, cpu);
3309
3310	local_irq_save(flags);
3311
3312	rps_lock(sd);
3313	qlen = skb_queue_len(&sd->input_pkt_queue);
3314	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3315		if (qlen) {
3316enqueue:
3317			__skb_queue_tail(&sd->input_pkt_queue, skb);
3318			input_queue_tail_incr_save(sd, qtail);
3319			rps_unlock(sd);
3320			local_irq_restore(flags);
3321			return NET_RX_SUCCESS;
3322		}
3323
3324		/* Schedule NAPI for backlog device
3325		 * We can use non atomic operation since we own the queue lock
3326		 */
3327		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3328			if (!rps_ipi_queued(sd))
3329				____napi_schedule(sd, &sd->backlog);
3330		}
3331		goto enqueue;
3332	}
3333
3334	sd->dropped++;
3335	rps_unlock(sd);
3336
3337	local_irq_restore(flags);
3338
3339	atomic_long_inc(&skb->dev->rx_dropped);
3340	kfree_skb(skb);
3341	return NET_RX_DROP;
3342}
3343
3344static int netif_rx_internal(struct sk_buff *skb)
3345{
3346	int ret;
3347
3348	net_timestamp_check(netdev_tstamp_prequeue, skb);
3349
3350	trace_netif_rx(skb);
3351#ifdef CONFIG_RPS
3352	if (static_key_false(&rps_needed)) {
3353		struct rps_dev_flow voidflow, *rflow = &voidflow;
3354		int cpu;
3355
3356		preempt_disable();
3357		rcu_read_lock();
3358
3359		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3360		if (cpu < 0)
3361			cpu = smp_processor_id();
3362
3363		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3364
3365		rcu_read_unlock();
3366		preempt_enable();
3367	} else
3368#endif
3369	{
3370		unsigned int qtail;
3371		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3372		put_cpu();
3373	}
3374	return ret;
3375}
3376
3377/**
3378 *	netif_rx	-	post buffer to the network code
3379 *	@skb: buffer to post
3380 *
3381 *	This function receives a packet from a device driver and queues it for
3382 *	the upper (protocol) levels to process.  It always succeeds. The buffer
3383 *	may be dropped during processing for congestion control or by the
3384 *	protocol layers.
3385 *
3386 *	return values:
3387 *	NET_RX_SUCCESS	(no congestion)
3388 *	NET_RX_DROP     (packet was dropped)
3389 *
3390 */
3391
3392int netif_rx(struct sk_buff *skb)
3393{
3394	trace_netif_rx_entry(skb);
3395
3396	return netif_rx_internal(skb);
3397}
3398EXPORT_SYMBOL(netif_rx);
3399
3400int netif_rx_ni(struct sk_buff *skb)
3401{
3402	int err;
3403
3404	trace_netif_rx_ni_entry(skb);
3405
3406	preempt_disable();
3407	err = netif_rx_internal(skb);
3408	if (local_softirq_pending())
3409		do_softirq();
3410	preempt_enable();
3411
3412	return err;
3413}
3414EXPORT_SYMBOL(netif_rx_ni);
3415
3416static void net_tx_action(struct softirq_action *h)
3417{
3418	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3419
3420	if (sd->completion_queue) {
3421		struct sk_buff *clist;
3422
3423		local_irq_disable();
3424		clist = sd->completion_queue;
3425		sd->completion_queue = NULL;
3426		local_irq_enable();
3427
3428		while (clist) {
3429			struct sk_buff *skb = clist;
3430			clist = clist->next;
3431
3432			WARN_ON(atomic_read(&skb->users));
3433			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3434				trace_consume_skb(skb);
3435			else
3436				trace_kfree_skb(skb, net_tx_action);
3437			__kfree_skb(skb);
3438		}
3439	}
3440
3441	if (sd->output_queue) {
3442		struct Qdisc *head;
3443
3444		local_irq_disable();
3445		head = sd->output_queue;
3446		sd->output_queue = NULL;
3447		sd->output_queue_tailp = &sd->output_queue;
3448		local_irq_enable();
3449
3450		while (head) {
3451			struct Qdisc *q = head;
3452			spinlock_t *root_lock;
3453
3454			head = head->next_sched;
3455
3456			root_lock = qdisc_lock(q);
3457			if (spin_trylock(root_lock)) {
3458				smp_mb__before_atomic();
3459				clear_bit(__QDISC_STATE_SCHED,
3460					  &q->state);
3461				qdisc_run(q);
3462				spin_unlock(root_lock);
3463			} else {
3464				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3465					      &q->state)) {
3466					__netif_reschedule(q);
3467				} else {
3468					smp_mb__before_atomic();
3469					clear_bit(__QDISC_STATE_SCHED,
3470						  &q->state);
3471				}
3472			}
3473		}
3474	}
3475}
3476
3477#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3478    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3479/* This hook is defined here for ATM LANE */
3480int (*br_fdb_test_addr_hook)(struct net_device *dev,
3481			     unsigned char *addr) __read_mostly;
3482EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3483#endif
3484
3485#ifdef CONFIG_NET_CLS_ACT
3486/* TODO: Maybe we should just force sch_ingress to be compiled in
3487 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3488 * a compare and 2 stores extra right now if we dont have it on
3489 * but have CONFIG_NET_CLS_ACT
3490 * NOTE: This doesn't stop any functionality; if you dont have
3491 * the ingress scheduler, you just can't add policies on ingress.
3492 *
3493 */
3494static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3495{
3496	struct net_device *dev = skb->dev;
3497	u32 ttl = G_TC_RTTL(skb->tc_verd);
3498	int result = TC_ACT_OK;
3499	struct Qdisc *q;
3500
3501	if (unlikely(MAX_RED_LOOP < ttl++)) {
3502		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3503				     skb->skb_iif, dev->ifindex);
3504		return TC_ACT_SHOT;
3505	}
3506
3507	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3508	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3509
3510	q = rcu_dereference(rxq->qdisc);
3511	if (q != &noop_qdisc) {
3512		spin_lock(qdisc_lock(q));
3513		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3514			result = qdisc_enqueue_root(skb, q);
3515		spin_unlock(qdisc_lock(q));
3516	}
3517
3518	return result;
3519}
3520
3521static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3522					 struct packet_type **pt_prev,
3523					 int *ret, struct net_device *orig_dev)
3524{
3525	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3526
3527	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3528		goto out;
3529
3530	if (*pt_prev) {
3531		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3532		*pt_prev = NULL;
3533	}
3534
3535	switch (ing_filter(skb, rxq)) {
3536	case TC_ACT_SHOT:
3537	case TC_ACT_STOLEN:
3538		kfree_skb(skb);
3539		return NULL;
3540	}
3541
3542out:
3543	skb->tc_verd = 0;
3544	return skb;
3545}
3546#endif
3547
3548/**
3549 *	netdev_rx_handler_register - register receive handler
3550 *	@dev: device to register a handler for
3551 *	@rx_handler: receive handler to register
3552 *	@rx_handler_data: data pointer that is used by rx handler
3553 *
3554 *	Register a receive handler for a device. This handler will then be
3555 *	called from __netif_receive_skb. A negative errno code is returned
3556 *	on a failure.
3557 *
3558 *	The caller must hold the rtnl_mutex.
3559 *
3560 *	For a general description of rx_handler, see enum rx_handler_result.
3561 */
3562int netdev_rx_handler_register(struct net_device *dev,
3563			       rx_handler_func_t *rx_handler,
3564			       void *rx_handler_data)
3565{
3566	ASSERT_RTNL();
3567
3568	if (dev->rx_handler)
3569		return -EBUSY;
3570
3571	/* Note: rx_handler_data must be set before rx_handler */
3572	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3573	rcu_assign_pointer(dev->rx_handler, rx_handler);
3574
3575	return 0;
3576}
3577EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3578
3579/**
3580 *	netdev_rx_handler_unregister - unregister receive handler
3581 *	@dev: device to unregister a handler from
3582 *
3583 *	Unregister a receive handler from a device.
3584 *
3585 *	The caller must hold the rtnl_mutex.
3586 */
3587void netdev_rx_handler_unregister(struct net_device *dev)
3588{
3589
3590	ASSERT_RTNL();
3591	RCU_INIT_POINTER(dev->rx_handler, NULL);
3592	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3593	 * section has a guarantee to see a non NULL rx_handler_data
3594	 * as well.
3595	 */
3596	synchronize_net();
3597	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3598}
3599EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3600
3601/*
3602 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3603 * the special handling of PFMEMALLOC skbs.
3604 */
3605static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3606{
3607	switch (skb->protocol) {
3608	case htons(ETH_P_ARP):
3609	case htons(ETH_P_IP):
3610	case htons(ETH_P_IPV6):
3611	case htons(ETH_P_8021Q):
3612	case htons(ETH_P_8021AD):
3613		return true;
3614	default:
3615		return false;
3616	}
3617}
3618
3619static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3620{
3621	struct packet_type *ptype, *pt_prev;
3622	rx_handler_func_t *rx_handler;
3623	struct net_device *orig_dev;
3624	bool deliver_exact = false;
3625	int ret = NET_RX_DROP;
3626	__be16 type;
3627
3628	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3629
3630	trace_netif_receive_skb(skb);
3631
3632	orig_dev = skb->dev;
3633
3634	skb_reset_network_header(skb);
3635	if (!skb_transport_header_was_set(skb))
3636		skb_reset_transport_header(skb);
3637	skb_reset_mac_len(skb);
3638
3639	pt_prev = NULL;
3640
3641	rcu_read_lock();
3642
3643another_round:
3644	skb->skb_iif = skb->dev->ifindex;
3645
3646	__this_cpu_inc(softnet_data.processed);
3647
3648	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3649	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3650		skb = skb_vlan_untag(skb);
3651		if (unlikely(!skb))
3652			goto unlock;
3653	}
3654
3655#ifdef CONFIG_NET_CLS_ACT
3656	if (skb->tc_verd & TC_NCLS) {
3657		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3658		goto ncls;
3659	}
3660#endif
3661
3662	if (pfmemalloc)
3663		goto skip_taps;
3664
3665	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3666		if (pt_prev)
3667			ret = deliver_skb(skb, pt_prev, orig_dev);
3668		pt_prev = ptype;
3669	}
3670
3671	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3672		if (pt_prev)
3673			ret = deliver_skb(skb, pt_prev, orig_dev);
3674		pt_prev = ptype;
3675	}
3676
3677skip_taps:
3678#ifdef CONFIG_NET_CLS_ACT
3679	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3680	if (!skb)
3681		goto unlock;
3682ncls:
3683#endif
3684
3685	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3686		goto drop;
3687
3688	if (skb_vlan_tag_present(skb)) {
3689		if (pt_prev) {
3690			ret = deliver_skb(skb, pt_prev, orig_dev);
3691			pt_prev = NULL;
3692		}
3693		if (vlan_do_receive(&skb))
3694			goto another_round;
3695		else if (unlikely(!skb))
3696			goto unlock;
3697	}
3698
3699	rx_handler = rcu_dereference(skb->dev->rx_handler);
3700	if (rx_handler) {
3701		if (pt_prev) {
3702			ret = deliver_skb(skb, pt_prev, orig_dev);
3703			pt_prev = NULL;
3704		}
3705		switch (rx_handler(&skb)) {
3706		case RX_HANDLER_CONSUMED:
3707			ret = NET_RX_SUCCESS;
3708			goto unlock;
3709		case RX_HANDLER_ANOTHER:
3710			goto another_round;
3711		case RX_HANDLER_EXACT:
3712			deliver_exact = true;
3713		case RX_HANDLER_PASS:
3714			break;
3715		default:
3716			BUG();
3717		}
3718	}
3719
3720	if (unlikely(skb_vlan_tag_present(skb))) {
3721		if (skb_vlan_tag_get_id(skb))
3722			skb->pkt_type = PACKET_OTHERHOST;
3723		/* Note: we might in the future use prio bits
3724		 * and set skb->priority like in vlan_do_receive()
3725		 * For the time being, just ignore Priority Code Point
3726		 */
3727		skb->vlan_tci = 0;
3728	}
3729
3730	type = skb->protocol;
3731
3732	/* deliver only exact match when indicated */
3733	if (likely(!deliver_exact)) {
3734		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3735				       &ptype_base[ntohs(type) &
3736						   PTYPE_HASH_MASK]);
3737	}
3738
3739	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3740			       &orig_dev->ptype_specific);
3741
3742	if (unlikely(skb->dev != orig_dev)) {
3743		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3744				       &skb->dev->ptype_specific);
3745	}
3746
3747	if (pt_prev) {
3748		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3749			goto drop;
3750		else
3751			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3752	} else {
3753drop:
3754		atomic_long_inc(&skb->dev->rx_dropped);
3755		kfree_skb(skb);
3756		/* Jamal, now you will not able to escape explaining
3757		 * me how you were going to use this. :-)
3758		 */
3759		ret = NET_RX_DROP;
3760	}
3761
3762unlock:
3763	rcu_read_unlock();
3764	return ret;
3765}
3766
3767static int __netif_receive_skb(struct sk_buff *skb)
3768{
3769	int ret;
3770
3771	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3772		unsigned long pflags = current->flags;
3773
3774		/*
3775		 * PFMEMALLOC skbs are special, they should
3776		 * - be delivered to SOCK_MEMALLOC sockets only
3777		 * - stay away from userspace
3778		 * - have bounded memory usage
3779		 *
3780		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3781		 * context down to all allocation sites.
3782		 */
3783		current->flags |= PF_MEMALLOC;
3784		ret = __netif_receive_skb_core(skb, true);
3785		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3786	} else
3787		ret = __netif_receive_skb_core(skb, false);
3788
3789	return ret;
3790}
3791
3792static int netif_receive_skb_internal(struct sk_buff *skb)
3793{
3794	net_timestamp_check(netdev_tstamp_prequeue, skb);
3795
3796	if (skb_defer_rx_timestamp(skb))
3797		return NET_RX_SUCCESS;
3798
3799#ifdef CONFIG_RPS
3800	if (static_key_false(&rps_needed)) {
3801		struct rps_dev_flow voidflow, *rflow = &voidflow;
3802		int cpu, ret;
3803
3804		rcu_read_lock();
3805
3806		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3807
3808		if (cpu >= 0) {
3809			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3810			rcu_read_unlock();
3811			return ret;
3812		}
3813		rcu_read_unlock();
3814	}
3815#endif
3816	return __netif_receive_skb(skb);
3817}
3818
3819/**
3820 *	netif_receive_skb - process receive buffer from network
3821 *	@skb: buffer to process
3822 *
3823 *	netif_receive_skb() is the main receive data processing function.
3824 *	It always succeeds. The buffer may be dropped during processing
3825 *	for congestion control or by the protocol layers.
3826 *
3827 *	This function may only be called from softirq context and interrupts
3828 *	should be enabled.
3829 *
3830 *	Return values (usually ignored):
3831 *	NET_RX_SUCCESS: no congestion
3832 *	NET_RX_DROP: packet was dropped
3833 */
3834int netif_receive_skb(struct sk_buff *skb)
3835{
3836	trace_netif_receive_skb_entry(skb);
3837
3838	return netif_receive_skb_internal(skb);
3839}
3840EXPORT_SYMBOL(netif_receive_skb);
3841
3842/* Network device is going away, flush any packets still pending
3843 * Called with irqs disabled.
3844 */
3845static void flush_backlog(void *arg)
3846{
3847	struct net_device *dev = arg;
3848	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3849	struct sk_buff *skb, *tmp;
3850
3851	rps_lock(sd);
3852	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3853		if (skb->dev == dev) {
3854			__skb_unlink(skb, &sd->input_pkt_queue);
3855			kfree_skb(skb);
3856			input_queue_head_incr(sd);
3857		}
3858	}
3859	rps_unlock(sd);
3860
3861	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3862		if (skb->dev == dev) {
3863			__skb_unlink(skb, &sd->process_queue);
3864			kfree_skb(skb);
3865			input_queue_head_incr(sd);
3866		}
3867	}
3868}
3869
3870static int napi_gro_complete(struct sk_buff *skb)
3871{
3872	struct packet_offload *ptype;
3873	__be16 type = skb->protocol;
3874	struct list_head *head = &offload_base;
3875	int err = -ENOENT;
3876
3877	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3878
3879	if (NAPI_GRO_CB(skb)->count == 1) {
3880		skb_shinfo(skb)->gso_size = 0;
3881		goto out;
3882	}
3883
3884	rcu_read_lock();
3885	list_for_each_entry_rcu(ptype, head, list) {
3886		if (ptype->type != type || !ptype->callbacks.gro_complete)
3887			continue;
3888
3889		err = ptype->callbacks.gro_complete(skb, 0);
3890		break;
3891	}
3892	rcu_read_unlock();
3893
3894	if (err) {
3895		WARN_ON(&ptype->list == head);
3896		kfree_skb(skb);
3897		return NET_RX_SUCCESS;
3898	}
3899
3900out:
3901	return netif_receive_skb_internal(skb);
3902}
3903
3904/* napi->gro_list contains packets ordered by age.
3905 * youngest packets at the head of it.
3906 * Complete skbs in reverse order to reduce latencies.
3907 */
3908void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3909{
3910	struct sk_buff *skb, *prev = NULL;
3911
3912	/* scan list and build reverse chain */
3913	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3914		skb->prev = prev;
3915		prev = skb;
3916	}
3917
3918	for (skb = prev; skb; skb = prev) {
3919		skb->next = NULL;
3920
3921		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3922			return;
3923
3924		prev = skb->prev;
3925		napi_gro_complete(skb);
3926		napi->gro_count--;
3927	}
3928
3929	napi->gro_list = NULL;
3930}
3931EXPORT_SYMBOL(napi_gro_flush);
3932
3933static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3934{
3935	struct sk_buff *p;
3936	unsigned int maclen = skb->dev->hard_header_len;
3937	u32 hash = skb_get_hash_raw(skb);
3938
3939	for (p = napi->gro_list; p; p = p->next) {
3940		unsigned long diffs;
3941
3942		NAPI_GRO_CB(p)->flush = 0;
3943
3944		if (hash != skb_get_hash_raw(p)) {
3945			NAPI_GRO_CB(p)->same_flow = 0;
3946			continue;
3947		}
3948
3949		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3950		diffs |= p->vlan_tci ^ skb->vlan_tci;
3951		if (maclen == ETH_HLEN)
3952			diffs |= compare_ether_header(skb_mac_header(p),
3953						      skb_mac_header(skb));
3954		else if (!diffs)
3955			diffs = memcmp(skb_mac_header(p),
3956				       skb_mac_header(skb),
3957				       maclen);
3958		NAPI_GRO_CB(p)->same_flow = !diffs;
3959	}
3960}
3961
3962static void skb_gro_reset_offset(struct sk_buff *skb)
3963{
3964	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3965	const skb_frag_t *frag0 = &pinfo->frags[0];
3966
3967	NAPI_GRO_CB(skb)->data_offset = 0;
3968	NAPI_GRO_CB(skb)->frag0 = NULL;
3969	NAPI_GRO_CB(skb)->frag0_len = 0;
3970
3971	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3972	    pinfo->nr_frags &&
3973	    !PageHighMem(skb_frag_page(frag0))) {
3974		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3975		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3976	}
3977}
3978
3979static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3980{
3981	struct skb_shared_info *pinfo = skb_shinfo(skb);
3982
3983	BUG_ON(skb->end - skb->tail < grow);
3984
3985	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3986
3987	skb->data_len -= grow;
3988	skb->tail += grow;
3989
3990	pinfo->frags[0].page_offset += grow;
3991	skb_frag_size_sub(&pinfo->frags[0], grow);
3992
3993	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3994		skb_frag_unref(skb, 0);
3995		memmove(pinfo->frags, pinfo->frags + 1,
3996			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3997	}
3998}
3999
4000static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4001{
4002	struct sk_buff **pp = NULL;
4003	struct packet_offload *ptype;
4004	__be16 type = skb->protocol;
4005	struct list_head *head = &offload_base;
4006	int same_flow;
4007	enum gro_result ret;
4008	int grow;
4009
4010	if (!(skb->dev->features & NETIF_F_GRO))
4011		goto normal;
4012
4013	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4014		goto normal;
4015
4016	gro_list_prepare(napi, skb);
4017
4018	rcu_read_lock();
4019	list_for_each_entry_rcu(ptype, head, list) {
4020		if (ptype->type != type || !ptype->callbacks.gro_receive)
4021			continue;
4022
4023		skb_set_network_header(skb, skb_gro_offset(skb));
4024		skb_reset_mac_len(skb);
4025		NAPI_GRO_CB(skb)->same_flow = 0;
4026		NAPI_GRO_CB(skb)->flush = 0;
4027		NAPI_GRO_CB(skb)->free = 0;
4028		NAPI_GRO_CB(skb)->udp_mark = 0;
4029		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4030
4031		/* Setup for GRO checksum validation */
4032		switch (skb->ip_summed) {
4033		case CHECKSUM_COMPLETE:
4034			NAPI_GRO_CB(skb)->csum = skb->csum;
4035			NAPI_GRO_CB(skb)->csum_valid = 1;
4036			NAPI_GRO_CB(skb)->csum_cnt = 0;
4037			break;
4038		case CHECKSUM_UNNECESSARY:
4039			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4040			NAPI_GRO_CB(skb)->csum_valid = 0;
4041			break;
4042		default:
4043			NAPI_GRO_CB(skb)->csum_cnt = 0;
4044			NAPI_GRO_CB(skb)->csum_valid = 0;
4045		}
4046
4047		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4048		break;
4049	}
4050	rcu_read_unlock();
4051
4052	if (&ptype->list == head)
4053		goto normal;
4054
4055	same_flow = NAPI_GRO_CB(skb)->same_flow;
4056	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4057
4058	if (pp) {
4059		struct sk_buff *nskb = *pp;
4060
4061		*pp = nskb->next;
4062		nskb->next = NULL;
4063		napi_gro_complete(nskb);
4064		napi->gro_count--;
4065	}
4066
4067	if (same_flow)
4068		goto ok;
4069
4070	if (NAPI_GRO_CB(skb)->flush)
4071		goto normal;
4072
4073	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4074		struct sk_buff *nskb = napi->gro_list;
4075
4076		/* locate the end of the list to select the 'oldest' flow */
4077		while (nskb->next) {
4078			pp = &nskb->next;
4079			nskb = *pp;
4080		}
4081		*pp = NULL;
4082		nskb->next = NULL;
4083		napi_gro_complete(nskb);
4084	} else {
4085		napi->gro_count++;
4086	}
4087	NAPI_GRO_CB(skb)->count = 1;
4088	NAPI_GRO_CB(skb)->age = jiffies;
4089	NAPI_GRO_CB(skb)->last = skb;
4090	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4091	skb->next = napi->gro_list;
4092	napi->gro_list = skb;
4093	ret = GRO_HELD;
4094
4095pull:
4096	grow = skb_gro_offset(skb) - skb_headlen(skb);
4097	if (grow > 0)
4098		gro_pull_from_frag0(skb, grow);
4099ok:
4100	return ret;
4101
4102normal:
4103	ret = GRO_NORMAL;
4104	goto pull;
4105}
4106
4107struct packet_offload *gro_find_receive_by_type(__be16 type)
4108{
4109	struct list_head *offload_head = &offload_base;
4110	struct packet_offload *ptype;
4111
4112	list_for_each_entry_rcu(ptype, offload_head, list) {
4113		if (ptype->type != type || !ptype->callbacks.gro_receive)
4114			continue;
4115		return ptype;
4116	}
4117	return NULL;
4118}
4119EXPORT_SYMBOL(gro_find_receive_by_type);
4120
4121struct packet_offload *gro_find_complete_by_type(__be16 type)
4122{
4123	struct list_head *offload_head = &offload_base;
4124	struct packet_offload *ptype;
4125
4126	list_for_each_entry_rcu(ptype, offload_head, list) {
4127		if (ptype->type != type || !ptype->callbacks.gro_complete)
4128			continue;
4129		return ptype;
4130	}
4131	return NULL;
4132}
4133EXPORT_SYMBOL(gro_find_complete_by_type);
4134
4135static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4136{
4137	switch (ret) {
4138	case GRO_NORMAL:
4139		if (netif_receive_skb_internal(skb))
4140			ret = GRO_DROP;
4141		break;
4142
4143	case GRO_DROP:
4144		kfree_skb(skb);
4145		break;
4146
4147	case GRO_MERGED_FREE:
4148		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4149			kmem_cache_free(skbuff_head_cache, skb);
4150		else
4151			__kfree_skb(skb);
4152		break;
4153
4154	case GRO_HELD:
4155	case GRO_MERGED:
4156		break;
4157	}
4158
4159	return ret;
4160}
4161
4162gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4163{
4164	trace_napi_gro_receive_entry(skb);
4165
4166	skb_gro_reset_offset(skb);
4167
4168	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4169}
4170EXPORT_SYMBOL(napi_gro_receive);
4171
4172static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4173{
4174	if (unlikely(skb->pfmemalloc)) {
4175		consume_skb(skb);
4176		return;
4177	}
4178	__skb_pull(skb, skb_headlen(skb));
4179	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4180	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4181	skb->vlan_tci = 0;
4182	skb->dev = napi->dev;
4183	skb->skb_iif = 0;
4184	skb->encapsulation = 0;
4185	skb_shinfo(skb)->gso_type = 0;
4186	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4187
4188	napi->skb = skb;
4189}
4190
4191struct sk_buff *napi_get_frags(struct napi_struct *napi)
4192{
4193	struct sk_buff *skb = napi->skb;
4194
4195	if (!skb) {
4196		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4197		napi->skb = skb;
4198	}
4199	return skb;
4200}
4201EXPORT_SYMBOL(napi_get_frags);
4202
4203static gro_result_t napi_frags_finish(struct napi_struct *napi,
4204				      struct sk_buff *skb,
4205				      gro_result_t ret)
4206{
4207	switch (ret) {
4208	case GRO_NORMAL:
4209	case GRO_HELD:
4210		__skb_push(skb, ETH_HLEN);
4211		skb->protocol = eth_type_trans(skb, skb->dev);
4212		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4213			ret = GRO_DROP;
4214		break;
4215
4216	case GRO_DROP:
4217	case GRO_MERGED_FREE:
4218		napi_reuse_skb(napi, skb);
4219		break;
4220
4221	case GRO_MERGED:
4222		break;
4223	}
4224
4225	return ret;
4226}
4227
4228/* Upper GRO stack assumes network header starts at gro_offset=0
4229 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4230 * We copy ethernet header into skb->data to have a common layout.
4231 */
4232static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4233{
4234	struct sk_buff *skb = napi->skb;
4235	const struct ethhdr *eth;
4236	unsigned int hlen = sizeof(*eth);
4237
4238	napi->skb = NULL;
4239
4240	skb_reset_mac_header(skb);
4241	skb_gro_reset_offset(skb);
4242
4243	eth = skb_gro_header_fast(skb, 0);
4244	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4245		eth = skb_gro_header_slow(skb, hlen, 0);
4246		if (unlikely(!eth)) {
4247			napi_reuse_skb(napi, skb);
4248			return NULL;
4249		}
4250	} else {
4251		gro_pull_from_frag0(skb, hlen);
4252		NAPI_GRO_CB(skb)->frag0 += hlen;
4253		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4254	}
4255	__skb_pull(skb, hlen);
4256
4257	/*
4258	 * This works because the only protocols we care about don't require
4259	 * special handling.
4260	 * We'll fix it up properly in napi_frags_finish()
4261	 */
4262	skb->protocol = eth->h_proto;
4263
4264	return skb;
4265}
4266
4267gro_result_t napi_gro_frags(struct napi_struct *napi)
4268{
4269	struct sk_buff *skb = napi_frags_skb(napi);
4270
4271	if (!skb)
4272		return GRO_DROP;
4273
4274	trace_napi_gro_frags_entry(skb);
4275
4276	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4277}
4278EXPORT_SYMBOL(napi_gro_frags);
4279
4280/* Compute the checksum from gro_offset and return the folded value
4281 * after adding in any pseudo checksum.
4282 */
4283__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4284{
4285	__wsum wsum;
4286	__sum16 sum;
4287
4288	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4289
4290	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4291	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4292	if (likely(!sum)) {
4293		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4294		    !skb->csum_complete_sw)
4295			netdev_rx_csum_fault(skb->dev);
4296	}
4297
4298	NAPI_GRO_CB(skb)->csum = wsum;
4299	NAPI_GRO_CB(skb)->csum_valid = 1;
4300
4301	return sum;
4302}
4303EXPORT_SYMBOL(__skb_gro_checksum_complete);
4304
4305/*
4306 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4307 * Note: called with local irq disabled, but exits with local irq enabled.
4308 */
4309static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4310{
4311#ifdef CONFIG_RPS
4312	struct softnet_data *remsd = sd->rps_ipi_list;
4313
4314	if (remsd) {
4315		sd->rps_ipi_list = NULL;
4316
4317		local_irq_enable();
4318
4319		/* Send pending IPI's to kick RPS processing on remote cpus. */
4320		while (remsd) {
4321			struct softnet_data *next = remsd->rps_ipi_next;
4322
4323			if (cpu_online(remsd->cpu))
4324				smp_call_function_single_async(remsd->cpu,
4325							   &remsd->csd);
4326			remsd = next;
4327		}
4328	} else
4329#endif
4330		local_irq_enable();
4331}
4332
4333static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4334{
4335#ifdef CONFIG_RPS
4336	return sd->rps_ipi_list != NULL;
4337#else
4338	return false;
4339#endif
4340}
4341
4342static int process_backlog(struct napi_struct *napi, int quota)
4343{
4344	int work = 0;
4345	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4346
4347	/* Check if we have pending ipi, its better to send them now,
4348	 * not waiting net_rx_action() end.
4349	 */
4350	if (sd_has_rps_ipi_waiting(sd)) {
4351		local_irq_disable();
4352		net_rps_action_and_irq_enable(sd);
4353	}
4354
4355	napi->weight = weight_p;
4356	local_irq_disable();
4357	while (1) {
4358		struct sk_buff *skb;
4359
4360		while ((skb = __skb_dequeue(&sd->process_queue))) {
4361			local_irq_enable();
4362			__netif_receive_skb(skb);
4363			local_irq_disable();
4364			input_queue_head_incr(sd);
4365			if (++work >= quota) {
4366				local_irq_enable();
4367				return work;
4368			}
4369		}
4370
4371		rps_lock(sd);
4372		if (skb_queue_empty(&sd->input_pkt_queue)) {
4373			/*
4374			 * Inline a custom version of __napi_complete().
4375			 * only current cpu owns and manipulates this napi,
4376			 * and NAPI_STATE_SCHED is the only possible flag set
4377			 * on backlog.
4378			 * We can use a plain write instead of clear_bit(),
4379			 * and we dont need an smp_mb() memory barrier.
4380			 */
4381			napi->state = 0;
4382			rps_unlock(sd);
4383
4384			break;
4385		}
4386
4387		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4388					   &sd->process_queue);
4389		rps_unlock(sd);
4390	}
4391	local_irq_enable();
4392
4393	return work;
4394}
4395
4396/**
4397 * __napi_schedule - schedule for receive
4398 * @n: entry to schedule
4399 *
4400 * The entry's receive function will be scheduled to run.
4401 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4402 */
4403void __napi_schedule(struct napi_struct *n)
4404{
4405	unsigned long flags;
4406
4407	local_irq_save(flags);
4408	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4409	local_irq_restore(flags);
4410}
4411EXPORT_SYMBOL(__napi_schedule);
4412
4413/**
4414 * __napi_schedule_irqoff - schedule for receive
4415 * @n: entry to schedule
4416 *
4417 * Variant of __napi_schedule() assuming hard irqs are masked
4418 */
4419void __napi_schedule_irqoff(struct napi_struct *n)
4420{
4421	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4422}
4423EXPORT_SYMBOL(__napi_schedule_irqoff);
4424
4425void __napi_complete(struct napi_struct *n)
4426{
4427	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4428
4429	list_del_init(&n->poll_list);
4430	smp_mb__before_atomic();
4431	clear_bit(NAPI_STATE_SCHED, &n->state);
4432}
4433EXPORT_SYMBOL(__napi_complete);
4434
4435void napi_complete_done(struct napi_struct *n, int work_done)
4436{
4437	unsigned long flags;
4438
4439	/*
4440	 * don't let napi dequeue from the cpu poll list
4441	 * just in case its running on a different cpu
4442	 */
4443	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4444		return;
4445
4446	if (n->gro_list) {
4447		unsigned long timeout = 0;
4448
4449		if (work_done)
4450			timeout = n->dev->gro_flush_timeout;
4451
4452		if (timeout)
4453			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4454				      HRTIMER_MODE_REL_PINNED);
4455		else
4456			napi_gro_flush(n, false);
4457	}
4458	if (likely(list_empty(&n->poll_list))) {
4459		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4460	} else {
4461		/* If n->poll_list is not empty, we need to mask irqs */
4462		local_irq_save(flags);
4463		__napi_complete(n);
4464		local_irq_restore(flags);
4465	}
4466}
4467EXPORT_SYMBOL(napi_complete_done);
4468
4469/* must be called under rcu_read_lock(), as we dont take a reference */
4470struct napi_struct *napi_by_id(unsigned int napi_id)
4471{
4472	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4473	struct napi_struct *napi;
4474
4475	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4476		if (napi->napi_id == napi_id)
4477			return napi;
4478
4479	return NULL;
4480}
4481EXPORT_SYMBOL_GPL(napi_by_id);
4482
4483void napi_hash_add(struct napi_struct *napi)
4484{
4485	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4486
4487		spin_lock(&napi_hash_lock);
4488
4489		/* 0 is not a valid id, we also skip an id that is taken
4490		 * we expect both events to be extremely rare
4491		 */
4492		napi->napi_id = 0;
4493		while (!napi->napi_id) {
4494			napi->napi_id = ++napi_gen_id;
4495			if (napi_by_id(napi->napi_id))
4496				napi->napi_id = 0;
4497		}
4498
4499		hlist_add_head_rcu(&napi->napi_hash_node,
4500			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4501
4502		spin_unlock(&napi_hash_lock);
4503	}
4504}
4505EXPORT_SYMBOL_GPL(napi_hash_add);
4506
4507/* Warning : caller is responsible to make sure rcu grace period
4508 * is respected before freeing memory containing @napi
4509 */
4510void napi_hash_del(struct napi_struct *napi)
4511{
4512	spin_lock(&napi_hash_lock);
4513
4514	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4515		hlist_del_rcu(&napi->napi_hash_node);
4516
4517	spin_unlock(&napi_hash_lock);
4518}
4519EXPORT_SYMBOL_GPL(napi_hash_del);
4520
4521static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4522{
4523	struct napi_struct *napi;
4524
4525	napi = container_of(timer, struct napi_struct, timer);
4526	if (napi->gro_list)
4527		napi_schedule(napi);
4528
4529	return HRTIMER_NORESTART;
4530}
4531
4532void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4533		    int (*poll)(struct napi_struct *, int), int weight)
4534{
4535	INIT_LIST_HEAD(&napi->poll_list);
4536	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4537	napi->timer.function = napi_watchdog;
4538	napi->gro_count = 0;
4539	napi->gro_list = NULL;
4540	napi->skb = NULL;
4541	napi->poll = poll;
4542	if (weight > NAPI_POLL_WEIGHT)
4543		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4544			    weight, dev->name);
4545	napi->weight = weight;
4546	list_add(&napi->dev_list, &dev->napi_list);
4547	napi->dev = dev;
4548#ifdef CONFIG_NETPOLL
4549	spin_lock_init(&napi->poll_lock);
4550	napi->poll_owner = -1;
4551#endif
4552	set_bit(NAPI_STATE_SCHED, &napi->state);
4553}
4554EXPORT_SYMBOL(netif_napi_add);
4555
4556void napi_disable(struct napi_struct *n)
4557{
4558	might_sleep();
4559	set_bit(NAPI_STATE_DISABLE, &n->state);
4560
4561	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4562		msleep(1);
4563
4564	hrtimer_cancel(&n->timer);
4565
4566	clear_bit(NAPI_STATE_DISABLE, &n->state);
4567}
4568EXPORT_SYMBOL(napi_disable);
4569
4570void netif_napi_del(struct napi_struct *napi)
4571{
4572	list_del_init(&napi->dev_list);
4573	napi_free_frags(napi);
4574
4575	kfree_skb_list(napi->gro_list);
4576	napi->gro_list = NULL;
4577	napi->gro_count = 0;
4578}
4579EXPORT_SYMBOL(netif_napi_del);
4580
4581static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4582{
4583	void *have;
4584	int work, weight;
4585
4586	list_del_init(&n->poll_list);
4587
4588	have = netpoll_poll_lock(n);
4589
4590	weight = n->weight;
4591
4592	/* This NAPI_STATE_SCHED test is for avoiding a race
4593	 * with netpoll's poll_napi().  Only the entity which
4594	 * obtains the lock and sees NAPI_STATE_SCHED set will
4595	 * actually make the ->poll() call.  Therefore we avoid
4596	 * accidentally calling ->poll() when NAPI is not scheduled.
4597	 */
4598	work = 0;
4599	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4600		work = n->poll(n, weight);
4601		trace_napi_poll(n);
4602	}
4603
4604	WARN_ON_ONCE(work > weight);
4605
4606	if (likely(work < weight))
4607		goto out_unlock;
4608
4609	/* Drivers must not modify the NAPI state if they
4610	 * consume the entire weight.  In such cases this code
4611	 * still "owns" the NAPI instance and therefore can
4612	 * move the instance around on the list at-will.
4613	 */
4614	if (unlikely(napi_disable_pending(n))) {
4615		napi_complete(n);
4616		goto out_unlock;
4617	}
4618
4619	if (n->gro_list) {
4620		/* flush too old packets
4621		 * If HZ < 1000, flush all packets.
4622		 */
4623		napi_gro_flush(n, HZ >= 1000);
4624	}
4625
4626	/* Some drivers may have called napi_schedule
4627	 * prior to exhausting their budget.
4628	 */
4629	if (unlikely(!list_empty(&n->poll_list))) {
4630		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4631			     n->dev ? n->dev->name : "backlog");
4632		goto out_unlock;
4633	}
4634
4635	list_add_tail(&n->poll_list, repoll);
4636
4637out_unlock:
4638	netpoll_poll_unlock(have);
4639
4640	return work;
4641}
4642
4643static void net_rx_action(struct softirq_action *h)
4644{
4645	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4646	unsigned long time_limit = jiffies + 2;
4647	int budget = netdev_budget;
4648	LIST_HEAD(list);
4649	LIST_HEAD(repoll);
4650
4651	local_irq_disable();
4652	list_splice_init(&sd->poll_list, &list);
4653	local_irq_enable();
4654
4655	for (;;) {
4656		struct napi_struct *n;
4657
4658		if (list_empty(&list)) {
4659			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4660				return;
4661			break;
4662		}
4663
4664		n = list_first_entry(&list, struct napi_struct, poll_list);
4665		budget -= napi_poll(n, &repoll);
4666
4667		/* If softirq window is exhausted then punt.
4668		 * Allow this to run for 2 jiffies since which will allow
4669		 * an average latency of 1.5/HZ.
4670		 */
4671		if (unlikely(budget <= 0 ||
4672			     time_after_eq(jiffies, time_limit))) {
4673			sd->time_squeeze++;
4674			break;
4675		}
4676	}
4677
4678	local_irq_disable();
4679
4680	list_splice_tail_init(&sd->poll_list, &list);
4681	list_splice_tail(&repoll, &list);
4682	list_splice(&list, &sd->poll_list);
4683	if (!list_empty(&sd->poll_list))
4684		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4685
4686	net_rps_action_and_irq_enable(sd);
4687}
4688
4689struct netdev_adjacent {
4690	struct net_device *dev;
4691
4692	/* upper master flag, there can only be one master device per list */
4693	bool master;
4694
4695	/* counter for the number of times this device was added to us */
4696	u16 ref_nr;
4697
4698	/* private field for the users */
4699	void *private;
4700
4701	struct list_head list;
4702	struct rcu_head rcu;
4703};
4704
4705static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4706						 struct net_device *adj_dev,
4707						 struct list_head *adj_list)
4708{
4709	struct netdev_adjacent *adj;
4710
4711	list_for_each_entry(adj, adj_list, list) {
4712		if (adj->dev == adj_dev)
4713			return adj;
4714	}
4715	return NULL;
4716}
4717
4718/**
4719 * netdev_has_upper_dev - Check if device is linked to an upper device
4720 * @dev: device
4721 * @upper_dev: upper device to check
4722 *
4723 * Find out if a device is linked to specified upper device and return true
4724 * in case it is. Note that this checks only immediate upper device,
4725 * not through a complete stack of devices. The caller must hold the RTNL lock.
4726 */
4727bool netdev_has_upper_dev(struct net_device *dev,
4728			  struct net_device *upper_dev)
4729{
4730	ASSERT_RTNL();
4731
4732	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4733}
4734EXPORT_SYMBOL(netdev_has_upper_dev);
4735
4736/**
4737 * netdev_has_any_upper_dev - Check if device is linked to some device
4738 * @dev: device
4739 *
4740 * Find out if a device is linked to an upper device and return true in case
4741 * it is. The caller must hold the RTNL lock.
4742 */
4743static bool netdev_has_any_upper_dev(struct net_device *dev)
4744{
4745	ASSERT_RTNL();
4746
4747	return !list_empty(&dev->all_adj_list.upper);
4748}
4749
4750/**
4751 * netdev_master_upper_dev_get - Get master upper device
4752 * @dev: device
4753 *
4754 * Find a master upper device and return pointer to it or NULL in case
4755 * it's not there. The caller must hold the RTNL lock.
4756 */
4757struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4758{
4759	struct netdev_adjacent *upper;
4760
4761	ASSERT_RTNL();
4762
4763	if (list_empty(&dev->adj_list.upper))
4764		return NULL;
4765
4766	upper = list_first_entry(&dev->adj_list.upper,
4767				 struct netdev_adjacent, list);
4768	if (likely(upper->master))
4769		return upper->dev;
4770	return NULL;
4771}
4772EXPORT_SYMBOL(netdev_master_upper_dev_get);
4773
4774void *netdev_adjacent_get_private(struct list_head *adj_list)
4775{
4776	struct netdev_adjacent *adj;
4777
4778	adj = list_entry(adj_list, struct netdev_adjacent, list);
4779
4780	return adj->private;
4781}
4782EXPORT_SYMBOL(netdev_adjacent_get_private);
4783
4784/**
4785 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4786 * @dev: device
4787 * @iter: list_head ** of the current position
4788 *
4789 * Gets the next device from the dev's upper list, starting from iter
4790 * position. The caller must hold RCU read lock.
4791 */
4792struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4793						 struct list_head **iter)
4794{
4795	struct netdev_adjacent *upper;
4796
4797	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4798
4799	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4800
4801	if (&upper->list == &dev->adj_list.upper)
4802		return NULL;
4803
4804	*iter = &upper->list;
4805
4806	return upper->dev;
4807}
4808EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4809
4810/**
4811 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4812 * @dev: device
4813 * @iter: list_head ** of the current position
4814 *
4815 * Gets the next device from the dev's upper list, starting from iter
4816 * position. The caller must hold RCU read lock.
4817 */
4818struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4819						     struct list_head **iter)
4820{
4821	struct netdev_adjacent *upper;
4822
4823	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4824
4825	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4826
4827	if (&upper->list == &dev->all_adj_list.upper)
4828		return NULL;
4829
4830	*iter = &upper->list;
4831
4832	return upper->dev;
4833}
4834EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4835
4836/**
4837 * netdev_lower_get_next_private - Get the next ->private from the
4838 *				   lower neighbour list
4839 * @dev: device
4840 * @iter: list_head ** of the current position
4841 *
4842 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4843 * list, starting from iter position. The caller must hold either hold the
4844 * RTNL lock or its own locking that guarantees that the neighbour lower
4845 * list will remain unchainged.
4846 */
4847void *netdev_lower_get_next_private(struct net_device *dev,
4848				    struct list_head **iter)
4849{
4850	struct netdev_adjacent *lower;
4851
4852	lower = list_entry(*iter, struct netdev_adjacent, list);
4853
4854	if (&lower->list == &dev->adj_list.lower)
4855		return NULL;
4856
4857	*iter = lower->list.next;
4858
4859	return lower->private;
4860}
4861EXPORT_SYMBOL(netdev_lower_get_next_private);
4862
4863/**
4864 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4865 *				       lower neighbour list, RCU
4866 *				       variant
4867 * @dev: device
4868 * @iter: list_head ** of the current position
4869 *
4870 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4871 * list, starting from iter position. The caller must hold RCU read lock.
4872 */
4873void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4874					struct list_head **iter)
4875{
4876	struct netdev_adjacent *lower;
4877
4878	WARN_ON_ONCE(!rcu_read_lock_held());
4879
4880	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4881
4882	if (&lower->list == &dev->adj_list.lower)
4883		return NULL;
4884
4885	*iter = &lower->list;
4886
4887	return lower->private;
4888}
4889EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4890
4891/**
4892 * netdev_lower_get_next - Get the next device from the lower neighbour
4893 *                         list
4894 * @dev: device
4895 * @iter: list_head ** of the current position
4896 *
4897 * Gets the next netdev_adjacent from the dev's lower neighbour
4898 * list, starting from iter position. The caller must hold RTNL lock or
4899 * its own locking that guarantees that the neighbour lower
4900 * list will remain unchainged.
4901 */
4902void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4903{
4904	struct netdev_adjacent *lower;
4905
4906	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4907
4908	if (&lower->list == &dev->adj_list.lower)
4909		return NULL;
4910
4911	*iter = &lower->list;
4912
4913	return lower->dev;
4914}
4915EXPORT_SYMBOL(netdev_lower_get_next);
4916
4917/**
4918 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4919 *				       lower neighbour list, RCU
4920 *				       variant
4921 * @dev: device
4922 *
4923 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4924 * list. The caller must hold RCU read lock.
4925 */
4926void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4927{
4928	struct netdev_adjacent *lower;
4929
4930	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4931			struct netdev_adjacent, list);
4932	if (lower)
4933		return lower->private;
4934	return NULL;
4935}
4936EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4937
4938/**
4939 * netdev_master_upper_dev_get_rcu - Get master upper device
4940 * @dev: device
4941 *
4942 * Find a master upper device and return pointer to it or NULL in case
4943 * it's not there. The caller must hold the RCU read lock.
4944 */
4945struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4946{
4947	struct netdev_adjacent *upper;
4948
4949	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4950				       struct netdev_adjacent, list);
4951	if (upper && likely(upper->master))
4952		return upper->dev;
4953	return NULL;
4954}
4955EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4956
4957static int netdev_adjacent_sysfs_add(struct net_device *dev,
4958			      struct net_device *adj_dev,
4959			      struct list_head *dev_list)
4960{
4961	char linkname[IFNAMSIZ+7];
4962	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4963		"upper_%s" : "lower_%s", adj_dev->name);
4964	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4965				 linkname);
4966}
4967static void netdev_adjacent_sysfs_del(struct net_device *dev,
4968			       char *name,
4969			       struct list_head *dev_list)
4970{
4971	char linkname[IFNAMSIZ+7];
4972	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4973		"upper_%s" : "lower_%s", name);
4974	sysfs_remove_link(&(dev->dev.kobj), linkname);
4975}
4976
4977static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4978						 struct net_device *adj_dev,
4979						 struct list_head *dev_list)
4980{
4981	return (dev_list == &dev->adj_list.upper ||
4982		dev_list == &dev->adj_list.lower) &&
4983		net_eq(dev_net(dev), dev_net(adj_dev));
4984}
4985
4986static int __netdev_adjacent_dev_insert(struct net_device *dev,
4987					struct net_device *adj_dev,
4988					struct list_head *dev_list,
4989					void *private, bool master)
4990{
4991	struct netdev_adjacent *adj;
4992	int ret;
4993
4994	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4995
4996	if (adj) {
4997		adj->ref_nr++;
4998		return 0;
4999	}
5000
5001	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5002	if (!adj)
5003		return -ENOMEM;
5004
5005	adj->dev = adj_dev;
5006	adj->master = master;
5007	adj->ref_nr = 1;
5008	adj->private = private;
5009	dev_hold(adj_dev);
5010
5011	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5012		 adj_dev->name, dev->name, adj_dev->name);
5013
5014	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5015		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5016		if (ret)
5017			goto free_adj;
5018	}
5019
5020	/* Ensure that master link is always the first item in list. */
5021	if (master) {
5022		ret = sysfs_create_link(&(dev->dev.kobj),
5023					&(adj_dev->dev.kobj), "master");
5024		if (ret)
5025			goto remove_symlinks;
5026
5027		list_add_rcu(&adj->list, dev_list);
5028	} else {
5029		list_add_tail_rcu(&adj->list, dev_list);
5030	}
5031
5032	return 0;
5033
5034remove_symlinks:
5035	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5036		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5037free_adj:
5038	kfree(adj);
5039	dev_put(adj_dev);
5040
5041	return ret;
5042}
5043
5044static void __netdev_adjacent_dev_remove(struct net_device *dev,
5045					 struct net_device *adj_dev,
5046					 struct list_head *dev_list)
5047{
5048	struct netdev_adjacent *adj;
5049
5050	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5051
5052	if (!adj) {
5053		pr_err("tried to remove device %s from %s\n",
5054		       dev->name, adj_dev->name);
5055		BUG();
5056	}
5057
5058	if (adj->ref_nr > 1) {
5059		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5060			 adj->ref_nr-1);
5061		adj->ref_nr--;
5062		return;
5063	}
5064
5065	if (adj->master)
5066		sysfs_remove_link(&(dev->dev.kobj), "master");
5067
5068	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5069		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5070
5071	list_del_rcu(&adj->list);
5072	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5073		 adj_dev->name, dev->name, adj_dev->name);
5074	dev_put(adj_dev);
5075	kfree_rcu(adj, rcu);
5076}
5077
5078static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5079					    struct net_device *upper_dev,
5080					    struct list_head *up_list,
5081					    struct list_head *down_list,
5082					    void *private, bool master)
5083{
5084	int ret;
5085
5086	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5087					   master);
5088	if (ret)
5089		return ret;
5090
5091	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5092					   false);
5093	if (ret) {
5094		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5095		return ret;
5096	}
5097
5098	return 0;
5099}
5100
5101static int __netdev_adjacent_dev_link(struct net_device *dev,
5102				      struct net_device *upper_dev)
5103{
5104	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5105						&dev->all_adj_list.upper,
5106						&upper_dev->all_adj_list.lower,
5107						NULL, false);
5108}
5109
5110static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5111					       struct net_device *upper_dev,
5112					       struct list_head *up_list,
5113					       struct list_head *down_list)
5114{
5115	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5116	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5117}
5118
5119static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5120					 struct net_device *upper_dev)
5121{
5122	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5123					   &dev->all_adj_list.upper,
5124					   &upper_dev->all_adj_list.lower);
5125}
5126
5127static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5128						struct net_device *upper_dev,
5129						void *private, bool master)
5130{
5131	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5132
5133	if (ret)
5134		return ret;
5135
5136	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5137					       &dev->adj_list.upper,
5138					       &upper_dev->adj_list.lower,
5139					       private, master);
5140	if (ret) {
5141		__netdev_adjacent_dev_unlink(dev, upper_dev);
5142		return ret;
5143	}
5144
5145	return 0;
5146}
5147
5148static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5149						   struct net_device *upper_dev)
5150{
5151	__netdev_adjacent_dev_unlink(dev, upper_dev);
5152	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5153					   &dev->adj_list.upper,
5154					   &upper_dev->adj_list.lower);
5155}
5156
5157static int __netdev_upper_dev_link(struct net_device *dev,
5158				   struct net_device *upper_dev, bool master,
5159				   void *private)
5160{
5161	struct netdev_adjacent *i, *j, *to_i, *to_j;
5162	int ret = 0;
5163
5164	ASSERT_RTNL();
5165
5166	if (dev == upper_dev)
5167		return -EBUSY;
5168
5169	/* To prevent loops, check if dev is not upper device to upper_dev. */
5170	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5171		return -EBUSY;
5172
5173	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5174		return -EEXIST;
5175
5176	if (master && netdev_master_upper_dev_get(dev))
5177		return -EBUSY;
5178
5179	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5180						   master);
5181	if (ret)
5182		return ret;
5183
5184	/* Now that we linked these devs, make all the upper_dev's
5185	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5186	 * versa, and don't forget the devices itself. All of these
5187	 * links are non-neighbours.
5188	 */
5189	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5190		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5191			pr_debug("Interlinking %s with %s, non-neighbour\n",
5192				 i->dev->name, j->dev->name);
5193			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5194			if (ret)
5195				goto rollback_mesh;
5196		}
5197	}
5198
5199	/* add dev to every upper_dev's upper device */
5200	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5201		pr_debug("linking %s's upper device %s with %s\n",
5202			 upper_dev->name, i->dev->name, dev->name);
5203		ret = __netdev_adjacent_dev_link(dev, i->dev);
5204		if (ret)
5205			goto rollback_upper_mesh;
5206	}
5207
5208	/* add upper_dev to every dev's lower device */
5209	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5210		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5211			 i->dev->name, upper_dev->name);
5212		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5213		if (ret)
5214			goto rollback_lower_mesh;
5215	}
5216
5217	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5218	return 0;
5219
5220rollback_lower_mesh:
5221	to_i = i;
5222	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5223		if (i == to_i)
5224			break;
5225		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5226	}
5227
5228	i = NULL;
5229
5230rollback_upper_mesh:
5231	to_i = i;
5232	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5233		if (i == to_i)
5234			break;
5235		__netdev_adjacent_dev_unlink(dev, i->dev);
5236	}
5237
5238	i = j = NULL;
5239
5240rollback_mesh:
5241	to_i = i;
5242	to_j = j;
5243	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5244		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5245			if (i == to_i && j == to_j)
5246				break;
5247			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5248		}
5249		if (i == to_i)
5250			break;
5251	}
5252
5253	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5254
5255	return ret;
5256}
5257
5258/**
5259 * netdev_upper_dev_link - Add a link to the upper device
5260 * @dev: device
5261 * @upper_dev: new upper device
5262 *
5263 * Adds a link to device which is upper to this one. The caller must hold
5264 * the RTNL lock. On a failure a negative errno code is returned.
5265 * On success the reference counts are adjusted and the function
5266 * returns zero.
5267 */
5268int netdev_upper_dev_link(struct net_device *dev,
5269			  struct net_device *upper_dev)
5270{
5271	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5272}
5273EXPORT_SYMBOL(netdev_upper_dev_link);
5274
5275/**
5276 * netdev_master_upper_dev_link - Add a master link to the upper device
5277 * @dev: device
5278 * @upper_dev: new upper device
5279 *
5280 * Adds a link to device which is upper to this one. In this case, only
5281 * one master upper device can be linked, although other non-master devices
5282 * might be linked as well. The caller must hold the RTNL lock.
5283 * On a failure a negative errno code is returned. On success the reference
5284 * counts are adjusted and the function returns zero.
5285 */
5286int netdev_master_upper_dev_link(struct net_device *dev,
5287				 struct net_device *upper_dev)
5288{
5289	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5290}
5291EXPORT_SYMBOL(netdev_master_upper_dev_link);
5292
5293int netdev_master_upper_dev_link_private(struct net_device *dev,
5294					 struct net_device *upper_dev,
5295					 void *private)
5296{
5297	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5298}
5299EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5300
5301/**
5302 * netdev_upper_dev_unlink - Removes a link to upper device
5303 * @dev: device
5304 * @upper_dev: new upper device
5305 *
5306 * Removes a link to device which is upper to this one. The caller must hold
5307 * the RTNL lock.
5308 */
5309void netdev_upper_dev_unlink(struct net_device *dev,
5310			     struct net_device *upper_dev)
5311{
5312	struct netdev_adjacent *i, *j;
5313	ASSERT_RTNL();
5314
5315	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5316
5317	/* Here is the tricky part. We must remove all dev's lower
5318	 * devices from all upper_dev's upper devices and vice
5319	 * versa, to maintain the graph relationship.
5320	 */
5321	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5322		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5323			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5324
5325	/* remove also the devices itself from lower/upper device
5326	 * list
5327	 */
5328	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5329		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5330
5331	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5332		__netdev_adjacent_dev_unlink(dev, i->dev);
5333
5334	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5335}
5336EXPORT_SYMBOL(netdev_upper_dev_unlink);
5337
5338/**
5339 * netdev_bonding_info_change - Dispatch event about slave change
5340 * @dev: device
5341 * @bonding_info: info to dispatch
5342 *
5343 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5344 * The caller must hold the RTNL lock.
5345 */
5346void netdev_bonding_info_change(struct net_device *dev,
5347				struct netdev_bonding_info *bonding_info)
5348{
5349	struct netdev_notifier_bonding_info	info;
5350
5351	memcpy(&info.bonding_info, bonding_info,
5352	       sizeof(struct netdev_bonding_info));
5353	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5354				      &info.info);
5355}
5356EXPORT_SYMBOL(netdev_bonding_info_change);
5357
5358static void netdev_adjacent_add_links(struct net_device *dev)
5359{
5360	struct netdev_adjacent *iter;
5361
5362	struct net *net = dev_net(dev);
5363
5364	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5365		if (!net_eq(net,dev_net(iter->dev)))
5366			continue;
5367		netdev_adjacent_sysfs_add(iter->dev, dev,
5368					  &iter->dev->adj_list.lower);
5369		netdev_adjacent_sysfs_add(dev, iter->dev,
5370					  &dev->adj_list.upper);
5371	}
5372
5373	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5374		if (!net_eq(net,dev_net(iter->dev)))
5375			continue;
5376		netdev_adjacent_sysfs_add(iter->dev, dev,
5377					  &iter->dev->adj_list.upper);
5378		netdev_adjacent_sysfs_add(dev, iter->dev,
5379					  &dev->adj_list.lower);
5380	}
5381}
5382
5383static void netdev_adjacent_del_links(struct net_device *dev)
5384{
5385	struct netdev_adjacent *iter;
5386
5387	struct net *net = dev_net(dev);
5388
5389	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5390		if (!net_eq(net,dev_net(iter->dev)))
5391			continue;
5392		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5393					  &iter->dev->adj_list.lower);
5394		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5395					  &dev->adj_list.upper);
5396	}
5397
5398	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5399		if (!net_eq(net,dev_net(iter->dev)))
5400			continue;
5401		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5402					  &iter->dev->adj_list.upper);
5403		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5404					  &dev->adj_list.lower);
5405	}
5406}
5407
5408void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5409{
5410	struct netdev_adjacent *iter;
5411
5412	struct net *net = dev_net(dev);
5413
5414	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5415		if (!net_eq(net,dev_net(iter->dev)))
5416			continue;
5417		netdev_adjacent_sysfs_del(iter->dev, oldname,
5418					  &iter->dev->adj_list.lower);
5419		netdev_adjacent_sysfs_add(iter->dev, dev,
5420					  &iter->dev->adj_list.lower);
5421	}
5422
5423	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5424		if (!net_eq(net,dev_net(iter->dev)))
5425			continue;
5426		netdev_adjacent_sysfs_del(iter->dev, oldname,
5427					  &iter->dev->adj_list.upper);
5428		netdev_adjacent_sysfs_add(iter->dev, dev,
5429					  &iter->dev->adj_list.upper);
5430	}
5431}
5432
5433void *netdev_lower_dev_get_private(struct net_device *dev,
5434				   struct net_device *lower_dev)
5435{
5436	struct netdev_adjacent *lower;
5437
5438	if (!lower_dev)
5439		return NULL;
5440	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5441	if (!lower)
5442		return NULL;
5443
5444	return lower->private;
5445}
5446EXPORT_SYMBOL(netdev_lower_dev_get_private);
5447
5448
5449int dev_get_nest_level(struct net_device *dev,
5450		       bool (*type_check)(struct net_device *dev))
5451{
5452	struct net_device *lower = NULL;
5453	struct list_head *iter;
5454	int max_nest = -1;
5455	int nest;
5456
5457	ASSERT_RTNL();
5458
5459	netdev_for_each_lower_dev(dev, lower, iter) {
5460		nest = dev_get_nest_level(lower, type_check);
5461		if (max_nest < nest)
5462			max_nest = nest;
5463	}
5464
5465	if (type_check(dev))
5466		max_nest++;
5467
5468	return max_nest;
5469}
5470EXPORT_SYMBOL(dev_get_nest_level);
5471
5472static void dev_change_rx_flags(struct net_device *dev, int flags)
5473{
5474	const struct net_device_ops *ops = dev->netdev_ops;
5475
5476	if (ops->ndo_change_rx_flags)
5477		ops->ndo_change_rx_flags(dev, flags);
5478}
5479
5480static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5481{
5482	unsigned int old_flags = dev->flags;
5483	kuid_t uid;
5484	kgid_t gid;
5485
5486	ASSERT_RTNL();
5487
5488	dev->flags |= IFF_PROMISC;
5489	dev->promiscuity += inc;
5490	if (dev->promiscuity == 0) {
5491		/*
5492		 * Avoid overflow.
5493		 * If inc causes overflow, untouch promisc and return error.
5494		 */
5495		if (inc < 0)
5496			dev->flags &= ~IFF_PROMISC;
5497		else {
5498			dev->promiscuity -= inc;
5499			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5500				dev->name);
5501			return -EOVERFLOW;
5502		}
5503	}
5504	if (dev->flags != old_flags) {
5505		pr_info("device %s %s promiscuous mode\n",
5506			dev->name,
5507			dev->flags & IFF_PROMISC ? "entered" : "left");
5508		if (audit_enabled) {
5509			current_uid_gid(&uid, &gid);
5510			audit_log(current->audit_context, GFP_ATOMIC,
5511				AUDIT_ANOM_PROMISCUOUS,
5512				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5513				dev->name, (dev->flags & IFF_PROMISC),
5514				(old_flags & IFF_PROMISC),
5515				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5516				from_kuid(&init_user_ns, uid),
5517				from_kgid(&init_user_ns, gid),
5518				audit_get_sessionid(current));
5519		}
5520
5521		dev_change_rx_flags(dev, IFF_PROMISC);
5522	}
5523	if (notify)
5524		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5525	return 0;
5526}
5527
5528/**
5529 *	dev_set_promiscuity	- update promiscuity count on a device
5530 *	@dev: device
5531 *	@inc: modifier
5532 *
5533 *	Add or remove promiscuity from a device. While the count in the device
5534 *	remains above zero the interface remains promiscuous. Once it hits zero
5535 *	the device reverts back to normal filtering operation. A negative inc
5536 *	value is used to drop promiscuity on the device.
5537 *	Return 0 if successful or a negative errno code on error.
5538 */
5539int dev_set_promiscuity(struct net_device *dev, int inc)
5540{
5541	unsigned int old_flags = dev->flags;
5542	int err;
5543
5544	err = __dev_set_promiscuity(dev, inc, true);
5545	if (err < 0)
5546		return err;
5547	if (dev->flags != old_flags)
5548		dev_set_rx_mode(dev);
5549	return err;
5550}
5551EXPORT_SYMBOL(dev_set_promiscuity);
5552
5553static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5554{
5555	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5556
5557	ASSERT_RTNL();
5558
5559	dev->flags |= IFF_ALLMULTI;
5560	dev->allmulti += inc;
5561	if (dev->allmulti == 0) {
5562		/*
5563		 * Avoid overflow.
5564		 * If inc causes overflow, untouch allmulti and return error.
5565		 */
5566		if (inc < 0)
5567			dev->flags &= ~IFF_ALLMULTI;
5568		else {
5569			dev->allmulti -= inc;
5570			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5571				dev->name);
5572			return -EOVERFLOW;
5573		}
5574	}
5575	if (dev->flags ^ old_flags) {
5576		dev_change_rx_flags(dev, IFF_ALLMULTI);
5577		dev_set_rx_mode(dev);
5578		if (notify)
5579			__dev_notify_flags(dev, old_flags,
5580					   dev->gflags ^ old_gflags);
5581	}
5582	return 0;
5583}
5584
5585/**
5586 *	dev_set_allmulti	- update allmulti count on a device
5587 *	@dev: device
5588 *	@inc: modifier
5589 *
5590 *	Add or remove reception of all multicast frames to a device. While the
5591 *	count in the device remains above zero the interface remains listening
5592 *	to all interfaces. Once it hits zero the device reverts back to normal
5593 *	filtering operation. A negative @inc value is used to drop the counter
5594 *	when releasing a resource needing all multicasts.
5595 *	Return 0 if successful or a negative errno code on error.
5596 */
5597
5598int dev_set_allmulti(struct net_device *dev, int inc)
5599{
5600	return __dev_set_allmulti(dev, inc, true);
5601}
5602EXPORT_SYMBOL(dev_set_allmulti);
5603
5604/*
5605 *	Upload unicast and multicast address lists to device and
5606 *	configure RX filtering. When the device doesn't support unicast
5607 *	filtering it is put in promiscuous mode while unicast addresses
5608 *	are present.
5609 */
5610void __dev_set_rx_mode(struct net_device *dev)
5611{
5612	const struct net_device_ops *ops = dev->netdev_ops;
5613
5614	/* dev_open will call this function so the list will stay sane. */
5615	if (!(dev->flags&IFF_UP))
5616		return;
5617
5618	if (!netif_device_present(dev))
5619		return;
5620
5621	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5622		/* Unicast addresses changes may only happen under the rtnl,
5623		 * therefore calling __dev_set_promiscuity here is safe.
5624		 */
5625		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5626			__dev_set_promiscuity(dev, 1, false);
5627			dev->uc_promisc = true;
5628		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5629			__dev_set_promiscuity(dev, -1, false);
5630			dev->uc_promisc = false;
5631		}
5632	}
5633
5634	if (ops->ndo_set_rx_mode)
5635		ops->ndo_set_rx_mode(dev);
5636}
5637
5638void dev_set_rx_mode(struct net_device *dev)
5639{
5640	netif_addr_lock_bh(dev);
5641	__dev_set_rx_mode(dev);
5642	netif_addr_unlock_bh(dev);
5643}
5644
5645/**
5646 *	dev_get_flags - get flags reported to userspace
5647 *	@dev: device
5648 *
5649 *	Get the combination of flag bits exported through APIs to userspace.
5650 */
5651unsigned int dev_get_flags(const struct net_device *dev)
5652{
5653	unsigned int flags;
5654
5655	flags = (dev->flags & ~(IFF_PROMISC |
5656				IFF_ALLMULTI |
5657				IFF_RUNNING |
5658				IFF_LOWER_UP |
5659				IFF_DORMANT)) |
5660		(dev->gflags & (IFF_PROMISC |
5661				IFF_ALLMULTI));
5662
5663	if (netif_running(dev)) {
5664		if (netif_oper_up(dev))
5665			flags |= IFF_RUNNING;
5666		if (netif_carrier_ok(dev))
5667			flags |= IFF_LOWER_UP;
5668		if (netif_dormant(dev))
5669			flags |= IFF_DORMANT;
5670	}
5671
5672	return flags;
5673}
5674EXPORT_SYMBOL(dev_get_flags);
5675
5676int __dev_change_flags(struct net_device *dev, unsigned int flags)
5677{
5678	unsigned int old_flags = dev->flags;
5679	int ret;
5680
5681	ASSERT_RTNL();
5682
5683	/*
5684	 *	Set the flags on our device.
5685	 */
5686
5687	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5688			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5689			       IFF_AUTOMEDIA)) |
5690		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5691				    IFF_ALLMULTI));
5692
5693	/*
5694	 *	Load in the correct multicast list now the flags have changed.
5695	 */
5696
5697	if ((old_flags ^ flags) & IFF_MULTICAST)
5698		dev_change_rx_flags(dev, IFF_MULTICAST);
5699
5700	dev_set_rx_mode(dev);
5701
5702	/*
5703	 *	Have we downed the interface. We handle IFF_UP ourselves
5704	 *	according to user attempts to set it, rather than blindly
5705	 *	setting it.
5706	 */
5707
5708	ret = 0;
5709	if ((old_flags ^ flags) & IFF_UP)
5710		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5711
5712	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5713		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5714		unsigned int old_flags = dev->flags;
5715
5716		dev->gflags ^= IFF_PROMISC;
5717
5718		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5719			if (dev->flags != old_flags)
5720				dev_set_rx_mode(dev);
5721	}
5722
5723	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5724	   is important. Some (broken) drivers set IFF_PROMISC, when
5725	   IFF_ALLMULTI is requested not asking us and not reporting.
5726	 */
5727	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5728		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5729
5730		dev->gflags ^= IFF_ALLMULTI;
5731		__dev_set_allmulti(dev, inc, false);
5732	}
5733
5734	return ret;
5735}
5736
5737void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5738			unsigned int gchanges)
5739{
5740	unsigned int changes = dev->flags ^ old_flags;
5741
5742	if (gchanges)
5743		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5744
5745	if (changes & IFF_UP) {
5746		if (dev->flags & IFF_UP)
5747			call_netdevice_notifiers(NETDEV_UP, dev);
5748		else
5749			call_netdevice_notifiers(NETDEV_DOWN, dev);
5750	}
5751
5752	if (dev->flags & IFF_UP &&
5753	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5754		struct netdev_notifier_change_info change_info;
5755
5756		change_info.flags_changed = changes;
5757		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5758					      &change_info.info);
5759	}
5760}
5761
5762/**
5763 *	dev_change_flags - change device settings
5764 *	@dev: device
5765 *	@flags: device state flags
5766 *
5767 *	Change settings on device based state flags. The flags are
5768 *	in the userspace exported format.
5769 */
5770int dev_change_flags(struct net_device *dev, unsigned int flags)
5771{
5772	int ret;
5773	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5774
5775	ret = __dev_change_flags(dev, flags);
5776	if (ret < 0)
5777		return ret;
5778
5779	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5780	__dev_notify_flags(dev, old_flags, changes);
5781	return ret;
5782}
5783EXPORT_SYMBOL(dev_change_flags);
5784
5785static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5786{
5787	const struct net_device_ops *ops = dev->netdev_ops;
5788
5789	if (ops->ndo_change_mtu)
5790		return ops->ndo_change_mtu(dev, new_mtu);
5791
5792	dev->mtu = new_mtu;
5793	return 0;
5794}
5795
5796/**
5797 *	dev_set_mtu - Change maximum transfer unit
5798 *	@dev: device
5799 *	@new_mtu: new transfer unit
5800 *
5801 *	Change the maximum transfer size of the network device.
5802 */
5803int dev_set_mtu(struct net_device *dev, int new_mtu)
5804{
5805	int err, orig_mtu;
5806
5807	if (new_mtu == dev->mtu)
5808		return 0;
5809
5810	/*	MTU must be positive.	 */
5811	if (new_mtu < 0)
5812		return -EINVAL;
5813
5814	if (!netif_device_present(dev))
5815		return -ENODEV;
5816
5817	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5818	err = notifier_to_errno(err);
5819	if (err)
5820		return err;
5821
5822	orig_mtu = dev->mtu;
5823	err = __dev_set_mtu(dev, new_mtu);
5824
5825	if (!err) {
5826		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5827		err = notifier_to_errno(err);
5828		if (err) {
5829			/* setting mtu back and notifying everyone again,
5830			 * so that they have a chance to revert changes.
5831			 */
5832			__dev_set_mtu(dev, orig_mtu);
5833			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5834		}
5835	}
5836	return err;
5837}
5838EXPORT_SYMBOL(dev_set_mtu);
5839
5840/**
5841 *	dev_set_group - Change group this device belongs to
5842 *	@dev: device
5843 *	@new_group: group this device should belong to
5844 */
5845void dev_set_group(struct net_device *dev, int new_group)
5846{
5847	dev->group = new_group;
5848}
5849EXPORT_SYMBOL(dev_set_group);
5850
5851/**
5852 *	dev_set_mac_address - Change Media Access Control Address
5853 *	@dev: device
5854 *	@sa: new address
5855 *
5856 *	Change the hardware (MAC) address of the device
5857 */
5858int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5859{
5860	const struct net_device_ops *ops = dev->netdev_ops;
5861	int err;
5862
5863	if (!ops->ndo_set_mac_address)
5864		return -EOPNOTSUPP;
5865	if (sa->sa_family != dev->type)
5866		return -EINVAL;
5867	if (!netif_device_present(dev))
5868		return -ENODEV;
5869	err = ops->ndo_set_mac_address(dev, sa);
5870	if (err)
5871		return err;
5872	dev->addr_assign_type = NET_ADDR_SET;
5873	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5874	add_device_randomness(dev->dev_addr, dev->addr_len);
5875	return 0;
5876}
5877EXPORT_SYMBOL(dev_set_mac_address);
5878
5879/**
5880 *	dev_change_carrier - Change device carrier
5881 *	@dev: device
5882 *	@new_carrier: new value
5883 *
5884 *	Change device carrier
5885 */
5886int dev_change_carrier(struct net_device *dev, bool new_carrier)
5887{
5888	const struct net_device_ops *ops = dev->netdev_ops;
5889
5890	if (!ops->ndo_change_carrier)
5891		return -EOPNOTSUPP;
5892	if (!netif_device_present(dev))
5893		return -ENODEV;
5894	return ops->ndo_change_carrier(dev, new_carrier);
5895}
5896EXPORT_SYMBOL(dev_change_carrier);
5897
5898/**
5899 *	dev_get_phys_port_id - Get device physical port ID
5900 *	@dev: device
5901 *	@ppid: port ID
5902 *
5903 *	Get device physical port ID
5904 */
5905int dev_get_phys_port_id(struct net_device *dev,
5906			 struct netdev_phys_item_id *ppid)
5907{
5908	const struct net_device_ops *ops = dev->netdev_ops;
5909
5910	if (!ops->ndo_get_phys_port_id)
5911		return -EOPNOTSUPP;
5912	return ops->ndo_get_phys_port_id(dev, ppid);
5913}
5914EXPORT_SYMBOL(dev_get_phys_port_id);
5915
5916/**
5917 *	dev_new_index	-	allocate an ifindex
5918 *	@net: the applicable net namespace
5919 *
5920 *	Returns a suitable unique value for a new device interface
5921 *	number.  The caller must hold the rtnl semaphore or the
5922 *	dev_base_lock to be sure it remains unique.
5923 */
5924static int dev_new_index(struct net *net)
5925{
5926	int ifindex = net->ifindex;
5927	for (;;) {
5928		if (++ifindex <= 0)
5929			ifindex = 1;
5930		if (!__dev_get_by_index(net, ifindex))
5931			return net->ifindex = ifindex;
5932	}
5933}
5934
5935/* Delayed registration/unregisteration */
5936static LIST_HEAD(net_todo_list);
5937DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5938
5939static void net_set_todo(struct net_device *dev)
5940{
5941	list_add_tail(&dev->todo_list, &net_todo_list);
5942	dev_net(dev)->dev_unreg_count++;
5943}
5944
5945static void rollback_registered_many(struct list_head *head)
5946{
5947	struct net_device *dev, *tmp;
5948	LIST_HEAD(close_head);
5949
5950	BUG_ON(dev_boot_phase);
5951	ASSERT_RTNL();
5952
5953	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5954		/* Some devices call without registering
5955		 * for initialization unwind. Remove those
5956		 * devices and proceed with the remaining.
5957		 */
5958		if (dev->reg_state == NETREG_UNINITIALIZED) {
5959			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5960				 dev->name, dev);
5961
5962			WARN_ON(1);
5963			list_del(&dev->unreg_list);
5964			continue;
5965		}
5966		dev->dismantle = true;
5967		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5968	}
5969
5970	/* If device is running, close it first. */
5971	list_for_each_entry(dev, head, unreg_list)
5972		list_add_tail(&dev->close_list, &close_head);
5973	dev_close_many(&close_head);
5974
5975	list_for_each_entry(dev, head, unreg_list) {
5976		/* And unlink it from device chain. */
5977		unlist_netdevice(dev);
5978
5979		dev->reg_state = NETREG_UNREGISTERING;
5980	}
5981
5982	synchronize_net();
5983
5984	list_for_each_entry(dev, head, unreg_list) {
5985		struct sk_buff *skb = NULL;
5986
5987		/* Shutdown queueing discipline. */
5988		dev_shutdown(dev);
5989
5990
5991		/* Notify protocols, that we are about to destroy
5992		   this device. They should clean all the things.
5993		*/
5994		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5995
5996		if (!dev->rtnl_link_ops ||
5997		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5998			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5999						     GFP_KERNEL);
6000
6001		/*
6002		 *	Flush the unicast and multicast chains
6003		 */
6004		dev_uc_flush(dev);
6005		dev_mc_flush(dev);
6006
6007		if (dev->netdev_ops->ndo_uninit)
6008			dev->netdev_ops->ndo_uninit(dev);
6009
6010		if (skb)
6011			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6012
6013		/* Notifier chain MUST detach us all upper devices. */
6014		WARN_ON(netdev_has_any_upper_dev(dev));
6015
6016		/* Remove entries from kobject tree */
6017		netdev_unregister_kobject(dev);
6018#ifdef CONFIG_XPS
6019		/* Remove XPS queueing entries */
6020		netif_reset_xps_queues_gt(dev, 0);
6021#endif
6022	}
6023
6024	synchronize_net();
6025
6026	list_for_each_entry(dev, head, unreg_list)
6027		dev_put(dev);
6028}
6029
6030static void rollback_registered(struct net_device *dev)
6031{
6032	LIST_HEAD(single);
6033
6034	list_add(&dev->unreg_list, &single);
6035	rollback_registered_many(&single);
6036	list_del(&single);
6037}
6038
6039static netdev_features_t netdev_fix_features(struct net_device *dev,
6040	netdev_features_t features)
6041{
6042	/* Fix illegal checksum combinations */
6043	if ((features & NETIF_F_HW_CSUM) &&
6044	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6045		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6046		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6047	}
6048
6049	/* TSO requires that SG is present as well. */
6050	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6051		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6052		features &= ~NETIF_F_ALL_TSO;
6053	}
6054
6055	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6056					!(features & NETIF_F_IP_CSUM)) {
6057		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6058		features &= ~NETIF_F_TSO;
6059		features &= ~NETIF_F_TSO_ECN;
6060	}
6061
6062	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6063					 !(features & NETIF_F_IPV6_CSUM)) {
6064		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6065		features &= ~NETIF_F_TSO6;
6066	}
6067
6068	/* TSO ECN requires that TSO is present as well. */
6069	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6070		features &= ~NETIF_F_TSO_ECN;
6071
6072	/* Software GSO depends on SG. */
6073	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6074		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6075		features &= ~NETIF_F_GSO;
6076	}
6077
6078	/* UFO needs SG and checksumming */
6079	if (features & NETIF_F_UFO) {
6080		/* maybe split UFO into V4 and V6? */
6081		if (!((features & NETIF_F_GEN_CSUM) ||
6082		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6083			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6084			netdev_dbg(dev,
6085				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6086			features &= ~NETIF_F_UFO;
6087		}
6088
6089		if (!(features & NETIF_F_SG)) {
6090			netdev_dbg(dev,
6091				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6092			features &= ~NETIF_F_UFO;
6093		}
6094	}
6095
6096#ifdef CONFIG_NET_RX_BUSY_POLL
6097	if (dev->netdev_ops->ndo_busy_poll)
6098		features |= NETIF_F_BUSY_POLL;
6099	else
6100#endif
6101		features &= ~NETIF_F_BUSY_POLL;
6102
6103	return features;
6104}
6105
6106int __netdev_update_features(struct net_device *dev)
6107{
6108	netdev_features_t features;
6109	int err = 0;
6110
6111	ASSERT_RTNL();
6112
6113	features = netdev_get_wanted_features(dev);
6114
6115	if (dev->netdev_ops->ndo_fix_features)
6116		features = dev->netdev_ops->ndo_fix_features(dev, features);
6117
6118	/* driver might be less strict about feature dependencies */
6119	features = netdev_fix_features(dev, features);
6120
6121	if (dev->features == features)
6122		return 0;
6123
6124	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6125		&dev->features, &features);
6126
6127	if (dev->netdev_ops->ndo_set_features)
6128		err = dev->netdev_ops->ndo_set_features(dev, features);
6129
6130	if (unlikely(err < 0)) {
6131		netdev_err(dev,
6132			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6133			err, &features, &dev->features);
6134		return -1;
6135	}
6136
6137	if (!err)
6138		dev->features = features;
6139
6140	return 1;
6141}
6142
6143/**
6144 *	netdev_update_features - recalculate device features
6145 *	@dev: the device to check
6146 *
6147 *	Recalculate dev->features set and send notifications if it
6148 *	has changed. Should be called after driver or hardware dependent
6149 *	conditions might have changed that influence the features.
6150 */
6151void netdev_update_features(struct net_device *dev)
6152{
6153	if (__netdev_update_features(dev))
6154		netdev_features_change(dev);
6155}
6156EXPORT_SYMBOL(netdev_update_features);
6157
6158/**
6159 *	netdev_change_features - recalculate device features
6160 *	@dev: the device to check
6161 *
6162 *	Recalculate dev->features set and send notifications even
6163 *	if they have not changed. Should be called instead of
6164 *	netdev_update_features() if also dev->vlan_features might
6165 *	have changed to allow the changes to be propagated to stacked
6166 *	VLAN devices.
6167 */
6168void netdev_change_features(struct net_device *dev)
6169{
6170	__netdev_update_features(dev);
6171	netdev_features_change(dev);
6172}
6173EXPORT_SYMBOL(netdev_change_features);
6174
6175/**
6176 *	netif_stacked_transfer_operstate -	transfer operstate
6177 *	@rootdev: the root or lower level device to transfer state from
6178 *	@dev: the device to transfer operstate to
6179 *
6180 *	Transfer operational state from root to device. This is normally
6181 *	called when a stacking relationship exists between the root
6182 *	device and the device(a leaf device).
6183 */
6184void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6185					struct net_device *dev)
6186{
6187	if (rootdev->operstate == IF_OPER_DORMANT)
6188		netif_dormant_on(dev);
6189	else
6190		netif_dormant_off(dev);
6191
6192	if (netif_carrier_ok(rootdev)) {
6193		if (!netif_carrier_ok(dev))
6194			netif_carrier_on(dev);
6195	} else {
6196		if (netif_carrier_ok(dev))
6197			netif_carrier_off(dev);
6198	}
6199}
6200EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6201
6202#ifdef CONFIG_SYSFS
6203static int netif_alloc_rx_queues(struct net_device *dev)
6204{
6205	unsigned int i, count = dev->num_rx_queues;
6206	struct netdev_rx_queue *rx;
6207	size_t sz = count * sizeof(*rx);
6208
6209	BUG_ON(count < 1);
6210
6211	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6212	if (!rx) {
6213		rx = vzalloc(sz);
6214		if (!rx)
6215			return -ENOMEM;
6216	}
6217	dev->_rx = rx;
6218
6219	for (i = 0; i < count; i++)
6220		rx[i].dev = dev;
6221	return 0;
6222}
6223#endif
6224
6225static void netdev_init_one_queue(struct net_device *dev,
6226				  struct netdev_queue *queue, void *_unused)
6227{
6228	/* Initialize queue lock */
6229	spin_lock_init(&queue->_xmit_lock);
6230	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6231	queue->xmit_lock_owner = -1;
6232	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6233	queue->dev = dev;
6234#ifdef CONFIG_BQL
6235	dql_init(&queue->dql, HZ);
6236#endif
6237}
6238
6239static void netif_free_tx_queues(struct net_device *dev)
6240{
6241	kvfree(dev->_tx);
6242}
6243
6244static int netif_alloc_netdev_queues(struct net_device *dev)
6245{
6246	unsigned int count = dev->num_tx_queues;
6247	struct netdev_queue *tx;
6248	size_t sz = count * sizeof(*tx);
6249
6250	BUG_ON(count < 1 || count > 0xffff);
6251
6252	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6253	if (!tx) {
6254		tx = vzalloc(sz);
6255		if (!tx)
6256			return -ENOMEM;
6257	}
6258	dev->_tx = tx;
6259
6260	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6261	spin_lock_init(&dev->tx_global_lock);
6262
6263	return 0;
6264}
6265
6266/**
6267 *	register_netdevice	- register a network device
6268 *	@dev: device to register
6269 *
6270 *	Take a completed network device structure and add it to the kernel
6271 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6272 *	chain. 0 is returned on success. A negative errno code is returned
6273 *	on a failure to set up the device, or if the name is a duplicate.
6274 *
6275 *	Callers must hold the rtnl semaphore. You may want
6276 *	register_netdev() instead of this.
6277 *
6278 *	BUGS:
6279 *	The locking appears insufficient to guarantee two parallel registers
6280 *	will not get the same name.
6281 */
6282
6283int register_netdevice(struct net_device *dev)
6284{
6285	int ret;
6286	struct net *net = dev_net(dev);
6287
6288	BUG_ON(dev_boot_phase);
6289	ASSERT_RTNL();
6290
6291	might_sleep();
6292
6293	/* When net_device's are persistent, this will be fatal. */
6294	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6295	BUG_ON(!net);
6296
6297	spin_lock_init(&dev->addr_list_lock);
6298	netdev_set_addr_lockdep_class(dev);
6299
6300	dev->iflink = -1;
6301
6302	ret = dev_get_valid_name(net, dev, dev->name);
6303	if (ret < 0)
6304		goto out;
6305
6306	/* Init, if this function is available */
6307	if (dev->netdev_ops->ndo_init) {
6308		ret = dev->netdev_ops->ndo_init(dev);
6309		if (ret) {
6310			if (ret > 0)
6311				ret = -EIO;
6312			goto out;
6313		}
6314	}
6315
6316	if (((dev->hw_features | dev->features) &
6317	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6318	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6319	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6320		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6321		ret = -EINVAL;
6322		goto err_uninit;
6323	}
6324
6325	ret = -EBUSY;
6326	if (!dev->ifindex)
6327		dev->ifindex = dev_new_index(net);
6328	else if (__dev_get_by_index(net, dev->ifindex))
6329		goto err_uninit;
6330
6331	if (dev->iflink == -1)
6332		dev->iflink = dev->ifindex;
6333
6334	/* Transfer changeable features to wanted_features and enable
6335	 * software offloads (GSO and GRO).
6336	 */
6337	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6338	dev->features |= NETIF_F_SOFT_FEATURES;
6339	dev->wanted_features = dev->features & dev->hw_features;
6340
6341	if (!(dev->flags & IFF_LOOPBACK)) {
6342		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6343	}
6344
6345	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6346	 */
6347	dev->vlan_features |= NETIF_F_HIGHDMA;
6348
6349	/* Make NETIF_F_SG inheritable to tunnel devices.
6350	 */
6351	dev->hw_enc_features |= NETIF_F_SG;
6352
6353	/* Make NETIF_F_SG inheritable to MPLS.
6354	 */
6355	dev->mpls_features |= NETIF_F_SG;
6356
6357	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6358	ret = notifier_to_errno(ret);
6359	if (ret)
6360		goto err_uninit;
6361
6362	ret = netdev_register_kobject(dev);
6363	if (ret)
6364		goto err_uninit;
6365	dev->reg_state = NETREG_REGISTERED;
6366
6367	__netdev_update_features(dev);
6368
6369	/*
6370	 *	Default initial state at registry is that the
6371	 *	device is present.
6372	 */
6373
6374	set_bit(__LINK_STATE_PRESENT, &dev->state);
6375
6376	linkwatch_init_dev(dev);
6377
6378	dev_init_scheduler(dev);
6379	dev_hold(dev);
6380	list_netdevice(dev);
6381	add_device_randomness(dev->dev_addr, dev->addr_len);
6382
6383	/* If the device has permanent device address, driver should
6384	 * set dev_addr and also addr_assign_type should be set to
6385	 * NET_ADDR_PERM (default value).
6386	 */
6387	if (dev->addr_assign_type == NET_ADDR_PERM)
6388		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6389
6390	/* Notify protocols, that a new device appeared. */
6391	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6392	ret = notifier_to_errno(ret);
6393	if (ret) {
6394		rollback_registered(dev);
6395		dev->reg_state = NETREG_UNREGISTERED;
6396	}
6397	/*
6398	 *	Prevent userspace races by waiting until the network
6399	 *	device is fully setup before sending notifications.
6400	 */
6401	if (!dev->rtnl_link_ops ||
6402	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6403		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6404
6405out:
6406	return ret;
6407
6408err_uninit:
6409	if (dev->netdev_ops->ndo_uninit)
6410		dev->netdev_ops->ndo_uninit(dev);
6411	goto out;
6412}
6413EXPORT_SYMBOL(register_netdevice);
6414
6415/**
6416 *	init_dummy_netdev	- init a dummy network device for NAPI
6417 *	@dev: device to init
6418 *
6419 *	This takes a network device structure and initialize the minimum
6420 *	amount of fields so it can be used to schedule NAPI polls without
6421 *	registering a full blown interface. This is to be used by drivers
6422 *	that need to tie several hardware interfaces to a single NAPI
6423 *	poll scheduler due to HW limitations.
6424 */
6425int init_dummy_netdev(struct net_device *dev)
6426{
6427	/* Clear everything. Note we don't initialize spinlocks
6428	 * are they aren't supposed to be taken by any of the
6429	 * NAPI code and this dummy netdev is supposed to be
6430	 * only ever used for NAPI polls
6431	 */
6432	memset(dev, 0, sizeof(struct net_device));
6433
6434	/* make sure we BUG if trying to hit standard
6435	 * register/unregister code path
6436	 */
6437	dev->reg_state = NETREG_DUMMY;
6438
6439	/* NAPI wants this */
6440	INIT_LIST_HEAD(&dev->napi_list);
6441
6442	/* a dummy interface is started by default */
6443	set_bit(__LINK_STATE_PRESENT, &dev->state);
6444	set_bit(__LINK_STATE_START, &dev->state);
6445
6446	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6447	 * because users of this 'device' dont need to change
6448	 * its refcount.
6449	 */
6450
6451	return 0;
6452}
6453EXPORT_SYMBOL_GPL(init_dummy_netdev);
6454
6455
6456/**
6457 *	register_netdev	- register a network device
6458 *	@dev: device to register
6459 *
6460 *	Take a completed network device structure and add it to the kernel
6461 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6462 *	chain. 0 is returned on success. A negative errno code is returned
6463 *	on a failure to set up the device, or if the name is a duplicate.
6464 *
6465 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6466 *	and expands the device name if you passed a format string to
6467 *	alloc_netdev.
6468 */
6469int register_netdev(struct net_device *dev)
6470{
6471	int err;
6472
6473	rtnl_lock();
6474	err = register_netdevice(dev);
6475	rtnl_unlock();
6476	return err;
6477}
6478EXPORT_SYMBOL(register_netdev);
6479
6480int netdev_refcnt_read(const struct net_device *dev)
6481{
6482	int i, refcnt = 0;
6483
6484	for_each_possible_cpu(i)
6485		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6486	return refcnt;
6487}
6488EXPORT_SYMBOL(netdev_refcnt_read);
6489
6490/**
6491 * netdev_wait_allrefs - wait until all references are gone.
6492 * @dev: target net_device
6493 *
6494 * This is called when unregistering network devices.
6495 *
6496 * Any protocol or device that holds a reference should register
6497 * for netdevice notification, and cleanup and put back the
6498 * reference if they receive an UNREGISTER event.
6499 * We can get stuck here if buggy protocols don't correctly
6500 * call dev_put.
6501 */
6502static void netdev_wait_allrefs(struct net_device *dev)
6503{
6504	unsigned long rebroadcast_time, warning_time;
6505	int refcnt;
6506
6507	linkwatch_forget_dev(dev);
6508
6509	rebroadcast_time = warning_time = jiffies;
6510	refcnt = netdev_refcnt_read(dev);
6511
6512	while (refcnt != 0) {
6513		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6514			rtnl_lock();
6515
6516			/* Rebroadcast unregister notification */
6517			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6518
6519			__rtnl_unlock();
6520			rcu_barrier();
6521			rtnl_lock();
6522
6523			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6524			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6525				     &dev->state)) {
6526				/* We must not have linkwatch events
6527				 * pending on unregister. If this
6528				 * happens, we simply run the queue
6529				 * unscheduled, resulting in a noop
6530				 * for this device.
6531				 */
6532				linkwatch_run_queue();
6533			}
6534
6535			__rtnl_unlock();
6536
6537			rebroadcast_time = jiffies;
6538		}
6539
6540		msleep(250);
6541
6542		refcnt = netdev_refcnt_read(dev);
6543
6544		if (time_after(jiffies, warning_time + 10 * HZ)) {
6545			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6546				 dev->name, refcnt);
6547			warning_time = jiffies;
6548		}
6549	}
6550}
6551
6552/* The sequence is:
6553 *
6554 *	rtnl_lock();
6555 *	...
6556 *	register_netdevice(x1);
6557 *	register_netdevice(x2);
6558 *	...
6559 *	unregister_netdevice(y1);
6560 *	unregister_netdevice(y2);
6561 *      ...
6562 *	rtnl_unlock();
6563 *	free_netdev(y1);
6564 *	free_netdev(y2);
6565 *
6566 * We are invoked by rtnl_unlock().
6567 * This allows us to deal with problems:
6568 * 1) We can delete sysfs objects which invoke hotplug
6569 *    without deadlocking with linkwatch via keventd.
6570 * 2) Since we run with the RTNL semaphore not held, we can sleep
6571 *    safely in order to wait for the netdev refcnt to drop to zero.
6572 *
6573 * We must not return until all unregister events added during
6574 * the interval the lock was held have been completed.
6575 */
6576void netdev_run_todo(void)
6577{
6578	struct list_head list;
6579
6580	/* Snapshot list, allow later requests */
6581	list_replace_init(&net_todo_list, &list);
6582
6583	__rtnl_unlock();
6584
6585
6586	/* Wait for rcu callbacks to finish before next phase */
6587	if (!list_empty(&list))
6588		rcu_barrier();
6589
6590	while (!list_empty(&list)) {
6591		struct net_device *dev
6592			= list_first_entry(&list, struct net_device, todo_list);
6593		list_del(&dev->todo_list);
6594
6595		rtnl_lock();
6596		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6597		__rtnl_unlock();
6598
6599		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6600			pr_err("network todo '%s' but state %d\n",
6601			       dev->name, dev->reg_state);
6602			dump_stack();
6603			continue;
6604		}
6605
6606		dev->reg_state = NETREG_UNREGISTERED;
6607
6608		on_each_cpu(flush_backlog, dev, 1);
6609
6610		netdev_wait_allrefs(dev);
6611
6612		/* paranoia */
6613		BUG_ON(netdev_refcnt_read(dev));
6614		BUG_ON(!list_empty(&dev->ptype_all));
6615		BUG_ON(!list_empty(&dev->ptype_specific));
6616		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6617		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6618		WARN_ON(dev->dn_ptr);
6619
6620		if (dev->destructor)
6621			dev->destructor(dev);
6622
6623		/* Report a network device has been unregistered */
6624		rtnl_lock();
6625		dev_net(dev)->dev_unreg_count--;
6626		__rtnl_unlock();
6627		wake_up(&netdev_unregistering_wq);
6628
6629		/* Free network device */
6630		kobject_put(&dev->dev.kobj);
6631	}
6632}
6633
6634/* Convert net_device_stats to rtnl_link_stats64.  They have the same
6635 * fields in the same order, with only the type differing.
6636 */
6637void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6638			     const struct net_device_stats *netdev_stats)
6639{
6640#if BITS_PER_LONG == 64
6641	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6642	memcpy(stats64, netdev_stats, sizeof(*stats64));
6643#else
6644	size_t i, n = sizeof(*stats64) / sizeof(u64);
6645	const unsigned long *src = (const unsigned long *)netdev_stats;
6646	u64 *dst = (u64 *)stats64;
6647
6648	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6649		     sizeof(*stats64) / sizeof(u64));
6650	for (i = 0; i < n; i++)
6651		dst[i] = src[i];
6652#endif
6653}
6654EXPORT_SYMBOL(netdev_stats_to_stats64);
6655
6656/**
6657 *	dev_get_stats	- get network device statistics
6658 *	@dev: device to get statistics from
6659 *	@storage: place to store stats
6660 *
6661 *	Get network statistics from device. Return @storage.
6662 *	The device driver may provide its own method by setting
6663 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6664 *	otherwise the internal statistics structure is used.
6665 */
6666struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6667					struct rtnl_link_stats64 *storage)
6668{
6669	const struct net_device_ops *ops = dev->netdev_ops;
6670
6671	if (ops->ndo_get_stats64) {
6672		memset(storage, 0, sizeof(*storage));
6673		ops->ndo_get_stats64(dev, storage);
6674	} else if (ops->ndo_get_stats) {
6675		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6676	} else {
6677		netdev_stats_to_stats64(storage, &dev->stats);
6678	}
6679	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6680	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6681	return storage;
6682}
6683EXPORT_SYMBOL(dev_get_stats);
6684
6685struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6686{
6687	struct netdev_queue *queue = dev_ingress_queue(dev);
6688
6689#ifdef CONFIG_NET_CLS_ACT
6690	if (queue)
6691		return queue;
6692	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6693	if (!queue)
6694		return NULL;
6695	netdev_init_one_queue(dev, queue, NULL);
6696	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6697	queue->qdisc_sleeping = &noop_qdisc;
6698	rcu_assign_pointer(dev->ingress_queue, queue);
6699#endif
6700	return queue;
6701}
6702
6703static const struct ethtool_ops default_ethtool_ops;
6704
6705void netdev_set_default_ethtool_ops(struct net_device *dev,
6706				    const struct ethtool_ops *ops)
6707{
6708	if (dev->ethtool_ops == &default_ethtool_ops)
6709		dev->ethtool_ops = ops;
6710}
6711EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6712
6713void netdev_freemem(struct net_device *dev)
6714{
6715	char *addr = (char *)dev - dev->padded;
6716
6717	kvfree(addr);
6718}
6719
6720/**
6721 *	alloc_netdev_mqs - allocate network device
6722 *	@sizeof_priv:		size of private data to allocate space for
6723 *	@name:			device name format string
6724 *	@name_assign_type: 	origin of device name
6725 *	@setup:			callback to initialize device
6726 *	@txqs:			the number of TX subqueues to allocate
6727 *	@rxqs:			the number of RX subqueues to allocate
6728 *
6729 *	Allocates a struct net_device with private data area for driver use
6730 *	and performs basic initialization.  Also allocates subqueue structs
6731 *	for each queue on the device.
6732 */
6733struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6734		unsigned char name_assign_type,
6735		void (*setup)(struct net_device *),
6736		unsigned int txqs, unsigned int rxqs)
6737{
6738	struct net_device *dev;
6739	size_t alloc_size;
6740	struct net_device *p;
6741
6742	BUG_ON(strlen(name) >= sizeof(dev->name));
6743
6744	if (txqs < 1) {
6745		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6746		return NULL;
6747	}
6748
6749#ifdef CONFIG_SYSFS
6750	if (rxqs < 1) {
6751		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6752		return NULL;
6753	}
6754#endif
6755
6756	alloc_size = sizeof(struct net_device);
6757	if (sizeof_priv) {
6758		/* ensure 32-byte alignment of private area */
6759		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6760		alloc_size += sizeof_priv;
6761	}
6762	/* ensure 32-byte alignment of whole construct */
6763	alloc_size += NETDEV_ALIGN - 1;
6764
6765	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6766	if (!p)
6767		p = vzalloc(alloc_size);
6768	if (!p)
6769		return NULL;
6770
6771	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6772	dev->padded = (char *)dev - (char *)p;
6773
6774	dev->pcpu_refcnt = alloc_percpu(int);
6775	if (!dev->pcpu_refcnt)
6776		goto free_dev;
6777
6778	if (dev_addr_init(dev))
6779		goto free_pcpu;
6780
6781	dev_mc_init(dev);
6782	dev_uc_init(dev);
6783
6784	dev_net_set(dev, &init_net);
6785
6786	dev->gso_max_size = GSO_MAX_SIZE;
6787	dev->gso_max_segs = GSO_MAX_SEGS;
6788	dev->gso_min_segs = 0;
6789
6790	INIT_LIST_HEAD(&dev->napi_list);
6791	INIT_LIST_HEAD(&dev->unreg_list);
6792	INIT_LIST_HEAD(&dev->close_list);
6793	INIT_LIST_HEAD(&dev->link_watch_list);
6794	INIT_LIST_HEAD(&dev->adj_list.upper);
6795	INIT_LIST_HEAD(&dev->adj_list.lower);
6796	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6797	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6798	INIT_LIST_HEAD(&dev->ptype_all);
6799	INIT_LIST_HEAD(&dev->ptype_specific);
6800	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6801	setup(dev);
6802
6803	dev->num_tx_queues = txqs;
6804	dev->real_num_tx_queues = txqs;
6805	if (netif_alloc_netdev_queues(dev))
6806		goto free_all;
6807
6808#ifdef CONFIG_SYSFS
6809	dev->num_rx_queues = rxqs;
6810	dev->real_num_rx_queues = rxqs;
6811	if (netif_alloc_rx_queues(dev))
6812		goto free_all;
6813#endif
6814
6815	strcpy(dev->name, name);
6816	dev->name_assign_type = name_assign_type;
6817	dev->group = INIT_NETDEV_GROUP;
6818	if (!dev->ethtool_ops)
6819		dev->ethtool_ops = &default_ethtool_ops;
6820	return dev;
6821
6822free_all:
6823	free_netdev(dev);
6824	return NULL;
6825
6826free_pcpu:
6827	free_percpu(dev->pcpu_refcnt);
6828free_dev:
6829	netdev_freemem(dev);
6830	return NULL;
6831}
6832EXPORT_SYMBOL(alloc_netdev_mqs);
6833
6834/**
6835 *	free_netdev - free network device
6836 *	@dev: device
6837 *
6838 *	This function does the last stage of destroying an allocated device
6839 * 	interface. The reference to the device object is released.
6840 *	If this is the last reference then it will be freed.
6841 */
6842void free_netdev(struct net_device *dev)
6843{
6844	struct napi_struct *p, *n;
6845
6846	release_net(dev_net(dev));
6847
6848	netif_free_tx_queues(dev);
6849#ifdef CONFIG_SYSFS
6850	kvfree(dev->_rx);
6851#endif
6852
6853	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6854
6855	/* Flush device addresses */
6856	dev_addr_flush(dev);
6857
6858	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6859		netif_napi_del(p);
6860
6861	free_percpu(dev->pcpu_refcnt);
6862	dev->pcpu_refcnt = NULL;
6863
6864	/*  Compatibility with error handling in drivers */
6865	if (dev->reg_state == NETREG_UNINITIALIZED) {
6866		netdev_freemem(dev);
6867		return;
6868	}
6869
6870	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6871	dev->reg_state = NETREG_RELEASED;
6872
6873	/* will free via device release */
6874	put_device(&dev->dev);
6875}
6876EXPORT_SYMBOL(free_netdev);
6877
6878/**
6879 *	synchronize_net -  Synchronize with packet receive processing
6880 *
6881 *	Wait for packets currently being received to be done.
6882 *	Does not block later packets from starting.
6883 */
6884void synchronize_net(void)
6885{
6886	might_sleep();
6887	if (rtnl_is_locked())
6888		synchronize_rcu_expedited();
6889	else
6890		synchronize_rcu();
6891}
6892EXPORT_SYMBOL(synchronize_net);
6893
6894/**
6895 *	unregister_netdevice_queue - remove device from the kernel
6896 *	@dev: device
6897 *	@head: list
6898 *
6899 *	This function shuts down a device interface and removes it
6900 *	from the kernel tables.
6901 *	If head not NULL, device is queued to be unregistered later.
6902 *
6903 *	Callers must hold the rtnl semaphore.  You may want
6904 *	unregister_netdev() instead of this.
6905 */
6906
6907void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6908{
6909	ASSERT_RTNL();
6910
6911	if (head) {
6912		list_move_tail(&dev->unreg_list, head);
6913	} else {
6914		rollback_registered(dev);
6915		/* Finish processing unregister after unlock */
6916		net_set_todo(dev);
6917	}
6918}
6919EXPORT_SYMBOL(unregister_netdevice_queue);
6920
6921/**
6922 *	unregister_netdevice_many - unregister many devices
6923 *	@head: list of devices
6924 *
6925 *  Note: As most callers use a stack allocated list_head,
6926 *  we force a list_del() to make sure stack wont be corrupted later.
6927 */
6928void unregister_netdevice_many(struct list_head *head)
6929{
6930	struct net_device *dev;
6931
6932	if (!list_empty(head)) {
6933		rollback_registered_many(head);
6934		list_for_each_entry(dev, head, unreg_list)
6935			net_set_todo(dev);
6936		list_del(head);
6937	}
6938}
6939EXPORT_SYMBOL(unregister_netdevice_many);
6940
6941/**
6942 *	unregister_netdev - remove device from the kernel
6943 *	@dev: device
6944 *
6945 *	This function shuts down a device interface and removes it
6946 *	from the kernel tables.
6947 *
6948 *	This is just a wrapper for unregister_netdevice that takes
6949 *	the rtnl semaphore.  In general you want to use this and not
6950 *	unregister_netdevice.
6951 */
6952void unregister_netdev(struct net_device *dev)
6953{
6954	rtnl_lock();
6955	unregister_netdevice(dev);
6956	rtnl_unlock();
6957}
6958EXPORT_SYMBOL(unregister_netdev);
6959
6960/**
6961 *	dev_change_net_namespace - move device to different nethost namespace
6962 *	@dev: device
6963 *	@net: network namespace
6964 *	@pat: If not NULL name pattern to try if the current device name
6965 *	      is already taken in the destination network namespace.
6966 *
6967 *	This function shuts down a device interface and moves it
6968 *	to a new network namespace. On success 0 is returned, on
6969 *	a failure a netagive errno code is returned.
6970 *
6971 *	Callers must hold the rtnl semaphore.
6972 */
6973
6974int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6975{
6976	int err;
6977
6978	ASSERT_RTNL();
6979
6980	/* Don't allow namespace local devices to be moved. */
6981	err = -EINVAL;
6982	if (dev->features & NETIF_F_NETNS_LOCAL)
6983		goto out;
6984
6985	/* Ensure the device has been registrered */
6986	if (dev->reg_state != NETREG_REGISTERED)
6987		goto out;
6988
6989	/* Get out if there is nothing todo */
6990	err = 0;
6991	if (net_eq(dev_net(dev), net))
6992		goto out;
6993
6994	/* Pick the destination device name, and ensure
6995	 * we can use it in the destination network namespace.
6996	 */
6997	err = -EEXIST;
6998	if (__dev_get_by_name(net, dev->name)) {
6999		/* We get here if we can't use the current device name */
7000		if (!pat)
7001			goto out;
7002		if (dev_get_valid_name(net, dev, pat) < 0)
7003			goto out;
7004	}
7005
7006	/*
7007	 * And now a mini version of register_netdevice unregister_netdevice.
7008	 */
7009
7010	/* If device is running close it first. */
7011	dev_close(dev);
7012
7013	/* And unlink it from device chain */
7014	err = -ENODEV;
7015	unlist_netdevice(dev);
7016
7017	synchronize_net();
7018
7019	/* Shutdown queueing discipline. */
7020	dev_shutdown(dev);
7021
7022	/* Notify protocols, that we are about to destroy
7023	   this device. They should clean all the things.
7024
7025	   Note that dev->reg_state stays at NETREG_REGISTERED.
7026	   This is wanted because this way 8021q and macvlan know
7027	   the device is just moving and can keep their slaves up.
7028	*/
7029	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7030	rcu_barrier();
7031	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7032	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7033
7034	/*
7035	 *	Flush the unicast and multicast chains
7036	 */
7037	dev_uc_flush(dev);
7038	dev_mc_flush(dev);
7039
7040	/* Send a netdev-removed uevent to the old namespace */
7041	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7042	netdev_adjacent_del_links(dev);
7043
7044	/* Actually switch the network namespace */
7045	dev_net_set(dev, net);
7046
7047	/* If there is an ifindex conflict assign a new one */
7048	if (__dev_get_by_index(net, dev->ifindex)) {
7049		int iflink = (dev->iflink == dev->ifindex);
7050		dev->ifindex = dev_new_index(net);
7051		if (iflink)
7052			dev->iflink = dev->ifindex;
7053	}
7054
7055	/* Send a netdev-add uevent to the new namespace */
7056	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7057	netdev_adjacent_add_links(dev);
7058
7059	/* Fixup kobjects */
7060	err = device_rename(&dev->dev, dev->name);
7061	WARN_ON(err);
7062
7063	/* Add the device back in the hashes */
7064	list_netdevice(dev);
7065
7066	/* Notify protocols, that a new device appeared. */
7067	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7068
7069	/*
7070	 *	Prevent userspace races by waiting until the network
7071	 *	device is fully setup before sending notifications.
7072	 */
7073	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7074
7075	synchronize_net();
7076	err = 0;
7077out:
7078	return err;
7079}
7080EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7081
7082static int dev_cpu_callback(struct notifier_block *nfb,
7083			    unsigned long action,
7084			    void *ocpu)
7085{
7086	struct sk_buff **list_skb;
7087	struct sk_buff *skb;
7088	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7089	struct softnet_data *sd, *oldsd;
7090
7091	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7092		return NOTIFY_OK;
7093
7094	local_irq_disable();
7095	cpu = smp_processor_id();
7096	sd = &per_cpu(softnet_data, cpu);
7097	oldsd = &per_cpu(softnet_data, oldcpu);
7098
7099	/* Find end of our completion_queue. */
7100	list_skb = &sd->completion_queue;
7101	while (*list_skb)
7102		list_skb = &(*list_skb)->next;
7103	/* Append completion queue from offline CPU. */
7104	*list_skb = oldsd->completion_queue;
7105	oldsd->completion_queue = NULL;
7106
7107	/* Append output queue from offline CPU. */
7108	if (oldsd->output_queue) {
7109		*sd->output_queue_tailp = oldsd->output_queue;
7110		sd->output_queue_tailp = oldsd->output_queue_tailp;
7111		oldsd->output_queue = NULL;
7112		oldsd->output_queue_tailp = &oldsd->output_queue;
7113	}
7114	/* Append NAPI poll list from offline CPU, with one exception :
7115	 * process_backlog() must be called by cpu owning percpu backlog.
7116	 * We properly handle process_queue & input_pkt_queue later.
7117	 */
7118	while (!list_empty(&oldsd->poll_list)) {
7119		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7120							    struct napi_struct,
7121							    poll_list);
7122
7123		list_del_init(&napi->poll_list);
7124		if (napi->poll == process_backlog)
7125			napi->state = 0;
7126		else
7127			____napi_schedule(sd, napi);
7128	}
7129
7130	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7131	local_irq_enable();
7132
7133	/* Process offline CPU's input_pkt_queue */
7134	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7135		netif_rx_ni(skb);
7136		input_queue_head_incr(oldsd);
7137	}
7138	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7139		netif_rx_ni(skb);
7140		input_queue_head_incr(oldsd);
7141	}
7142
7143	return NOTIFY_OK;
7144}
7145
7146
7147/**
7148 *	netdev_increment_features - increment feature set by one
7149 *	@all: current feature set
7150 *	@one: new feature set
7151 *	@mask: mask feature set
7152 *
7153 *	Computes a new feature set after adding a device with feature set
7154 *	@one to the master device with current feature set @all.  Will not
7155 *	enable anything that is off in @mask. Returns the new feature set.
7156 */
7157netdev_features_t netdev_increment_features(netdev_features_t all,
7158	netdev_features_t one, netdev_features_t mask)
7159{
7160	if (mask & NETIF_F_GEN_CSUM)
7161		mask |= NETIF_F_ALL_CSUM;
7162	mask |= NETIF_F_VLAN_CHALLENGED;
7163
7164	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7165	all &= one | ~NETIF_F_ALL_FOR_ALL;
7166
7167	/* If one device supports hw checksumming, set for all. */
7168	if (all & NETIF_F_GEN_CSUM)
7169		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7170
7171	return all;
7172}
7173EXPORT_SYMBOL(netdev_increment_features);
7174
7175static struct hlist_head * __net_init netdev_create_hash(void)
7176{
7177	int i;
7178	struct hlist_head *hash;
7179
7180	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7181	if (hash != NULL)
7182		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7183			INIT_HLIST_HEAD(&hash[i]);
7184
7185	return hash;
7186}
7187
7188/* Initialize per network namespace state */
7189static int __net_init netdev_init(struct net *net)
7190{
7191	if (net != &init_net)
7192		INIT_LIST_HEAD(&net->dev_base_head);
7193
7194	net->dev_name_head = netdev_create_hash();
7195	if (net->dev_name_head == NULL)
7196		goto err_name;
7197
7198	net->dev_index_head = netdev_create_hash();
7199	if (net->dev_index_head == NULL)
7200		goto err_idx;
7201
7202	return 0;
7203
7204err_idx:
7205	kfree(net->dev_name_head);
7206err_name:
7207	return -ENOMEM;
7208}
7209
7210/**
7211 *	netdev_drivername - network driver for the device
7212 *	@dev: network device
7213 *
7214 *	Determine network driver for device.
7215 */
7216const char *netdev_drivername(const struct net_device *dev)
7217{
7218	const struct device_driver *driver;
7219	const struct device *parent;
7220	const char *empty = "";
7221
7222	parent = dev->dev.parent;
7223	if (!parent)
7224		return empty;
7225
7226	driver = parent->driver;
7227	if (driver && driver->name)
7228		return driver->name;
7229	return empty;
7230}
7231
7232static void __netdev_printk(const char *level, const struct net_device *dev,
7233			    struct va_format *vaf)
7234{
7235	if (dev && dev->dev.parent) {
7236		dev_printk_emit(level[1] - '0',
7237				dev->dev.parent,
7238				"%s %s %s%s: %pV",
7239				dev_driver_string(dev->dev.parent),
7240				dev_name(dev->dev.parent),
7241				netdev_name(dev), netdev_reg_state(dev),
7242				vaf);
7243	} else if (dev) {
7244		printk("%s%s%s: %pV",
7245		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7246	} else {
7247		printk("%s(NULL net_device): %pV", level, vaf);
7248	}
7249}
7250
7251void netdev_printk(const char *level, const struct net_device *dev,
7252		   const char *format, ...)
7253{
7254	struct va_format vaf;
7255	va_list args;
7256
7257	va_start(args, format);
7258
7259	vaf.fmt = format;
7260	vaf.va = &args;
7261
7262	__netdev_printk(level, dev, &vaf);
7263
7264	va_end(args);
7265}
7266EXPORT_SYMBOL(netdev_printk);
7267
7268#define define_netdev_printk_level(func, level)			\
7269void func(const struct net_device *dev, const char *fmt, ...)	\
7270{								\
7271	struct va_format vaf;					\
7272	va_list args;						\
7273								\
7274	va_start(args, fmt);					\
7275								\
7276	vaf.fmt = fmt;						\
7277	vaf.va = &args;						\
7278								\
7279	__netdev_printk(level, dev, &vaf);			\
7280								\
7281	va_end(args);						\
7282}								\
7283EXPORT_SYMBOL(func);
7284
7285define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7286define_netdev_printk_level(netdev_alert, KERN_ALERT);
7287define_netdev_printk_level(netdev_crit, KERN_CRIT);
7288define_netdev_printk_level(netdev_err, KERN_ERR);
7289define_netdev_printk_level(netdev_warn, KERN_WARNING);
7290define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7291define_netdev_printk_level(netdev_info, KERN_INFO);
7292
7293static void __net_exit netdev_exit(struct net *net)
7294{
7295	kfree(net->dev_name_head);
7296	kfree(net->dev_index_head);
7297}
7298
7299static struct pernet_operations __net_initdata netdev_net_ops = {
7300	.init = netdev_init,
7301	.exit = netdev_exit,
7302};
7303
7304static void __net_exit default_device_exit(struct net *net)
7305{
7306	struct net_device *dev, *aux;
7307	/*
7308	 * Push all migratable network devices back to the
7309	 * initial network namespace
7310	 */
7311	rtnl_lock();
7312	for_each_netdev_safe(net, dev, aux) {
7313		int err;
7314		char fb_name[IFNAMSIZ];
7315
7316		/* Ignore unmoveable devices (i.e. loopback) */
7317		if (dev->features & NETIF_F_NETNS_LOCAL)
7318			continue;
7319
7320		/* Leave virtual devices for the generic cleanup */
7321		if (dev->rtnl_link_ops)
7322			continue;
7323
7324		/* Push remaining network devices to init_net */
7325		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7326		err = dev_change_net_namespace(dev, &init_net, fb_name);
7327		if (err) {
7328			pr_emerg("%s: failed to move %s to init_net: %d\n",
7329				 __func__, dev->name, err);
7330			BUG();
7331		}
7332	}
7333	rtnl_unlock();
7334}
7335
7336static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7337{
7338	/* Return with the rtnl_lock held when there are no network
7339	 * devices unregistering in any network namespace in net_list.
7340	 */
7341	struct net *net;
7342	bool unregistering;
7343	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7344
7345	add_wait_queue(&netdev_unregistering_wq, &wait);
7346	for (;;) {
7347		unregistering = false;
7348		rtnl_lock();
7349		list_for_each_entry(net, net_list, exit_list) {
7350			if (net->dev_unreg_count > 0) {
7351				unregistering = true;
7352				break;
7353			}
7354		}
7355		if (!unregistering)
7356			break;
7357		__rtnl_unlock();
7358
7359		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7360	}
7361	remove_wait_queue(&netdev_unregistering_wq, &wait);
7362}
7363
7364static void __net_exit default_device_exit_batch(struct list_head *net_list)
7365{
7366	/* At exit all network devices most be removed from a network
7367	 * namespace.  Do this in the reverse order of registration.
7368	 * Do this across as many network namespaces as possible to
7369	 * improve batching efficiency.
7370	 */
7371	struct net_device *dev;
7372	struct net *net;
7373	LIST_HEAD(dev_kill_list);
7374
7375	/* To prevent network device cleanup code from dereferencing
7376	 * loopback devices or network devices that have been freed
7377	 * wait here for all pending unregistrations to complete,
7378	 * before unregistring the loopback device and allowing the
7379	 * network namespace be freed.
7380	 *
7381	 * The netdev todo list containing all network devices
7382	 * unregistrations that happen in default_device_exit_batch
7383	 * will run in the rtnl_unlock() at the end of
7384	 * default_device_exit_batch.
7385	 */
7386	rtnl_lock_unregistering(net_list);
7387	list_for_each_entry(net, net_list, exit_list) {
7388		for_each_netdev_reverse(net, dev) {
7389			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7390				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7391			else
7392				unregister_netdevice_queue(dev, &dev_kill_list);
7393		}
7394	}
7395	unregister_netdevice_many(&dev_kill_list);
7396	rtnl_unlock();
7397}
7398
7399static struct pernet_operations __net_initdata default_device_ops = {
7400	.exit = default_device_exit,
7401	.exit_batch = default_device_exit_batch,
7402};
7403
7404/*
7405 *	Initialize the DEV module. At boot time this walks the device list and
7406 *	unhooks any devices that fail to initialise (normally hardware not
7407 *	present) and leaves us with a valid list of present and active devices.
7408 *
7409 */
7410
7411/*
7412 *       This is called single threaded during boot, so no need
7413 *       to take the rtnl semaphore.
7414 */
7415static int __init net_dev_init(void)
7416{
7417	int i, rc = -ENOMEM;
7418
7419	BUG_ON(!dev_boot_phase);
7420
7421	if (dev_proc_init())
7422		goto out;
7423
7424	if (netdev_kobject_init())
7425		goto out;
7426
7427	INIT_LIST_HEAD(&ptype_all);
7428	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7429		INIT_LIST_HEAD(&ptype_base[i]);
7430
7431	INIT_LIST_HEAD(&offload_base);
7432
7433	if (register_pernet_subsys(&netdev_net_ops))
7434		goto out;
7435
7436	/*
7437	 *	Initialise the packet receive queues.
7438	 */
7439
7440	for_each_possible_cpu(i) {
7441		struct softnet_data *sd = &per_cpu(softnet_data, i);
7442
7443		skb_queue_head_init(&sd->input_pkt_queue);
7444		skb_queue_head_init(&sd->process_queue);
7445		INIT_LIST_HEAD(&sd->poll_list);
7446		sd->output_queue_tailp = &sd->output_queue;
7447#ifdef CONFIG_RPS
7448		sd->csd.func = rps_trigger_softirq;
7449		sd->csd.info = sd;
7450		sd->cpu = i;
7451#endif
7452
7453		sd->backlog.poll = process_backlog;
7454		sd->backlog.weight = weight_p;
7455	}
7456
7457	dev_boot_phase = 0;
7458
7459	/* The loopback device is special if any other network devices
7460	 * is present in a network namespace the loopback device must
7461	 * be present. Since we now dynamically allocate and free the
7462	 * loopback device ensure this invariant is maintained by
7463	 * keeping the loopback device as the first device on the
7464	 * list of network devices.  Ensuring the loopback devices
7465	 * is the first device that appears and the last network device
7466	 * that disappears.
7467	 */
7468	if (register_pernet_device(&loopback_net_ops))
7469		goto out;
7470
7471	if (register_pernet_device(&default_device_ops))
7472		goto out;
7473
7474	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7475	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7476
7477	hotcpu_notifier(dev_cpu_callback, 0);
7478	dst_init();
7479	rc = 0;
7480out:
7481	return rc;
7482}
7483
7484subsys_initcall(net_dev_init);