net/core/dev.c at v5.2-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / dev.c
at v5.2-rc2 254 kB view raw
   1/*
   2 *      NET3    Protocol independent device support routines.
   3 *
   4 *		This program is free software; you can redistribute it and/or
   5 *		modify it under the terms of the GNU General Public License
   6 *		as published by the Free Software Foundation; either version
   7 *		2 of the License, or (at your option) any later version.
   8 *
   9 *	Derived from the non IP parts of dev.c 1.0.19
  10 *              Authors:	Ross Biro
  11 *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *				Mark Evans, <evansmp@uhura.aston.ac.uk>
  13 *
  14 *	Additional Authors:
  15 *		Florian la Roche <rzsfl@rz.uni-sb.de>
  16 *		Alan Cox <gw4pts@gw4pts.ampr.org>
  17 *		David Hinds <dahinds@users.sourceforge.net>
  18 *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19 *		Adam Sulmicki <adam@cfar.umd.edu>
  20 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21 *
  22 *	Changes:
  23 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24 *                                      to 2 if register_netdev gets called
  25 *                                      before net_dev_init & also removed a
  26 *                                      few lines of code in the process.
  27 *		Alan Cox	:	device private ioctl copies fields back.
  28 *		Alan Cox	:	Transmit queue code does relevant
  29 *					stunts to keep the queue safe.
  30 *		Alan Cox	:	Fixed double lock.
  31 *		Alan Cox	:	Fixed promisc NULL pointer trap
  32 *		????????	:	Support the full private ioctl range
  33 *		Alan Cox	:	Moved ioctl permission check into
  34 *					drivers
  35 *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
  36 *		Alan Cox	:	100 backlog just doesn't cut it when
  37 *					you start doing multicast video 8)
  38 *		Alan Cox	:	Rewrote net_bh and list manager.
  39 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40 *		Alan Cox	:	Took out transmit every packet pass
  41 *					Saved a few bytes in the ioctl handler
  42 *		Alan Cox	:	Network driver sets packet type before
  43 *					calling netif_rx. Saves a function
  44 *					call a packet.
  45 *		Alan Cox	:	Hashed net_bh()
  46 *		Richard Kooijman:	Timestamp fixes.
  47 *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
  48 *		Alan Cox	:	Device lock protection.
  49 *              Alan Cox        :       Fixed nasty side effect of device close
  50 *					changes.
  51 *		Rudi Cilibrasi	:	Pass the right thing to
  52 *					set_mac_address()
  53 *		Dave Miller	:	32bit quantity for the device lock to
  54 *					make it work out on a Sparc.
  55 *		Bjorn Ekwall	:	Added KERNELD hack.
  56 *		Alan Cox	:	Cleaned up the backlog initialise.
  57 *		Craig Metz	:	SIOCGIFCONF fix if space for under
  58 *					1 device.
  59 *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
  60 *					is no device open function.
  61 *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  62 *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  63 *		Cyrus Durgin	:	Cleaned for KMOD
  64 *		Adam Sulmicki   :	Bug Fix : Network Device Unload
  65 *					A network device unload needs to purge
  66 *					the backlog queue.
  67 *	Paul Rusty Russell	:	SIOCSIFNAME
  68 *              Pekka Riikonen  :	Netdev boot-time settings code
  69 *              Andrew Morton   :       Make unregister_netdevice wait
  70 *                                      indefinitely on dev->refcnt
  71 *              J Hadi Salim    :       - Backlog queue sampling
  72 *				        - netif_rx() feedback
  73 */
  74
  75#include <linux/uaccess.h>
  76#include <linux/bitops.h>
  77#include <linux/capability.h>
  78#include <linux/cpu.h>
  79#include <linux/types.h>
  80#include <linux/kernel.h>
  81#include <linux/hash.h>
  82#include <linux/slab.h>
  83#include <linux/sched.h>
  84#include <linux/sched/mm.h>
  85#include <linux/mutex.h>
  86#include <linux/string.h>
  87#include <linux/mm.h>
  88#include <linux/socket.h>
  89#include <linux/sockios.h>
  90#include <linux/errno.h>
  91#include <linux/interrupt.h>
  92#include <linux/if_ether.h>
  93#include <linux/netdevice.h>
  94#include <linux/etherdevice.h>
  95#include <linux/ethtool.h>
  96#include <linux/skbuff.h>
  97#include <linux/bpf.h>
  98#include <linux/bpf_trace.h>
  99#include <net/net_namespace.h>
 100#include <net/sock.h>
 101#include <net/busy_poll.h>
 102#include <linux/rtnetlink.h>
 103#include <linux/stat.h>
 104#include <net/dst.h>
 105#include <net/dst_metadata.h>
 106#include <net/pkt_sched.h>
 107#include <net/pkt_cls.h>
 108#include <net/checksum.h>
 109#include <net/xfrm.h>
 110#include <linux/highmem.h>
 111#include <linux/init.h>
 112#include <linux/module.h>
 113#include <linux/netpoll.h>
 114#include <linux/rcupdate.h>
 115#include <linux/delay.h>
 116#include <net/iw_handler.h>
 117#include <asm/current.h>
 118#include <linux/audit.h>
 119#include <linux/dmaengine.h>
 120#include <linux/err.h>
 121#include <linux/ctype.h>
 122#include <linux/if_arp.h>
 123#include <linux/if_vlan.h>
 124#include <linux/ip.h>
 125#include <net/ip.h>
 126#include <net/mpls.h>
 127#include <linux/ipv6.h>
 128#include <linux/in.h>
 129#include <linux/jhash.h>
 130#include <linux/random.h>
 131#include <trace/events/napi.h>
 132#include <trace/events/net.h>
 133#include <trace/events/skb.h>
 134#include <linux/inetdevice.h>
 135#include <linux/cpu_rmap.h>
 136#include <linux/static_key.h>
 137#include <linux/hashtable.h>
 138#include <linux/vmalloc.h>
 139#include <linux/if_macvlan.h>
 140#include <linux/errqueue.h>
 141#include <linux/hrtimer.h>
 142#include <linux/netfilter_ingress.h>
 143#include <linux/crash_dump.h>
 144#include <linux/sctp.h>
 145#include <net/udp_tunnel.h>
 146#include <linux/net_namespace.h>
 147#include <linux/indirect_call_wrapper.h>
 148#include <net/devlink.h>
 149
 150#include "net-sysfs.h"
 151
 152#define MAX_GRO_SKBS 8
 153
 154/* This should be increased if a protocol with a bigger head is added. */
 155#define GRO_MAX_HEAD (MAX_HEADER + 128)
 156
 157static DEFINE_SPINLOCK(ptype_lock);
 158static DEFINE_SPINLOCK(offload_lock);
 159struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 160struct list_head ptype_all __read_mostly;	/* Taps */
 161static struct list_head offload_base __read_mostly;
 162
 163static int netif_rx_internal(struct sk_buff *skb);
 164static int call_netdevice_notifiers_info(unsigned long val,
 165					 struct netdev_notifier_info *info);
 166static int call_netdevice_notifiers_extack(unsigned long val,
 167					   struct net_device *dev,
 168					   struct netlink_ext_ack *extack);
 169static struct napi_struct *napi_by_id(unsigned int napi_id);
 170
 171/*
 172 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 173 * semaphore.
 174 *
 175 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 176 *
 177 * Writers must hold the rtnl semaphore while they loop through the
 178 * dev_base_head list, and hold dev_base_lock for writing when they do the
 179 * actual updates.  This allows pure readers to access the list even
 180 * while a writer is preparing to update it.
 181 *
 182 * To put it another way, dev_base_lock is held for writing only to
 183 * protect against pure readers; the rtnl semaphore provides the
 184 * protection against other writers.
 185 *
 186 * See, for example usages, register_netdevice() and
 187 * unregister_netdevice(), which must be called with the rtnl
 188 * semaphore held.
 189 */
 190DEFINE_RWLOCK(dev_base_lock);
 191EXPORT_SYMBOL(dev_base_lock);
 192
 193static DEFINE_MUTEX(ifalias_mutex);
 194
 195/* protects napi_hash addition/deletion and napi_gen_id */
 196static DEFINE_SPINLOCK(napi_hash_lock);
 197
 198static unsigned int napi_gen_id = NR_CPUS;
 199static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 200
 201static seqcount_t devnet_rename_seq;
 202
 203static inline void dev_base_seq_inc(struct net *net)
 204{
 205	while (++net->dev_base_seq == 0)
 206		;
 207}
 208
 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 210{
 211	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 212
 213	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214}
 215
 216static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217{
 218	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219}
 220
 221static inline void rps_lock(struct softnet_data *sd)
 222{
 223#ifdef CONFIG_RPS
 224	spin_lock(&sd->input_pkt_queue.lock);
 225#endif
 226}
 227
 228static inline void rps_unlock(struct softnet_data *sd)
 229{
 230#ifdef CONFIG_RPS
 231	spin_unlock(&sd->input_pkt_queue.lock);
 232#endif
 233}
 234
 235/* Device list insertion */
 236static void list_netdevice(struct net_device *dev)
 237{
 238	struct net *net = dev_net(dev);
 239
 240	ASSERT_RTNL();
 241
 242	write_lock_bh(&dev_base_lock);
 243	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245	hlist_add_head_rcu(&dev->index_hlist,
 246			   dev_index_hash(net, dev->ifindex));
 247	write_unlock_bh(&dev_base_lock);
 248
 249	dev_base_seq_inc(net);
 250}
 251
 252/* Device list removal
 253 * caller must respect a RCU grace period before freeing/reusing dev
 254 */
 255static void unlist_netdevice(struct net_device *dev)
 256{
 257	ASSERT_RTNL();
 258
 259	/* Unlink dev from the device chain */
 260	write_lock_bh(&dev_base_lock);
 261	list_del_rcu(&dev->dev_list);
 262	hlist_del_rcu(&dev->name_hlist);
 263	hlist_del_rcu(&dev->index_hlist);
 264	write_unlock_bh(&dev_base_lock);
 265
 266	dev_base_seq_inc(dev_net(dev));
 267}
 268
 269/*
 270 *	Our notifier list
 271 */
 272
 273static RAW_NOTIFIER_HEAD(netdev_chain);
 274
 275/*
 276 *	Device drivers call our routines to queue packets here. We empty the
 277 *	queue in the local softnet handler.
 278 */
 279
 280DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 281EXPORT_PER_CPU_SYMBOL(softnet_data);
 282
 283#ifdef CONFIG_LOCKDEP
 284/*
 285 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 286 * according to dev->type
 287 */
 288static const unsigned short netdev_lock_type[] = {
 289	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 290	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 291	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 292	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 293	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 294	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 295	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 296	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 297	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 298	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 299	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 300	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 301	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 302	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 303	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 304
 305static const char *const netdev_lock_name[] = {
 306	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 307	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 308	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 309	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 310	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 311	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 312	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 313	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 314	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 315	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 316	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 317	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 318	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 319	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 320	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 321
 322static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 323static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 324
 325static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 326{
 327	int i;
 328
 329	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 330		if (netdev_lock_type[i] == dev_type)
 331			return i;
 332	/* the last key is used by default */
 333	return ARRAY_SIZE(netdev_lock_type) - 1;
 334}
 335
 336static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 337						 unsigned short dev_type)
 338{
 339	int i;
 340
 341	i = netdev_lock_pos(dev_type);
 342	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 343				   netdev_lock_name[i]);
 344}
 345
 346static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 347{
 348	int i;
 349
 350	i = netdev_lock_pos(dev->type);
 351	lockdep_set_class_and_name(&dev->addr_list_lock,
 352				   &netdev_addr_lock_key[i],
 353				   netdev_lock_name[i]);
 354}
 355#else
 356static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 357						 unsigned short dev_type)
 358{
 359}
 360static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 361{
 362}
 363#endif
 364
 365/*******************************************************************************
 366 *
 367 *		Protocol management and registration routines
 368 *
 369 *******************************************************************************/
 370
 371
 372/*
 373 *	Add a protocol ID to the list. Now that the input handler is
 374 *	smarter we can dispense with all the messy stuff that used to be
 375 *	here.
 376 *
 377 *	BEWARE!!! Protocol handlers, mangling input packets,
 378 *	MUST BE last in hash buckets and checking protocol handlers
 379 *	MUST start from promiscuous ptype_all chain in net_bh.
 380 *	It is true now, do not change it.
 381 *	Explanation follows: if protocol handler, mangling packet, will
 382 *	be the first on list, it is not able to sense, that packet
 383 *	is cloned and should be copied-on-write, so that it will
 384 *	change it and subsequent readers will get broken packet.
 385 *							--ANK (980803)
 386 */
 387
 388static inline struct list_head *ptype_head(const struct packet_type *pt)
 389{
 390	if (pt->type == htons(ETH_P_ALL))
 391		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 392	else
 393		return pt->dev ? &pt->dev->ptype_specific :
 394				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 395}
 396
 397/**
 398 *	dev_add_pack - add packet handler
 399 *	@pt: packet type declaration
 400 *
 401 *	Add a protocol handler to the networking stack. The passed &packet_type
 402 *	is linked into kernel lists and may not be freed until it has been
 403 *	removed from the kernel lists.
 404 *
 405 *	This call does not sleep therefore it can not
 406 *	guarantee all CPU's that are in middle of receiving packets
 407 *	will see the new packet type (until the next received packet).
 408 */
 409
 410void dev_add_pack(struct packet_type *pt)
 411{
 412	struct list_head *head = ptype_head(pt);
 413
 414	spin_lock(&ptype_lock);
 415	list_add_rcu(&pt->list, head);
 416	spin_unlock(&ptype_lock);
 417}
 418EXPORT_SYMBOL(dev_add_pack);
 419
 420/**
 421 *	__dev_remove_pack	 - remove packet handler
 422 *	@pt: packet type declaration
 423 *
 424 *	Remove a protocol handler that was previously added to the kernel
 425 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 426 *	from the kernel lists and can be freed or reused once this function
 427 *	returns.
 428 *
 429 *      The packet type might still be in use by receivers
 430 *	and must not be freed until after all the CPU's have gone
 431 *	through a quiescent state.
 432 */
 433void __dev_remove_pack(struct packet_type *pt)
 434{
 435	struct list_head *head = ptype_head(pt);
 436	struct packet_type *pt1;
 437
 438	spin_lock(&ptype_lock);
 439
 440	list_for_each_entry(pt1, head, list) {
 441		if (pt == pt1) {
 442			list_del_rcu(&pt->list);
 443			goto out;
 444		}
 445	}
 446
 447	pr_warn("dev_remove_pack: %p not found\n", pt);
 448out:
 449	spin_unlock(&ptype_lock);
 450}
 451EXPORT_SYMBOL(__dev_remove_pack);
 452
 453/**
 454 *	dev_remove_pack	 - remove packet handler
 455 *	@pt: packet type declaration
 456 *
 457 *	Remove a protocol handler that was previously added to the kernel
 458 *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
 459 *	from the kernel lists and can be freed or reused once this function
 460 *	returns.
 461 *
 462 *	This call sleeps to guarantee that no CPU is looking at the packet
 463 *	type after return.
 464 */
 465void dev_remove_pack(struct packet_type *pt)
 466{
 467	__dev_remove_pack(pt);
 468
 469	synchronize_net();
 470}
 471EXPORT_SYMBOL(dev_remove_pack);
 472
 473
 474/**
 475 *	dev_add_offload - register offload handlers
 476 *	@po: protocol offload declaration
 477 *
 478 *	Add protocol offload handlers to the networking stack. The passed
 479 *	&proto_offload is linked into kernel lists and may not be freed until
 480 *	it has been removed from the kernel lists.
 481 *
 482 *	This call does not sleep therefore it can not
 483 *	guarantee all CPU's that are in middle of receiving packets
 484 *	will see the new offload handlers (until the next received packet).
 485 */
 486void dev_add_offload(struct packet_offload *po)
 487{
 488	struct packet_offload *elem;
 489
 490	spin_lock(&offload_lock);
 491	list_for_each_entry(elem, &offload_base, list) {
 492		if (po->priority < elem->priority)
 493			break;
 494	}
 495	list_add_rcu(&po->list, elem->list.prev);
 496	spin_unlock(&offload_lock);
 497}
 498EXPORT_SYMBOL(dev_add_offload);
 499
 500/**
 501 *	__dev_remove_offload	 - remove offload handler
 502 *	@po: packet offload declaration
 503 *
 504 *	Remove a protocol offload handler that was previously added to the
 505 *	kernel offload handlers by dev_add_offload(). The passed &offload_type
 506 *	is removed from the kernel lists and can be freed or reused once this
 507 *	function returns.
 508 *
 509 *      The packet type might still be in use by receivers
 510 *	and must not be freed until after all the CPU's have gone
 511 *	through a quiescent state.
 512 */
 513static void __dev_remove_offload(struct packet_offload *po)
 514{
 515	struct list_head *head = &offload_base;
 516	struct packet_offload *po1;
 517
 518	spin_lock(&offload_lock);
 519
 520	list_for_each_entry(po1, head, list) {
 521		if (po == po1) {
 522			list_del_rcu(&po->list);
 523			goto out;
 524		}
 525	}
 526
 527	pr_warn("dev_remove_offload: %p not found\n", po);
 528out:
 529	spin_unlock(&offload_lock);
 530}
 531
 532/**
 533 *	dev_remove_offload	 - remove packet offload handler
 534 *	@po: packet offload declaration
 535 *
 536 *	Remove a packet offload handler that was previously added to the kernel
 537 *	offload handlers by dev_add_offload(). The passed &offload_type is
 538 *	removed from the kernel lists and can be freed or reused once this
 539 *	function returns.
 540 *
 541 *	This call sleeps to guarantee that no CPU is looking at the packet
 542 *	type after return.
 543 */
 544void dev_remove_offload(struct packet_offload *po)
 545{
 546	__dev_remove_offload(po);
 547
 548	synchronize_net();
 549}
 550EXPORT_SYMBOL(dev_remove_offload);
 551
 552/******************************************************************************
 553 *
 554 *		      Device Boot-time Settings Routines
 555 *
 556 ******************************************************************************/
 557
 558/* Boot time configuration table */
 559static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 560
 561/**
 562 *	netdev_boot_setup_add	- add new setup entry
 563 *	@name: name of the device
 564 *	@map: configured settings for the device
 565 *
 566 *	Adds new setup entry to the dev_boot_setup list.  The function
 567 *	returns 0 on error and 1 on success.  This is a generic routine to
 568 *	all netdevices.
 569 */
 570static int netdev_boot_setup_add(char *name, struct ifmap *map)
 571{
 572	struct netdev_boot_setup *s;
 573	int i;
 574
 575	s = dev_boot_setup;
 576	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 577		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 578			memset(s[i].name, 0, sizeof(s[i].name));
 579			strlcpy(s[i].name, name, IFNAMSIZ);
 580			memcpy(&s[i].map, map, sizeof(s[i].map));
 581			break;
 582		}
 583	}
 584
 585	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 586}
 587
 588/**
 589 * netdev_boot_setup_check	- check boot time settings
 590 * @dev: the netdevice
 591 *
 592 * Check boot time settings for the device.
 593 * The found settings are set for the device to be used
 594 * later in the device probing.
 595 * Returns 0 if no settings found, 1 if they are.
 596 */
 597int netdev_boot_setup_check(struct net_device *dev)
 598{
 599	struct netdev_boot_setup *s = dev_boot_setup;
 600	int i;
 601
 602	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 603		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 604		    !strcmp(dev->name, s[i].name)) {
 605			dev->irq = s[i].map.irq;
 606			dev->base_addr = s[i].map.base_addr;
 607			dev->mem_start = s[i].map.mem_start;
 608			dev->mem_end = s[i].map.mem_end;
 609			return 1;
 610		}
 611	}
 612	return 0;
 613}
 614EXPORT_SYMBOL(netdev_boot_setup_check);
 615
 616
 617/**
 618 * netdev_boot_base	- get address from boot time settings
 619 * @prefix: prefix for network device
 620 * @unit: id for network device
 621 *
 622 * Check boot time settings for the base address of device.
 623 * The found settings are set for the device to be used
 624 * later in the device probing.
 625 * Returns 0 if no settings found.
 626 */
 627unsigned long netdev_boot_base(const char *prefix, int unit)
 628{
 629	const struct netdev_boot_setup *s = dev_boot_setup;
 630	char name[IFNAMSIZ];
 631	int i;
 632
 633	sprintf(name, "%s%d", prefix, unit);
 634
 635	/*
 636	 * If device already registered then return base of 1
 637	 * to indicate not to probe for this interface
 638	 */
 639	if (__dev_get_by_name(&init_net, name))
 640		return 1;
 641
 642	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 643		if (!strcmp(name, s[i].name))
 644			return s[i].map.base_addr;
 645	return 0;
 646}
 647
 648/*
 649 * Saves at boot time configured settings for any netdevice.
 650 */
 651int __init netdev_boot_setup(char *str)
 652{
 653	int ints[5];
 654	struct ifmap map;
 655
 656	str = get_options(str, ARRAY_SIZE(ints), ints);
 657	if (!str || !*str)
 658		return 0;
 659
 660	/* Save settings */
 661	memset(&map, 0, sizeof(map));
 662	if (ints[0] > 0)
 663		map.irq = ints[1];
 664	if (ints[0] > 1)
 665		map.base_addr = ints[2];
 666	if (ints[0] > 2)
 667		map.mem_start = ints[3];
 668	if (ints[0] > 3)
 669		map.mem_end = ints[4];
 670
 671	/* Add new entry to the list */
 672	return netdev_boot_setup_add(str, &map);
 673}
 674
 675__setup("netdev=", netdev_boot_setup);
 676
 677/*******************************************************************************
 678 *
 679 *			    Device Interface Subroutines
 680 *
 681 *******************************************************************************/
 682
 683/**
 684 *	dev_get_iflink	- get 'iflink' value of a interface
 685 *	@dev: targeted interface
 686 *
 687 *	Indicates the ifindex the interface is linked to.
 688 *	Physical interfaces have the same 'ifindex' and 'iflink' values.
 689 */
 690
 691int dev_get_iflink(const struct net_device *dev)
 692{
 693	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 694		return dev->netdev_ops->ndo_get_iflink(dev);
 695
 696	return dev->ifindex;
 697}
 698EXPORT_SYMBOL(dev_get_iflink);
 699
 700/**
 701 *	dev_fill_metadata_dst - Retrieve tunnel egress information.
 702 *	@dev: targeted interface
 703 *	@skb: The packet.
 704 *
 705 *	For better visibility of tunnel traffic OVS needs to retrieve
 706 *	egress tunnel information for a packet. Following API allows
 707 *	user to get this info.
 708 */
 709int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 710{
 711	struct ip_tunnel_info *info;
 712
 713	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 714		return -EINVAL;
 715
 716	info = skb_tunnel_info_unclone(skb);
 717	if (!info)
 718		return -ENOMEM;
 719	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 720		return -EINVAL;
 721
 722	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 723}
 724EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 725
 726/**
 727 *	__dev_get_by_name	- find a device by its name
 728 *	@net: the applicable net namespace
 729 *	@name: name to find
 730 *
 731 *	Find an interface by name. Must be called under RTNL semaphore
 732 *	or @dev_base_lock. If the name is found a pointer to the device
 733 *	is returned. If the name is not found then %NULL is returned. The
 734 *	reference counters are not incremented so the caller must be
 735 *	careful with locks.
 736 */
 737
 738struct net_device *__dev_get_by_name(struct net *net, const char *name)
 739{
 740	struct net_device *dev;
 741	struct hlist_head *head = dev_name_hash(net, name);
 742
 743	hlist_for_each_entry(dev, head, name_hlist)
 744		if (!strncmp(dev->name, name, IFNAMSIZ))
 745			return dev;
 746
 747	return NULL;
 748}
 749EXPORT_SYMBOL(__dev_get_by_name);
 750
 751/**
 752 * dev_get_by_name_rcu	- find a device by its name
 753 * @net: the applicable net namespace
 754 * @name: name to find
 755 *
 756 * Find an interface by name.
 757 * If the name is found a pointer to the device is returned.
 758 * If the name is not found then %NULL is returned.
 759 * The reference counters are not incremented so the caller must be
 760 * careful with locks. The caller must hold RCU lock.
 761 */
 762
 763struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 764{
 765	struct net_device *dev;
 766	struct hlist_head *head = dev_name_hash(net, name);
 767
 768	hlist_for_each_entry_rcu(dev, head, name_hlist)
 769		if (!strncmp(dev->name, name, IFNAMSIZ))
 770			return dev;
 771
 772	return NULL;
 773}
 774EXPORT_SYMBOL(dev_get_by_name_rcu);
 775
 776/**
 777 *	dev_get_by_name		- find a device by its name
 778 *	@net: the applicable net namespace
 779 *	@name: name to find
 780 *
 781 *	Find an interface by name. This can be called from any
 782 *	context and does its own locking. The returned handle has
 783 *	the usage count incremented and the caller must use dev_put() to
 784 *	release it when it is no longer needed. %NULL is returned if no
 785 *	matching device is found.
 786 */
 787
 788struct net_device *dev_get_by_name(struct net *net, const char *name)
 789{
 790	struct net_device *dev;
 791
 792	rcu_read_lock();
 793	dev = dev_get_by_name_rcu(net, name);
 794	if (dev)
 795		dev_hold(dev);
 796	rcu_read_unlock();
 797	return dev;
 798}
 799EXPORT_SYMBOL(dev_get_by_name);
 800
 801/**
 802 *	__dev_get_by_index - find a device by its ifindex
 803 *	@net: the applicable net namespace
 804 *	@ifindex: index of device
 805 *
 806 *	Search for an interface by index. Returns %NULL if the device
 807 *	is not found or a pointer to the device. The device has not
 808 *	had its reference counter increased so the caller must be careful
 809 *	about locking. The caller must hold either the RTNL semaphore
 810 *	or @dev_base_lock.
 811 */
 812
 813struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 814{
 815	struct net_device *dev;
 816	struct hlist_head *head = dev_index_hash(net, ifindex);
 817
 818	hlist_for_each_entry(dev, head, index_hlist)
 819		if (dev->ifindex == ifindex)
 820			return dev;
 821
 822	return NULL;
 823}
 824EXPORT_SYMBOL(__dev_get_by_index);
 825
 826/**
 827 *	dev_get_by_index_rcu - find a device by its ifindex
 828 *	@net: the applicable net namespace
 829 *	@ifindex: index of device
 830 *
 831 *	Search for an interface by index. Returns %NULL if the device
 832 *	is not found or a pointer to the device. The device has not
 833 *	had its reference counter increased so the caller must be careful
 834 *	about locking. The caller must hold RCU lock.
 835 */
 836
 837struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 838{
 839	struct net_device *dev;
 840	struct hlist_head *head = dev_index_hash(net, ifindex);
 841
 842	hlist_for_each_entry_rcu(dev, head, index_hlist)
 843		if (dev->ifindex == ifindex)
 844			return dev;
 845
 846	return NULL;
 847}
 848EXPORT_SYMBOL(dev_get_by_index_rcu);
 849
 850
 851/**
 852 *	dev_get_by_index - find a device by its ifindex
 853 *	@net: the applicable net namespace
 854 *	@ifindex: index of device
 855 *
 856 *	Search for an interface by index. Returns NULL if the device
 857 *	is not found or a pointer to the device. The device returned has
 858 *	had a reference added and the pointer is safe until the user calls
 859 *	dev_put to indicate they have finished with it.
 860 */
 861
 862struct net_device *dev_get_by_index(struct net *net, int ifindex)
 863{
 864	struct net_device *dev;
 865
 866	rcu_read_lock();
 867	dev = dev_get_by_index_rcu(net, ifindex);
 868	if (dev)
 869		dev_hold(dev);
 870	rcu_read_unlock();
 871	return dev;
 872}
 873EXPORT_SYMBOL(dev_get_by_index);
 874
 875/**
 876 *	dev_get_by_napi_id - find a device by napi_id
 877 *	@napi_id: ID of the NAPI struct
 878 *
 879 *	Search for an interface by NAPI ID. Returns %NULL if the device
 880 *	is not found or a pointer to the device. The device has not had
 881 *	its reference counter increased so the caller must be careful
 882 *	about locking. The caller must hold RCU lock.
 883 */
 884
 885struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 886{
 887	struct napi_struct *napi;
 888
 889	WARN_ON_ONCE(!rcu_read_lock_held());
 890
 891	if (napi_id < MIN_NAPI_ID)
 892		return NULL;
 893
 894	napi = napi_by_id(napi_id);
 895
 896	return napi ? napi->dev : NULL;
 897}
 898EXPORT_SYMBOL(dev_get_by_napi_id);
 899
 900/**
 901 *	netdev_get_name - get a netdevice name, knowing its ifindex.
 902 *	@net: network namespace
 903 *	@name: a pointer to the buffer where the name will be stored.
 904 *	@ifindex: the ifindex of the interface to get the name from.
 905 *
 906 *	The use of raw_seqcount_begin() and cond_resched() before
 907 *	retrying is required as we want to give the writers a chance
 908 *	to complete when CONFIG_PREEMPT is not set.
 909 */
 910int netdev_get_name(struct net *net, char *name, int ifindex)
 911{
 912	struct net_device *dev;
 913	unsigned int seq;
 914
 915retry:
 916	seq = raw_seqcount_begin(&devnet_rename_seq);
 917	rcu_read_lock();
 918	dev = dev_get_by_index_rcu(net, ifindex);
 919	if (!dev) {
 920		rcu_read_unlock();
 921		return -ENODEV;
 922	}
 923
 924	strcpy(name, dev->name);
 925	rcu_read_unlock();
 926	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 927		cond_resched();
 928		goto retry;
 929	}
 930
 931	return 0;
 932}
 933
 934/**
 935 *	dev_getbyhwaddr_rcu - find a device by its hardware address
 936 *	@net: the applicable net namespace
 937 *	@type: media type of device
 938 *	@ha: hardware address
 939 *
 940 *	Search for an interface by MAC address. Returns NULL if the device
 941 *	is not found or a pointer to the device.
 942 *	The caller must hold RCU or RTNL.
 943 *	The returned device has not had its ref count increased
 944 *	and the caller must therefore be careful about locking
 945 *
 946 */
 947
 948struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 949				       const char *ha)
 950{
 951	struct net_device *dev;
 952
 953	for_each_netdev_rcu(net, dev)
 954		if (dev->type == type &&
 955		    !memcmp(dev->dev_addr, ha, dev->addr_len))
 956			return dev;
 957
 958	return NULL;
 959}
 960EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 961
 962struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 963{
 964	struct net_device *dev;
 965
 966	ASSERT_RTNL();
 967	for_each_netdev(net, dev)
 968		if (dev->type == type)
 969			return dev;
 970
 971	return NULL;
 972}
 973EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 974
 975struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 976{
 977	struct net_device *dev, *ret = NULL;
 978
 979	rcu_read_lock();
 980	for_each_netdev_rcu(net, dev)
 981		if (dev->type == type) {
 982			dev_hold(dev);
 983			ret = dev;
 984			break;
 985		}
 986	rcu_read_unlock();
 987	return ret;
 988}
 989EXPORT_SYMBOL(dev_getfirstbyhwtype);
 990
 991/**
 992 *	__dev_get_by_flags - find any device with given flags
 993 *	@net: the applicable net namespace
 994 *	@if_flags: IFF_* values
 995 *	@mask: bitmask of bits in if_flags to check
 996 *
 997 *	Search for any interface with the given flags. Returns NULL if a device
 998 *	is not found or a pointer to the device. Must be called inside
 999 *	rtnl_lock(), and result refcount is unchanged.
1000 */
1001
1002struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1003				      unsigned short mask)
1004{
1005	struct net_device *dev, *ret;
1006
1007	ASSERT_RTNL();
1008
1009	ret = NULL;
1010	for_each_netdev(net, dev) {
1011		if (((dev->flags ^ if_flags) & mask) == 0) {
1012			ret = dev;
1013			break;
1014		}
1015	}
1016	return ret;
1017}
1018EXPORT_SYMBOL(__dev_get_by_flags);
1019
1020/**
1021 *	dev_valid_name - check if name is okay for network device
1022 *	@name: name string
1023 *
1024 *	Network device names need to be valid file names to
1025 *	to allow sysfs to work.  We also disallow any kind of
1026 *	whitespace.
1027 */
1028bool dev_valid_name(const char *name)
1029{
1030	if (*name == '\0')
1031		return false;
1032	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1033		return false;
1034	if (!strcmp(name, ".") || !strcmp(name, ".."))
1035		return false;
1036
1037	while (*name) {
1038		if (*name == '/' || *name == ':' || isspace(*name))
1039			return false;
1040		name++;
1041	}
1042	return true;
1043}
1044EXPORT_SYMBOL(dev_valid_name);
1045
1046/**
1047 *	__dev_alloc_name - allocate a name for a device
1048 *	@net: network namespace to allocate the device name in
1049 *	@name: name format string
1050 *	@buf:  scratch buffer and result name string
1051 *
1052 *	Passed a format string - eg "lt%d" it will try and find a suitable
1053 *	id. It scans list of devices to build up a free map, then chooses
1054 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1055 *	while allocating the name and adding the device in order to avoid
1056 *	duplicates.
1057 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1058 *	Returns the number of the unit assigned or a negative errno code.
1059 */
1060
1061static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1062{
1063	int i = 0;
1064	const char *p;
1065	const int max_netdevices = 8*PAGE_SIZE;
1066	unsigned long *inuse;
1067	struct net_device *d;
1068
1069	if (!dev_valid_name(name))
1070		return -EINVAL;
1071
1072	p = strchr(name, '%');
1073	if (p) {
1074		/*
1075		 * Verify the string as this thing may have come from
1076		 * the user.  There must be either one "%d" and no other "%"
1077		 * characters.
1078		 */
1079		if (p[1] != 'd' || strchr(p + 2, '%'))
1080			return -EINVAL;
1081
1082		/* Use one page as a bit array of possible slots */
1083		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1084		if (!inuse)
1085			return -ENOMEM;
1086
1087		for_each_netdev(net, d) {
1088			if (!sscanf(d->name, name, &i))
1089				continue;
1090			if (i < 0 || i >= max_netdevices)
1091				continue;
1092
1093			/*  avoid cases where sscanf is not exact inverse of printf */
1094			snprintf(buf, IFNAMSIZ, name, i);
1095			if (!strncmp(buf, d->name, IFNAMSIZ))
1096				set_bit(i, inuse);
1097		}
1098
1099		i = find_first_zero_bit(inuse, max_netdevices);
1100		free_page((unsigned long) inuse);
1101	}
1102
1103	snprintf(buf, IFNAMSIZ, name, i);
1104	if (!__dev_get_by_name(net, buf))
1105		return i;
1106
1107	/* It is possible to run out of possible slots
1108	 * when the name is long and there isn't enough space left
1109	 * for the digits, or if all bits are used.
1110	 */
1111	return -ENFILE;
1112}
1113
1114static int dev_alloc_name_ns(struct net *net,
1115			     struct net_device *dev,
1116			     const char *name)
1117{
1118	char buf[IFNAMSIZ];
1119	int ret;
1120
1121	BUG_ON(!net);
1122	ret = __dev_alloc_name(net, name, buf);
1123	if (ret >= 0)
1124		strlcpy(dev->name, buf, IFNAMSIZ);
1125	return ret;
1126}
1127
1128/**
1129 *	dev_alloc_name - allocate a name for a device
1130 *	@dev: device
1131 *	@name: name format string
1132 *
1133 *	Passed a format string - eg "lt%d" it will try and find a suitable
1134 *	id. It scans list of devices to build up a free map, then chooses
1135 *	the first empty slot. The caller must hold the dev_base or rtnl lock
1136 *	while allocating the name and adding the device in order to avoid
1137 *	duplicates.
1138 *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1139 *	Returns the number of the unit assigned or a negative errno code.
1140 */
1141
1142int dev_alloc_name(struct net_device *dev, const char *name)
1143{
1144	return dev_alloc_name_ns(dev_net(dev), dev, name);
1145}
1146EXPORT_SYMBOL(dev_alloc_name);
1147
1148int dev_get_valid_name(struct net *net, struct net_device *dev,
1149		       const char *name)
1150{
1151	BUG_ON(!net);
1152
1153	if (!dev_valid_name(name))
1154		return -EINVAL;
1155
1156	if (strchr(name, '%'))
1157		return dev_alloc_name_ns(net, dev, name);
1158	else if (__dev_get_by_name(net, name))
1159		return -EEXIST;
1160	else if (dev->name != name)
1161		strlcpy(dev->name, name, IFNAMSIZ);
1162
1163	return 0;
1164}
1165EXPORT_SYMBOL(dev_get_valid_name);
1166
1167/**
1168 *	dev_change_name - change name of a device
1169 *	@dev: device
1170 *	@newname: name (or format string) must be at least IFNAMSIZ
1171 *
1172 *	Change name of a device, can pass format strings "eth%d".
1173 *	for wildcarding.
1174 */
1175int dev_change_name(struct net_device *dev, const char *newname)
1176{
1177	unsigned char old_assign_type;
1178	char oldname[IFNAMSIZ];
1179	int err = 0;
1180	int ret;
1181	struct net *net;
1182
1183	ASSERT_RTNL();
1184	BUG_ON(!dev_net(dev));
1185
1186	net = dev_net(dev);
1187
1188	/* Some auto-enslaved devices e.g. failover slaves are
1189	 * special, as userspace might rename the device after
1190	 * the interface had been brought up and running since
1191	 * the point kernel initiated auto-enslavement. Allow
1192	 * live name change even when these slave devices are
1193	 * up and running.
1194	 *
1195	 * Typically, users of these auto-enslaving devices
1196	 * don't actually care about slave name change, as
1197	 * they are supposed to operate on master interface
1198	 * directly.
1199	 */
1200	if (dev->flags & IFF_UP &&
1201	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1202		return -EBUSY;
1203
1204	write_seqcount_begin(&devnet_rename_seq);
1205
1206	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1207		write_seqcount_end(&devnet_rename_seq);
1208		return 0;
1209	}
1210
1211	memcpy(oldname, dev->name, IFNAMSIZ);
1212
1213	err = dev_get_valid_name(net, dev, newname);
1214	if (err < 0) {
1215		write_seqcount_end(&devnet_rename_seq);
1216		return err;
1217	}
1218
1219	if (oldname[0] && !strchr(oldname, '%'))
1220		netdev_info(dev, "renamed from %s\n", oldname);
1221
1222	old_assign_type = dev->name_assign_type;
1223	dev->name_assign_type = NET_NAME_RENAMED;
1224
1225rollback:
1226	ret = device_rename(&dev->dev, dev->name);
1227	if (ret) {
1228		memcpy(dev->name, oldname, IFNAMSIZ);
1229		dev->name_assign_type = old_assign_type;
1230		write_seqcount_end(&devnet_rename_seq);
1231		return ret;
1232	}
1233
1234	write_seqcount_end(&devnet_rename_seq);
1235
1236	netdev_adjacent_rename_links(dev, oldname);
1237
1238	write_lock_bh(&dev_base_lock);
1239	hlist_del_rcu(&dev->name_hlist);
1240	write_unlock_bh(&dev_base_lock);
1241
1242	synchronize_rcu();
1243
1244	write_lock_bh(&dev_base_lock);
1245	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1246	write_unlock_bh(&dev_base_lock);
1247
1248	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1249	ret = notifier_to_errno(ret);
1250
1251	if (ret) {
1252		/* err >= 0 after dev_alloc_name() or stores the first errno */
1253		if (err >= 0) {
1254			err = ret;
1255			write_seqcount_begin(&devnet_rename_seq);
1256			memcpy(dev->name, oldname, IFNAMSIZ);
1257			memcpy(oldname, newname, IFNAMSIZ);
1258			dev->name_assign_type = old_assign_type;
1259			old_assign_type = NET_NAME_RENAMED;
1260			goto rollback;
1261		} else {
1262			pr_err("%s: name change rollback failed: %d\n",
1263			       dev->name, ret);
1264		}
1265	}
1266
1267	return err;
1268}
1269
1270/**
1271 *	dev_set_alias - change ifalias of a device
1272 *	@dev: device
1273 *	@alias: name up to IFALIASZ
1274 *	@len: limit of bytes to copy from info
1275 *
1276 *	Set ifalias for a device,
1277 */
1278int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1279{
1280	struct dev_ifalias *new_alias = NULL;
1281
1282	if (len >= IFALIASZ)
1283		return -EINVAL;
1284
1285	if (len) {
1286		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1287		if (!new_alias)
1288			return -ENOMEM;
1289
1290		memcpy(new_alias->ifalias, alias, len);
1291		new_alias->ifalias[len] = 0;
1292	}
1293
1294	mutex_lock(&ifalias_mutex);
1295	rcu_swap_protected(dev->ifalias, new_alias,
1296			   mutex_is_locked(&ifalias_mutex));
1297	mutex_unlock(&ifalias_mutex);
1298
1299	if (new_alias)
1300		kfree_rcu(new_alias, rcuhead);
1301
1302	return len;
1303}
1304EXPORT_SYMBOL(dev_set_alias);
1305
1306/**
1307 *	dev_get_alias - get ifalias of a device
1308 *	@dev: device
1309 *	@name: buffer to store name of ifalias
1310 *	@len: size of buffer
1311 *
1312 *	get ifalias for a device.  Caller must make sure dev cannot go
1313 *	away,  e.g. rcu read lock or own a reference count to device.
1314 */
1315int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1316{
1317	const struct dev_ifalias *alias;
1318	int ret = 0;
1319
1320	rcu_read_lock();
1321	alias = rcu_dereference(dev->ifalias);
1322	if (alias)
1323		ret = snprintf(name, len, "%s", alias->ifalias);
1324	rcu_read_unlock();
1325
1326	return ret;
1327}
1328
1329/**
1330 *	netdev_features_change - device changes features
1331 *	@dev: device to cause notification
1332 *
1333 *	Called to indicate a device has changed features.
1334 */
1335void netdev_features_change(struct net_device *dev)
1336{
1337	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1338}
1339EXPORT_SYMBOL(netdev_features_change);
1340
1341/**
1342 *	netdev_state_change - device changes state
1343 *	@dev: device to cause notification
1344 *
1345 *	Called to indicate a device has changed state. This function calls
1346 *	the notifier chains for netdev_chain and sends a NEWLINK message
1347 *	to the routing socket.
1348 */
1349void netdev_state_change(struct net_device *dev)
1350{
1351	if (dev->flags & IFF_UP) {
1352		struct netdev_notifier_change_info change_info = {
1353			.info.dev = dev,
1354		};
1355
1356		call_netdevice_notifiers_info(NETDEV_CHANGE,
1357					      &change_info.info);
1358		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1359	}
1360}
1361EXPORT_SYMBOL(netdev_state_change);
1362
1363/**
1364 * netdev_notify_peers - notify network peers about existence of @dev
1365 * @dev: network device
1366 *
1367 * Generate traffic such that interested network peers are aware of
1368 * @dev, such as by generating a gratuitous ARP. This may be used when
1369 * a device wants to inform the rest of the network about some sort of
1370 * reconfiguration such as a failover event or virtual machine
1371 * migration.
1372 */
1373void netdev_notify_peers(struct net_device *dev)
1374{
1375	rtnl_lock();
1376	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1377	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1378	rtnl_unlock();
1379}
1380EXPORT_SYMBOL(netdev_notify_peers);
1381
1382static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1383{
1384	const struct net_device_ops *ops = dev->netdev_ops;
1385	int ret;
1386
1387	ASSERT_RTNL();
1388
1389	if (!netif_device_present(dev))
1390		return -ENODEV;
1391
1392	/* Block netpoll from trying to do any rx path servicing.
1393	 * If we don't do this there is a chance ndo_poll_controller
1394	 * or ndo_poll may be running while we open the device
1395	 */
1396	netpoll_poll_disable(dev);
1397
1398	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1399	ret = notifier_to_errno(ret);
1400	if (ret)
1401		return ret;
1402
1403	set_bit(__LINK_STATE_START, &dev->state);
1404
1405	if (ops->ndo_validate_addr)
1406		ret = ops->ndo_validate_addr(dev);
1407
1408	if (!ret && ops->ndo_open)
1409		ret = ops->ndo_open(dev);
1410
1411	netpoll_poll_enable(dev);
1412
1413	if (ret)
1414		clear_bit(__LINK_STATE_START, &dev->state);
1415	else {
1416		dev->flags |= IFF_UP;
1417		dev_set_rx_mode(dev);
1418		dev_activate(dev);
1419		add_device_randomness(dev->dev_addr, dev->addr_len);
1420	}
1421
1422	return ret;
1423}
1424
1425/**
1426 *	dev_open	- prepare an interface for use.
1427 *	@dev: device to open
1428 *	@extack: netlink extended ack
1429 *
1430 *	Takes a device from down to up state. The device's private open
1431 *	function is invoked and then the multicast lists are loaded. Finally
1432 *	the device is moved into the up state and a %NETDEV_UP message is
1433 *	sent to the netdev notifier chain.
1434 *
1435 *	Calling this function on an active interface is a nop. On a failure
1436 *	a negative errno code is returned.
1437 */
1438int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1439{
1440	int ret;
1441
1442	if (dev->flags & IFF_UP)
1443		return 0;
1444
1445	ret = __dev_open(dev, extack);
1446	if (ret < 0)
1447		return ret;
1448
1449	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1450	call_netdevice_notifiers(NETDEV_UP, dev);
1451
1452	return ret;
1453}
1454EXPORT_SYMBOL(dev_open);
1455
1456static void __dev_close_many(struct list_head *head)
1457{
1458	struct net_device *dev;
1459
1460	ASSERT_RTNL();
1461	might_sleep();
1462
1463	list_for_each_entry(dev, head, close_list) {
1464		/* Temporarily disable netpoll until the interface is down */
1465		netpoll_poll_disable(dev);
1466
1467		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1468
1469		clear_bit(__LINK_STATE_START, &dev->state);
1470
1471		/* Synchronize to scheduled poll. We cannot touch poll list, it
1472		 * can be even on different cpu. So just clear netif_running().
1473		 *
1474		 * dev->stop() will invoke napi_disable() on all of it's
1475		 * napi_struct instances on this device.
1476		 */
1477		smp_mb__after_atomic(); /* Commit netif_running(). */
1478	}
1479
1480	dev_deactivate_many(head);
1481
1482	list_for_each_entry(dev, head, close_list) {
1483		const struct net_device_ops *ops = dev->netdev_ops;
1484
1485		/*
1486		 *	Call the device specific close. This cannot fail.
1487		 *	Only if device is UP
1488		 *
1489		 *	We allow it to be called even after a DETACH hot-plug
1490		 *	event.
1491		 */
1492		if (ops->ndo_stop)
1493			ops->ndo_stop(dev);
1494
1495		dev->flags &= ~IFF_UP;
1496		netpoll_poll_enable(dev);
1497	}
1498}
1499
1500static void __dev_close(struct net_device *dev)
1501{
1502	LIST_HEAD(single);
1503
1504	list_add(&dev->close_list, &single);
1505	__dev_close_many(&single);
1506	list_del(&single);
1507}
1508
1509void dev_close_many(struct list_head *head, bool unlink)
1510{
1511	struct net_device *dev, *tmp;
1512
1513	/* Remove the devices that don't need to be closed */
1514	list_for_each_entry_safe(dev, tmp, head, close_list)
1515		if (!(dev->flags & IFF_UP))
1516			list_del_init(&dev->close_list);
1517
1518	__dev_close_many(head);
1519
1520	list_for_each_entry_safe(dev, tmp, head, close_list) {
1521		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1522		call_netdevice_notifiers(NETDEV_DOWN, dev);
1523		if (unlink)
1524			list_del_init(&dev->close_list);
1525	}
1526}
1527EXPORT_SYMBOL(dev_close_many);
1528
1529/**
1530 *	dev_close - shutdown an interface.
1531 *	@dev: device to shutdown
1532 *
1533 *	This function moves an active device into down state. A
1534 *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1535 *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1536 *	chain.
1537 */
1538void dev_close(struct net_device *dev)
1539{
1540	if (dev->flags & IFF_UP) {
1541		LIST_HEAD(single);
1542
1543		list_add(&dev->close_list, &single);
1544		dev_close_many(&single, true);
1545		list_del(&single);
1546	}
1547}
1548EXPORT_SYMBOL(dev_close);
1549
1550
1551/**
1552 *	dev_disable_lro - disable Large Receive Offload on a device
1553 *	@dev: device
1554 *
1555 *	Disable Large Receive Offload (LRO) on a net device.  Must be
1556 *	called under RTNL.  This is needed if received packets may be
1557 *	forwarded to another interface.
1558 */
1559void dev_disable_lro(struct net_device *dev)
1560{
1561	struct net_device *lower_dev;
1562	struct list_head *iter;
1563
1564	dev->wanted_features &= ~NETIF_F_LRO;
1565	netdev_update_features(dev);
1566
1567	if (unlikely(dev->features & NETIF_F_LRO))
1568		netdev_WARN(dev, "failed to disable LRO!\n");
1569
1570	netdev_for_each_lower_dev(dev, lower_dev, iter)
1571		dev_disable_lro(lower_dev);
1572}
1573EXPORT_SYMBOL(dev_disable_lro);
1574
1575/**
1576 *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1577 *	@dev: device
1578 *
1579 *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1580 *	called under RTNL.  This is needed if Generic XDP is installed on
1581 *	the device.
1582 */
1583static void dev_disable_gro_hw(struct net_device *dev)
1584{
1585	dev->wanted_features &= ~NETIF_F_GRO_HW;
1586	netdev_update_features(dev);
1587
1588	if (unlikely(dev->features & NETIF_F_GRO_HW))
1589		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1590}
1591
1592const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1593{
1594#define N(val) 						\
1595	case NETDEV_##val:				\
1596		return "NETDEV_" __stringify(val);
1597	switch (cmd) {
1598	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1599	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1600	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1601	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1602	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1603	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1604	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1605	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1606	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1607	N(PRE_CHANGEADDR)
1608	}
1609#undef N
1610	return "UNKNOWN_NETDEV_EVENT";
1611}
1612EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1613
1614static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1615				   struct net_device *dev)
1616{
1617	struct netdev_notifier_info info = {
1618		.dev = dev,
1619	};
1620
1621	return nb->notifier_call(nb, val, &info);
1622}
1623
1624static int dev_boot_phase = 1;
1625
1626/**
1627 * register_netdevice_notifier - register a network notifier block
1628 * @nb: notifier
1629 *
1630 * Register a notifier to be called when network device events occur.
1631 * The notifier passed is linked into the kernel structures and must
1632 * not be reused until it has been unregistered. A negative errno code
1633 * is returned on a failure.
1634 *
1635 * When registered all registration and up events are replayed
1636 * to the new notifier to allow device to have a race free
1637 * view of the network device list.
1638 */
1639
1640int register_netdevice_notifier(struct notifier_block *nb)
1641{
1642	struct net_device *dev;
1643	struct net_device *last;
1644	struct net *net;
1645	int err;
1646
1647	/* Close race with setup_net() and cleanup_net() */
1648	down_write(&pernet_ops_rwsem);
1649	rtnl_lock();
1650	err = raw_notifier_chain_register(&netdev_chain, nb);
1651	if (err)
1652		goto unlock;
1653	if (dev_boot_phase)
1654		goto unlock;
1655	for_each_net(net) {
1656		for_each_netdev(net, dev) {
1657			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1658			err = notifier_to_errno(err);
1659			if (err)
1660				goto rollback;
1661
1662			if (!(dev->flags & IFF_UP))
1663				continue;
1664
1665			call_netdevice_notifier(nb, NETDEV_UP, dev);
1666		}
1667	}
1668
1669unlock:
1670	rtnl_unlock();
1671	up_write(&pernet_ops_rwsem);
1672	return err;
1673
1674rollback:
1675	last = dev;
1676	for_each_net(net) {
1677		for_each_netdev(net, dev) {
1678			if (dev == last)
1679				goto outroll;
1680
1681			if (dev->flags & IFF_UP) {
1682				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1683							dev);
1684				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1685			}
1686			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1687		}
1688	}
1689
1690outroll:
1691	raw_notifier_chain_unregister(&netdev_chain, nb);
1692	goto unlock;
1693}
1694EXPORT_SYMBOL(register_netdevice_notifier);
1695
1696/**
1697 * unregister_netdevice_notifier - unregister a network notifier block
1698 * @nb: notifier
1699 *
1700 * Unregister a notifier previously registered by
1701 * register_netdevice_notifier(). The notifier is unlinked into the
1702 * kernel structures and may then be reused. A negative errno code
1703 * is returned on a failure.
1704 *
1705 * After unregistering unregister and down device events are synthesized
1706 * for all devices on the device list to the removed notifier to remove
1707 * the need for special case cleanup code.
1708 */
1709
1710int unregister_netdevice_notifier(struct notifier_block *nb)
1711{
1712	struct net_device *dev;
1713	struct net *net;
1714	int err;
1715
1716	/* Close race with setup_net() and cleanup_net() */
1717	down_write(&pernet_ops_rwsem);
1718	rtnl_lock();
1719	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1720	if (err)
1721		goto unlock;
1722
1723	for_each_net(net) {
1724		for_each_netdev(net, dev) {
1725			if (dev->flags & IFF_UP) {
1726				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1727							dev);
1728				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1729			}
1730			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1731		}
1732	}
1733unlock:
1734	rtnl_unlock();
1735	up_write(&pernet_ops_rwsem);
1736	return err;
1737}
1738EXPORT_SYMBOL(unregister_netdevice_notifier);
1739
1740/**
1741 *	call_netdevice_notifiers_info - call all network notifier blocks
1742 *	@val: value passed unmodified to notifier function
1743 *	@info: notifier information data
1744 *
1745 *	Call all network notifier blocks.  Parameters and return value
1746 *	are as for raw_notifier_call_chain().
1747 */
1748
1749static int call_netdevice_notifiers_info(unsigned long val,
1750					 struct netdev_notifier_info *info)
1751{
1752	ASSERT_RTNL();
1753	return raw_notifier_call_chain(&netdev_chain, val, info);
1754}
1755
1756static int call_netdevice_notifiers_extack(unsigned long val,
1757					   struct net_device *dev,
1758					   struct netlink_ext_ack *extack)
1759{
1760	struct netdev_notifier_info info = {
1761		.dev = dev,
1762		.extack = extack,
1763	};
1764
1765	return call_netdevice_notifiers_info(val, &info);
1766}
1767
1768/**
1769 *	call_netdevice_notifiers - call all network notifier blocks
1770 *      @val: value passed unmodified to notifier function
1771 *      @dev: net_device pointer passed unmodified to notifier function
1772 *
1773 *	Call all network notifier blocks.  Parameters and return value
1774 *	are as for raw_notifier_call_chain().
1775 */
1776
1777int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1778{
1779	return call_netdevice_notifiers_extack(val, dev, NULL);
1780}
1781EXPORT_SYMBOL(call_netdevice_notifiers);
1782
1783/**
1784 *	call_netdevice_notifiers_mtu - call all network notifier blocks
1785 *	@val: value passed unmodified to notifier function
1786 *	@dev: net_device pointer passed unmodified to notifier function
1787 *	@arg: additional u32 argument passed to the notifier function
1788 *
1789 *	Call all network notifier blocks.  Parameters and return value
1790 *	are as for raw_notifier_call_chain().
1791 */
1792static int call_netdevice_notifiers_mtu(unsigned long val,
1793					struct net_device *dev, u32 arg)
1794{
1795	struct netdev_notifier_info_ext info = {
1796		.info.dev = dev,
1797		.ext.mtu = arg,
1798	};
1799
1800	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1801
1802	return call_netdevice_notifiers_info(val, &info.info);
1803}
1804
1805#ifdef CONFIG_NET_INGRESS
1806static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1807
1808void net_inc_ingress_queue(void)
1809{
1810	static_branch_inc(&ingress_needed_key);
1811}
1812EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1813
1814void net_dec_ingress_queue(void)
1815{
1816	static_branch_dec(&ingress_needed_key);
1817}
1818EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1819#endif
1820
1821#ifdef CONFIG_NET_EGRESS
1822static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1823
1824void net_inc_egress_queue(void)
1825{
1826	static_branch_inc(&egress_needed_key);
1827}
1828EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1829
1830void net_dec_egress_queue(void)
1831{
1832	static_branch_dec(&egress_needed_key);
1833}
1834EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1835#endif
1836
1837static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1838#ifdef CONFIG_JUMP_LABEL
1839static atomic_t netstamp_needed_deferred;
1840static atomic_t netstamp_wanted;
1841static void netstamp_clear(struct work_struct *work)
1842{
1843	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1844	int wanted;
1845
1846	wanted = atomic_add_return(deferred, &netstamp_wanted);
1847	if (wanted > 0)
1848		static_branch_enable(&netstamp_needed_key);
1849	else
1850		static_branch_disable(&netstamp_needed_key);
1851}
1852static DECLARE_WORK(netstamp_work, netstamp_clear);
1853#endif
1854
1855void net_enable_timestamp(void)
1856{
1857#ifdef CONFIG_JUMP_LABEL
1858	int wanted;
1859
1860	while (1) {
1861		wanted = atomic_read(&netstamp_wanted);
1862		if (wanted <= 0)
1863			break;
1864		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1865			return;
1866	}
1867	atomic_inc(&netstamp_needed_deferred);
1868	schedule_work(&netstamp_work);
1869#else
1870	static_branch_inc(&netstamp_needed_key);
1871#endif
1872}
1873EXPORT_SYMBOL(net_enable_timestamp);
1874
1875void net_disable_timestamp(void)
1876{
1877#ifdef CONFIG_JUMP_LABEL
1878	int wanted;
1879
1880	while (1) {
1881		wanted = atomic_read(&netstamp_wanted);
1882		if (wanted <= 1)
1883			break;
1884		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1885			return;
1886	}
1887	atomic_dec(&netstamp_needed_deferred);
1888	schedule_work(&netstamp_work);
1889#else
1890	static_branch_dec(&netstamp_needed_key);
1891#endif
1892}
1893EXPORT_SYMBOL(net_disable_timestamp);
1894
1895static inline void net_timestamp_set(struct sk_buff *skb)
1896{
1897	skb->tstamp = 0;
1898	if (static_branch_unlikely(&netstamp_needed_key))
1899		__net_timestamp(skb);
1900}
1901
1902#define net_timestamp_check(COND, SKB)				\
1903	if (static_branch_unlikely(&netstamp_needed_key)) {	\
1904		if ((COND) && !(SKB)->tstamp)			\
1905			__net_timestamp(SKB);			\
1906	}							\
1907
1908bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1909{
1910	unsigned int len;
1911
1912	if (!(dev->flags & IFF_UP))
1913		return false;
1914
1915	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1916	if (skb->len <= len)
1917		return true;
1918
1919	/* if TSO is enabled, we don't care about the length as the packet
1920	 * could be forwarded without being segmented before
1921	 */
1922	if (skb_is_gso(skb))
1923		return true;
1924
1925	return false;
1926}
1927EXPORT_SYMBOL_GPL(is_skb_forwardable);
1928
1929int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1930{
1931	int ret = ____dev_forward_skb(dev, skb);
1932
1933	if (likely(!ret)) {
1934		skb->protocol = eth_type_trans(skb, dev);
1935		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1936	}
1937
1938	return ret;
1939}
1940EXPORT_SYMBOL_GPL(__dev_forward_skb);
1941
1942/**
1943 * dev_forward_skb - loopback an skb to another netif
1944 *
1945 * @dev: destination network device
1946 * @skb: buffer to forward
1947 *
1948 * return values:
1949 *	NET_RX_SUCCESS	(no congestion)
1950 *	NET_RX_DROP     (packet was dropped, but freed)
1951 *
1952 * dev_forward_skb can be used for injecting an skb from the
1953 * start_xmit function of one device into the receive queue
1954 * of another device.
1955 *
1956 * The receiving device may be in another namespace, so
1957 * we have to clear all information in the skb that could
1958 * impact namespace isolation.
1959 */
1960int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1961{
1962	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1963}
1964EXPORT_SYMBOL_GPL(dev_forward_skb);
1965
1966static inline int deliver_skb(struct sk_buff *skb,
1967			      struct packet_type *pt_prev,
1968			      struct net_device *orig_dev)
1969{
1970	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1971		return -ENOMEM;
1972	refcount_inc(&skb->users);
1973	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1974}
1975
1976static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1977					  struct packet_type **pt,
1978					  struct net_device *orig_dev,
1979					  __be16 type,
1980					  struct list_head *ptype_list)
1981{
1982	struct packet_type *ptype, *pt_prev = *pt;
1983
1984	list_for_each_entry_rcu(ptype, ptype_list, list) {
1985		if (ptype->type != type)
1986			continue;
1987		if (pt_prev)
1988			deliver_skb(skb, pt_prev, orig_dev);
1989		pt_prev = ptype;
1990	}
1991	*pt = pt_prev;
1992}
1993
1994static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1995{
1996	if (!ptype->af_packet_priv || !skb->sk)
1997		return false;
1998
1999	if (ptype->id_match)
2000		return ptype->id_match(ptype, skb->sk);
2001	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2002		return true;
2003
2004	return false;
2005}
2006
2007/**
2008 * dev_nit_active - return true if any network interface taps are in use
2009 *
2010 * @dev: network device to check for the presence of taps
2011 */
2012bool dev_nit_active(struct net_device *dev)
2013{
2014	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2015}
2016EXPORT_SYMBOL_GPL(dev_nit_active);
2017
2018/*
2019 *	Support routine. Sends outgoing frames to any network
2020 *	taps currently in use.
2021 */
2022
2023void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2024{
2025	struct packet_type *ptype;
2026	struct sk_buff *skb2 = NULL;
2027	struct packet_type *pt_prev = NULL;
2028	struct list_head *ptype_list = &ptype_all;
2029
2030	rcu_read_lock();
2031again:
2032	list_for_each_entry_rcu(ptype, ptype_list, list) {
2033		if (ptype->ignore_outgoing)
2034			continue;
2035
2036		/* Never send packets back to the socket
2037		 * they originated from - MvS (miquels@drinkel.ow.org)
2038		 */
2039		if (skb_loop_sk(ptype, skb))
2040			continue;
2041
2042		if (pt_prev) {
2043			deliver_skb(skb2, pt_prev, skb->dev);
2044			pt_prev = ptype;
2045			continue;
2046		}
2047
2048		/* need to clone skb, done only once */
2049		skb2 = skb_clone(skb, GFP_ATOMIC);
2050		if (!skb2)
2051			goto out_unlock;
2052
2053		net_timestamp_set(skb2);
2054
2055		/* skb->nh should be correctly
2056		 * set by sender, so that the second statement is
2057		 * just protection against buggy protocols.
2058		 */
2059		skb_reset_mac_header(skb2);
2060
2061		if (skb_network_header(skb2) < skb2->data ||
2062		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2063			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2064					     ntohs(skb2->protocol),
2065					     dev->name);
2066			skb_reset_network_header(skb2);
2067		}
2068
2069		skb2->transport_header = skb2->network_header;
2070		skb2->pkt_type = PACKET_OUTGOING;
2071		pt_prev = ptype;
2072	}
2073
2074	if (ptype_list == &ptype_all) {
2075		ptype_list = &dev->ptype_all;
2076		goto again;
2077	}
2078out_unlock:
2079	if (pt_prev) {
2080		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2081			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2082		else
2083			kfree_skb(skb2);
2084	}
2085	rcu_read_unlock();
2086}
2087EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2088
2089/**
2090 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2091 * @dev: Network device
2092 * @txq: number of queues available
2093 *
2094 * If real_num_tx_queues is changed the tc mappings may no longer be
2095 * valid. To resolve this verify the tc mapping remains valid and if
2096 * not NULL the mapping. With no priorities mapping to this
2097 * offset/count pair it will no longer be used. In the worst case TC0
2098 * is invalid nothing can be done so disable priority mappings. If is
2099 * expected that drivers will fix this mapping if they can before
2100 * calling netif_set_real_num_tx_queues.
2101 */
2102static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2103{
2104	int i;
2105	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2106
2107	/* If TC0 is invalidated disable TC mapping */
2108	if (tc->offset + tc->count > txq) {
2109		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2110		dev->num_tc = 0;
2111		return;
2112	}
2113
2114	/* Invalidated prio to tc mappings set to TC0 */
2115	for (i = 1; i < TC_BITMASK + 1; i++) {
2116		int q = netdev_get_prio_tc_map(dev, i);
2117
2118		tc = &dev->tc_to_txq[q];
2119		if (tc->offset + tc->count > txq) {
2120			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2121				i, q);
2122			netdev_set_prio_tc_map(dev, i, 0);
2123		}
2124	}
2125}
2126
2127int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2128{
2129	if (dev->num_tc) {
2130		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2131		int i;
2132
2133		/* walk through the TCs and see if it falls into any of them */
2134		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2135			if ((txq - tc->offset) < tc->count)
2136				return i;
2137		}
2138
2139		/* didn't find it, just return -1 to indicate no match */
2140		return -1;
2141	}
2142
2143	return 0;
2144}
2145EXPORT_SYMBOL(netdev_txq_to_tc);
2146
2147#ifdef CONFIG_XPS
2148struct static_key xps_needed __read_mostly;
2149EXPORT_SYMBOL(xps_needed);
2150struct static_key xps_rxqs_needed __read_mostly;
2151EXPORT_SYMBOL(xps_rxqs_needed);
2152static DEFINE_MUTEX(xps_map_mutex);
2153#define xmap_dereference(P)		\
2154	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2155
2156static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2157			     int tci, u16 index)
2158{
2159	struct xps_map *map = NULL;
2160	int pos;
2161
2162	if (dev_maps)
2163		map = xmap_dereference(dev_maps->attr_map[tci]);
2164	if (!map)
2165		return false;
2166
2167	for (pos = map->len; pos--;) {
2168		if (map->queues[pos] != index)
2169			continue;
2170
2171		if (map->len > 1) {
2172			map->queues[pos] = map->queues[--map->len];
2173			break;
2174		}
2175
2176		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2177		kfree_rcu(map, rcu);
2178		return false;
2179	}
2180
2181	return true;
2182}
2183
2184static bool remove_xps_queue_cpu(struct net_device *dev,
2185				 struct xps_dev_maps *dev_maps,
2186				 int cpu, u16 offset, u16 count)
2187{
2188	int num_tc = dev->num_tc ? : 1;
2189	bool active = false;
2190	int tci;
2191
2192	for (tci = cpu * num_tc; num_tc--; tci++) {
2193		int i, j;
2194
2195		for (i = count, j = offset; i--; j++) {
2196			if (!remove_xps_queue(dev_maps, tci, j))
2197				break;
2198		}
2199
2200		active |= i < 0;
2201	}
2202
2203	return active;
2204}
2205
2206static void reset_xps_maps(struct net_device *dev,
2207			   struct xps_dev_maps *dev_maps,
2208			   bool is_rxqs_map)
2209{
2210	if (is_rxqs_map) {
2211		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2212		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2213	} else {
2214		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2215	}
2216	static_key_slow_dec_cpuslocked(&xps_needed);
2217	kfree_rcu(dev_maps, rcu);
2218}
2219
2220static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2221			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2222			   u16 offset, u16 count, bool is_rxqs_map)
2223{
2224	bool active = false;
2225	int i, j;
2226
2227	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2228	     j < nr_ids;)
2229		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2230					       count);
2231	if (!active)
2232		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2233
2234	if (!is_rxqs_map) {
2235		for (i = offset + (count - 1); count--; i--) {
2236			netdev_queue_numa_node_write(
2237				netdev_get_tx_queue(dev, i),
2238				NUMA_NO_NODE);
2239		}
2240	}
2241}
2242
2243static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2244				   u16 count)
2245{
2246	const unsigned long *possible_mask = NULL;
2247	struct xps_dev_maps *dev_maps;
2248	unsigned int nr_ids;
2249
2250	if (!static_key_false(&xps_needed))
2251		return;
2252
2253	cpus_read_lock();
2254	mutex_lock(&xps_map_mutex);
2255
2256	if (static_key_false(&xps_rxqs_needed)) {
2257		dev_maps = xmap_dereference(dev->xps_rxqs_map);
2258		if (dev_maps) {
2259			nr_ids = dev->num_rx_queues;
2260			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2261				       offset, count, true);
2262		}
2263	}
2264
2265	dev_maps = xmap_dereference(dev->xps_cpus_map);
2266	if (!dev_maps)
2267		goto out_no_maps;
2268
2269	if (num_possible_cpus() > 1)
2270		possible_mask = cpumask_bits(cpu_possible_mask);
2271	nr_ids = nr_cpu_ids;
2272	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2273		       false);
2274
2275out_no_maps:
2276	mutex_unlock(&xps_map_mutex);
2277	cpus_read_unlock();
2278}
2279
2280static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2281{
2282	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2283}
2284
2285static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2286				      u16 index, bool is_rxqs_map)
2287{
2288	struct xps_map *new_map;
2289	int alloc_len = XPS_MIN_MAP_ALLOC;
2290	int i, pos;
2291
2292	for (pos = 0; map && pos < map->len; pos++) {
2293		if (map->queues[pos] != index)
2294			continue;
2295		return map;
2296	}
2297
2298	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2299	if (map) {
2300		if (pos < map->alloc_len)
2301			return map;
2302
2303		alloc_len = map->alloc_len * 2;
2304	}
2305
2306	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2307	 *  map
2308	 */
2309	if (is_rxqs_map)
2310		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2311	else
2312		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2313				       cpu_to_node(attr_index));
2314	if (!new_map)
2315		return NULL;
2316
2317	for (i = 0; i < pos; i++)
2318		new_map->queues[i] = map->queues[i];
2319	new_map->alloc_len = alloc_len;
2320	new_map->len = pos;
2321
2322	return new_map;
2323}
2324
2325/* Must be called under cpus_read_lock */
2326int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2327			  u16 index, bool is_rxqs_map)
2328{
2329	const unsigned long *online_mask = NULL, *possible_mask = NULL;
2330	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2331	int i, j, tci, numa_node_id = -2;
2332	int maps_sz, num_tc = 1, tc = 0;
2333	struct xps_map *map, *new_map;
2334	bool active = false;
2335	unsigned int nr_ids;
2336
2337	if (dev->num_tc) {
2338		/* Do not allow XPS on subordinate device directly */
2339		num_tc = dev->num_tc;
2340		if (num_tc < 0)
2341			return -EINVAL;
2342
2343		/* If queue belongs to subordinate dev use its map */
2344		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2345
2346		tc = netdev_txq_to_tc(dev, index);
2347		if (tc < 0)
2348			return -EINVAL;
2349	}
2350
2351	mutex_lock(&xps_map_mutex);
2352	if (is_rxqs_map) {
2353		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2354		dev_maps = xmap_dereference(dev->xps_rxqs_map);
2355		nr_ids = dev->num_rx_queues;
2356	} else {
2357		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2358		if (num_possible_cpus() > 1) {
2359			online_mask = cpumask_bits(cpu_online_mask);
2360			possible_mask = cpumask_bits(cpu_possible_mask);
2361		}
2362		dev_maps = xmap_dereference(dev->xps_cpus_map);
2363		nr_ids = nr_cpu_ids;
2364	}
2365
2366	if (maps_sz < L1_CACHE_BYTES)
2367		maps_sz = L1_CACHE_BYTES;
2368
2369	/* allocate memory for queue storage */
2370	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2371	     j < nr_ids;) {
2372		if (!new_dev_maps)
2373			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2374		if (!new_dev_maps) {
2375			mutex_unlock(&xps_map_mutex);
2376			return -ENOMEM;
2377		}
2378
2379		tci = j * num_tc + tc;
2380		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2381				 NULL;
2382
2383		map = expand_xps_map(map, j, index, is_rxqs_map);
2384		if (!map)
2385			goto error;
2386
2387		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2388	}
2389
2390	if (!new_dev_maps)
2391		goto out_no_new_maps;
2392
2393	if (!dev_maps) {
2394		/* Increment static keys at most once per type */
2395		static_key_slow_inc_cpuslocked(&xps_needed);
2396		if (is_rxqs_map)
2397			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2398	}
2399
2400	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2401	     j < nr_ids;) {
2402		/* copy maps belonging to foreign traffic classes */
2403		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2404			/* fill in the new device map from the old device map */
2405			map = xmap_dereference(dev_maps->attr_map[tci]);
2406			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2407		}
2408
2409		/* We need to explicitly update tci as prevous loop
2410		 * could break out early if dev_maps is NULL.
2411		 */
2412		tci = j * num_tc + tc;
2413
2414		if (netif_attr_test_mask(j, mask, nr_ids) &&
2415		    netif_attr_test_online(j, online_mask, nr_ids)) {
2416			/* add tx-queue to CPU/rx-queue maps */
2417			int pos = 0;
2418
2419			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2420			while ((pos < map->len) && (map->queues[pos] != index))
2421				pos++;
2422
2423			if (pos == map->len)
2424				map->queues[map->len++] = index;
2425#ifdef CONFIG_NUMA
2426			if (!is_rxqs_map) {
2427				if (numa_node_id == -2)
2428					numa_node_id = cpu_to_node(j);
2429				else if (numa_node_id != cpu_to_node(j))
2430					numa_node_id = -1;
2431			}
2432#endif
2433		} else if (dev_maps) {
2434			/* fill in the new device map from the old device map */
2435			map = xmap_dereference(dev_maps->attr_map[tci]);
2436			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2437		}
2438
2439		/* copy maps belonging to foreign traffic classes */
2440		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2441			/* fill in the new device map from the old device map */
2442			map = xmap_dereference(dev_maps->attr_map[tci]);
2443			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2444		}
2445	}
2446
2447	if (is_rxqs_map)
2448		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2449	else
2450		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2451
2452	/* Cleanup old maps */
2453	if (!dev_maps)
2454		goto out_no_old_maps;
2455
2456	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2457	     j < nr_ids;) {
2458		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2459			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2460			map = xmap_dereference(dev_maps->attr_map[tci]);
2461			if (map && map != new_map)
2462				kfree_rcu(map, rcu);
2463		}
2464	}
2465
2466	kfree_rcu(dev_maps, rcu);
2467
2468out_no_old_maps:
2469	dev_maps = new_dev_maps;
2470	active = true;
2471
2472out_no_new_maps:
2473	if (!is_rxqs_map) {
2474		/* update Tx queue numa node */
2475		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2476					     (numa_node_id >= 0) ?
2477					     numa_node_id : NUMA_NO_NODE);
2478	}
2479
2480	if (!dev_maps)
2481		goto out_no_maps;
2482
2483	/* removes tx-queue from unused CPUs/rx-queues */
2484	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2485	     j < nr_ids;) {
2486		for (i = tc, tci = j * num_tc; i--; tci++)
2487			active |= remove_xps_queue(dev_maps, tci, index);
2488		if (!netif_attr_test_mask(j, mask, nr_ids) ||
2489		    !netif_attr_test_online(j, online_mask, nr_ids))
2490			active |= remove_xps_queue(dev_maps, tci, index);
2491		for (i = num_tc - tc, tci++; --i; tci++)
2492			active |= remove_xps_queue(dev_maps, tci, index);
2493	}
2494
2495	/* free map if not active */
2496	if (!active)
2497		reset_xps_maps(dev, dev_maps, is_rxqs_map);
2498
2499out_no_maps:
2500	mutex_unlock(&xps_map_mutex);
2501
2502	return 0;
2503error:
2504	/* remove any maps that we added */
2505	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2506	     j < nr_ids;) {
2507		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2508			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2509			map = dev_maps ?
2510			      xmap_dereference(dev_maps->attr_map[tci]) :
2511			      NULL;
2512			if (new_map && new_map != map)
2513				kfree(new_map);
2514		}
2515	}
2516
2517	mutex_unlock(&xps_map_mutex);
2518
2519	kfree(new_dev_maps);
2520	return -ENOMEM;
2521}
2522EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2523
2524int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2525			u16 index)
2526{
2527	int ret;
2528
2529	cpus_read_lock();
2530	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2531	cpus_read_unlock();
2532
2533	return ret;
2534}
2535EXPORT_SYMBOL(netif_set_xps_queue);
2536
2537#endif
2538static void netdev_unbind_all_sb_channels(struct net_device *dev)
2539{
2540	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2541
2542	/* Unbind any subordinate channels */
2543	while (txq-- != &dev->_tx[0]) {
2544		if (txq->sb_dev)
2545			netdev_unbind_sb_channel(dev, txq->sb_dev);
2546	}
2547}
2548
2549void netdev_reset_tc(struct net_device *dev)
2550{
2551#ifdef CONFIG_XPS
2552	netif_reset_xps_queues_gt(dev, 0);
2553#endif
2554	netdev_unbind_all_sb_channels(dev);
2555
2556	/* Reset TC configuration of device */
2557	dev->num_tc = 0;
2558	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2559	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2560}
2561EXPORT_SYMBOL(netdev_reset_tc);
2562
2563int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2564{
2565	if (tc >= dev->num_tc)
2566		return -EINVAL;
2567
2568#ifdef CONFIG_XPS
2569	netif_reset_xps_queues(dev, offset, count);
2570#endif
2571	dev->tc_to_txq[tc].count = count;
2572	dev->tc_to_txq[tc].offset = offset;
2573	return 0;
2574}
2575EXPORT_SYMBOL(netdev_set_tc_queue);
2576
2577int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2578{
2579	if (num_tc > TC_MAX_QUEUE)
2580		return -EINVAL;
2581
2582#ifdef CONFIG_XPS
2583	netif_reset_xps_queues_gt(dev, 0);
2584#endif
2585	netdev_unbind_all_sb_channels(dev);
2586
2587	dev->num_tc = num_tc;
2588	return 0;
2589}
2590EXPORT_SYMBOL(netdev_set_num_tc);
2591
2592void netdev_unbind_sb_channel(struct net_device *dev,
2593			      struct net_device *sb_dev)
2594{
2595	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2596
2597#ifdef CONFIG_XPS
2598	netif_reset_xps_queues_gt(sb_dev, 0);
2599#endif
2600	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2601	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2602
2603	while (txq-- != &dev->_tx[0]) {
2604		if (txq->sb_dev == sb_dev)
2605			txq->sb_dev = NULL;
2606	}
2607}
2608EXPORT_SYMBOL(netdev_unbind_sb_channel);
2609
2610int netdev_bind_sb_channel_queue(struct net_device *dev,
2611				 struct net_device *sb_dev,
2612				 u8 tc, u16 count, u16 offset)
2613{
2614	/* Make certain the sb_dev and dev are already configured */
2615	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2616		return -EINVAL;
2617
2618	/* We cannot hand out queues we don't have */
2619	if ((offset + count) > dev->real_num_tx_queues)
2620		return -EINVAL;
2621
2622	/* Record the mapping */
2623	sb_dev->tc_to_txq[tc].count = count;
2624	sb_dev->tc_to_txq[tc].offset = offset;
2625
2626	/* Provide a way for Tx queue to find the tc_to_txq map or
2627	 * XPS map for itself.
2628	 */
2629	while (count--)
2630		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2631
2632	return 0;
2633}
2634EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2635
2636int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2637{
2638	/* Do not use a multiqueue device to represent a subordinate channel */
2639	if (netif_is_multiqueue(dev))
2640		return -ENODEV;
2641
2642	/* We allow channels 1 - 32767 to be used for subordinate channels.
2643	 * Channel 0 is meant to be "native" mode and used only to represent
2644	 * the main root device. We allow writing 0 to reset the device back
2645	 * to normal mode after being used as a subordinate channel.
2646	 */
2647	if (channel > S16_MAX)
2648		return -EINVAL;
2649
2650	dev->num_tc = -channel;
2651
2652	return 0;
2653}
2654EXPORT_SYMBOL(netdev_set_sb_channel);
2655
2656/*
2657 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2658 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2659 */
2660int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2661{
2662	bool disabling;
2663	int rc;
2664
2665	disabling = txq < dev->real_num_tx_queues;
2666
2667	if (txq < 1 || txq > dev->num_tx_queues)
2668		return -EINVAL;
2669
2670	if (dev->reg_state == NETREG_REGISTERED ||
2671	    dev->reg_state == NETREG_UNREGISTERING) {
2672		ASSERT_RTNL();
2673
2674		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2675						  txq);
2676		if (rc)
2677			return rc;
2678
2679		if (dev->num_tc)
2680			netif_setup_tc(dev, txq);
2681
2682		dev->real_num_tx_queues = txq;
2683
2684		if (disabling) {
2685			synchronize_net();
2686			qdisc_reset_all_tx_gt(dev, txq);
2687#ifdef CONFIG_XPS
2688			netif_reset_xps_queues_gt(dev, txq);
2689#endif
2690		}
2691	} else {
2692		dev->real_num_tx_queues = txq;
2693	}
2694
2695	return 0;
2696}
2697EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2698
2699#ifdef CONFIG_SYSFS
2700/**
2701 *	netif_set_real_num_rx_queues - set actual number of RX queues used
2702 *	@dev: Network device
2703 *	@rxq: Actual number of RX queues
2704 *
2705 *	This must be called either with the rtnl_lock held or before
2706 *	registration of the net device.  Returns 0 on success, or a
2707 *	negative error code.  If called before registration, it always
2708 *	succeeds.
2709 */
2710int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2711{
2712	int rc;
2713
2714	if (rxq < 1 || rxq > dev->num_rx_queues)
2715		return -EINVAL;
2716
2717	if (dev->reg_state == NETREG_REGISTERED) {
2718		ASSERT_RTNL();
2719
2720		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2721						  rxq);
2722		if (rc)
2723			return rc;
2724	}
2725
2726	dev->real_num_rx_queues = rxq;
2727	return 0;
2728}
2729EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2730#endif
2731
2732/**
2733 * netif_get_num_default_rss_queues - default number of RSS queues
2734 *
2735 * This routine should set an upper limit on the number of RSS queues
2736 * used by default by multiqueue devices.
2737 */
2738int netif_get_num_default_rss_queues(void)
2739{
2740	return is_kdump_kernel() ?
2741		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2742}
2743EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2744
2745static void __netif_reschedule(struct Qdisc *q)
2746{
2747	struct softnet_data *sd;
2748	unsigned long flags;
2749
2750	local_irq_save(flags);
2751	sd = this_cpu_ptr(&softnet_data);
2752	q->next_sched = NULL;
2753	*sd->output_queue_tailp = q;
2754	sd->output_queue_tailp = &q->next_sched;
2755	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2756	local_irq_restore(flags);
2757}
2758
2759void __netif_schedule(struct Qdisc *q)
2760{
2761	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2762		__netif_reschedule(q);
2763}
2764EXPORT_SYMBOL(__netif_schedule);
2765
2766struct dev_kfree_skb_cb {
2767	enum skb_free_reason reason;
2768};
2769
2770static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2771{
2772	return (struct dev_kfree_skb_cb *)skb->cb;
2773}
2774
2775void netif_schedule_queue(struct netdev_queue *txq)
2776{
2777	rcu_read_lock();
2778	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2779		struct Qdisc *q = rcu_dereference(txq->qdisc);
2780
2781		__netif_schedule(q);
2782	}
2783	rcu_read_unlock();
2784}
2785EXPORT_SYMBOL(netif_schedule_queue);
2786
2787void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2788{
2789	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2790		struct Qdisc *q;
2791
2792		rcu_read_lock();
2793		q = rcu_dereference(dev_queue->qdisc);
2794		__netif_schedule(q);
2795		rcu_read_unlock();
2796	}
2797}
2798EXPORT_SYMBOL(netif_tx_wake_queue);
2799
2800void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2801{
2802	unsigned long flags;
2803
2804	if (unlikely(!skb))
2805		return;
2806
2807	if (likely(refcount_read(&skb->users) == 1)) {
2808		smp_rmb();
2809		refcount_set(&skb->users, 0);
2810	} else if (likely(!refcount_dec_and_test(&skb->users))) {
2811		return;
2812	}
2813	get_kfree_skb_cb(skb)->reason = reason;
2814	local_irq_save(flags);
2815	skb->next = __this_cpu_read(softnet_data.completion_queue);
2816	__this_cpu_write(softnet_data.completion_queue, skb);
2817	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2818	local_irq_restore(flags);
2819}
2820EXPORT_SYMBOL(__dev_kfree_skb_irq);
2821
2822void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2823{
2824	if (in_irq() || irqs_disabled())
2825		__dev_kfree_skb_irq(skb, reason);
2826	else
2827		dev_kfree_skb(skb);
2828}
2829EXPORT_SYMBOL(__dev_kfree_skb_any);
2830
2831
2832/**
2833 * netif_device_detach - mark device as removed
2834 * @dev: network device
2835 *
2836 * Mark device as removed from system and therefore no longer available.
2837 */
2838void netif_device_detach(struct net_device *dev)
2839{
2840	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2841	    netif_running(dev)) {
2842		netif_tx_stop_all_queues(dev);
2843	}
2844}
2845EXPORT_SYMBOL(netif_device_detach);
2846
2847/**
2848 * netif_device_attach - mark device as attached
2849 * @dev: network device
2850 *
2851 * Mark device as attached from system and restart if needed.
2852 */
2853void netif_device_attach(struct net_device *dev)
2854{
2855	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2856	    netif_running(dev)) {
2857		netif_tx_wake_all_queues(dev);
2858		__netdev_watchdog_up(dev);
2859	}
2860}
2861EXPORT_SYMBOL(netif_device_attach);
2862
2863/*
2864 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2865 * to be used as a distribution range.
2866 */
2867static u16 skb_tx_hash(const struct net_device *dev,
2868		       const struct net_device *sb_dev,
2869		       struct sk_buff *skb)
2870{
2871	u32 hash;
2872	u16 qoffset = 0;
2873	u16 qcount = dev->real_num_tx_queues;
2874
2875	if (dev->num_tc) {
2876		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2877
2878		qoffset = sb_dev->tc_to_txq[tc].offset;
2879		qcount = sb_dev->tc_to_txq[tc].count;
2880	}
2881
2882	if (skb_rx_queue_recorded(skb)) {
2883		hash = skb_get_rx_queue(skb);
2884		while (unlikely(hash >= qcount))
2885			hash -= qcount;
2886		return hash + qoffset;
2887	}
2888
2889	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2890}
2891
2892static void skb_warn_bad_offload(const struct sk_buff *skb)
2893{
2894	static const netdev_features_t null_features;
2895	struct net_device *dev = skb->dev;
2896	const char *name = "";
2897
2898	if (!net_ratelimit())
2899		return;
2900
2901	if (dev) {
2902		if (dev->dev.parent)
2903			name = dev_driver_string(dev->dev.parent);
2904		else
2905			name = netdev_name(dev);
2906	}
2907	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2908	     "gso_type=%d ip_summed=%d\n",
2909	     name, dev ? &dev->features : &null_features,
2910	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2911	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2912	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2913}
2914
2915/*
2916 * Invalidate hardware checksum when packet is to be mangled, and
2917 * complete checksum manually on outgoing path.
2918 */
2919int skb_checksum_help(struct sk_buff *skb)
2920{
2921	__wsum csum;
2922	int ret = 0, offset;
2923
2924	if (skb->ip_summed == CHECKSUM_COMPLETE)
2925		goto out_set_summed;
2926
2927	if (unlikely(skb_shinfo(skb)->gso_size)) {
2928		skb_warn_bad_offload(skb);
2929		return -EINVAL;
2930	}
2931
2932	/* Before computing a checksum, we should make sure no frag could
2933	 * be modified by an external entity : checksum could be wrong.
2934	 */
2935	if (skb_has_shared_frag(skb)) {
2936		ret = __skb_linearize(skb);
2937		if (ret)
2938			goto out;
2939	}
2940
2941	offset = skb_checksum_start_offset(skb);
2942	BUG_ON(offset >= skb_headlen(skb));
2943	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2944
2945	offset += skb->csum_offset;
2946	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2947
2948	if (skb_cloned(skb) &&
2949	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2950		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2951		if (ret)
2952			goto out;
2953	}
2954
2955	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2956out_set_summed:
2957	skb->ip_summed = CHECKSUM_NONE;
2958out:
2959	return ret;
2960}
2961EXPORT_SYMBOL(skb_checksum_help);
2962
2963int skb_crc32c_csum_help(struct sk_buff *skb)
2964{
2965	__le32 crc32c_csum;
2966	int ret = 0, offset, start;
2967
2968	if (skb->ip_summed != CHECKSUM_PARTIAL)
2969		goto out;
2970
2971	if (unlikely(skb_is_gso(skb)))
2972		goto out;
2973
2974	/* Before computing a checksum, we should make sure no frag could
2975	 * be modified by an external entity : checksum could be wrong.
2976	 */
2977	if (unlikely(skb_has_shared_frag(skb))) {
2978		ret = __skb_linearize(skb);
2979		if (ret)
2980			goto out;
2981	}
2982	start = skb_checksum_start_offset(skb);
2983	offset = start + offsetof(struct sctphdr, checksum);
2984	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2985		ret = -EINVAL;
2986		goto out;
2987	}
2988	if (skb_cloned(skb) &&
2989	    !skb_clone_writable(skb, offset + sizeof(__le32))) {
2990		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2991		if (ret)
2992			goto out;
2993	}
2994	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2995						  skb->len - start, ~(__u32)0,
2996						  crc32c_csum_stub));
2997	*(__le32 *)(skb->data + offset) = crc32c_csum;
2998	skb->ip_summed = CHECKSUM_NONE;
2999	skb->csum_not_inet = 0;
3000out:
3001	return ret;
3002}
3003
3004__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3005{
3006	__be16 type = skb->protocol;
3007
3008	/* Tunnel gso handlers can set protocol to ethernet. */
3009	if (type == htons(ETH_P_TEB)) {
3010		struct ethhdr *eth;
3011
3012		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3013			return 0;
3014
3015		eth = (struct ethhdr *)skb->data;
3016		type = eth->h_proto;
3017	}
3018
3019	return __vlan_get_protocol(skb, type, depth);
3020}
3021
3022/**
3023 *	skb_mac_gso_segment - mac layer segmentation handler.
3024 *	@skb: buffer to segment
3025 *	@features: features for the output path (see dev->features)
3026 */
3027struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3028				    netdev_features_t features)
3029{
3030	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3031	struct packet_offload *ptype;
3032	int vlan_depth = skb->mac_len;
3033	__be16 type = skb_network_protocol(skb, &vlan_depth);
3034
3035	if (unlikely(!type))
3036		return ERR_PTR(-EINVAL);
3037
3038	__skb_pull(skb, vlan_depth);
3039
3040	rcu_read_lock();
3041	list_for_each_entry_rcu(ptype, &offload_base, list) {
3042		if (ptype->type == type && ptype->callbacks.gso_segment) {
3043			segs = ptype->callbacks.gso_segment(skb, features);
3044			break;
3045		}
3046	}
3047	rcu_read_unlock();
3048
3049	__skb_push(skb, skb->data - skb_mac_header(skb));
3050
3051	return segs;
3052}
3053EXPORT_SYMBOL(skb_mac_gso_segment);
3054
3055
3056/* openvswitch calls this on rx path, so we need a different check.
3057 */
3058static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3059{
3060	if (tx_path)
3061		return skb->ip_summed != CHECKSUM_PARTIAL &&
3062		       skb->ip_summed != CHECKSUM_UNNECESSARY;
3063
3064	return skb->ip_summed == CHECKSUM_NONE;
3065}
3066
3067/**
3068 *	__skb_gso_segment - Perform segmentation on skb.
3069 *	@skb: buffer to segment
3070 *	@features: features for the output path (see dev->features)
3071 *	@tx_path: whether it is called in TX path
3072 *
3073 *	This function segments the given skb and returns a list of segments.
3074 *
3075 *	It may return NULL if the skb requires no segmentation.  This is
3076 *	only possible when GSO is used for verifying header integrity.
3077 *
3078 *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3079 */
3080struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3081				  netdev_features_t features, bool tx_path)
3082{
3083	struct sk_buff *segs;
3084
3085	if (unlikely(skb_needs_check(skb, tx_path))) {
3086		int err;
3087
3088		/* We're going to init ->check field in TCP or UDP header */
3089		err = skb_cow_head(skb, 0);
3090		if (err < 0)
3091			return ERR_PTR(err);
3092	}
3093
3094	/* Only report GSO partial support if it will enable us to
3095	 * support segmentation on this frame without needing additional
3096	 * work.
3097	 */
3098	if (features & NETIF_F_GSO_PARTIAL) {
3099		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3100		struct net_device *dev = skb->dev;
3101
3102		partial_features |= dev->features & dev->gso_partial_features;
3103		if (!skb_gso_ok(skb, features | partial_features))
3104			features &= ~NETIF_F_GSO_PARTIAL;
3105	}
3106
3107	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3108		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3109
3110	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3111	SKB_GSO_CB(skb)->encap_level = 0;
3112
3113	skb_reset_mac_header(skb);
3114	skb_reset_mac_len(skb);
3115
3116	segs = skb_mac_gso_segment(skb, features);
3117
3118	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3119		skb_warn_bad_offload(skb);
3120
3121	return segs;
3122}
3123EXPORT_SYMBOL(__skb_gso_segment);
3124
3125/* Take action when hardware reception checksum errors are detected. */
3126#ifdef CONFIG_BUG
3127void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3128{
3129	if (net_ratelimit()) {
3130		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3131		if (dev)
3132			pr_err("dev features: %pNF\n", &dev->features);
3133		pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
3134		       skb->len, skb->data_len, skb->pkt_type,
3135		       skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
3136		       skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
3137		       skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
3138		dump_stack();
3139	}
3140}
3141EXPORT_SYMBOL(netdev_rx_csum_fault);
3142#endif
3143
3144/* XXX: check that highmem exists at all on the given machine. */
3145static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3146{
3147#ifdef CONFIG_HIGHMEM
3148	int i;
3149
3150	if (!(dev->features & NETIF_F_HIGHDMA)) {
3151		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3152			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3153
3154			if (PageHighMem(skb_frag_page(frag)))
3155				return 1;
3156		}
3157	}
3158#endif
3159	return 0;
3160}
3161
3162/* If MPLS offload request, verify we are testing hardware MPLS features
3163 * instead of standard features for the netdev.
3164 */
3165#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3166static netdev_features_t net_mpls_features(struct sk_buff *skb,
3167					   netdev_features_t features,
3168					   __be16 type)
3169{
3170	if (eth_p_mpls(type))
3171		features &= skb->dev->mpls_features;
3172
3173	return features;
3174}
3175#else
3176static netdev_features_t net_mpls_features(struct sk_buff *skb,
3177					   netdev_features_t features,
3178					   __be16 type)
3179{
3180	return features;
3181}
3182#endif
3183
3184static netdev_features_t harmonize_features(struct sk_buff *skb,
3185	netdev_features_t features)
3186{
3187	int tmp;
3188	__be16 type;
3189
3190	type = skb_network_protocol(skb, &tmp);
3191	features = net_mpls_features(skb, features, type);
3192
3193	if (skb->ip_summed != CHECKSUM_NONE &&
3194	    !can_checksum_protocol(features, type)) {
3195		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3196	}
3197	if (illegal_highdma(skb->dev, skb))
3198		features &= ~NETIF_F_SG;
3199
3200	return features;
3201}
3202
3203netdev_features_t passthru_features_check(struct sk_buff *skb,
3204					  struct net_device *dev,
3205					  netdev_features_t features)
3206{
3207	return features;
3208}
3209EXPORT_SYMBOL(passthru_features_check);
3210
3211static netdev_features_t dflt_features_check(struct sk_buff *skb,
3212					     struct net_device *dev,
3213					     netdev_features_t features)
3214{
3215	return vlan_features_check(skb, features);
3216}
3217
3218static netdev_features_t gso_features_check(const struct sk_buff *skb,
3219					    struct net_device *dev,
3220					    netdev_features_t features)
3221{
3222	u16 gso_segs = skb_shinfo(skb)->gso_segs;
3223
3224	if (gso_segs > dev->gso_max_segs)
3225		return features & ~NETIF_F_GSO_MASK;
3226
3227	/* Support for GSO partial features requires software
3228	 * intervention before we can actually process the packets
3229	 * so we need to strip support for any partial features now
3230	 * and we can pull them back in after we have partially
3231	 * segmented the frame.
3232	 */
3233	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3234		features &= ~dev->gso_partial_features;
3235
3236	/* Make sure to clear the IPv4 ID mangling feature if the
3237	 * IPv4 header has the potential to be fragmented.
3238	 */
3239	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3240		struct iphdr *iph = skb->encapsulation ?
3241				    inner_ip_hdr(skb) : ip_hdr(skb);
3242
3243		if (!(iph->frag_off & htons(IP_DF)))
3244			features &= ~NETIF_F_TSO_MANGLEID;
3245	}
3246
3247	return features;
3248}
3249
3250netdev_features_t netif_skb_features(struct sk_buff *skb)
3251{
3252	struct net_device *dev = skb->dev;
3253	netdev_features_t features = dev->features;
3254
3255	if (skb_is_gso(skb))
3256		features = gso_features_check(skb, dev, features);
3257
3258	/* If encapsulation offload request, verify we are testing
3259	 * hardware encapsulation features instead of standard
3260	 * features for the netdev
3261	 */
3262	if (skb->encapsulation)
3263		features &= dev->hw_enc_features;
3264
3265	if (skb_vlan_tagged(skb))
3266		features = netdev_intersect_features(features,
3267						     dev->vlan_features |
3268						     NETIF_F_HW_VLAN_CTAG_TX |
3269						     NETIF_F_HW_VLAN_STAG_TX);
3270
3271	if (dev->netdev_ops->ndo_features_check)
3272		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3273								features);
3274	else
3275		features &= dflt_features_check(skb, dev, features);
3276
3277	return harmonize_features(skb, features);
3278}
3279EXPORT_SYMBOL(netif_skb_features);
3280
3281static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3282		    struct netdev_queue *txq, bool more)
3283{
3284	unsigned int len;
3285	int rc;
3286
3287	if (dev_nit_active(dev))
3288		dev_queue_xmit_nit(skb, dev);
3289
3290	len = skb->len;
3291	trace_net_dev_start_xmit(skb, dev);
3292	rc = netdev_start_xmit(skb, dev, txq, more);
3293	trace_net_dev_xmit(skb, rc, dev, len);
3294
3295	return rc;
3296}
3297
3298struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3299				    struct netdev_queue *txq, int *ret)
3300{
3301	struct sk_buff *skb = first;
3302	int rc = NETDEV_TX_OK;
3303
3304	while (skb) {
3305		struct sk_buff *next = skb->next;
3306
3307		skb_mark_not_on_list(skb);
3308		rc = xmit_one(skb, dev, txq, next != NULL);
3309		if (unlikely(!dev_xmit_complete(rc))) {
3310			skb->next = next;
3311			goto out;
3312		}
3313
3314		skb = next;
3315		if (netif_tx_queue_stopped(txq) && skb) {
3316			rc = NETDEV_TX_BUSY;
3317			break;
3318		}
3319	}
3320
3321out:
3322	*ret = rc;
3323	return skb;
3324}
3325
3326static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3327					  netdev_features_t features)
3328{
3329	if (skb_vlan_tag_present(skb) &&
3330	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3331		skb = __vlan_hwaccel_push_inside(skb);
3332	return skb;
3333}
3334
3335int skb_csum_hwoffload_help(struct sk_buff *skb,
3336			    const netdev_features_t features)
3337{
3338	if (unlikely(skb->csum_not_inet))
3339		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3340			skb_crc32c_csum_help(skb);
3341
3342	return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3343}
3344EXPORT_SYMBOL(skb_csum_hwoffload_help);
3345
3346static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3347{
3348	netdev_features_t features;
3349
3350	features = netif_skb_features(skb);
3351	skb = validate_xmit_vlan(skb, features);
3352	if (unlikely(!skb))
3353		goto out_null;
3354
3355	skb = sk_validate_xmit_skb(skb, dev);
3356	if (unlikely(!skb))
3357		goto out_null;
3358
3359	if (netif_needs_gso(skb, features)) {
3360		struct sk_buff *segs;
3361
3362		segs = skb_gso_segment(skb, features);
3363		if (IS_ERR(segs)) {
3364			goto out_kfree_skb;
3365		} else if (segs) {
3366			consume_skb(skb);
3367			skb = segs;
3368		}
3369	} else {
3370		if (skb_needs_linearize(skb, features) &&
3371		    __skb_linearize(skb))
3372			goto out_kfree_skb;
3373
3374		/* If packet is not checksummed and device does not
3375		 * support checksumming for this protocol, complete
3376		 * checksumming here.
3377		 */
3378		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3379			if (skb->encapsulation)
3380				skb_set_inner_transport_header(skb,
3381							       skb_checksum_start_offset(skb));
3382			else
3383				skb_set_transport_header(skb,
3384							 skb_checksum_start_offset(skb));
3385			if (skb_csum_hwoffload_help(skb, features))
3386				goto out_kfree_skb;
3387		}
3388	}
3389
3390	skb = validate_xmit_xfrm(skb, features, again);
3391
3392	return skb;
3393
3394out_kfree_skb:
3395	kfree_skb(skb);
3396out_null:
3397	atomic_long_inc(&dev->tx_dropped);
3398	return NULL;
3399}
3400
3401struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3402{
3403	struct sk_buff *next, *head = NULL, *tail;
3404
3405	for (; skb != NULL; skb = next) {
3406		next = skb->next;
3407		skb_mark_not_on_list(skb);
3408
3409		/* in case skb wont be segmented, point to itself */
3410		skb->prev = skb;
3411
3412		skb = validate_xmit_skb(skb, dev, again);
3413		if (!skb)
3414			continue;
3415
3416		if (!head)
3417			head = skb;
3418		else
3419			tail->next = skb;
3420		/* If skb was segmented, skb->prev points to
3421		 * the last segment. If not, it still contains skb.
3422		 */
3423		tail = skb->prev;
3424	}
3425	return head;
3426}
3427EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3428
3429static void qdisc_pkt_len_init(struct sk_buff *skb)
3430{
3431	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3432
3433	qdisc_skb_cb(skb)->pkt_len = skb->len;
3434
3435	/* To get more precise estimation of bytes sent on wire,
3436	 * we add to pkt_len the headers size of all segments
3437	 */
3438	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3439		unsigned int hdr_len;
3440		u16 gso_segs = shinfo->gso_segs;
3441
3442		/* mac layer + network layer */
3443		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3444
3445		/* + transport layer */
3446		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3447			const struct tcphdr *th;
3448			struct tcphdr _tcphdr;
3449
3450			th = skb_header_pointer(skb, skb_transport_offset(skb),
3451						sizeof(_tcphdr), &_tcphdr);
3452			if (likely(th))
3453				hdr_len += __tcp_hdrlen(th);
3454		} else {
3455			struct udphdr _udphdr;
3456
3457			if (skb_header_pointer(skb, skb_transport_offset(skb),
3458					       sizeof(_udphdr), &_udphdr))
3459				hdr_len += sizeof(struct udphdr);
3460		}
3461
3462		if (shinfo->gso_type & SKB_GSO_DODGY)
3463			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3464						shinfo->gso_size);
3465
3466		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3467	}
3468}
3469
3470static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3471				 struct net_device *dev,
3472				 struct netdev_queue *txq)
3473{
3474	spinlock_t *root_lock = qdisc_lock(q);
3475	struct sk_buff *to_free = NULL;
3476	bool contended;
3477	int rc;
3478
3479	qdisc_calculate_pkt_len(skb, q);
3480
3481	if (q->flags & TCQ_F_NOLOCK) {
3482		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3483			__qdisc_drop(skb, &to_free);
3484			rc = NET_XMIT_DROP;
3485		} else if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
3486			   qdisc_run_begin(q)) {
3487			qdisc_bstats_cpu_update(q, skb);
3488
3489			if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
3490				__qdisc_run(q);
3491
3492			qdisc_run_end(q);
3493			rc = NET_XMIT_SUCCESS;
3494		} else {
3495			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3496			qdisc_run(q);
3497		}
3498
3499		if (unlikely(to_free))
3500			kfree_skb_list(to_free);
3501		return rc;
3502	}
3503
3504	/*
3505	 * Heuristic to force contended enqueues to serialize on a
3506	 * separate lock before trying to get qdisc main lock.
3507	 * This permits qdisc->running owner to get the lock more
3508	 * often and dequeue packets faster.
3509	 */
3510	contended = qdisc_is_running(q);
3511	if (unlikely(contended))
3512		spin_lock(&q->busylock);
3513
3514	spin_lock(root_lock);
3515	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3516		__qdisc_drop(skb, &to_free);
3517		rc = NET_XMIT_DROP;
3518	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3519		   qdisc_run_begin(q)) {
3520		/*
3521		 * This is a work-conserving queue; there are no old skbs
3522		 * waiting to be sent out; and the qdisc is not running -
3523		 * xmit the skb directly.
3524		 */
3525
3526		qdisc_bstats_update(q, skb);
3527
3528		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3529			if (unlikely(contended)) {
3530				spin_unlock(&q->busylock);
3531				contended = false;
3532			}
3533			__qdisc_run(q);
3534		}
3535
3536		qdisc_run_end(q);
3537		rc = NET_XMIT_SUCCESS;
3538	} else {
3539		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3540		if (qdisc_run_begin(q)) {
3541			if (unlikely(contended)) {
3542				spin_unlock(&q->busylock);
3543				contended = false;
3544			}
3545			__qdisc_run(q);
3546			qdisc_run_end(q);
3547		}
3548	}
3549	spin_unlock(root_lock);
3550	if (unlikely(to_free))
3551		kfree_skb_list(to_free);
3552	if (unlikely(contended))
3553		spin_unlock(&q->busylock);
3554	return rc;
3555}
3556
3557#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3558static void skb_update_prio(struct sk_buff *skb)
3559{
3560	const struct netprio_map *map;
3561	const struct sock *sk;
3562	unsigned int prioidx;
3563
3564	if (skb->priority)
3565		return;
3566	map = rcu_dereference_bh(skb->dev->priomap);
3567	if (!map)
3568		return;
3569	sk = skb_to_full_sk(skb);
3570	if (!sk)
3571		return;
3572
3573	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3574
3575	if (prioidx < map->priomap_len)
3576		skb->priority = map->priomap[prioidx];
3577}
3578#else
3579#define skb_update_prio(skb)
3580#endif
3581
3582/**
3583 *	dev_loopback_xmit - loop back @skb
3584 *	@net: network namespace this loopback is happening in
3585 *	@sk:  sk needed to be a netfilter okfn
3586 *	@skb: buffer to transmit
3587 */
3588int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3589{
3590	skb_reset_mac_header(skb);
3591	__skb_pull(skb, skb_network_offset(skb));
3592	skb->pkt_type = PACKET_LOOPBACK;
3593	skb->ip_summed = CHECKSUM_UNNECESSARY;
3594	WARN_ON(!skb_dst(skb));
3595	skb_dst_force(skb);
3596	netif_rx_ni(skb);
3597	return 0;
3598}
3599EXPORT_SYMBOL(dev_loopback_xmit);
3600
3601#ifdef CONFIG_NET_EGRESS
3602static struct sk_buff *
3603sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3604{
3605	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3606	struct tcf_result cl_res;
3607
3608	if (!miniq)
3609		return skb;
3610
3611	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3612	mini_qdisc_bstats_cpu_update(miniq, skb);
3613
3614	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3615	case TC_ACT_OK:
3616	case TC_ACT_RECLASSIFY:
3617		skb->tc_index = TC_H_MIN(cl_res.classid);
3618		break;
3619	case TC_ACT_SHOT:
3620		mini_qdisc_qstats_cpu_drop(miniq);
3621		*ret = NET_XMIT_DROP;
3622		kfree_skb(skb);
3623		return NULL;
3624	case TC_ACT_STOLEN:
3625	case TC_ACT_QUEUED:
3626	case TC_ACT_TRAP:
3627		*ret = NET_XMIT_SUCCESS;
3628		consume_skb(skb);
3629		return NULL;
3630	case TC_ACT_REDIRECT:
3631		/* No need to push/pop skb's mac_header here on egress! */
3632		skb_do_redirect(skb);
3633		*ret = NET_XMIT_SUCCESS;
3634		return NULL;
3635	default:
3636		break;
3637	}
3638
3639	return skb;
3640}
3641#endif /* CONFIG_NET_EGRESS */
3642
3643#ifdef CONFIG_XPS
3644static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3645			       struct xps_dev_maps *dev_maps, unsigned int tci)
3646{
3647	struct xps_map *map;
3648	int queue_index = -1;
3649
3650	if (dev->num_tc) {
3651		tci *= dev->num_tc;
3652		tci += netdev_get_prio_tc_map(dev, skb->priority);
3653	}
3654
3655	map = rcu_dereference(dev_maps->attr_map[tci]);
3656	if (map) {
3657		if (map->len == 1)
3658			queue_index = map->queues[0];
3659		else
3660			queue_index = map->queues[reciprocal_scale(
3661						skb_get_hash(skb), map->len)];
3662		if (unlikely(queue_index >= dev->real_num_tx_queues))
3663			queue_index = -1;
3664	}
3665	return queue_index;
3666}
3667#endif
3668
3669static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3670			 struct sk_buff *skb)
3671{
3672#ifdef CONFIG_XPS
3673	struct xps_dev_maps *dev_maps;
3674	struct sock *sk = skb->sk;
3675	int queue_index = -1;
3676
3677	if (!static_key_false(&xps_needed))
3678		return -1;
3679
3680	rcu_read_lock();
3681	if (!static_key_false(&xps_rxqs_needed))
3682		goto get_cpus_map;
3683
3684	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3685	if (dev_maps) {
3686		int tci = sk_rx_queue_get(sk);
3687
3688		if (tci >= 0 && tci < dev->num_rx_queues)
3689			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3690							  tci);
3691	}
3692
3693get_cpus_map:
3694	if (queue_index < 0) {
3695		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3696		if (dev_maps) {
3697			unsigned int tci = skb->sender_cpu - 1;
3698
3699			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3700							  tci);
3701		}
3702	}
3703	rcu_read_unlock();
3704
3705	return queue_index;
3706#else
3707	return -1;
3708#endif
3709}
3710
3711u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3712		     struct net_device *sb_dev)
3713{
3714	return 0;
3715}
3716EXPORT_SYMBOL(dev_pick_tx_zero);
3717
3718u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3719		       struct net_device *sb_dev)
3720{
3721	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3722}
3723EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3724
3725u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3726		     struct net_device *sb_dev)
3727{
3728	struct sock *sk = skb->sk;
3729	int queue_index = sk_tx_queue_get(sk);
3730
3731	sb_dev = sb_dev ? : dev;
3732
3733	if (queue_index < 0 || skb->ooo_okay ||
3734	    queue_index >= dev->real_num_tx_queues) {
3735		int new_index = get_xps_queue(dev, sb_dev, skb);
3736
3737		if (new_index < 0)
3738			new_index = skb_tx_hash(dev, sb_dev, skb);
3739
3740		if (queue_index != new_index && sk &&
3741		    sk_fullsock(sk) &&
3742		    rcu_access_pointer(sk->sk_dst_cache))
3743			sk_tx_queue_set(sk, new_index);
3744
3745		queue_index = new_index;
3746	}
3747
3748	return queue_index;
3749}
3750EXPORT_SYMBOL(netdev_pick_tx);
3751
3752struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
3753					 struct sk_buff *skb,
3754					 struct net_device *sb_dev)
3755{
3756	int queue_index = 0;
3757
3758#ifdef CONFIG_XPS
3759	u32 sender_cpu = skb->sender_cpu - 1;
3760
3761	if (sender_cpu >= (u32)NR_CPUS)
3762		skb->sender_cpu = raw_smp_processor_id() + 1;
3763#endif
3764
3765	if (dev->real_num_tx_queues != 1) {
3766		const struct net_device_ops *ops = dev->netdev_ops;
3767
3768		if (ops->ndo_select_queue)
3769			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
3770		else
3771			queue_index = netdev_pick_tx(dev, skb, sb_dev);
3772
3773		queue_index = netdev_cap_txqueue(dev, queue_index);
3774	}
3775
3776	skb_set_queue_mapping(skb, queue_index);
3777	return netdev_get_tx_queue(dev, queue_index);
3778}
3779
3780/**
3781 *	__dev_queue_xmit - transmit a buffer
3782 *	@skb: buffer to transmit
3783 *	@sb_dev: suboordinate device used for L2 forwarding offload
3784 *
3785 *	Queue a buffer for transmission to a network device. The caller must
3786 *	have set the device and priority and built the buffer before calling
3787 *	this function. The function can be called from an interrupt.
3788 *
3789 *	A negative errno code is returned on a failure. A success does not
3790 *	guarantee the frame will be transmitted as it may be dropped due
3791 *	to congestion or traffic shaping.
3792 *
3793 * -----------------------------------------------------------------------------------
3794 *      I notice this method can also return errors from the queue disciplines,
3795 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3796 *      be positive.
3797 *
3798 *      Regardless of the return value, the skb is consumed, so it is currently
3799 *      difficult to retry a send to this method.  (You can bump the ref count
3800 *      before sending to hold a reference for retry if you are careful.)
3801 *
3802 *      When calling this method, interrupts MUST be enabled.  This is because
3803 *      the BH enable code must have IRQs enabled so that it will not deadlock.
3804 *          --BLG
3805 */
3806static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
3807{
3808	struct net_device *dev = skb->dev;
3809	struct netdev_queue *txq;
3810	struct Qdisc *q;
3811	int rc = -ENOMEM;
3812	bool again = false;
3813
3814	skb_reset_mac_header(skb);
3815
3816	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3817		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3818
3819	/* Disable soft irqs for various locks below. Also
3820	 * stops preemption for RCU.
3821	 */
3822	rcu_read_lock_bh();
3823
3824	skb_update_prio(skb);
3825
3826	qdisc_pkt_len_init(skb);
3827#ifdef CONFIG_NET_CLS_ACT
3828	skb->tc_at_ingress = 0;
3829# ifdef CONFIG_NET_EGRESS
3830	if (static_branch_unlikely(&egress_needed_key)) {
3831		skb = sch_handle_egress(skb, &rc, dev);
3832		if (!skb)
3833			goto out;
3834	}
3835# endif
3836#endif
3837	/* If device/qdisc don't need skb->dst, release it right now while
3838	 * its hot in this cpu cache.
3839	 */
3840	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3841		skb_dst_drop(skb);
3842	else
3843		skb_dst_force(skb);
3844
3845	txq = netdev_core_pick_tx(dev, skb, sb_dev);
3846	q = rcu_dereference_bh(txq->qdisc);
3847
3848	trace_net_dev_queue(skb);
3849	if (q->enqueue) {
3850		rc = __dev_xmit_skb(skb, q, dev, txq);
3851		goto out;
3852	}
3853
3854	/* The device has no queue. Common case for software devices:
3855	 * loopback, all the sorts of tunnels...
3856
3857	 * Really, it is unlikely that netif_tx_lock protection is necessary
3858	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3859	 * counters.)
3860	 * However, it is possible, that they rely on protection
3861	 * made by us here.
3862
3863	 * Check this and shot the lock. It is not prone from deadlocks.
3864	 *Either shot noqueue qdisc, it is even simpler 8)
3865	 */
3866	if (dev->flags & IFF_UP) {
3867		int cpu = smp_processor_id(); /* ok because BHs are off */
3868
3869		if (txq->xmit_lock_owner != cpu) {
3870			if (dev_xmit_recursion())
3871				goto recursion_alert;
3872
3873			skb = validate_xmit_skb(skb, dev, &again);
3874			if (!skb)
3875				goto out;
3876
3877			HARD_TX_LOCK(dev, txq, cpu);
3878
3879			if (!netif_xmit_stopped(txq)) {
3880				dev_xmit_recursion_inc();
3881				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3882				dev_xmit_recursion_dec();
3883				if (dev_xmit_complete(rc)) {
3884					HARD_TX_UNLOCK(dev, txq);
3885					goto out;
3886				}
3887			}
3888			HARD_TX_UNLOCK(dev, txq);
3889			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3890					     dev->name);
3891		} else {
3892			/* Recursion is detected! It is possible,
3893			 * unfortunately
3894			 */
3895recursion_alert:
3896			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3897					     dev->name);
3898		}
3899	}
3900
3901	rc = -ENETDOWN;
3902	rcu_read_unlock_bh();
3903
3904	atomic_long_inc(&dev->tx_dropped);
3905	kfree_skb_list(skb);
3906	return rc;
3907out:
3908	rcu_read_unlock_bh();
3909	return rc;
3910}
3911
3912int dev_queue_xmit(struct sk_buff *skb)
3913{
3914	return __dev_queue_xmit(skb, NULL);
3915}
3916EXPORT_SYMBOL(dev_queue_xmit);
3917
3918int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
3919{
3920	return __dev_queue_xmit(skb, sb_dev);
3921}
3922EXPORT_SYMBOL(dev_queue_xmit_accel);
3923
3924int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
3925{
3926	struct net_device *dev = skb->dev;
3927	struct sk_buff *orig_skb = skb;
3928	struct netdev_queue *txq;
3929	int ret = NETDEV_TX_BUSY;
3930	bool again = false;
3931
3932	if (unlikely(!netif_running(dev) ||
3933		     !netif_carrier_ok(dev)))
3934		goto drop;
3935
3936	skb = validate_xmit_skb_list(skb, dev, &again);
3937	if (skb != orig_skb)
3938		goto drop;
3939
3940	skb_set_queue_mapping(skb, queue_id);
3941	txq = skb_get_tx_queue(dev, skb);
3942
3943	local_bh_disable();
3944
3945	HARD_TX_LOCK(dev, txq, smp_processor_id());
3946	if (!netif_xmit_frozen_or_drv_stopped(txq))
3947		ret = netdev_start_xmit(skb, dev, txq, false);
3948	HARD_TX_UNLOCK(dev, txq);
3949
3950	local_bh_enable();
3951
3952	if (!dev_xmit_complete(ret))
3953		kfree_skb(skb);
3954
3955	return ret;
3956drop:
3957	atomic_long_inc(&dev->tx_dropped);
3958	kfree_skb_list(skb);
3959	return NET_XMIT_DROP;
3960}
3961EXPORT_SYMBOL(dev_direct_xmit);
3962
3963/*************************************************************************
3964 *			Receiver routines
3965 *************************************************************************/
3966
3967int netdev_max_backlog __read_mostly = 1000;
3968EXPORT_SYMBOL(netdev_max_backlog);
3969
3970int netdev_tstamp_prequeue __read_mostly = 1;
3971int netdev_budget __read_mostly = 300;
3972unsigned int __read_mostly netdev_budget_usecs = 2000;
3973int weight_p __read_mostly = 64;           /* old backlog weight */
3974int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3975int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3976int dev_rx_weight __read_mostly = 64;
3977int dev_tx_weight __read_mostly = 64;
3978
3979/* Called with irq disabled */
3980static inline void ____napi_schedule(struct softnet_data *sd,
3981				     struct napi_struct *napi)
3982{
3983	list_add_tail(&napi->poll_list, &sd->poll_list);
3984	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3985}
3986
3987#ifdef CONFIG_RPS
3988
3989/* One global table that all flow-based protocols share. */
3990struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3991EXPORT_SYMBOL(rps_sock_flow_table);
3992u32 rps_cpu_mask __read_mostly;
3993EXPORT_SYMBOL(rps_cpu_mask);
3994
3995struct static_key_false rps_needed __read_mostly;
3996EXPORT_SYMBOL(rps_needed);
3997struct static_key_false rfs_needed __read_mostly;
3998EXPORT_SYMBOL(rfs_needed);
3999
4000static struct rps_dev_flow *
4001set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4002	    struct rps_dev_flow *rflow, u16 next_cpu)
4003{
4004	if (next_cpu < nr_cpu_ids) {
4005#ifdef CONFIG_RFS_ACCEL
4006		struct netdev_rx_queue *rxqueue;
4007		struct rps_dev_flow_table *flow_table;
4008		struct rps_dev_flow *old_rflow;
4009		u32 flow_id;
4010		u16 rxq_index;
4011		int rc;
4012
4013		/* Should we steer this flow to a different hardware queue? */
4014		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4015		    !(dev->features & NETIF_F_NTUPLE))
4016			goto out;
4017		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4018		if (rxq_index == skb_get_rx_queue(skb))
4019			goto out;
4020
4021		rxqueue = dev->_rx + rxq_index;
4022		flow_table = rcu_dereference(rxqueue->rps_flow_table);
4023		if (!flow_table)
4024			goto out;
4025		flow_id = skb_get_hash(skb) & flow_table->mask;
4026		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4027							rxq_index, flow_id);
4028		if (rc < 0)
4029			goto out;
4030		old_rflow = rflow;
4031		rflow = &flow_table->flows[flow_id];
4032		rflow->filter = rc;
4033		if (old_rflow->filter == rflow->filter)
4034			old_rflow->filter = RPS_NO_FILTER;
4035	out:
4036#endif
4037		rflow->last_qtail =
4038			per_cpu(softnet_data, next_cpu).input_queue_head;
4039	}
4040
4041	rflow->cpu = next_cpu;
4042	return rflow;
4043}
4044
4045/*
4046 * get_rps_cpu is called from netif_receive_skb and returns the target
4047 * CPU from the RPS map of the receiving queue for a given skb.
4048 * rcu_read_lock must be held on entry.
4049 */
4050static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4051		       struct rps_dev_flow **rflowp)
4052{
4053	const struct rps_sock_flow_table *sock_flow_table;
4054	struct netdev_rx_queue *rxqueue = dev->_rx;
4055	struct rps_dev_flow_table *flow_table;
4056	struct rps_map *map;
4057	int cpu = -1;
4058	u32 tcpu;
4059	u32 hash;
4060
4061	if (skb_rx_queue_recorded(skb)) {
4062		u16 index = skb_get_rx_queue(skb);
4063
4064		if (unlikely(index >= dev->real_num_rx_queues)) {
4065			WARN_ONCE(dev->real_num_rx_queues > 1,
4066				  "%s received packet on queue %u, but number "
4067				  "of RX queues is %u\n",
4068				  dev->name, index, dev->real_num_rx_queues);
4069			goto done;
4070		}
4071		rxqueue += index;
4072	}
4073
4074	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4075
4076	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4077	map = rcu_dereference(rxqueue->rps_map);
4078	if (!flow_table && !map)
4079		goto done;
4080
4081	skb_reset_network_header(skb);
4082	hash = skb_get_hash(skb);
4083	if (!hash)
4084		goto done;
4085
4086	sock_flow_table = rcu_dereference(rps_sock_flow_table);
4087	if (flow_table && sock_flow_table) {
4088		struct rps_dev_flow *rflow;
4089		u32 next_cpu;
4090		u32 ident;
4091
4092		/* First check into global flow table if there is a match */
4093		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4094		if ((ident ^ hash) & ~rps_cpu_mask)
4095			goto try_rps;
4096
4097		next_cpu = ident & rps_cpu_mask;
4098
4099		/* OK, now we know there is a match,
4100		 * we can look at the local (per receive queue) flow table
4101		 */
4102		rflow = &flow_table->flows[hash & flow_table->mask];
4103		tcpu = rflow->cpu;
4104
4105		/*
4106		 * If the desired CPU (where last recvmsg was done) is
4107		 * different from current CPU (one in the rx-queue flow
4108		 * table entry), switch if one of the following holds:
4109		 *   - Current CPU is unset (>= nr_cpu_ids).
4110		 *   - Current CPU is offline.
4111		 *   - The current CPU's queue tail has advanced beyond the
4112		 *     last packet that was enqueued using this table entry.
4113		 *     This guarantees that all previous packets for the flow
4114		 *     have been dequeued, thus preserving in order delivery.
4115		 */
4116		if (unlikely(tcpu != next_cpu) &&
4117		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4118		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4119		      rflow->last_qtail)) >= 0)) {
4120			tcpu = next_cpu;
4121			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4122		}
4123
4124		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4125			*rflowp = rflow;
4126			cpu = tcpu;
4127			goto done;
4128		}
4129	}
4130
4131try_rps:
4132
4133	if (map) {
4134		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4135		if (cpu_online(tcpu)) {
4136			cpu = tcpu;
4137			goto done;
4138		}
4139	}
4140
4141done:
4142	return cpu;
4143}
4144
4145#ifdef CONFIG_RFS_ACCEL
4146
4147/**
4148 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4149 * @dev: Device on which the filter was set
4150 * @rxq_index: RX queue index
4151 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4152 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4153 *
4154 * Drivers that implement ndo_rx_flow_steer() should periodically call
4155 * this function for each installed filter and remove the filters for
4156 * which it returns %true.
4157 */
4158bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4159			 u32 flow_id, u16 filter_id)
4160{
4161	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4162	struct rps_dev_flow_table *flow_table;
4163	struct rps_dev_flow *rflow;
4164	bool expire = true;
4165	unsigned int cpu;
4166
4167	rcu_read_lock();
4168	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4169	if (flow_table && flow_id <= flow_table->mask) {
4170		rflow = &flow_table->flows[flow_id];
4171		cpu = READ_ONCE(rflow->cpu);
4172		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4173		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4174			   rflow->last_qtail) <
4175		     (int)(10 * flow_table->mask)))
4176			expire = false;
4177	}
4178	rcu_read_unlock();
4179	return expire;
4180}
4181EXPORT_SYMBOL(rps_may_expire_flow);
4182
4183#endif /* CONFIG_RFS_ACCEL */
4184
4185/* Called from hardirq (IPI) context */
4186static void rps_trigger_softirq(void *data)
4187{
4188	struct softnet_data *sd = data;
4189
4190	____napi_schedule(sd, &sd->backlog);
4191	sd->received_rps++;
4192}
4193
4194#endif /* CONFIG_RPS */
4195
4196/*
4197 * Check if this softnet_data structure is another cpu one
4198 * If yes, queue it to our IPI list and return 1
4199 * If no, return 0
4200 */
4201static int rps_ipi_queued(struct softnet_data *sd)
4202{
4203#ifdef CONFIG_RPS
4204	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4205
4206	if (sd != mysd) {
4207		sd->rps_ipi_next = mysd->rps_ipi_list;
4208		mysd->rps_ipi_list = sd;
4209
4210		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4211		return 1;
4212	}
4213#endif /* CONFIG_RPS */
4214	return 0;
4215}
4216
4217#ifdef CONFIG_NET_FLOW_LIMIT
4218int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4219#endif
4220
4221static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4222{
4223#ifdef CONFIG_NET_FLOW_LIMIT
4224	struct sd_flow_limit *fl;
4225	struct softnet_data *sd;
4226	unsigned int old_flow, new_flow;
4227
4228	if (qlen < (netdev_max_backlog >> 1))
4229		return false;
4230
4231	sd = this_cpu_ptr(&softnet_data);
4232
4233	rcu_read_lock();
4234	fl = rcu_dereference(sd->flow_limit);
4235	if (fl) {
4236		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4237		old_flow = fl->history[fl->history_head];
4238		fl->history[fl->history_head] = new_flow;
4239
4240		fl->history_head++;
4241		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4242
4243		if (likely(fl->buckets[old_flow]))
4244			fl->buckets[old_flow]--;
4245
4246		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4247			fl->count++;
4248			rcu_read_unlock();
4249			return true;
4250		}
4251	}
4252	rcu_read_unlock();
4253#endif
4254	return false;
4255}
4256
4257/*
4258 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4259 * queue (may be a remote CPU queue).
4260 */
4261static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4262			      unsigned int *qtail)
4263{
4264	struct softnet_data *sd;
4265	unsigned long flags;
4266	unsigned int qlen;
4267
4268	sd = &per_cpu(softnet_data, cpu);
4269
4270	local_irq_save(flags);
4271
4272	rps_lock(sd);
4273	if (!netif_running(skb->dev))
4274		goto drop;
4275	qlen = skb_queue_len(&sd->input_pkt_queue);
4276	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4277		if (qlen) {
4278enqueue:
4279			__skb_queue_tail(&sd->input_pkt_queue, skb);
4280			input_queue_tail_incr_save(sd, qtail);
4281			rps_unlock(sd);
4282			local_irq_restore(flags);
4283			return NET_RX_SUCCESS;
4284		}
4285
4286		/* Schedule NAPI for backlog device
4287		 * We can use non atomic operation since we own the queue lock
4288		 */
4289		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4290			if (!rps_ipi_queued(sd))
4291				____napi_schedule(sd, &sd->backlog);
4292		}
4293		goto enqueue;
4294	}
4295
4296drop:
4297	sd->dropped++;
4298	rps_unlock(sd);
4299
4300	local_irq_restore(flags);
4301
4302	atomic_long_inc(&skb->dev->rx_dropped);
4303	kfree_skb(skb);
4304	return NET_RX_DROP;
4305}
4306
4307static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4308{
4309	struct net_device *dev = skb->dev;
4310	struct netdev_rx_queue *rxqueue;
4311
4312	rxqueue = dev->_rx;
4313
4314	if (skb_rx_queue_recorded(skb)) {
4315		u16 index = skb_get_rx_queue(skb);
4316
4317		if (unlikely(index >= dev->real_num_rx_queues)) {
4318			WARN_ONCE(dev->real_num_rx_queues > 1,
4319				  "%s received packet on queue %u, but number "
4320				  "of RX queues is %u\n",
4321				  dev->name, index, dev->real_num_rx_queues);
4322
4323			return rxqueue; /* Return first rxqueue */
4324		}
4325		rxqueue += index;
4326	}
4327	return rxqueue;
4328}
4329
4330static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4331				     struct xdp_buff *xdp,
4332				     struct bpf_prog *xdp_prog)
4333{
4334	struct netdev_rx_queue *rxqueue;
4335	void *orig_data, *orig_data_end;
4336	u32 metalen, act = XDP_DROP;
4337	__be16 orig_eth_type;
4338	struct ethhdr *eth;
4339	bool orig_bcast;
4340	int hlen, off;
4341	u32 mac_len;
4342
4343	/* Reinjected packets coming from act_mirred or similar should
4344	 * not get XDP generic processing.
4345	 */
4346	if (skb_cloned(skb) || skb_is_tc_redirected(skb))
4347		return XDP_PASS;
4348
4349	/* XDP packets must be linear and must have sufficient headroom
4350	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4351	 * native XDP provides, thus we need to do it here as well.
4352	 */
4353	if (skb_is_nonlinear(skb) ||
4354	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4355		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4356		int troom = skb->tail + skb->data_len - skb->end;
4357
4358		/* In case we have to go down the path and also linearize,
4359		 * then lets do the pskb_expand_head() work just once here.
4360		 */
4361		if (pskb_expand_head(skb,
4362				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4363				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4364			goto do_drop;
4365		if (skb_linearize(skb))
4366			goto do_drop;
4367	}
4368
4369	/* The XDP program wants to see the packet starting at the MAC
4370	 * header.
4371	 */
4372	mac_len = skb->data - skb_mac_header(skb);
4373	hlen = skb_headlen(skb) + mac_len;
4374	xdp->data = skb->data - mac_len;
4375	xdp->data_meta = xdp->data;
4376	xdp->data_end = xdp->data + hlen;
4377	xdp->data_hard_start = skb->data - skb_headroom(skb);
4378	orig_data_end = xdp->data_end;
4379	orig_data = xdp->data;
4380	eth = (struct ethhdr *)xdp->data;
4381	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4382	orig_eth_type = eth->h_proto;
4383
4384	rxqueue = netif_get_rxqueue(skb);
4385	xdp->rxq = &rxqueue->xdp_rxq;
4386
4387	act = bpf_prog_run_xdp(xdp_prog, xdp);
4388
4389	off = xdp->data - orig_data;
4390	if (off > 0)
4391		__skb_pull(skb, off);
4392	else if (off < 0)
4393		__skb_push(skb, -off);
4394	skb->mac_header += off;
4395
4396	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4397	 * pckt.
4398	 */
4399	off = orig_data_end - xdp->data_end;
4400	if (off != 0) {
4401		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4402		skb->len -= off;
4403
4404	}
4405
4406	/* check if XDP changed eth hdr such SKB needs update */
4407	eth = (struct ethhdr *)xdp->data;
4408	if ((orig_eth_type != eth->h_proto) ||
4409	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4410		__skb_push(skb, ETH_HLEN);
4411		skb->protocol = eth_type_trans(skb, skb->dev);
4412	}
4413
4414	switch (act) {
4415	case XDP_REDIRECT:
4416	case XDP_TX:
4417		__skb_push(skb, mac_len);
4418		break;
4419	case XDP_PASS:
4420		metalen = xdp->data - xdp->data_meta;
4421		if (metalen)
4422			skb_metadata_set(skb, metalen);
4423		break;
4424	default:
4425		bpf_warn_invalid_xdp_action(act);
4426		/* fall through */
4427	case XDP_ABORTED:
4428		trace_xdp_exception(skb->dev, xdp_prog, act);
4429		/* fall through */
4430	case XDP_DROP:
4431	do_drop:
4432		kfree_skb(skb);
4433		break;
4434	}
4435
4436	return act;
4437}
4438
4439/* When doing generic XDP we have to bypass the qdisc layer and the
4440 * network taps in order to match in-driver-XDP behavior.
4441 */
4442void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4443{
4444	struct net_device *dev = skb->dev;
4445	struct netdev_queue *txq;
4446	bool free_skb = true;
4447	int cpu, rc;
4448
4449	txq = netdev_core_pick_tx(dev, skb, NULL);
4450	cpu = smp_processor_id();
4451	HARD_TX_LOCK(dev, txq, cpu);
4452	if (!netif_xmit_stopped(txq)) {
4453		rc = netdev_start_xmit(skb, dev, txq, 0);
4454		if (dev_xmit_complete(rc))
4455			free_skb = false;
4456	}
4457	HARD_TX_UNLOCK(dev, txq);
4458	if (free_skb) {
4459		trace_xdp_exception(dev, xdp_prog, XDP_TX);
4460		kfree_skb(skb);
4461	}
4462}
4463EXPORT_SYMBOL_GPL(generic_xdp_tx);
4464
4465static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4466
4467int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4468{
4469	if (xdp_prog) {
4470		struct xdp_buff xdp;
4471		u32 act;
4472		int err;
4473
4474		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4475		if (act != XDP_PASS) {
4476			switch (act) {
4477			case XDP_REDIRECT:
4478				err = xdp_do_generic_redirect(skb->dev, skb,
4479							      &xdp, xdp_prog);
4480				if (err)
4481					goto out_redir;
4482				break;
4483			case XDP_TX:
4484				generic_xdp_tx(skb, xdp_prog);
4485				break;
4486			}
4487			return XDP_DROP;
4488		}
4489	}
4490	return XDP_PASS;
4491out_redir:
4492	kfree_skb(skb);
4493	return XDP_DROP;
4494}
4495EXPORT_SYMBOL_GPL(do_xdp_generic);
4496
4497static int netif_rx_internal(struct sk_buff *skb)
4498{
4499	int ret;
4500
4501	net_timestamp_check(netdev_tstamp_prequeue, skb);
4502
4503	trace_netif_rx(skb);
4504
4505	if (static_branch_unlikely(&generic_xdp_needed_key)) {
4506		int ret;
4507
4508		preempt_disable();
4509		rcu_read_lock();
4510		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4511		rcu_read_unlock();
4512		preempt_enable();
4513
4514		/* Consider XDP consuming the packet a success from
4515		 * the netdev point of view we do not want to count
4516		 * this as an error.
4517		 */
4518		if (ret != XDP_PASS)
4519			return NET_RX_SUCCESS;
4520	}
4521
4522#ifdef CONFIG_RPS
4523	if (static_branch_unlikely(&rps_needed)) {
4524		struct rps_dev_flow voidflow, *rflow = &voidflow;
4525		int cpu;
4526
4527		preempt_disable();
4528		rcu_read_lock();
4529
4530		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4531		if (cpu < 0)
4532			cpu = smp_processor_id();
4533
4534		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4535
4536		rcu_read_unlock();
4537		preempt_enable();
4538	} else
4539#endif
4540	{
4541		unsigned int qtail;
4542
4543		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4544		put_cpu();
4545	}
4546	return ret;
4547}
4548
4549/**
4550 *	netif_rx	-	post buffer to the network code
4551 *	@skb: buffer to post
4552 *
4553 *	This function receives a packet from a device driver and queues it for
4554 *	the upper (protocol) levels to process.  It always succeeds. The buffer
4555 *	may be dropped during processing for congestion control or by the
4556 *	protocol layers.
4557 *
4558 *	return values:
4559 *	NET_RX_SUCCESS	(no congestion)
4560 *	NET_RX_DROP     (packet was dropped)
4561 *
4562 */
4563
4564int netif_rx(struct sk_buff *skb)
4565{
4566	int ret;
4567
4568	trace_netif_rx_entry(skb);
4569
4570	ret = netif_rx_internal(skb);
4571	trace_netif_rx_exit(ret);
4572
4573	return ret;
4574}
4575EXPORT_SYMBOL(netif_rx);
4576
4577int netif_rx_ni(struct sk_buff *skb)
4578{
4579	int err;
4580
4581	trace_netif_rx_ni_entry(skb);
4582
4583	preempt_disable();
4584	err = netif_rx_internal(skb);
4585	if (local_softirq_pending())
4586		do_softirq();
4587	preempt_enable();
4588	trace_netif_rx_ni_exit(err);
4589
4590	return err;
4591}
4592EXPORT_SYMBOL(netif_rx_ni);
4593
4594static __latent_entropy void net_tx_action(struct softirq_action *h)
4595{
4596	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4597
4598	if (sd->completion_queue) {
4599		struct sk_buff *clist;
4600
4601		local_irq_disable();
4602		clist = sd->completion_queue;
4603		sd->completion_queue = NULL;
4604		local_irq_enable();
4605
4606		while (clist) {
4607			struct sk_buff *skb = clist;
4608
4609			clist = clist->next;
4610
4611			WARN_ON(refcount_read(&skb->users));
4612			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4613				trace_consume_skb(skb);
4614			else
4615				trace_kfree_skb(skb, net_tx_action);
4616
4617			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4618				__kfree_skb(skb);
4619			else
4620				__kfree_skb_defer(skb);
4621		}
4622
4623		__kfree_skb_flush();
4624	}
4625
4626	if (sd->output_queue) {
4627		struct Qdisc *head;
4628
4629		local_irq_disable();
4630		head = sd->output_queue;
4631		sd->output_queue = NULL;
4632		sd->output_queue_tailp = &sd->output_queue;
4633		local_irq_enable();
4634
4635		while (head) {
4636			struct Qdisc *q = head;
4637			spinlock_t *root_lock = NULL;
4638
4639			head = head->next_sched;
4640
4641			if (!(q->flags & TCQ_F_NOLOCK)) {
4642				root_lock = qdisc_lock(q);
4643				spin_lock(root_lock);
4644			}
4645			/* We need to make sure head->next_sched is read
4646			 * before clearing __QDISC_STATE_SCHED
4647			 */
4648			smp_mb__before_atomic();
4649			clear_bit(__QDISC_STATE_SCHED, &q->state);
4650			qdisc_run(q);
4651			if (root_lock)
4652				spin_unlock(root_lock);
4653		}
4654	}
4655
4656	xfrm_dev_backlog(sd);
4657}
4658
4659#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4660/* This hook is defined here for ATM LANE */
4661int (*br_fdb_test_addr_hook)(struct net_device *dev,
4662			     unsigned char *addr) __read_mostly;
4663EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4664#endif
4665
4666static inline struct sk_buff *
4667sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4668		   struct net_device *orig_dev)
4669{
4670#ifdef CONFIG_NET_CLS_ACT
4671	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4672	struct tcf_result cl_res;
4673
4674	/* If there's at least one ingress present somewhere (so
4675	 * we get here via enabled static key), remaining devices
4676	 * that are not configured with an ingress qdisc will bail
4677	 * out here.
4678	 */
4679	if (!miniq)
4680		return skb;
4681
4682	if (*pt_prev) {
4683		*ret = deliver_skb(skb, *pt_prev, orig_dev);
4684		*pt_prev = NULL;
4685	}
4686
4687	qdisc_skb_cb(skb)->pkt_len = skb->len;
4688	skb->tc_at_ingress = 1;
4689	mini_qdisc_bstats_cpu_update(miniq, skb);
4690
4691	switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4692	case TC_ACT_OK:
4693	case TC_ACT_RECLASSIFY:
4694		skb->tc_index = TC_H_MIN(cl_res.classid);
4695		break;
4696	case TC_ACT_SHOT:
4697		mini_qdisc_qstats_cpu_drop(miniq);
4698		kfree_skb(skb);
4699		return NULL;
4700	case TC_ACT_STOLEN:
4701	case TC_ACT_QUEUED:
4702	case TC_ACT_TRAP:
4703		consume_skb(skb);
4704		return NULL;
4705	case TC_ACT_REDIRECT:
4706		/* skb_mac_header check was done by cls/act_bpf, so
4707		 * we can safely push the L2 header back before
4708		 * redirecting to another netdev
4709		 */
4710		__skb_push(skb, skb->mac_len);
4711		skb_do_redirect(skb);
4712		return NULL;
4713	case TC_ACT_REINSERT:
4714		/* this does not scrub the packet, and updates stats on error */
4715		skb_tc_reinsert(skb, &cl_res);
4716		return NULL;
4717	default:
4718		break;
4719	}
4720#endif /* CONFIG_NET_CLS_ACT */
4721	return skb;
4722}
4723
4724/**
4725 *	netdev_is_rx_handler_busy - check if receive handler is registered
4726 *	@dev: device to check
4727 *
4728 *	Check if a receive handler is already registered for a given device.
4729 *	Return true if there one.
4730 *
4731 *	The caller must hold the rtnl_mutex.
4732 */
4733bool netdev_is_rx_handler_busy(struct net_device *dev)
4734{
4735	ASSERT_RTNL();
4736	return dev && rtnl_dereference(dev->rx_handler);
4737}
4738EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4739
4740/**
4741 *	netdev_rx_handler_register - register receive handler
4742 *	@dev: device to register a handler for
4743 *	@rx_handler: receive handler to register
4744 *	@rx_handler_data: data pointer that is used by rx handler
4745 *
4746 *	Register a receive handler for a device. This handler will then be
4747 *	called from __netif_receive_skb. A negative errno code is returned
4748 *	on a failure.
4749 *
4750 *	The caller must hold the rtnl_mutex.
4751 *
4752 *	For a general description of rx_handler, see enum rx_handler_result.
4753 */
4754int netdev_rx_handler_register(struct net_device *dev,
4755			       rx_handler_func_t *rx_handler,
4756			       void *rx_handler_data)
4757{
4758	if (netdev_is_rx_handler_busy(dev))
4759		return -EBUSY;
4760
4761	if (dev->priv_flags & IFF_NO_RX_HANDLER)
4762		return -EINVAL;
4763
4764	/* Note: rx_handler_data must be set before rx_handler */
4765	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4766	rcu_assign_pointer(dev->rx_handler, rx_handler);
4767
4768	return 0;
4769}
4770EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4771
4772/**
4773 *	netdev_rx_handler_unregister - unregister receive handler
4774 *	@dev: device to unregister a handler from
4775 *
4776 *	Unregister a receive handler from a device.
4777 *
4778 *	The caller must hold the rtnl_mutex.
4779 */
4780void netdev_rx_handler_unregister(struct net_device *dev)
4781{
4782
4783	ASSERT_RTNL();
4784	RCU_INIT_POINTER(dev->rx_handler, NULL);
4785	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4786	 * section has a guarantee to see a non NULL rx_handler_data
4787	 * as well.
4788	 */
4789	synchronize_net();
4790	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4791}
4792EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4793
4794/*
4795 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4796 * the special handling of PFMEMALLOC skbs.
4797 */
4798static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4799{
4800	switch (skb->protocol) {
4801	case htons(ETH_P_ARP):
4802	case htons(ETH_P_IP):
4803	case htons(ETH_P_IPV6):
4804	case htons(ETH_P_8021Q):
4805	case htons(ETH_P_8021AD):
4806		return true;
4807	default:
4808		return false;
4809	}
4810}
4811
4812static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4813			     int *ret, struct net_device *orig_dev)
4814{
4815#ifdef CONFIG_NETFILTER_INGRESS
4816	if (nf_hook_ingress_active(skb)) {
4817		int ingress_retval;
4818
4819		if (*pt_prev) {
4820			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4821			*pt_prev = NULL;
4822		}
4823
4824		rcu_read_lock();
4825		ingress_retval = nf_hook_ingress(skb);
4826		rcu_read_unlock();
4827		return ingress_retval;
4828	}
4829#endif /* CONFIG_NETFILTER_INGRESS */
4830	return 0;
4831}
4832
4833static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
4834				    struct packet_type **ppt_prev)
4835{
4836	struct packet_type *ptype, *pt_prev;
4837	rx_handler_func_t *rx_handler;
4838	struct net_device *orig_dev;
4839	bool deliver_exact = false;
4840	int ret = NET_RX_DROP;
4841	__be16 type;
4842
4843	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4844
4845	trace_netif_receive_skb(skb);
4846
4847	orig_dev = skb->dev;
4848
4849	skb_reset_network_header(skb);
4850	if (!skb_transport_header_was_set(skb))
4851		skb_reset_transport_header(skb);
4852	skb_reset_mac_len(skb);
4853
4854	pt_prev = NULL;
4855
4856another_round:
4857	skb->skb_iif = skb->dev->ifindex;
4858
4859	__this_cpu_inc(softnet_data.processed);
4860
4861	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4862	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4863		skb = skb_vlan_untag(skb);
4864		if (unlikely(!skb))
4865			goto out;
4866	}
4867
4868	if (skb_skip_tc_classify(skb))
4869		goto skip_classify;
4870
4871	if (pfmemalloc)
4872		goto skip_taps;
4873
4874	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4875		if (pt_prev)
4876			ret = deliver_skb(skb, pt_prev, orig_dev);
4877		pt_prev = ptype;
4878	}
4879
4880	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4881		if (pt_prev)
4882			ret = deliver_skb(skb, pt_prev, orig_dev);
4883		pt_prev = ptype;
4884	}
4885
4886skip_taps:
4887#ifdef CONFIG_NET_INGRESS
4888	if (static_branch_unlikely(&ingress_needed_key)) {
4889		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4890		if (!skb)
4891			goto out;
4892
4893		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4894			goto out;
4895	}
4896#endif
4897	skb_reset_tc(skb);
4898skip_classify:
4899	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4900		goto drop;
4901
4902	if (skb_vlan_tag_present(skb)) {
4903		if (pt_prev) {
4904			ret = deliver_skb(skb, pt_prev, orig_dev);
4905			pt_prev = NULL;
4906		}
4907		if (vlan_do_receive(&skb))
4908			goto another_round;
4909		else if (unlikely(!skb))
4910			goto out;
4911	}
4912
4913	rx_handler = rcu_dereference(skb->dev->rx_handler);
4914	if (rx_handler) {
4915		if (pt_prev) {
4916			ret = deliver_skb(skb, pt_prev, orig_dev);
4917			pt_prev = NULL;
4918		}
4919		switch (rx_handler(&skb)) {
4920		case RX_HANDLER_CONSUMED:
4921			ret = NET_RX_SUCCESS;
4922			goto out;
4923		case RX_HANDLER_ANOTHER:
4924			goto another_round;
4925		case RX_HANDLER_EXACT:
4926			deliver_exact = true;
4927		case RX_HANDLER_PASS:
4928			break;
4929		default:
4930			BUG();
4931		}
4932	}
4933
4934	if (unlikely(skb_vlan_tag_present(skb))) {
4935		if (skb_vlan_tag_get_id(skb))
4936			skb->pkt_type = PACKET_OTHERHOST;
4937		/* Note: we might in the future use prio bits
4938		 * and set skb->priority like in vlan_do_receive()
4939		 * For the time being, just ignore Priority Code Point
4940		 */
4941		__vlan_hwaccel_clear_tag(skb);
4942	}
4943
4944	type = skb->protocol;
4945
4946	/* deliver only exact match when indicated */
4947	if (likely(!deliver_exact)) {
4948		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4949				       &ptype_base[ntohs(type) &
4950						   PTYPE_HASH_MASK]);
4951	}
4952
4953	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4954			       &orig_dev->ptype_specific);
4955
4956	if (unlikely(skb->dev != orig_dev)) {
4957		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4958				       &skb->dev->ptype_specific);
4959	}
4960
4961	if (pt_prev) {
4962		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4963			goto drop;
4964		*ppt_prev = pt_prev;
4965	} else {
4966drop:
4967		if (!deliver_exact)
4968			atomic_long_inc(&skb->dev->rx_dropped);
4969		else
4970			atomic_long_inc(&skb->dev->rx_nohandler);
4971		kfree_skb(skb);
4972		/* Jamal, now you will not able to escape explaining
4973		 * me how you were going to use this. :-)
4974		 */
4975		ret = NET_RX_DROP;
4976	}
4977
4978out:
4979	return ret;
4980}
4981
4982static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
4983{
4984	struct net_device *orig_dev = skb->dev;
4985	struct packet_type *pt_prev = NULL;
4986	int ret;
4987
4988	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
4989	if (pt_prev)
4990		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
4991					 skb->dev, pt_prev, orig_dev);
4992	return ret;
4993}
4994
4995/**
4996 *	netif_receive_skb_core - special purpose version of netif_receive_skb
4997 *	@skb: buffer to process
4998 *
4999 *	More direct receive version of netif_receive_skb().  It should
5000 *	only be used by callers that have a need to skip RPS and Generic XDP.
5001 *	Caller must also take care of handling if (page_is_)pfmemalloc.
5002 *
5003 *	This function may only be called from softirq context and interrupts
5004 *	should be enabled.
5005 *
5006 *	Return values (usually ignored):
5007 *	NET_RX_SUCCESS: no congestion
5008 *	NET_RX_DROP: packet was dropped
5009 */
5010int netif_receive_skb_core(struct sk_buff *skb)
5011{
5012	int ret;
5013
5014	rcu_read_lock();
5015	ret = __netif_receive_skb_one_core(skb, false);
5016	rcu_read_unlock();
5017
5018	return ret;
5019}
5020EXPORT_SYMBOL(netif_receive_skb_core);
5021
5022static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5023						  struct packet_type *pt_prev,
5024						  struct net_device *orig_dev)
5025{
5026	struct sk_buff *skb, *next;
5027
5028	if (!pt_prev)
5029		return;
5030	if (list_empty(head))
5031		return;
5032	if (pt_prev->list_func != NULL)
5033		pt_prev->list_func(head, pt_prev, orig_dev);
5034	else
5035		list_for_each_entry_safe(skb, next, head, list) {
5036			skb_list_del_init(skb);
5037			INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5038					   skb->dev, pt_prev, orig_dev);
5039		}
5040}
5041
5042static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5043{
5044	/* Fast-path assumptions:
5045	 * - There is no RX handler.
5046	 * - Only one packet_type matches.
5047	 * If either of these fails, we will end up doing some per-packet
5048	 * processing in-line, then handling the 'last ptype' for the whole
5049	 * sublist.  This can't cause out-of-order delivery to any single ptype,
5050	 * because the 'last ptype' must be constant across the sublist, and all
5051	 * other ptypes are handled per-packet.
5052	 */
5053	/* Current (common) ptype of sublist */
5054	struct packet_type *pt_curr = NULL;
5055	/* Current (common) orig_dev of sublist */
5056	struct net_device *od_curr = NULL;
5057	struct list_head sublist;
5058	struct sk_buff *skb, *next;
5059
5060	INIT_LIST_HEAD(&sublist);
5061	list_for_each_entry_safe(skb, next, head, list) {
5062		struct net_device *orig_dev = skb->dev;
5063		struct packet_type *pt_prev = NULL;
5064
5065		skb_list_del_init(skb);
5066		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
5067		if (!pt_prev)
5068			continue;
5069		if (pt_curr != pt_prev || od_curr != orig_dev) {
5070			/* dispatch old sublist */
5071			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5072			/* start new sublist */
5073			INIT_LIST_HEAD(&sublist);
5074			pt_curr = pt_prev;
5075			od_curr = orig_dev;
5076		}
5077		list_add_tail(&skb->list, &sublist);
5078	}
5079
5080	/* dispatch final sublist */
5081	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5082}
5083
5084static int __netif_receive_skb(struct sk_buff *skb)
5085{
5086	int ret;
5087
5088	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5089		unsigned int noreclaim_flag;
5090
5091		/*
5092		 * PFMEMALLOC skbs are special, they should
5093		 * - be delivered to SOCK_MEMALLOC sockets only
5094		 * - stay away from userspace
5095		 * - have bounded memory usage
5096		 *
5097		 * Use PF_MEMALLOC as this saves us from propagating the allocation
5098		 * context down to all allocation sites.
5099		 */
5100		noreclaim_flag = memalloc_noreclaim_save();
5101		ret = __netif_receive_skb_one_core(skb, true);
5102		memalloc_noreclaim_restore(noreclaim_flag);
5103	} else
5104		ret = __netif_receive_skb_one_core(skb, false);
5105
5106	return ret;
5107}
5108
5109static void __netif_receive_skb_list(struct list_head *head)
5110{
5111	unsigned long noreclaim_flag = 0;
5112	struct sk_buff *skb, *next;
5113	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5114
5115	list_for_each_entry_safe(skb, next, head, list) {
5116		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5117			struct list_head sublist;
5118
5119			/* Handle the previous sublist */
5120			list_cut_before(&sublist, head, &skb->list);
5121			if (!list_empty(&sublist))
5122				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5123			pfmemalloc = !pfmemalloc;
5124			/* See comments in __netif_receive_skb */
5125			if (pfmemalloc)
5126				noreclaim_flag = memalloc_noreclaim_save();
5127			else
5128				memalloc_noreclaim_restore(noreclaim_flag);
5129		}
5130	}
5131	/* Handle the remaining sublist */
5132	if (!list_empty(head))
5133		__netif_receive_skb_list_core(head, pfmemalloc);
5134	/* Restore pflags */
5135	if (pfmemalloc)
5136		memalloc_noreclaim_restore(noreclaim_flag);
5137}
5138
5139static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5140{
5141	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5142	struct bpf_prog *new = xdp->prog;
5143	int ret = 0;
5144
5145	switch (xdp->command) {
5146	case XDP_SETUP_PROG:
5147		rcu_assign_pointer(dev->xdp_prog, new);
5148		if (old)
5149			bpf_prog_put(old);
5150
5151		if (old && !new) {
5152			static_branch_dec(&generic_xdp_needed_key);
5153		} else if (new && !old) {
5154			static_branch_inc(&generic_xdp_needed_key);
5155			dev_disable_lro(dev);
5156			dev_disable_gro_hw(dev);
5157		}
5158		break;
5159
5160	case XDP_QUERY_PROG:
5161		xdp->prog_id = old ? old->aux->id : 0;
5162		break;
5163
5164	default:
5165		ret = -EINVAL;
5166		break;
5167	}
5168
5169	return ret;
5170}
5171
5172static int netif_receive_skb_internal(struct sk_buff *skb)
5173{
5174	int ret;
5175
5176	net_timestamp_check(netdev_tstamp_prequeue, skb);
5177
5178	if (skb_defer_rx_timestamp(skb))
5179		return NET_RX_SUCCESS;
5180
5181	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5182		int ret;
5183
5184		preempt_disable();
5185		rcu_read_lock();
5186		ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5187		rcu_read_unlock();
5188		preempt_enable();
5189
5190		if (ret != XDP_PASS)
5191			return NET_RX_DROP;
5192	}
5193
5194	rcu_read_lock();
5195#ifdef CONFIG_RPS
5196	if (static_branch_unlikely(&rps_needed)) {
5197		struct rps_dev_flow voidflow, *rflow = &voidflow;
5198		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5199
5200		if (cpu >= 0) {
5201			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5202			rcu_read_unlock();
5203			return ret;
5204		}
5205	}
5206#endif
5207	ret = __netif_receive_skb(skb);
5208	rcu_read_unlock();
5209	return ret;
5210}
5211
5212static void netif_receive_skb_list_internal(struct list_head *head)
5213{
5214	struct bpf_prog *xdp_prog = NULL;
5215	struct sk_buff *skb, *next;
5216	struct list_head sublist;
5217
5218	INIT_LIST_HEAD(&sublist);
5219	list_for_each_entry_safe(skb, next, head, list) {
5220		net_timestamp_check(netdev_tstamp_prequeue, skb);
5221		skb_list_del_init(skb);
5222		if (!skb_defer_rx_timestamp(skb))
5223			list_add_tail(&skb->list, &sublist);
5224	}
5225	list_splice_init(&sublist, head);
5226
5227	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5228		preempt_disable();
5229		rcu_read_lock();
5230		list_for_each_entry_safe(skb, next, head, list) {
5231			xdp_prog = rcu_dereference(skb->dev->xdp_prog);
5232			skb_list_del_init(skb);
5233			if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
5234				list_add_tail(&skb->list, &sublist);
5235		}
5236		rcu_read_unlock();
5237		preempt_enable();
5238		/* Put passed packets back on main list */
5239		list_splice_init(&sublist, head);
5240	}
5241
5242	rcu_read_lock();
5243#ifdef CONFIG_RPS
5244	if (static_branch_unlikely(&rps_needed)) {
5245		list_for_each_entry_safe(skb, next, head, list) {
5246			struct rps_dev_flow voidflow, *rflow = &voidflow;
5247			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5248
5249			if (cpu >= 0) {
5250				/* Will be handled, remove from list */
5251				skb_list_del_init(skb);
5252				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5253			}
5254		}
5255	}
5256#endif
5257	__netif_receive_skb_list(head);
5258	rcu_read_unlock();
5259}
5260
5261/**
5262 *	netif_receive_skb - process receive buffer from network
5263 *	@skb: buffer to process
5264 *
5265 *	netif_receive_skb() is the main receive data processing function.
5266 *	It always succeeds. The buffer may be dropped during processing
5267 *	for congestion control or by the protocol layers.
5268 *
5269 *	This function may only be called from softirq context and interrupts
5270 *	should be enabled.
5271 *
5272 *	Return values (usually ignored):
5273 *	NET_RX_SUCCESS: no congestion
5274 *	NET_RX_DROP: packet was dropped
5275 */
5276int netif_receive_skb(struct sk_buff *skb)
5277{
5278	int ret;
5279
5280	trace_netif_receive_skb_entry(skb);
5281
5282	ret = netif_receive_skb_internal(skb);
5283	trace_netif_receive_skb_exit(ret);
5284
5285	return ret;
5286}
5287EXPORT_SYMBOL(netif_receive_skb);
5288
5289/**
5290 *	netif_receive_skb_list - process many receive buffers from network
5291 *	@head: list of skbs to process.
5292 *
5293 *	Since return value of netif_receive_skb() is normally ignored, and
5294 *	wouldn't be meaningful for a list, this function returns void.
5295 *
5296 *	This function may only be called from softirq context and interrupts
5297 *	should be enabled.
5298 */
5299void netif_receive_skb_list(struct list_head *head)
5300{
5301	struct sk_buff *skb;
5302
5303	if (list_empty(head))
5304		return;
5305	if (trace_netif_receive_skb_list_entry_enabled()) {
5306		list_for_each_entry(skb, head, list)
5307			trace_netif_receive_skb_list_entry(skb);
5308	}
5309	netif_receive_skb_list_internal(head);
5310	trace_netif_receive_skb_list_exit(0);
5311}
5312EXPORT_SYMBOL(netif_receive_skb_list);
5313
5314DEFINE_PER_CPU(struct work_struct, flush_works);
5315
5316/* Network device is going away, flush any packets still pending */
5317static void flush_backlog(struct work_struct *work)
5318{
5319	struct sk_buff *skb, *tmp;
5320	struct softnet_data *sd;
5321
5322	local_bh_disable();
5323	sd = this_cpu_ptr(&softnet_data);
5324
5325	local_irq_disable();
5326	rps_lock(sd);
5327	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5328		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5329			__skb_unlink(skb, &sd->input_pkt_queue);
5330			kfree_skb(skb);
5331			input_queue_head_incr(sd);
5332		}
5333	}
5334	rps_unlock(sd);
5335	local_irq_enable();
5336
5337	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5338		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5339			__skb_unlink(skb, &sd->process_queue);
5340			kfree_skb(skb);
5341			input_queue_head_incr(sd);
5342		}
5343	}
5344	local_bh_enable();
5345}
5346
5347static void flush_all_backlogs(void)
5348{
5349	unsigned int cpu;
5350
5351	get_online_cpus();
5352
5353	for_each_online_cpu(cpu)
5354		queue_work_on(cpu, system_highpri_wq,
5355			      per_cpu_ptr(&flush_works, cpu));
5356
5357	for_each_online_cpu(cpu)
5358		flush_work(per_cpu_ptr(&flush_works, cpu));
5359
5360	put_online_cpus();
5361}
5362
5363INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5364INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5365static int napi_gro_complete(struct sk_buff *skb)
5366{
5367	struct packet_offload *ptype;
5368	__be16 type = skb->protocol;
5369	struct list_head *head = &offload_base;
5370	int err = -ENOENT;
5371
5372	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5373
5374	if (NAPI_GRO_CB(skb)->count == 1) {
5375		skb_shinfo(skb)->gso_size = 0;
5376		goto out;
5377	}
5378
5379	rcu_read_lock();
5380	list_for_each_entry_rcu(ptype, head, list) {
5381		if (ptype->type != type || !ptype->callbacks.gro_complete)
5382			continue;
5383
5384		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5385					 ipv6_gro_complete, inet_gro_complete,
5386					 skb, 0);
5387		break;
5388	}
5389	rcu_read_unlock();
5390
5391	if (err) {
5392		WARN_ON(&ptype->list == head);
5393		kfree_skb(skb);
5394		return NET_RX_SUCCESS;
5395	}
5396
5397out:
5398	return netif_receive_skb_internal(skb);
5399}
5400
5401static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5402				   bool flush_old)
5403{
5404	struct list_head *head = &napi->gro_hash[index].list;
5405	struct sk_buff *skb, *p;
5406
5407	list_for_each_entry_safe_reverse(skb, p, head, list) {
5408		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5409			return;
5410		skb_list_del_init(skb);
5411		napi_gro_complete(skb);
5412		napi->gro_hash[index].count--;
5413	}
5414
5415	if (!napi->gro_hash[index].count)
5416		__clear_bit(index, &napi->gro_bitmask);
5417}
5418
5419/* napi->gro_hash[].list contains packets ordered by age.
5420 * youngest packets at the head of it.
5421 * Complete skbs in reverse order to reduce latencies.
5422 */
5423void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5424{
5425	unsigned long bitmask = napi->gro_bitmask;
5426	unsigned int i, base = ~0U;
5427
5428	while ((i = ffs(bitmask)) != 0) {
5429		bitmask >>= i;
5430		base += i;
5431		__napi_gro_flush_chain(napi, base, flush_old);
5432	}
5433}
5434EXPORT_SYMBOL(napi_gro_flush);
5435
5436static struct list_head *gro_list_prepare(struct napi_struct *napi,
5437					  struct sk_buff *skb)
5438{
5439	unsigned int maclen = skb->dev->hard_header_len;
5440	u32 hash = skb_get_hash_raw(skb);
5441	struct list_head *head;
5442	struct sk_buff *p;
5443
5444	head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5445	list_for_each_entry(p, head, list) {
5446		unsigned long diffs;
5447
5448		NAPI_GRO_CB(p)->flush = 0;
5449
5450		if (hash != skb_get_hash_raw(p)) {
5451			NAPI_GRO_CB(p)->same_flow = 0;
5452			continue;
5453		}
5454
5455		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5456		diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5457		if (skb_vlan_tag_present(p))
5458			diffs |= p->vlan_tci ^ skb->vlan_tci;
5459		diffs |= skb_metadata_dst_cmp(p, skb);
5460		diffs |= skb_metadata_differs(p, skb);
5461		if (maclen == ETH_HLEN)
5462			diffs |= compare_ether_header(skb_mac_header(p),
5463						      skb_mac_header(skb));
5464		else if (!diffs)
5465			diffs = memcmp(skb_mac_header(p),
5466				       skb_mac_header(skb),
5467				       maclen);
5468		NAPI_GRO_CB(p)->same_flow = !diffs;
5469	}
5470
5471	return head;
5472}
5473
5474static void skb_gro_reset_offset(struct sk_buff *skb)
5475{
5476	const struct skb_shared_info *pinfo = skb_shinfo(skb);
5477	const skb_frag_t *frag0 = &pinfo->frags[0];
5478
5479	NAPI_GRO_CB(skb)->data_offset = 0;
5480	NAPI_GRO_CB(skb)->frag0 = NULL;
5481	NAPI_GRO_CB(skb)->frag0_len = 0;
5482
5483	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5484	    pinfo->nr_frags &&
5485	    !PageHighMem(skb_frag_page(frag0))) {
5486		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5487		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5488						    skb_frag_size(frag0),
5489						    skb->end - skb->tail);
5490	}
5491}
5492
5493static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5494{
5495	struct skb_shared_info *pinfo = skb_shinfo(skb);
5496
5497	BUG_ON(skb->end - skb->tail < grow);
5498
5499	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5500
5501	skb->data_len -= grow;
5502	skb->tail += grow;
5503
5504	pinfo->frags[0].page_offset += grow;
5505	skb_frag_size_sub(&pinfo->frags[0], grow);
5506
5507	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5508		skb_frag_unref(skb, 0);
5509		memmove(pinfo->frags, pinfo->frags + 1,
5510			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
5511	}
5512}
5513
5514static void gro_flush_oldest(struct list_head *head)
5515{
5516	struct sk_buff *oldest;
5517
5518	oldest = list_last_entry(head, struct sk_buff, list);
5519
5520	/* We are called with head length >= MAX_GRO_SKBS, so this is
5521	 * impossible.
5522	 */
5523	if (WARN_ON_ONCE(!oldest))
5524		return;
5525
5526	/* Do not adjust napi->gro_hash[].count, caller is adding a new
5527	 * SKB to the chain.
5528	 */
5529	skb_list_del_init(oldest);
5530	napi_gro_complete(oldest);
5531}
5532
5533INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5534							   struct sk_buff *));
5535INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5536							   struct sk_buff *));
5537static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5538{
5539	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5540	struct list_head *head = &offload_base;
5541	struct packet_offload *ptype;
5542	__be16 type = skb->protocol;
5543	struct list_head *gro_head;
5544	struct sk_buff *pp = NULL;
5545	enum gro_result ret;
5546	int same_flow;
5547	int grow;
5548
5549	if (netif_elide_gro(skb->dev))
5550		goto normal;
5551
5552	gro_head = gro_list_prepare(napi, skb);
5553
5554	rcu_read_lock();
5555	list_for_each_entry_rcu(ptype, head, list) {
5556		if (ptype->type != type || !ptype->callbacks.gro_receive)
5557			continue;
5558
5559		skb_set_network_header(skb, skb_gro_offset(skb));
5560		skb_reset_mac_len(skb);
5561		NAPI_GRO_CB(skb)->same_flow = 0;
5562		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5563		NAPI_GRO_CB(skb)->free = 0;
5564		NAPI_GRO_CB(skb)->encap_mark = 0;
5565		NAPI_GRO_CB(skb)->recursion_counter = 0;
5566		NAPI_GRO_CB(skb)->is_fou = 0;
5567		NAPI_GRO_CB(skb)->is_atomic = 1;
5568		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5569
5570		/* Setup for GRO checksum validation */
5571		switch (skb->ip_summed) {
5572		case CHECKSUM_COMPLETE:
5573			NAPI_GRO_CB(skb)->csum = skb->csum;
5574			NAPI_GRO_CB(skb)->csum_valid = 1;
5575			NAPI_GRO_CB(skb)->csum_cnt = 0;
5576			break;
5577		case CHECKSUM_UNNECESSARY:
5578			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5579			NAPI_GRO_CB(skb)->csum_valid = 0;
5580			break;
5581		default:
5582			NAPI_GRO_CB(skb)->csum_cnt = 0;
5583			NAPI_GRO_CB(skb)->csum_valid = 0;
5584		}
5585
5586		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5587					ipv6_gro_receive, inet_gro_receive,
5588					gro_head, skb);
5589		break;
5590	}
5591	rcu_read_unlock();
5592
5593	if (&ptype->list == head)
5594		goto normal;
5595
5596	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
5597		ret = GRO_CONSUMED;
5598		goto ok;
5599	}
5600
5601	same_flow = NAPI_GRO_CB(skb)->same_flow;
5602	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5603
5604	if (pp) {
5605		skb_list_del_init(pp);
5606		napi_gro_complete(pp);
5607		napi->gro_hash[hash].count--;
5608	}
5609
5610	if (same_flow)
5611		goto ok;
5612
5613	if (NAPI_GRO_CB(skb)->flush)
5614		goto normal;
5615
5616	if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5617		gro_flush_oldest(gro_head);
5618	} else {
5619		napi->gro_hash[hash].count++;
5620	}
5621	NAPI_GRO_CB(skb)->count = 1;
5622	NAPI_GRO_CB(skb)->age = jiffies;
5623	NAPI_GRO_CB(skb)->last = skb;
5624	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5625	list_add(&skb->list, gro_head);
5626	ret = GRO_HELD;
5627
5628pull:
5629	grow = skb_gro_offset(skb) - skb_headlen(skb);
5630	if (grow > 0)
5631		gro_pull_from_frag0(skb, grow);
5632ok:
5633	if (napi->gro_hash[hash].count) {
5634		if (!test_bit(hash, &napi->gro_bitmask))
5635			__set_bit(hash, &napi->gro_bitmask);
5636	} else if (test_bit(hash, &napi->gro_bitmask)) {
5637		__clear_bit(hash, &napi->gro_bitmask);
5638	}
5639
5640	return ret;
5641
5642normal:
5643	ret = GRO_NORMAL;
5644	goto pull;
5645}
5646
5647struct packet_offload *gro_find_receive_by_type(__be16 type)
5648{
5649	struct list_head *offload_head = &offload_base;
5650	struct packet_offload *ptype;
5651
5652	list_for_each_entry_rcu(ptype, offload_head, list) {
5653		if (ptype->type != type || !ptype->callbacks.gro_receive)
5654			continue;
5655		return ptype;
5656	}
5657	return NULL;
5658}
5659EXPORT_SYMBOL(gro_find_receive_by_type);
5660
5661struct packet_offload *gro_find_complete_by_type(__be16 type)
5662{
5663	struct list_head *offload_head = &offload_base;
5664	struct packet_offload *ptype;
5665
5666	list_for_each_entry_rcu(ptype, offload_head, list) {
5667		if (ptype->type != type || !ptype->callbacks.gro_complete)
5668			continue;
5669		return ptype;
5670	}
5671	return NULL;
5672}
5673EXPORT_SYMBOL(gro_find_complete_by_type);
5674
5675static void napi_skb_free_stolen_head(struct sk_buff *skb)
5676{
5677	skb_dst_drop(skb);
5678	secpath_reset(skb);
5679	kmem_cache_free(skbuff_head_cache, skb);
5680}
5681
5682static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5683{
5684	switch (ret) {
5685	case GRO_NORMAL:
5686		if (netif_receive_skb_internal(skb))
5687			ret = GRO_DROP;
5688		break;
5689
5690	case GRO_DROP:
5691		kfree_skb(skb);
5692		break;
5693
5694	case GRO_MERGED_FREE:
5695		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5696			napi_skb_free_stolen_head(skb);
5697		else
5698			__kfree_skb(skb);
5699		break;
5700
5701	case GRO_HELD:
5702	case GRO_MERGED:
5703	case GRO_CONSUMED:
5704		break;
5705	}
5706
5707	return ret;
5708}
5709
5710gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5711{
5712	gro_result_t ret;
5713
5714	skb_mark_napi_id(skb, napi);
5715	trace_napi_gro_receive_entry(skb);
5716
5717	skb_gro_reset_offset(skb);
5718
5719	ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
5720	trace_napi_gro_receive_exit(ret);
5721
5722	return ret;
5723}
5724EXPORT_SYMBOL(napi_gro_receive);
5725
5726static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5727{
5728	if (unlikely(skb->pfmemalloc)) {
5729		consume_skb(skb);
5730		return;
5731	}
5732	__skb_pull(skb, skb_headlen(skb));
5733	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
5734	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5735	__vlan_hwaccel_clear_tag(skb);
5736	skb->dev = napi->dev;
5737	skb->skb_iif = 0;
5738
5739	/* eth_type_trans() assumes pkt_type is PACKET_HOST */
5740	skb->pkt_type = PACKET_HOST;
5741
5742	skb->encapsulation = 0;
5743	skb_shinfo(skb)->gso_type = 0;
5744	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5745	secpath_reset(skb);
5746
5747	napi->skb = skb;
5748}
5749
5750struct sk_buff *napi_get_frags(struct napi_struct *napi)
5751{
5752	struct sk_buff *skb = napi->skb;
5753
5754	if (!skb) {
5755		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5756		if (skb) {
5757			napi->skb = skb;
5758			skb_mark_napi_id(skb, napi);
5759		}
5760	}
5761	return skb;
5762}
5763EXPORT_SYMBOL(napi_get_frags);
5764
5765static gro_result_t napi_frags_finish(struct napi_struct *napi,
5766				      struct sk_buff *skb,
5767				      gro_result_t ret)
5768{
5769	switch (ret) {
5770	case GRO_NORMAL:
5771	case GRO_HELD:
5772		__skb_push(skb, ETH_HLEN);
5773		skb->protocol = eth_type_trans(skb, skb->dev);
5774		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5775			ret = GRO_DROP;
5776		break;
5777
5778	case GRO_DROP:
5779		napi_reuse_skb(napi, skb);
5780		break;
5781
5782	case GRO_MERGED_FREE:
5783		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5784			napi_skb_free_stolen_head(skb);
5785		else
5786			napi_reuse_skb(napi, skb);
5787		break;
5788
5789	case GRO_MERGED:
5790	case GRO_CONSUMED:
5791		break;
5792	}
5793
5794	return ret;
5795}
5796
5797/* Upper GRO stack assumes network header starts at gro_offset=0
5798 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5799 * We copy ethernet header into skb->data to have a common layout.
5800 */
5801static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5802{
5803	struct sk_buff *skb = napi->skb;
5804	const struct ethhdr *eth;
5805	unsigned int hlen = sizeof(*eth);
5806
5807	napi->skb = NULL;
5808
5809	skb_reset_mac_header(skb);
5810	skb_gro_reset_offset(skb);
5811
5812	eth = skb_gro_header_fast(skb, 0);
5813	if (unlikely(skb_gro_header_hard(skb, hlen))) {
5814		eth = skb_gro_header_slow(skb, hlen, 0);
5815		if (unlikely(!eth)) {
5816			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5817					     __func__, napi->dev->name);
5818			napi_reuse_skb(napi, skb);
5819			return NULL;
5820		}
5821	} else {
5822		gro_pull_from_frag0(skb, hlen);
5823		NAPI_GRO_CB(skb)->frag0 += hlen;
5824		NAPI_GRO_CB(skb)->frag0_len -= hlen;
5825	}
5826	__skb_pull(skb, hlen);
5827
5828	/*
5829	 * This works because the only protocols we care about don't require
5830	 * special handling.
5831	 * We'll fix it up properly in napi_frags_finish()
5832	 */
5833	skb->protocol = eth->h_proto;
5834
5835	return skb;
5836}
5837
5838gro_result_t napi_gro_frags(struct napi_struct *napi)
5839{
5840	gro_result_t ret;
5841	struct sk_buff *skb = napi_frags_skb(napi);
5842
5843	if (!skb)
5844		return GRO_DROP;
5845
5846	trace_napi_gro_frags_entry(skb);
5847
5848	ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5849	trace_napi_gro_frags_exit(ret);
5850
5851	return ret;
5852}
5853EXPORT_SYMBOL(napi_gro_frags);
5854
5855/* Compute the checksum from gro_offset and return the folded value
5856 * after adding in any pseudo checksum.
5857 */
5858__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5859{
5860	__wsum wsum;
5861	__sum16 sum;
5862
5863	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5864
5865	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5866	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5867	/* See comments in __skb_checksum_complete(). */
5868	if (likely(!sum)) {
5869		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5870		    !skb->csum_complete_sw)
5871			netdev_rx_csum_fault(skb->dev, skb);
5872	}
5873
5874	NAPI_GRO_CB(skb)->csum = wsum;
5875	NAPI_GRO_CB(skb)->csum_valid = 1;
5876
5877	return sum;
5878}
5879EXPORT_SYMBOL(__skb_gro_checksum_complete);
5880
5881static void net_rps_send_ipi(struct softnet_data *remsd)
5882{
5883#ifdef CONFIG_RPS
5884	while (remsd) {
5885		struct softnet_data *next = remsd->rps_ipi_next;
5886
5887		if (cpu_online(remsd->cpu))
5888			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5889		remsd = next;
5890	}
5891#endif
5892}
5893
5894/*
5895 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5896 * Note: called with local irq disabled, but exits with local irq enabled.
5897 */
5898static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5899{
5900#ifdef CONFIG_RPS
5901	struct softnet_data *remsd = sd->rps_ipi_list;
5902
5903	if (remsd) {
5904		sd->rps_ipi_list = NULL;
5905
5906		local_irq_enable();
5907
5908		/* Send pending IPI's to kick RPS processing on remote cpus. */
5909		net_rps_send_ipi(remsd);
5910	} else
5911#endif
5912		local_irq_enable();
5913}
5914
5915static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5916{
5917#ifdef CONFIG_RPS
5918	return sd->rps_ipi_list != NULL;
5919#else
5920	return false;
5921#endif
5922}
5923
5924static int process_backlog(struct napi_struct *napi, int quota)
5925{
5926	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5927	bool again = true;
5928	int work = 0;
5929
5930	/* Check if we have pending ipi, its better to send them now,
5931	 * not waiting net_rx_action() end.
5932	 */
5933	if (sd_has_rps_ipi_waiting(sd)) {
5934		local_irq_disable();
5935		net_rps_action_and_irq_enable(sd);
5936	}
5937
5938	napi->weight = dev_rx_weight;
5939	while (again) {
5940		struct sk_buff *skb;
5941
5942		while ((skb = __skb_dequeue(&sd->process_queue))) {
5943			rcu_read_lock();
5944			__netif_receive_skb(skb);
5945			rcu_read_unlock();
5946			input_queue_head_incr(sd);
5947			if (++work >= quota)
5948				return work;
5949
5950		}
5951
5952		local_irq_disable();
5953		rps_lock(sd);
5954		if (skb_queue_empty(&sd->input_pkt_queue)) {
5955			/*
5956			 * Inline a custom version of __napi_complete().
5957			 * only current cpu owns and manipulates this napi,
5958			 * and NAPI_STATE_SCHED is the only possible flag set
5959			 * on backlog.
5960			 * We can use a plain write instead of clear_bit(),
5961			 * and we dont need an smp_mb() memory barrier.
5962			 */
5963			napi->state = 0;
5964			again = false;
5965		} else {
5966			skb_queue_splice_tail_init(&sd->input_pkt_queue,
5967						   &sd->process_queue);
5968		}
5969		rps_unlock(sd);
5970		local_irq_enable();
5971	}
5972
5973	return work;
5974}
5975
5976/**
5977 * __napi_schedule - schedule for receive
5978 * @n: entry to schedule
5979 *
5980 * The entry's receive function will be scheduled to run.
5981 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5982 */
5983void __napi_schedule(struct napi_struct *n)
5984{
5985	unsigned long flags;
5986
5987	local_irq_save(flags);
5988	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5989	local_irq_restore(flags);
5990}
5991EXPORT_SYMBOL(__napi_schedule);
5992
5993/**
5994 *	napi_schedule_prep - check if napi can be scheduled
5995 *	@n: napi context
5996 *
5997 * Test if NAPI routine is already running, and if not mark
5998 * it as running.  This is used as a condition variable
5999 * insure only one NAPI poll instance runs.  We also make
6000 * sure there is no pending NAPI disable.
6001 */
6002bool napi_schedule_prep(struct napi_struct *n)
6003{
6004	unsigned long val, new;
6005
6006	do {
6007		val = READ_ONCE(n->state);
6008		if (unlikely(val & NAPIF_STATE_DISABLE))
6009			return false;
6010		new = val | NAPIF_STATE_SCHED;
6011
6012		/* Sets STATE_MISSED bit if STATE_SCHED was already set
6013		 * This was suggested by Alexander Duyck, as compiler
6014		 * emits better code than :
6015		 * if (val & NAPIF_STATE_SCHED)
6016		 *     new |= NAPIF_STATE_MISSED;
6017		 */
6018		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6019						   NAPIF_STATE_MISSED;
6020	} while (cmpxchg(&n->state, val, new) != val);
6021
6022	return !(val & NAPIF_STATE_SCHED);
6023}
6024EXPORT_SYMBOL(napi_schedule_prep);
6025
6026/**
6027 * __napi_schedule_irqoff - schedule for receive
6028 * @n: entry to schedule
6029 *
6030 * Variant of __napi_schedule() assuming hard irqs are masked
6031 */
6032void __napi_schedule_irqoff(struct napi_struct *n)
6033{
6034	____napi_schedule(this_cpu_ptr(&softnet_data), n);
6035}
6036EXPORT_SYMBOL(__napi_schedule_irqoff);
6037
6038bool napi_complete_done(struct napi_struct *n, int work_done)
6039{
6040	unsigned long flags, val, new;
6041
6042	/*
6043	 * 1) Don't let napi dequeue from the cpu poll list
6044	 *    just in case its running on a different cpu.
6045	 * 2) If we are busy polling, do nothing here, we have
6046	 *    the guarantee we will be called later.
6047	 */
6048	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6049				 NAPIF_STATE_IN_BUSY_POLL)))
6050		return false;
6051
6052	if (n->gro_bitmask) {
6053		unsigned long timeout = 0;
6054
6055		if (work_done)
6056			timeout = n->dev->gro_flush_timeout;
6057
6058		/* When the NAPI instance uses a timeout and keeps postponing
6059		 * it, we need to bound somehow the time packets are kept in
6060		 * the GRO layer
6061		 */
6062		napi_gro_flush(n, !!timeout);
6063		if (timeout)
6064			hrtimer_start(&n->timer, ns_to_ktime(timeout),
6065				      HRTIMER_MODE_REL_PINNED);
6066	}
6067	if (unlikely(!list_empty(&n->poll_list))) {
6068		/* If n->poll_list is not empty, we need to mask irqs */
6069		local_irq_save(flags);
6070		list_del_init(&n->poll_list);
6071		local_irq_restore(flags);
6072	}
6073
6074	do {
6075		val = READ_ONCE(n->state);
6076
6077		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6078
6079		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6080
6081		/* If STATE_MISSED was set, leave STATE_SCHED set,
6082		 * because we will call napi->poll() one more time.
6083		 * This C code was suggested by Alexander Duyck to help gcc.
6084		 */
6085		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6086						    NAPIF_STATE_SCHED;
6087	} while (cmpxchg(&n->state, val, new) != val);
6088
6089	if (unlikely(val & NAPIF_STATE_MISSED)) {
6090		__napi_schedule(n);
6091		return false;
6092	}
6093
6094	return true;
6095}
6096EXPORT_SYMBOL(napi_complete_done);
6097
6098/* must be called under rcu_read_lock(), as we dont take a reference */
6099static struct napi_struct *napi_by_id(unsigned int napi_id)
6100{
6101	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6102	struct napi_struct *napi;
6103
6104	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6105		if (napi->napi_id == napi_id)
6106			return napi;
6107
6108	return NULL;
6109}
6110
6111#if defined(CONFIG_NET_RX_BUSY_POLL)
6112
6113#define BUSY_POLL_BUDGET 8
6114
6115static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6116{
6117	int rc;
6118
6119	/* Busy polling means there is a high chance device driver hard irq
6120	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6121	 * set in napi_schedule_prep().
6122	 * Since we are about to call napi->poll() once more, we can safely
6123	 * clear NAPI_STATE_MISSED.
6124	 *
6125	 * Note: x86 could use a single "lock and ..." instruction
6126	 * to perform these two clear_bit()
6127	 */
6128	clear_bit(NAPI_STATE_MISSED, &napi->state);
6129	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6130
6131	local_bh_disable();
6132
6133	/* All we really want here is to re-enable device interrupts.
6134	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6135	 */
6136	rc = napi->poll(napi, BUSY_POLL_BUDGET);
6137	trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6138	netpoll_poll_unlock(have_poll_lock);
6139	if (rc == BUSY_POLL_BUDGET)
6140		__napi_schedule(napi);
6141	local_bh_enable();
6142}
6143
6144void napi_busy_loop(unsigned int napi_id,
6145		    bool (*loop_end)(void *, unsigned long),
6146		    void *loop_end_arg)
6147{
6148	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6149	int (*napi_poll)(struct napi_struct *napi, int budget);
6150	void *have_poll_lock = NULL;
6151	struct napi_struct *napi;
6152
6153restart:
6154	napi_poll = NULL;
6155
6156	rcu_read_lock();
6157
6158	napi = napi_by_id(napi_id);
6159	if (!napi)
6160		goto out;
6161
6162	preempt_disable();
6163	for (;;) {
6164		int work = 0;
6165
6166		local_bh_disable();
6167		if (!napi_poll) {
6168			unsigned long val = READ_ONCE(napi->state);
6169
6170			/* If multiple threads are competing for this napi,
6171			 * we avoid dirtying napi->state as much as we can.
6172			 */
6173			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6174				   NAPIF_STATE_IN_BUSY_POLL))
6175				goto count;
6176			if (cmpxchg(&napi->state, val,
6177				    val | NAPIF_STATE_IN_BUSY_POLL |
6178					  NAPIF_STATE_SCHED) != val)
6179				goto count;
6180			have_poll_lock = netpoll_poll_lock(napi);
6181			napi_poll = napi->poll;
6182		}
6183		work = napi_poll(napi, BUSY_POLL_BUDGET);
6184		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6185count:
6186		if (work > 0)
6187			__NET_ADD_STATS(dev_net(napi->dev),
6188					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6189		local_bh_enable();
6190
6191		if (!loop_end || loop_end(loop_end_arg, start_time))
6192			break;
6193
6194		if (unlikely(need_resched())) {
6195			if (napi_poll)
6196				busy_poll_stop(napi, have_poll_lock);
6197			preempt_enable();
6198			rcu_read_unlock();
6199			cond_resched();
6200			if (loop_end(loop_end_arg, start_time))
6201				return;
6202			goto restart;
6203		}
6204		cpu_relax();
6205	}
6206	if (napi_poll)
6207		busy_poll_stop(napi, have_poll_lock);
6208	preempt_enable();
6209out:
6210	rcu_read_unlock();
6211}
6212EXPORT_SYMBOL(napi_busy_loop);
6213
6214#endif /* CONFIG_NET_RX_BUSY_POLL */
6215
6216static void napi_hash_add(struct napi_struct *napi)
6217{
6218	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6219	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6220		return;
6221
6222	spin_lock(&napi_hash_lock);
6223
6224	/* 0..NR_CPUS range is reserved for sender_cpu use */
6225	do {
6226		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6227			napi_gen_id = MIN_NAPI_ID;
6228	} while (napi_by_id(napi_gen_id));
6229	napi->napi_id = napi_gen_id;
6230
6231	hlist_add_head_rcu(&napi->napi_hash_node,
6232			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6233
6234	spin_unlock(&napi_hash_lock);
6235}
6236
6237/* Warning : caller is responsible to make sure rcu grace period
6238 * is respected before freeing memory containing @napi
6239 */
6240bool napi_hash_del(struct napi_struct *napi)
6241{
6242	bool rcu_sync_needed = false;
6243
6244	spin_lock(&napi_hash_lock);
6245
6246	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6247		rcu_sync_needed = true;
6248		hlist_del_rcu(&napi->napi_hash_node);
6249	}
6250	spin_unlock(&napi_hash_lock);
6251	return rcu_sync_needed;
6252}
6253EXPORT_SYMBOL_GPL(napi_hash_del);
6254
6255static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6256{
6257	struct napi_struct *napi;
6258
6259	napi = container_of(timer, struct napi_struct, timer);
6260
6261	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
6262	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6263	 */
6264	if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6265	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6266		__napi_schedule_irqoff(napi);
6267
6268	return HRTIMER_NORESTART;
6269}
6270
6271static void init_gro_hash(struct napi_struct *napi)
6272{
6273	int i;
6274
6275	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6276		INIT_LIST_HEAD(&napi->gro_hash[i].list);
6277		napi->gro_hash[i].count = 0;
6278	}
6279	napi->gro_bitmask = 0;
6280}
6281
6282void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6283		    int (*poll)(struct napi_struct *, int), int weight)
6284{
6285	INIT_LIST_HEAD(&napi->poll_list);
6286	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6287	napi->timer.function = napi_watchdog;
6288	init_gro_hash(napi);
6289	napi->skb = NULL;
6290	napi->poll = poll;
6291	if (weight > NAPI_POLL_WEIGHT)
6292		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6293				weight);
6294	napi->weight = weight;
6295	list_add(&napi->dev_list, &dev->napi_list);
6296	napi->dev = dev;
6297#ifdef CONFIG_NETPOLL
6298	napi->poll_owner = -1;
6299#endif
6300	set_bit(NAPI_STATE_SCHED, &napi->state);
6301	napi_hash_add(napi);
6302}
6303EXPORT_SYMBOL(netif_napi_add);
6304
6305void napi_disable(struct napi_struct *n)
6306{
6307	might_sleep();
6308	set_bit(NAPI_STATE_DISABLE, &n->state);
6309
6310	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6311		msleep(1);
6312	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6313		msleep(1);
6314
6315	hrtimer_cancel(&n->timer);
6316
6317	clear_bit(NAPI_STATE_DISABLE, &n->state);
6318}
6319EXPORT_SYMBOL(napi_disable);
6320
6321static void flush_gro_hash(struct napi_struct *napi)
6322{
6323	int i;
6324
6325	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6326		struct sk_buff *skb, *n;
6327
6328		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6329			kfree_skb(skb);
6330		napi->gro_hash[i].count = 0;
6331	}
6332}
6333
6334/* Must be called in process context */
6335void netif_napi_del(struct napi_struct *napi)
6336{
6337	might_sleep();
6338	if (napi_hash_del(napi))
6339		synchronize_net();
6340	list_del_init(&napi->dev_list);
6341	napi_free_frags(napi);
6342
6343	flush_gro_hash(napi);
6344	napi->gro_bitmask = 0;
6345}
6346EXPORT_SYMBOL(netif_napi_del);
6347
6348static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6349{
6350	void *have;
6351	int work, weight;
6352
6353	list_del_init(&n->poll_list);
6354
6355	have = netpoll_poll_lock(n);
6356
6357	weight = n->weight;
6358
6359	/* This NAPI_STATE_SCHED test is for avoiding a race
6360	 * with netpoll's poll_napi().  Only the entity which
6361	 * obtains the lock and sees NAPI_STATE_SCHED set will
6362	 * actually make the ->poll() call.  Therefore we avoid
6363	 * accidentally calling ->poll() when NAPI is not scheduled.
6364	 */
6365	work = 0;
6366	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6367		work = n->poll(n, weight);
6368		trace_napi_poll(n, work, weight);
6369	}
6370
6371	WARN_ON_ONCE(work > weight);
6372
6373	if (likely(work < weight))
6374		goto out_unlock;
6375
6376	/* Drivers must not modify the NAPI state if they
6377	 * consume the entire weight.  In such cases this code
6378	 * still "owns" the NAPI instance and therefore can
6379	 * move the instance around on the list at-will.
6380	 */
6381	if (unlikely(napi_disable_pending(n))) {
6382		napi_complete(n);
6383		goto out_unlock;
6384	}
6385
6386	if (n->gro_bitmask) {
6387		/* flush too old packets
6388		 * If HZ < 1000, flush all packets.
6389		 */
6390		napi_gro_flush(n, HZ >= 1000);
6391	}
6392
6393	/* Some drivers may have called napi_schedule
6394	 * prior to exhausting their budget.
6395	 */
6396	if (unlikely(!list_empty(&n->poll_list))) {
6397		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6398			     n->dev ? n->dev->name : "backlog");
6399		goto out_unlock;
6400	}
6401
6402	list_add_tail(&n->poll_list, repoll);
6403
6404out_unlock:
6405	netpoll_poll_unlock(have);
6406
6407	return work;
6408}
6409
6410static __latent_entropy void net_rx_action(struct softirq_action *h)
6411{
6412	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6413	unsigned long time_limit = jiffies +
6414		usecs_to_jiffies(netdev_budget_usecs);
6415	int budget = netdev_budget;
6416	LIST_HEAD(list);
6417	LIST_HEAD(repoll);
6418
6419	local_irq_disable();
6420	list_splice_init(&sd->poll_list, &list);
6421	local_irq_enable();
6422
6423	for (;;) {
6424		struct napi_struct *n;
6425
6426		if (list_empty(&list)) {
6427			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6428				goto out;
6429			break;
6430		}
6431
6432		n = list_first_entry(&list, struct napi_struct, poll_list);
6433		budget -= napi_poll(n, &repoll);
6434
6435		/* If softirq window is exhausted then punt.
6436		 * Allow this to run for 2 jiffies since which will allow
6437		 * an average latency of 1.5/HZ.
6438		 */
6439		if (unlikely(budget <= 0 ||
6440			     time_after_eq(jiffies, time_limit))) {
6441			sd->time_squeeze++;
6442			break;
6443		}
6444	}
6445
6446	local_irq_disable();
6447
6448	list_splice_tail_init(&sd->poll_list, &list);
6449	list_splice_tail(&repoll, &list);
6450	list_splice(&list, &sd->poll_list);
6451	if (!list_empty(&sd->poll_list))
6452		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
6453
6454	net_rps_action_and_irq_enable(sd);
6455out:
6456	__kfree_skb_flush();
6457}
6458
6459struct netdev_adjacent {
6460	struct net_device *dev;
6461
6462	/* upper master flag, there can only be one master device per list */
6463	bool master;
6464
6465	/* counter for the number of times this device was added to us */
6466	u16 ref_nr;
6467
6468	/* private field for the users */
6469	void *private;
6470
6471	struct list_head list;
6472	struct rcu_head rcu;
6473};
6474
6475static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6476						 struct list_head *adj_list)
6477{
6478	struct netdev_adjacent *adj;
6479
6480	list_for_each_entry(adj, adj_list, list) {
6481		if (adj->dev == adj_dev)
6482			return adj;
6483	}
6484	return NULL;
6485}
6486
6487static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6488{
6489	struct net_device *dev = data;
6490
6491	return upper_dev == dev;
6492}
6493
6494/**
6495 * netdev_has_upper_dev - Check if device is linked to an upper device
6496 * @dev: device
6497 * @upper_dev: upper device to check
6498 *
6499 * Find out if a device is linked to specified upper device and return true
6500 * in case it is. Note that this checks only immediate upper device,
6501 * not through a complete stack of devices. The caller must hold the RTNL lock.
6502 */
6503bool netdev_has_upper_dev(struct net_device *dev,
6504			  struct net_device *upper_dev)
6505{
6506	ASSERT_RTNL();
6507
6508	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6509					     upper_dev);
6510}
6511EXPORT_SYMBOL(netdev_has_upper_dev);
6512
6513/**
6514 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6515 * @dev: device
6516 * @upper_dev: upper device to check
6517 *
6518 * Find out if a device is linked to specified upper device and return true
6519 * in case it is. Note that this checks the entire upper device chain.
6520 * The caller must hold rcu lock.
6521 */
6522
6523bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6524				  struct net_device *upper_dev)
6525{
6526	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6527					       upper_dev);
6528}
6529EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6530
6531/**
6532 * netdev_has_any_upper_dev - Check if device is linked to some device
6533 * @dev: device
6534 *
6535 * Find out if a device is linked to an upper device and return true in case
6536 * it is. The caller must hold the RTNL lock.
6537 */
6538bool netdev_has_any_upper_dev(struct net_device *dev)
6539{
6540	ASSERT_RTNL();
6541
6542	return !list_empty(&dev->adj_list.upper);
6543}
6544EXPORT_SYMBOL(netdev_has_any_upper_dev);
6545
6546/**
6547 * netdev_master_upper_dev_get - Get master upper device
6548 * @dev: device
6549 *
6550 * Find a master upper device and return pointer to it or NULL in case
6551 * it's not there. The caller must hold the RTNL lock.
6552 */
6553struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6554{
6555	struct netdev_adjacent *upper;
6556
6557	ASSERT_RTNL();
6558
6559	if (list_empty(&dev->adj_list.upper))
6560		return NULL;
6561
6562	upper = list_first_entry(&dev->adj_list.upper,
6563				 struct netdev_adjacent, list);
6564	if (likely(upper->master))
6565		return upper->dev;
6566	return NULL;
6567}
6568EXPORT_SYMBOL(netdev_master_upper_dev_get);
6569
6570/**
6571 * netdev_has_any_lower_dev - Check if device is linked to some device
6572 * @dev: device
6573 *
6574 * Find out if a device is linked to a lower device and return true in case
6575 * it is. The caller must hold the RTNL lock.
6576 */
6577static bool netdev_has_any_lower_dev(struct net_device *dev)
6578{
6579	ASSERT_RTNL();
6580
6581	return !list_empty(&dev->adj_list.lower);
6582}
6583
6584void *netdev_adjacent_get_private(struct list_head *adj_list)
6585{
6586	struct netdev_adjacent *adj;
6587
6588	adj = list_entry(adj_list, struct netdev_adjacent, list);
6589
6590	return adj->private;
6591}
6592EXPORT_SYMBOL(netdev_adjacent_get_private);
6593
6594/**
6595 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6596 * @dev: device
6597 * @iter: list_head ** of the current position
6598 *
6599 * Gets the next device from the dev's upper list, starting from iter
6600 * position. The caller must hold RCU read lock.
6601 */
6602struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6603						 struct list_head **iter)
6604{
6605	struct netdev_adjacent *upper;
6606
6607	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6608
6609	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6610
6611	if (&upper->list == &dev->adj_list.upper)
6612		return NULL;
6613
6614	*iter = &upper->list;
6615
6616	return upper->dev;
6617}
6618EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6619
6620static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6621						    struct list_head **iter)
6622{
6623	struct netdev_adjacent *upper;
6624
6625	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6626
6627	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6628
6629	if (&upper->list == &dev->adj_list.upper)
6630		return NULL;
6631
6632	*iter = &upper->list;
6633
6634	return upper->dev;
6635}
6636
6637int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6638				  int (*fn)(struct net_device *dev,
6639					    void *data),
6640				  void *data)
6641{
6642	struct net_device *udev;
6643	struct list_head *iter;
6644	int ret;
6645
6646	for (iter = &dev->adj_list.upper,
6647	     udev = netdev_next_upper_dev_rcu(dev, &iter);
6648	     udev;
6649	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6650		/* first is the upper device itself */
6651		ret = fn(udev, data);
6652		if (ret)
6653			return ret;
6654
6655		/* then look at all of its upper devices */
6656		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6657		if (ret)
6658			return ret;
6659	}
6660
6661	return 0;
6662}
6663EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6664
6665/**
6666 * netdev_lower_get_next_private - Get the next ->private from the
6667 *				   lower neighbour list
6668 * @dev: device
6669 * @iter: list_head ** of the current position
6670 *
6671 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6672 * list, starting from iter position. The caller must hold either hold the
6673 * RTNL lock or its own locking that guarantees that the neighbour lower
6674 * list will remain unchanged.
6675 */
6676void *netdev_lower_get_next_private(struct net_device *dev,
6677				    struct list_head **iter)
6678{
6679	struct netdev_adjacent *lower;
6680
6681	lower = list_entry(*iter, struct netdev_adjacent, list);
6682
6683	if (&lower->list == &dev->adj_list.lower)
6684		return NULL;
6685
6686	*iter = lower->list.next;
6687
6688	return lower->private;
6689}
6690EXPORT_SYMBOL(netdev_lower_get_next_private);
6691
6692/**
6693 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6694 *				       lower neighbour list, RCU
6695 *				       variant
6696 * @dev: device
6697 * @iter: list_head ** of the current position
6698 *
6699 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6700 * list, starting from iter position. The caller must hold RCU read lock.
6701 */
6702void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6703					struct list_head **iter)
6704{
6705	struct netdev_adjacent *lower;
6706
6707	WARN_ON_ONCE(!rcu_read_lock_held());
6708
6709	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6710
6711	if (&lower->list == &dev->adj_list.lower)
6712		return NULL;
6713
6714	*iter = &lower->list;
6715
6716	return lower->private;
6717}
6718EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6719
6720/**
6721 * netdev_lower_get_next - Get the next device from the lower neighbour
6722 *                         list
6723 * @dev: device
6724 * @iter: list_head ** of the current position
6725 *
6726 * Gets the next netdev_adjacent from the dev's lower neighbour
6727 * list, starting from iter position. The caller must hold RTNL lock or
6728 * its own locking that guarantees that the neighbour lower
6729 * list will remain unchanged.
6730 */
6731void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6732{
6733	struct netdev_adjacent *lower;
6734
6735	lower = list_entry(*iter, struct netdev_adjacent, list);
6736
6737	if (&lower->list == &dev->adj_list.lower)
6738		return NULL;
6739
6740	*iter = lower->list.next;
6741
6742	return lower->dev;
6743}
6744EXPORT_SYMBOL(netdev_lower_get_next);
6745
6746static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6747						struct list_head **iter)
6748{
6749	struct netdev_adjacent *lower;
6750
6751	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6752
6753	if (&lower->list == &dev->adj_list.lower)
6754		return NULL;
6755
6756	*iter = &lower->list;
6757
6758	return lower->dev;
6759}
6760
6761int netdev_walk_all_lower_dev(struct net_device *dev,
6762			      int (*fn)(struct net_device *dev,
6763					void *data),
6764			      void *data)
6765{
6766	struct net_device *ldev;
6767	struct list_head *iter;
6768	int ret;
6769
6770	for (iter = &dev->adj_list.lower,
6771	     ldev = netdev_next_lower_dev(dev, &iter);
6772	     ldev;
6773	     ldev = netdev_next_lower_dev(dev, &iter)) {
6774		/* first is the lower device itself */
6775		ret = fn(ldev, data);
6776		if (ret)
6777			return ret;
6778
6779		/* then look at all of its lower devices */
6780		ret = netdev_walk_all_lower_dev(ldev, fn, data);
6781		if (ret)
6782			return ret;
6783	}
6784
6785	return 0;
6786}
6787EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6788
6789static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6790						    struct list_head **iter)
6791{
6792	struct netdev_adjacent *lower;
6793
6794	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6795	if (&lower->list == &dev->adj_list.lower)
6796		return NULL;
6797
6798	*iter = &lower->list;
6799
6800	return lower->dev;
6801}
6802
6803int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6804				  int (*fn)(struct net_device *dev,
6805					    void *data),
6806				  void *data)
6807{
6808	struct net_device *ldev;
6809	struct list_head *iter;
6810	int ret;
6811
6812	for (iter = &dev->adj_list.lower,
6813	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
6814	     ldev;
6815	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6816		/* first is the lower device itself */
6817		ret = fn(ldev, data);
6818		if (ret)
6819			return ret;
6820
6821		/* then look at all of its lower devices */
6822		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6823		if (ret)
6824			return ret;
6825	}
6826
6827	return 0;
6828}
6829EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6830
6831/**
6832 * netdev_lower_get_first_private_rcu - Get the first ->private from the
6833 *				       lower neighbour list, RCU
6834 *				       variant
6835 * @dev: device
6836 *
6837 * Gets the first netdev_adjacent->private from the dev's lower neighbour
6838 * list. The caller must hold RCU read lock.
6839 */
6840void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6841{
6842	struct netdev_adjacent *lower;
6843
6844	lower = list_first_or_null_rcu(&dev->adj_list.lower,
6845			struct netdev_adjacent, list);
6846	if (lower)
6847		return lower->private;
6848	return NULL;
6849}
6850EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6851
6852/**
6853 * netdev_master_upper_dev_get_rcu - Get master upper device
6854 * @dev: device
6855 *
6856 * Find a master upper device and return pointer to it or NULL in case
6857 * it's not there. The caller must hold the RCU read lock.
6858 */
6859struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6860{
6861	struct netdev_adjacent *upper;
6862
6863	upper = list_first_or_null_rcu(&dev->adj_list.upper,
6864				       struct netdev_adjacent, list);
6865	if (upper && likely(upper->master))
6866		return upper->dev;
6867	return NULL;
6868}
6869EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
6870
6871static int netdev_adjacent_sysfs_add(struct net_device *dev,
6872			      struct net_device *adj_dev,
6873			      struct list_head *dev_list)
6874{
6875	char linkname[IFNAMSIZ+7];
6876
6877	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6878		"upper_%s" : "lower_%s", adj_dev->name);
6879	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
6880				 linkname);
6881}
6882static void netdev_adjacent_sysfs_del(struct net_device *dev,
6883			       char *name,
6884			       struct list_head *dev_list)
6885{
6886	char linkname[IFNAMSIZ+7];
6887
6888	sprintf(linkname, dev_list == &dev->adj_list.upper ?
6889		"upper_%s" : "lower_%s", name);
6890	sysfs_remove_link(&(dev->dev.kobj), linkname);
6891}
6892
6893static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
6894						 struct net_device *adj_dev,
6895						 struct list_head *dev_list)
6896{
6897	return (dev_list == &dev->adj_list.upper ||
6898		dev_list == &dev->adj_list.lower) &&
6899		net_eq(dev_net(dev), dev_net(adj_dev));
6900}
6901
6902static int __netdev_adjacent_dev_insert(struct net_device *dev,
6903					struct net_device *adj_dev,
6904					struct list_head *dev_list,
6905					void *private, bool master)
6906{
6907	struct netdev_adjacent *adj;
6908	int ret;
6909
6910	adj = __netdev_find_adj(adj_dev, dev_list);
6911
6912	if (adj) {
6913		adj->ref_nr += 1;
6914		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
6915			 dev->name, adj_dev->name, adj->ref_nr);
6916
6917		return 0;
6918	}
6919
6920	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
6921	if (!adj)
6922		return -ENOMEM;
6923
6924	adj->dev = adj_dev;
6925	adj->master = master;
6926	adj->ref_nr = 1;
6927	adj->private = private;
6928	dev_hold(adj_dev);
6929
6930	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
6931		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
6932
6933	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
6934		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
6935		if (ret)
6936			goto free_adj;
6937	}
6938
6939	/* Ensure that master link is always the first item in list. */
6940	if (master) {
6941		ret = sysfs_create_link(&(dev->dev.kobj),
6942					&(adj_dev->dev.kobj), "master");
6943		if (ret)
6944			goto remove_symlinks;
6945
6946		list_add_rcu(&adj->list, dev_list);
6947	} else {
6948		list_add_tail_rcu(&adj->list, dev_list);
6949	}
6950
6951	return 0;
6952
6953remove_symlinks:
6954	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6955		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6956free_adj:
6957	kfree(adj);
6958	dev_put(adj_dev);
6959
6960	return ret;
6961}
6962
6963static void __netdev_adjacent_dev_remove(struct net_device *dev,
6964					 struct net_device *adj_dev,
6965					 u16 ref_nr,
6966					 struct list_head *dev_list)
6967{
6968	struct netdev_adjacent *adj;
6969
6970	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
6971		 dev->name, adj_dev->name, ref_nr);
6972
6973	adj = __netdev_find_adj(adj_dev, dev_list);
6974
6975	if (!adj) {
6976		pr_err("Adjacency does not exist for device %s from %s\n",
6977		       dev->name, adj_dev->name);
6978		WARN_ON(1);
6979		return;
6980	}
6981
6982	if (adj->ref_nr > ref_nr) {
6983		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
6984			 dev->name, adj_dev->name, ref_nr,
6985			 adj->ref_nr - ref_nr);
6986		adj->ref_nr -= ref_nr;
6987		return;
6988	}
6989
6990	if (adj->master)
6991		sysfs_remove_link(&(dev->dev.kobj), "master");
6992
6993	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6994		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6995
6996	list_del_rcu(&adj->list);
6997	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
6998		 adj_dev->name, dev->name, adj_dev->name);
6999	dev_put(adj_dev);
7000	kfree_rcu(adj, rcu);
7001}
7002
7003static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7004					    struct net_device *upper_dev,
7005					    struct list_head *up_list,
7006					    struct list_head *down_list,
7007					    void *private, bool master)
7008{
7009	int ret;
7010
7011	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7012					   private, master);
7013	if (ret)
7014		return ret;
7015
7016	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7017					   private, false);
7018	if (ret) {
7019		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7020		return ret;
7021	}
7022
7023	return 0;
7024}
7025
7026static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7027					       struct net_device *upper_dev,
7028					       u16 ref_nr,
7029					       struct list_head *up_list,
7030					       struct list_head *down_list)
7031{
7032	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7033	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7034}
7035
7036static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7037						struct net_device *upper_dev,
7038						void *private, bool master)
7039{
7040	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7041						&dev->adj_list.upper,
7042						&upper_dev->adj_list.lower,
7043						private, master);
7044}
7045
7046static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7047						   struct net_device *upper_dev)
7048{
7049	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7050					   &dev->adj_list.upper,
7051					   &upper_dev->adj_list.lower);
7052}
7053
7054static int __netdev_upper_dev_link(struct net_device *dev,
7055				   struct net_device *upper_dev, bool master,
7056				   void *upper_priv, void *upper_info,
7057				   struct netlink_ext_ack *extack)
7058{
7059	struct netdev_notifier_changeupper_info changeupper_info = {
7060		.info = {
7061			.dev = dev,
7062			.extack = extack,
7063		},
7064		.upper_dev = upper_dev,
7065		.master = master,
7066		.linking = true,
7067		.upper_info = upper_info,
7068	};
7069	struct net_device *master_dev;
7070	int ret = 0;
7071
7072	ASSERT_RTNL();
7073
7074	if (dev == upper_dev)
7075		return -EBUSY;
7076
7077	/* To prevent loops, check if dev is not upper device to upper_dev. */
7078	if (netdev_has_upper_dev(upper_dev, dev))
7079		return -EBUSY;
7080
7081	if (!master) {
7082		if (netdev_has_upper_dev(dev, upper_dev))
7083			return -EEXIST;
7084	} else {
7085		master_dev = netdev_master_upper_dev_get(dev);
7086		if (master_dev)
7087			return master_dev == upper_dev ? -EEXIST : -EBUSY;
7088	}
7089
7090	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7091					    &changeupper_info.info);
7092	ret = notifier_to_errno(ret);
7093	if (ret)
7094		return ret;
7095
7096	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7097						   master);
7098	if (ret)
7099		return ret;
7100
7101	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7102					    &changeupper_info.info);
7103	ret = notifier_to_errno(ret);
7104	if (ret)
7105		goto rollback;
7106
7107	return 0;
7108
7109rollback:
7110	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7111
7112	return ret;
7113}
7114
7115/**
7116 * netdev_upper_dev_link - Add a link to the upper device
7117 * @dev: device
7118 * @upper_dev: new upper device
7119 * @extack: netlink extended ack
7120 *
7121 * Adds a link to device which is upper to this one. The caller must hold
7122 * the RTNL lock. On a failure a negative errno code is returned.
7123 * On success the reference counts are adjusted and the function
7124 * returns zero.
7125 */
7126int netdev_upper_dev_link(struct net_device *dev,
7127			  struct net_device *upper_dev,
7128			  struct netlink_ext_ack *extack)
7129{
7130	return __netdev_upper_dev_link(dev, upper_dev, false,
7131				       NULL, NULL, extack);
7132}
7133EXPORT_SYMBOL(netdev_upper_dev_link);
7134
7135/**
7136 * netdev_master_upper_dev_link - Add a master link to the upper device
7137 * @dev: device
7138 * @upper_dev: new upper device
7139 * @upper_priv: upper device private
7140 * @upper_info: upper info to be passed down via notifier
7141 * @extack: netlink extended ack
7142 *
7143 * Adds a link to device which is upper to this one. In this case, only
7144 * one master upper device can be linked, although other non-master devices
7145 * might be linked as well. The caller must hold the RTNL lock.
7146 * On a failure a negative errno code is returned. On success the reference
7147 * counts are adjusted and the function returns zero.
7148 */
7149int netdev_master_upper_dev_link(struct net_device *dev,
7150				 struct net_device *upper_dev,
7151				 void *upper_priv, void *upper_info,
7152				 struct netlink_ext_ack *extack)
7153{
7154	return __netdev_upper_dev_link(dev, upper_dev, true,
7155				       upper_priv, upper_info, extack);
7156}
7157EXPORT_SYMBOL(netdev_master_upper_dev_link);
7158
7159/**
7160 * netdev_upper_dev_unlink - Removes a link to upper device
7161 * @dev: device
7162 * @upper_dev: new upper device
7163 *
7164 * Removes a link to device which is upper to this one. The caller must hold
7165 * the RTNL lock.
7166 */
7167void netdev_upper_dev_unlink(struct net_device *dev,
7168			     struct net_device *upper_dev)
7169{
7170	struct netdev_notifier_changeupper_info changeupper_info = {
7171		.info = {
7172			.dev = dev,
7173		},
7174		.upper_dev = upper_dev,
7175		.linking = false,
7176	};
7177
7178	ASSERT_RTNL();
7179
7180	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7181
7182	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7183				      &changeupper_info.info);
7184
7185	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7186
7187	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7188				      &changeupper_info.info);
7189}
7190EXPORT_SYMBOL(netdev_upper_dev_unlink);
7191
7192/**
7193 * netdev_bonding_info_change - Dispatch event about slave change
7194 * @dev: device
7195 * @bonding_info: info to dispatch
7196 *
7197 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7198 * The caller must hold the RTNL lock.
7199 */
7200void netdev_bonding_info_change(struct net_device *dev,
7201				struct netdev_bonding_info *bonding_info)
7202{
7203	struct netdev_notifier_bonding_info info = {
7204		.info.dev = dev,
7205	};
7206
7207	memcpy(&info.bonding_info, bonding_info,
7208	       sizeof(struct netdev_bonding_info));
7209	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7210				      &info.info);
7211}
7212EXPORT_SYMBOL(netdev_bonding_info_change);
7213
7214static void netdev_adjacent_add_links(struct net_device *dev)
7215{
7216	struct netdev_adjacent *iter;
7217
7218	struct net *net = dev_net(dev);
7219
7220	list_for_each_entry(iter, &dev->adj_list.upper, list) {
7221		if (!net_eq(net, dev_net(iter->dev)))
7222			continue;
7223		netdev_adjacent_sysfs_add(iter->dev, dev,
7224					  &iter->dev->adj_list.lower);
7225		netdev_adjacent_sysfs_add(dev, iter->dev,
7226					  &dev->adj_list.upper);
7227	}
7228
7229	list_for_each_entry(iter, &dev->adj_list.lower, list) {
7230		if (!net_eq(net, dev_net(iter->dev)))
7231			continue;
7232		netdev_adjacent_sysfs_add(iter->dev, dev,
7233					  &iter->dev->adj_list.upper);
7234		netdev_adjacent_sysfs_add(dev, iter->dev,
7235					  &dev->adj_list.lower);
7236	}
7237}
7238
7239static void netdev_adjacent_del_links(struct net_device *dev)
7240{
7241	struct netdev_adjacent *iter;
7242
7243	struct net *net = dev_net(dev);
7244
7245	list_for_each_entry(iter, &dev->adj_list.upper, list) {
7246		if (!net_eq(net, dev_net(iter->dev)))
7247			continue;
7248		netdev_adjacent_sysfs_del(iter->dev, dev->name,
7249					  &iter->dev->adj_list.lower);
7250		netdev_adjacent_sysfs_del(dev, iter->dev->name,
7251					  &dev->adj_list.upper);
7252	}
7253
7254	list_for_each_entry(iter, &dev->adj_list.lower, list) {
7255		if (!net_eq(net, dev_net(iter->dev)))
7256			continue;
7257		netdev_adjacent_sysfs_del(iter->dev, dev->name,
7258					  &iter->dev->adj_list.upper);
7259		netdev_adjacent_sysfs_del(dev, iter->dev->name,
7260					  &dev->adj_list.lower);
7261	}
7262}
7263
7264void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7265{
7266	struct netdev_adjacent *iter;
7267
7268	struct net *net = dev_net(dev);
7269
7270	list_for_each_entry(iter, &dev->adj_list.upper, list) {
7271		if (!net_eq(net, dev_net(iter->dev)))
7272			continue;
7273		netdev_adjacent_sysfs_del(iter->dev, oldname,
7274					  &iter->dev->adj_list.lower);
7275		netdev_adjacent_sysfs_add(iter->dev, dev,
7276					  &iter->dev->adj_list.lower);
7277	}
7278
7279	list_for_each_entry(iter, &dev->adj_list.lower, list) {
7280		if (!net_eq(net, dev_net(iter->dev)))
7281			continue;
7282		netdev_adjacent_sysfs_del(iter->dev, oldname,
7283					  &iter->dev->adj_list.upper);
7284		netdev_adjacent_sysfs_add(iter->dev, dev,
7285					  &iter->dev->adj_list.upper);
7286	}
7287}
7288
7289void *netdev_lower_dev_get_private(struct net_device *dev,
7290				   struct net_device *lower_dev)
7291{
7292	struct netdev_adjacent *lower;
7293
7294	if (!lower_dev)
7295		return NULL;
7296	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
7297	if (!lower)
7298		return NULL;
7299
7300	return lower->private;
7301}
7302EXPORT_SYMBOL(netdev_lower_dev_get_private);
7303
7304
7305int dev_get_nest_level(struct net_device *dev)
7306{
7307	struct net_device *lower = NULL;
7308	struct list_head *iter;
7309	int max_nest = -1;
7310	int nest;
7311
7312	ASSERT_RTNL();
7313
7314	netdev_for_each_lower_dev(dev, lower, iter) {
7315		nest = dev_get_nest_level(lower);
7316		if (max_nest < nest)
7317			max_nest = nest;
7318	}
7319
7320	return max_nest + 1;
7321}
7322EXPORT_SYMBOL(dev_get_nest_level);
7323
7324/**
7325 * netdev_lower_change - Dispatch event about lower device state change
7326 * @lower_dev: device
7327 * @lower_state_info: state to dispatch
7328 *
7329 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
7330 * The caller must hold the RTNL lock.
7331 */
7332void netdev_lower_state_changed(struct net_device *lower_dev,
7333				void *lower_state_info)
7334{
7335	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
7336		.info.dev = lower_dev,
7337	};
7338
7339	ASSERT_RTNL();
7340	changelowerstate_info.lower_state_info = lower_state_info;
7341	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
7342				      &changelowerstate_info.info);
7343}
7344EXPORT_SYMBOL(netdev_lower_state_changed);
7345
7346static void dev_change_rx_flags(struct net_device *dev, int flags)
7347{
7348	const struct net_device_ops *ops = dev->netdev_ops;
7349
7350	if (ops->ndo_change_rx_flags)
7351		ops->ndo_change_rx_flags(dev, flags);
7352}
7353
7354static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
7355{
7356	unsigned int old_flags = dev->flags;
7357	kuid_t uid;
7358	kgid_t gid;
7359
7360	ASSERT_RTNL();
7361
7362	dev->flags |= IFF_PROMISC;
7363	dev->promiscuity += inc;
7364	if (dev->promiscuity == 0) {
7365		/*
7366		 * Avoid overflow.
7367		 * If inc causes overflow, untouch promisc and return error.
7368		 */
7369		if (inc < 0)
7370			dev->flags &= ~IFF_PROMISC;
7371		else {
7372			dev->promiscuity -= inc;
7373			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
7374				dev->name);
7375			return -EOVERFLOW;
7376		}
7377	}
7378	if (dev->flags != old_flags) {
7379		pr_info("device %s %s promiscuous mode\n",
7380			dev->name,
7381			dev->flags & IFF_PROMISC ? "entered" : "left");
7382		if (audit_enabled) {
7383			current_uid_gid(&uid, &gid);
7384			audit_log(audit_context(), GFP_ATOMIC,
7385				  AUDIT_ANOM_PROMISCUOUS,
7386				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
7387				  dev->name, (dev->flags & IFF_PROMISC),
7388				  (old_flags & IFF_PROMISC),
7389				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
7390				  from_kuid(&init_user_ns, uid),
7391				  from_kgid(&init_user_ns, gid),
7392				  audit_get_sessionid(current));
7393		}
7394
7395		dev_change_rx_flags(dev, IFF_PROMISC);
7396	}
7397	if (notify)
7398		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
7399	return 0;
7400}
7401
7402/**
7403 *	dev_set_promiscuity	- update promiscuity count on a device
7404 *	@dev: device
7405 *	@inc: modifier
7406 *
7407 *	Add or remove promiscuity from a device. While the count in the device
7408 *	remains above zero the interface remains promiscuous. Once it hits zero
7409 *	the device reverts back to normal filtering operation. A negative inc
7410 *	value is used to drop promiscuity on the device.
7411 *	Return 0 if successful or a negative errno code on error.
7412 */
7413int dev_set_promiscuity(struct net_device *dev, int inc)
7414{
7415	unsigned int old_flags = dev->flags;
7416	int err;
7417
7418	err = __dev_set_promiscuity(dev, inc, true);
7419	if (err < 0)
7420		return err;
7421	if (dev->flags != old_flags)
7422		dev_set_rx_mode(dev);
7423	return err;
7424}
7425EXPORT_SYMBOL(dev_set_promiscuity);
7426
7427static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
7428{
7429	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
7430
7431	ASSERT_RTNL();
7432
7433	dev->flags |= IFF_ALLMULTI;
7434	dev->allmulti += inc;
7435	if (dev->allmulti == 0) {
7436		/*
7437		 * Avoid overflow.
7438		 * If inc causes overflow, untouch allmulti and return error.
7439		 */
7440		if (inc < 0)
7441			dev->flags &= ~IFF_ALLMULTI;
7442		else {
7443			dev->allmulti -= inc;
7444			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
7445				dev->name);
7446			return -EOVERFLOW;
7447		}
7448	}
7449	if (dev->flags ^ old_flags) {
7450		dev_change_rx_flags(dev, IFF_ALLMULTI);
7451		dev_set_rx_mode(dev);
7452		if (notify)
7453			__dev_notify_flags(dev, old_flags,
7454					   dev->gflags ^ old_gflags);
7455	}
7456	return 0;
7457}
7458
7459/**
7460 *	dev_set_allmulti	- update allmulti count on a device
7461 *	@dev: device
7462 *	@inc: modifier
7463 *
7464 *	Add or remove reception of all multicast frames to a device. While the
7465 *	count in the device remains above zero the interface remains listening
7466 *	to all interfaces. Once it hits zero the device reverts back to normal
7467 *	filtering operation. A negative @inc value is used to drop the counter
7468 *	when releasing a resource needing all multicasts.
7469 *	Return 0 if successful or a negative errno code on error.
7470 */
7471
7472int dev_set_allmulti(struct net_device *dev, int inc)
7473{
7474	return __dev_set_allmulti(dev, inc, true);
7475}
7476EXPORT_SYMBOL(dev_set_allmulti);
7477
7478/*
7479 *	Upload unicast and multicast address lists to device and
7480 *	configure RX filtering. When the device doesn't support unicast
7481 *	filtering it is put in promiscuous mode while unicast addresses
7482 *	are present.
7483 */
7484void __dev_set_rx_mode(struct net_device *dev)
7485{
7486	const struct net_device_ops *ops = dev->netdev_ops;
7487
7488	/* dev_open will call this function so the list will stay sane. */
7489	if (!(dev->flags&IFF_UP))
7490		return;
7491
7492	if (!netif_device_present(dev))
7493		return;
7494
7495	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
7496		/* Unicast addresses changes may only happen under the rtnl,
7497		 * therefore calling __dev_set_promiscuity here is safe.
7498		 */
7499		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
7500			__dev_set_promiscuity(dev, 1, false);
7501			dev->uc_promisc = true;
7502		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
7503			__dev_set_promiscuity(dev, -1, false);
7504			dev->uc_promisc = false;
7505		}
7506	}
7507
7508	if (ops->ndo_set_rx_mode)
7509		ops->ndo_set_rx_mode(dev);
7510}
7511
7512void dev_set_rx_mode(struct net_device *dev)
7513{
7514	netif_addr_lock_bh(dev);
7515	__dev_set_rx_mode(dev);
7516	netif_addr_unlock_bh(dev);
7517}
7518
7519/**
7520 *	dev_get_flags - get flags reported to userspace
7521 *	@dev: device
7522 *
7523 *	Get the combination of flag bits exported through APIs to userspace.
7524 */
7525unsigned int dev_get_flags(const struct net_device *dev)
7526{
7527	unsigned int flags;
7528
7529	flags = (dev->flags & ~(IFF_PROMISC |
7530				IFF_ALLMULTI |
7531				IFF_RUNNING |
7532				IFF_LOWER_UP |
7533				IFF_DORMANT)) |
7534		(dev->gflags & (IFF_PROMISC |
7535				IFF_ALLMULTI));
7536
7537	if (netif_running(dev)) {
7538		if (netif_oper_up(dev))
7539			flags |= IFF_RUNNING;
7540		if (netif_carrier_ok(dev))
7541			flags |= IFF_LOWER_UP;
7542		if (netif_dormant(dev))
7543			flags |= IFF_DORMANT;
7544	}
7545
7546	return flags;
7547}
7548EXPORT_SYMBOL(dev_get_flags);
7549
7550int __dev_change_flags(struct net_device *dev, unsigned int flags,
7551		       struct netlink_ext_ack *extack)
7552{
7553	unsigned int old_flags = dev->flags;
7554	int ret;
7555
7556	ASSERT_RTNL();
7557
7558	/*
7559	 *	Set the flags on our device.
7560	 */
7561
7562	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
7563			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
7564			       IFF_AUTOMEDIA)) |
7565		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
7566				    IFF_ALLMULTI));
7567
7568	/*
7569	 *	Load in the correct multicast list now the flags have changed.
7570	 */
7571
7572	if ((old_flags ^ flags) & IFF_MULTICAST)
7573		dev_change_rx_flags(dev, IFF_MULTICAST);
7574
7575	dev_set_rx_mode(dev);
7576
7577	/*
7578	 *	Have we downed the interface. We handle IFF_UP ourselves
7579	 *	according to user attempts to set it, rather than blindly
7580	 *	setting it.
7581	 */
7582
7583	ret = 0;
7584	if ((old_flags ^ flags) & IFF_UP) {
7585		if (old_flags & IFF_UP)
7586			__dev_close(dev);
7587		else
7588			ret = __dev_open(dev, extack);
7589	}
7590
7591	if ((flags ^ dev->gflags) & IFF_PROMISC) {
7592		int inc = (flags & IFF_PROMISC) ? 1 : -1;
7593		unsigned int old_flags = dev->flags;
7594
7595		dev->gflags ^= IFF_PROMISC;
7596
7597		if (__dev_set_promiscuity(dev, inc, false) >= 0)
7598			if (dev->flags != old_flags)
7599				dev_set_rx_mode(dev);
7600	}
7601
7602	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
7603	 * is important. Some (broken) drivers set IFF_PROMISC, when
7604	 * IFF_ALLMULTI is requested not asking us and not reporting.
7605	 */
7606	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
7607		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
7608
7609		dev->gflags ^= IFF_ALLMULTI;
7610		__dev_set_allmulti(dev, inc, false);
7611	}
7612
7613	return ret;
7614}
7615
7616void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
7617			unsigned int gchanges)
7618{
7619	unsigned int changes = dev->flags ^ old_flags;
7620
7621	if (gchanges)
7622		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
7623
7624	if (changes & IFF_UP) {
7625		if (dev->flags & IFF_UP)
7626			call_netdevice_notifiers(NETDEV_UP, dev);
7627		else
7628			call_netdevice_notifiers(NETDEV_DOWN, dev);
7629	}
7630
7631	if (dev->flags & IFF_UP &&
7632	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7633		struct netdev_notifier_change_info change_info = {
7634			.info = {
7635				.dev = dev,
7636			},
7637			.flags_changed = changes,
7638		};
7639
7640		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7641	}
7642}
7643
7644/**
7645 *	dev_change_flags - change device settings
7646 *	@dev: device
7647 *	@flags: device state flags
7648 *	@extack: netlink extended ack
7649 *
7650 *	Change settings on device based state flags. The flags are
7651 *	in the userspace exported format.
7652 */
7653int dev_change_flags(struct net_device *dev, unsigned int flags,
7654		     struct netlink_ext_ack *extack)
7655{
7656	int ret;
7657	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7658
7659	ret = __dev_change_flags(dev, flags, extack);
7660	if (ret < 0)
7661		return ret;
7662
7663	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7664	__dev_notify_flags(dev, old_flags, changes);
7665	return ret;
7666}
7667EXPORT_SYMBOL(dev_change_flags);
7668
7669int __dev_set_mtu(struct net_device *dev, int new_mtu)
7670{
7671	const struct net_device_ops *ops = dev->netdev_ops;
7672
7673	if (ops->ndo_change_mtu)
7674		return ops->ndo_change_mtu(dev, new_mtu);
7675
7676	dev->mtu = new_mtu;
7677	return 0;
7678}
7679EXPORT_SYMBOL(__dev_set_mtu);
7680
7681/**
7682 *	dev_set_mtu_ext - Change maximum transfer unit
7683 *	@dev: device
7684 *	@new_mtu: new transfer unit
7685 *	@extack: netlink extended ack
7686 *
7687 *	Change the maximum transfer size of the network device.
7688 */
7689int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
7690		    struct netlink_ext_ack *extack)
7691{
7692	int err, orig_mtu;
7693
7694	if (new_mtu == dev->mtu)
7695		return 0;
7696
7697	/* MTU must be positive, and in range */
7698	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7699		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
7700		return -EINVAL;
7701	}
7702
7703	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7704		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
7705		return -EINVAL;
7706	}
7707
7708	if (!netif_device_present(dev))
7709		return -ENODEV;
7710
7711	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7712	err = notifier_to_errno(err);
7713	if (err)
7714		return err;
7715
7716	orig_mtu = dev->mtu;
7717	err = __dev_set_mtu(dev, new_mtu);
7718
7719	if (!err) {
7720		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7721						   orig_mtu);
7722		err = notifier_to_errno(err);
7723		if (err) {
7724			/* setting mtu back and notifying everyone again,
7725			 * so that they have a chance to revert changes.
7726			 */
7727			__dev_set_mtu(dev, orig_mtu);
7728			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
7729						     new_mtu);
7730		}
7731	}
7732	return err;
7733}
7734
7735int dev_set_mtu(struct net_device *dev, int new_mtu)
7736{
7737	struct netlink_ext_ack extack;
7738	int err;
7739
7740	memset(&extack, 0, sizeof(extack));
7741	err = dev_set_mtu_ext(dev, new_mtu, &extack);
7742	if (err && extack._msg)
7743		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
7744	return err;
7745}
7746EXPORT_SYMBOL(dev_set_mtu);
7747
7748/**
7749 *	dev_change_tx_queue_len - Change TX queue length of a netdevice
7750 *	@dev: device
7751 *	@new_len: new tx queue length
7752 */
7753int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7754{
7755	unsigned int orig_len = dev->tx_queue_len;
7756	int res;
7757
7758	if (new_len != (unsigned int)new_len)
7759		return -ERANGE;
7760
7761	if (new_len != orig_len) {
7762		dev->tx_queue_len = new_len;
7763		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7764		res = notifier_to_errno(res);
7765		if (res)
7766			goto err_rollback;
7767		res = dev_qdisc_change_tx_queue_len(dev);
7768		if (res)
7769			goto err_rollback;
7770	}
7771
7772	return 0;
7773
7774err_rollback:
7775	netdev_err(dev, "refused to change device tx_queue_len\n");
7776	dev->tx_queue_len = orig_len;
7777	return res;
7778}
7779
7780/**
7781 *	dev_set_group - Change group this device belongs to
7782 *	@dev: device
7783 *	@new_group: group this device should belong to
7784 */
7785void dev_set_group(struct net_device *dev, int new_group)
7786{
7787	dev->group = new_group;
7788}
7789EXPORT_SYMBOL(dev_set_group);
7790
7791/**
7792 *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
7793 *	@dev: device
7794 *	@addr: new address
7795 *	@extack: netlink extended ack
7796 */
7797int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
7798			      struct netlink_ext_ack *extack)
7799{
7800	struct netdev_notifier_pre_changeaddr_info info = {
7801		.info.dev = dev,
7802		.info.extack = extack,
7803		.dev_addr = addr,
7804	};
7805	int rc;
7806
7807	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
7808	return notifier_to_errno(rc);
7809}
7810EXPORT_SYMBOL(dev_pre_changeaddr_notify);
7811
7812/**
7813 *	dev_set_mac_address - Change Media Access Control Address
7814 *	@dev: device
7815 *	@sa: new address
7816 *	@extack: netlink extended ack
7817 *
7818 *	Change the hardware (MAC) address of the device
7819 */
7820int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
7821			struct netlink_ext_ack *extack)
7822{
7823	const struct net_device_ops *ops = dev->netdev_ops;
7824	int err;
7825
7826	if (!ops->ndo_set_mac_address)
7827		return -EOPNOTSUPP;
7828	if (sa->sa_family != dev->type)
7829		return -EINVAL;
7830	if (!netif_device_present(dev))
7831		return -ENODEV;
7832	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
7833	if (err)
7834		return err;
7835	err = ops->ndo_set_mac_address(dev, sa);
7836	if (err)
7837		return err;
7838	dev->addr_assign_type = NET_ADDR_SET;
7839	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7840	add_device_randomness(dev->dev_addr, dev->addr_len);
7841	return 0;
7842}
7843EXPORT_SYMBOL(dev_set_mac_address);
7844
7845/**
7846 *	dev_change_carrier - Change device carrier
7847 *	@dev: device
7848 *	@new_carrier: new value
7849 *
7850 *	Change device carrier
7851 */
7852int dev_change_carrier(struct net_device *dev, bool new_carrier)
7853{
7854	const struct net_device_ops *ops = dev->netdev_ops;
7855
7856	if (!ops->ndo_change_carrier)
7857		return -EOPNOTSUPP;
7858	if (!netif_device_present(dev))
7859		return -ENODEV;
7860	return ops->ndo_change_carrier(dev, new_carrier);
7861}
7862EXPORT_SYMBOL(dev_change_carrier);
7863
7864/**
7865 *	dev_get_phys_port_id - Get device physical port ID
7866 *	@dev: device
7867 *	@ppid: port ID
7868 *
7869 *	Get device physical port ID
7870 */
7871int dev_get_phys_port_id(struct net_device *dev,
7872			 struct netdev_phys_item_id *ppid)
7873{
7874	const struct net_device_ops *ops = dev->netdev_ops;
7875
7876	if (!ops->ndo_get_phys_port_id)
7877		return -EOPNOTSUPP;
7878	return ops->ndo_get_phys_port_id(dev, ppid);
7879}
7880EXPORT_SYMBOL(dev_get_phys_port_id);
7881
7882/**
7883 *	dev_get_phys_port_name - Get device physical port name
7884 *	@dev: device
7885 *	@name: port name
7886 *	@len: limit of bytes to copy to name
7887 *
7888 *	Get device physical port name
7889 */
7890int dev_get_phys_port_name(struct net_device *dev,
7891			   char *name, size_t len)
7892{
7893	const struct net_device_ops *ops = dev->netdev_ops;
7894	int err;
7895
7896	if (ops->ndo_get_phys_port_name) {
7897		err = ops->ndo_get_phys_port_name(dev, name, len);
7898		if (err != -EOPNOTSUPP)
7899			return err;
7900	}
7901	return devlink_compat_phys_port_name_get(dev, name, len);
7902}
7903EXPORT_SYMBOL(dev_get_phys_port_name);
7904
7905/**
7906 *	dev_get_port_parent_id - Get the device's port parent identifier
7907 *	@dev: network device
7908 *	@ppid: pointer to a storage for the port's parent identifier
7909 *	@recurse: allow/disallow recursion to lower devices
7910 *
7911 *	Get the devices's port parent identifier
7912 */
7913int dev_get_port_parent_id(struct net_device *dev,
7914			   struct netdev_phys_item_id *ppid,
7915			   bool recurse)
7916{
7917	const struct net_device_ops *ops = dev->netdev_ops;
7918	struct netdev_phys_item_id first = { };
7919	struct net_device *lower_dev;
7920	struct list_head *iter;
7921	int err;
7922
7923	if (ops->ndo_get_port_parent_id) {
7924		err = ops->ndo_get_port_parent_id(dev, ppid);
7925		if (err != -EOPNOTSUPP)
7926			return err;
7927	}
7928
7929	err = devlink_compat_switch_id_get(dev, ppid);
7930	if (!err || err != -EOPNOTSUPP)
7931		return err;
7932
7933	if (!recurse)
7934		return -EOPNOTSUPP;
7935
7936	netdev_for_each_lower_dev(dev, lower_dev, iter) {
7937		err = dev_get_port_parent_id(lower_dev, ppid, recurse);
7938		if (err)
7939			break;
7940		if (!first.id_len)
7941			first = *ppid;
7942		else if (memcmp(&first, ppid, sizeof(*ppid)))
7943			return -ENODATA;
7944	}
7945
7946	return err;
7947}
7948EXPORT_SYMBOL(dev_get_port_parent_id);
7949
7950/**
7951 *	netdev_port_same_parent_id - Indicate if two network devices have
7952 *	the same port parent identifier
7953 *	@a: first network device
7954 *	@b: second network device
7955 */
7956bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
7957{
7958	struct netdev_phys_item_id a_id = { };
7959	struct netdev_phys_item_id b_id = { };
7960
7961	if (dev_get_port_parent_id(a, &a_id, true) ||
7962	    dev_get_port_parent_id(b, &b_id, true))
7963		return false;
7964
7965	return netdev_phys_item_id_same(&a_id, &b_id);
7966}
7967EXPORT_SYMBOL(netdev_port_same_parent_id);
7968
7969/**
7970 *	dev_change_proto_down - update protocol port state information
7971 *	@dev: device
7972 *	@proto_down: new value
7973 *
7974 *	This info can be used by switch drivers to set the phys state of the
7975 *	port.
7976 */
7977int dev_change_proto_down(struct net_device *dev, bool proto_down)
7978{
7979	const struct net_device_ops *ops = dev->netdev_ops;
7980
7981	if (!ops->ndo_change_proto_down)
7982		return -EOPNOTSUPP;
7983	if (!netif_device_present(dev))
7984		return -ENODEV;
7985	return ops->ndo_change_proto_down(dev, proto_down);
7986}
7987EXPORT_SYMBOL(dev_change_proto_down);
7988
7989/**
7990 *	dev_change_proto_down_generic - generic implementation for
7991 * 	ndo_change_proto_down that sets carrier according to
7992 * 	proto_down.
7993 *
7994 *	@dev: device
7995 *	@proto_down: new value
7996 */
7997int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
7998{
7999	if (proto_down)
8000		netif_carrier_off(dev);
8001	else
8002		netif_carrier_on(dev);
8003	dev->proto_down = proto_down;
8004	return 0;
8005}
8006EXPORT_SYMBOL(dev_change_proto_down_generic);
8007
8008u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8009		    enum bpf_netdev_command cmd)
8010{
8011	struct netdev_bpf xdp;
8012
8013	if (!bpf_op)
8014		return 0;
8015
8016	memset(&xdp, 0, sizeof(xdp));
8017	xdp.command = cmd;
8018
8019	/* Query must always succeed. */
8020	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
8021
8022	return xdp.prog_id;
8023}
8024
8025static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8026			   struct netlink_ext_ack *extack, u32 flags,
8027			   struct bpf_prog *prog)
8028{
8029	struct netdev_bpf xdp;
8030
8031	memset(&xdp, 0, sizeof(xdp));
8032	if (flags & XDP_FLAGS_HW_MODE)
8033		xdp.command = XDP_SETUP_PROG_HW;
8034	else
8035		xdp.command = XDP_SETUP_PROG;
8036	xdp.extack = extack;
8037	xdp.flags = flags;
8038	xdp.prog = prog;
8039
8040	return bpf_op(dev, &xdp);
8041}
8042
8043static void dev_xdp_uninstall(struct net_device *dev)
8044{
8045	struct netdev_bpf xdp;
8046	bpf_op_t ndo_bpf;
8047
8048	/* Remove generic XDP */
8049	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
8050
8051	/* Remove from the driver */
8052	ndo_bpf = dev->netdev_ops->ndo_bpf;
8053	if (!ndo_bpf)
8054		return;
8055
8056	memset(&xdp, 0, sizeof(xdp));
8057	xdp.command = XDP_QUERY_PROG;
8058	WARN_ON(ndo_bpf(dev, &xdp));
8059	if (xdp.prog_id)
8060		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8061					NULL));
8062
8063	/* Remove HW offload */
8064	memset(&xdp, 0, sizeof(xdp));
8065	xdp.command = XDP_QUERY_PROG_HW;
8066	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8067		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8068					NULL));
8069}
8070
8071/**
8072 *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
8073 *	@dev: device
8074 *	@extack: netlink extended ack
8075 *	@fd: new program fd or negative value to clear
8076 *	@flags: xdp-related flags
8077 *
8078 *	Set or clear a bpf program for a device
8079 */
8080int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8081		      int fd, u32 flags)
8082{
8083	const struct net_device_ops *ops = dev->netdev_ops;
8084	enum bpf_netdev_command query;
8085	struct bpf_prog *prog = NULL;
8086	bpf_op_t bpf_op, bpf_chk;
8087	bool offload;
8088	int err;
8089
8090	ASSERT_RTNL();
8091
8092	offload = flags & XDP_FLAGS_HW_MODE;
8093	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8094
8095	bpf_op = bpf_chk = ops->ndo_bpf;
8096	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
8097		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
8098		return -EOPNOTSUPP;
8099	}
8100	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8101		bpf_op = generic_xdp_install;
8102	if (bpf_op == bpf_chk)
8103		bpf_chk = generic_xdp_install;
8104
8105	if (fd >= 0) {
8106		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
8107			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
8108			return -EEXIST;
8109		}
8110		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
8111		    __dev_xdp_query(dev, bpf_op, query)) {
8112			NL_SET_ERR_MSG(extack, "XDP program already attached");
8113			return -EBUSY;
8114		}
8115
8116		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8117					     bpf_op == ops->ndo_bpf);
8118		if (IS_ERR(prog))
8119			return PTR_ERR(prog);
8120
8121		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
8122			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8123			bpf_prog_put(prog);
8124			return -EINVAL;
8125		}
8126	}
8127
8128	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8129	if (err < 0 && prog)
8130		bpf_prog_put(prog);
8131
8132	return err;
8133}
8134
8135/**
8136 *	dev_new_index	-	allocate an ifindex
8137 *	@net: the applicable net namespace
8138 *
8139 *	Returns a suitable unique value for a new device interface
8140 *	number.  The caller must hold the rtnl semaphore or the
8141 *	dev_base_lock to be sure it remains unique.
8142 */
8143static int dev_new_index(struct net *net)
8144{
8145	int ifindex = net->ifindex;
8146
8147	for (;;) {
8148		if (++ifindex <= 0)
8149			ifindex = 1;
8150		if (!__dev_get_by_index(net, ifindex))
8151			return net->ifindex = ifindex;
8152	}
8153}
8154
8155/* Delayed registration/unregisteration */
8156static LIST_HEAD(net_todo_list);
8157DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
8158
8159static void net_set_todo(struct net_device *dev)
8160{
8161	list_add_tail(&dev->todo_list, &net_todo_list);
8162	dev_net(dev)->dev_unreg_count++;
8163}
8164
8165static void rollback_registered_many(struct list_head *head)
8166{
8167	struct net_device *dev, *tmp;
8168	LIST_HEAD(close_head);
8169
8170	BUG_ON(dev_boot_phase);
8171	ASSERT_RTNL();
8172
8173	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8174		/* Some devices call without registering
8175		 * for initialization unwind. Remove those
8176		 * devices and proceed with the remaining.
8177		 */
8178		if (dev->reg_state == NETREG_UNINITIALIZED) {
8179			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8180				 dev->name, dev);
8181
8182			WARN_ON(1);
8183			list_del(&dev->unreg_list);
8184			continue;
8185		}
8186		dev->dismantle = true;
8187		BUG_ON(dev->reg_state != NETREG_REGISTERED);
8188	}
8189
8190	/* If device is running, close it first. */
8191	list_for_each_entry(dev, head, unreg_list)
8192		list_add_tail(&dev->close_list, &close_head);
8193	dev_close_many(&close_head, true);
8194
8195	list_for_each_entry(dev, head, unreg_list) {
8196		/* And unlink it from device chain. */
8197		unlist_netdevice(dev);
8198
8199		dev->reg_state = NETREG_UNREGISTERING;
8200	}
8201	flush_all_backlogs();
8202
8203	synchronize_net();
8204
8205	list_for_each_entry(dev, head, unreg_list) {
8206		struct sk_buff *skb = NULL;
8207
8208		/* Shutdown queueing discipline. */
8209		dev_shutdown(dev);
8210
8211		dev_xdp_uninstall(dev);
8212
8213		/* Notify protocols, that we are about to destroy
8214		 * this device. They should clean all the things.
8215		 */
8216		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8217
8218		if (!dev->rtnl_link_ops ||
8219		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8220			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8221						     GFP_KERNEL, NULL, 0);
8222
8223		/*
8224		 *	Flush the unicast and multicast chains
8225		 */
8226		dev_uc_flush(dev);
8227		dev_mc_flush(dev);
8228
8229		if (dev->netdev_ops->ndo_uninit)
8230			dev->netdev_ops->ndo_uninit(dev);
8231
8232		if (skb)
8233			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8234
8235		/* Notifier chain MUST detach us all upper devices. */
8236		WARN_ON(netdev_has_any_upper_dev(dev));
8237		WARN_ON(netdev_has_any_lower_dev(dev));
8238
8239		/* Remove entries from kobject tree */
8240		netdev_unregister_kobject(dev);
8241#ifdef CONFIG_XPS
8242		/* Remove XPS queueing entries */
8243		netif_reset_xps_queues_gt(dev, 0);
8244#endif
8245	}
8246
8247	synchronize_net();
8248
8249	list_for_each_entry(dev, head, unreg_list)
8250		dev_put(dev);
8251}
8252
8253static void rollback_registered(struct net_device *dev)
8254{
8255	LIST_HEAD(single);
8256
8257	list_add(&dev->unreg_list, &single);
8258	rollback_registered_many(&single);
8259	list_del(&single);
8260}
8261
8262static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
8263	struct net_device *upper, netdev_features_t features)
8264{
8265	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8266	netdev_features_t feature;
8267	int feature_bit;
8268
8269	for_each_netdev_feature(upper_disables, feature_bit) {
8270		feature = __NETIF_F_BIT(feature_bit);
8271		if (!(upper->wanted_features & feature)
8272		    && (features & feature)) {
8273			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
8274				   &feature, upper->name);
8275			features &= ~feature;
8276		}
8277	}
8278
8279	return features;
8280}
8281
8282static void netdev_sync_lower_features(struct net_device *upper,
8283	struct net_device *lower, netdev_features_t features)
8284{
8285	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
8286	netdev_features_t feature;
8287	int feature_bit;
8288
8289	for_each_netdev_feature(upper_disables, feature_bit) {
8290		feature = __NETIF_F_BIT(feature_bit);
8291		if (!(features & feature) && (lower->features & feature)) {
8292			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
8293				   &feature, lower->name);
8294			lower->wanted_features &= ~feature;
8295			netdev_update_features(lower);
8296
8297			if (unlikely(lower->features & feature))
8298				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
8299					    &feature, lower->name);
8300		}
8301	}
8302}
8303
8304static netdev_features_t netdev_fix_features(struct net_device *dev,
8305	netdev_features_t features)
8306{
8307	/* Fix illegal checksum combinations */
8308	if ((features & NETIF_F_HW_CSUM) &&
8309	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
8310		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
8311		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
8312	}
8313
8314	/* TSO requires that SG is present as well. */
8315	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
8316		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
8317		features &= ~NETIF_F_ALL_TSO;
8318	}
8319
8320	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
8321					!(features & NETIF_F_IP_CSUM)) {
8322		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
8323		features &= ~NETIF_F_TSO;
8324		features &= ~NETIF_F_TSO_ECN;
8325	}
8326
8327	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
8328					 !(features & NETIF_F_IPV6_CSUM)) {
8329		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
8330		features &= ~NETIF_F_TSO6;
8331	}
8332
8333	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
8334	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
8335		features &= ~NETIF_F_TSO_MANGLEID;
8336
8337	/* TSO ECN requires that TSO is present as well. */
8338	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
8339		features &= ~NETIF_F_TSO_ECN;
8340
8341	/* Software GSO depends on SG. */
8342	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
8343		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
8344		features &= ~NETIF_F_GSO;
8345	}
8346
8347	/* GSO partial features require GSO partial be set */
8348	if ((features & dev->gso_partial_features) &&
8349	    !(features & NETIF_F_GSO_PARTIAL)) {
8350		netdev_dbg(dev,
8351			   "Dropping partially supported GSO features since no GSO partial.\n");
8352		features &= ~dev->gso_partial_features;
8353	}
8354
8355	if (!(features & NETIF_F_RXCSUM)) {
8356		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
8357		 * successfully merged by hardware must also have the
8358		 * checksum verified by hardware.  If the user does not
8359		 * want to enable RXCSUM, logically, we should disable GRO_HW.
8360		 */
8361		if (features & NETIF_F_GRO_HW) {
8362			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
8363			features &= ~NETIF_F_GRO_HW;
8364		}
8365	}
8366
8367	/* LRO/HW-GRO features cannot be combined with RX-FCS */
8368	if (features & NETIF_F_RXFCS) {
8369		if (features & NETIF_F_LRO) {
8370			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
8371			features &= ~NETIF_F_LRO;
8372		}
8373
8374		if (features & NETIF_F_GRO_HW) {
8375			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
8376			features &= ~NETIF_F_GRO_HW;
8377		}
8378	}
8379
8380	return features;
8381}
8382
8383int __netdev_update_features(struct net_device *dev)
8384{
8385	struct net_device *upper, *lower;
8386	netdev_features_t features;
8387	struct list_head *iter;
8388	int err = -1;
8389
8390	ASSERT_RTNL();
8391
8392	features = netdev_get_wanted_features(dev);
8393
8394	if (dev->netdev_ops->ndo_fix_features)
8395		features = dev->netdev_ops->ndo_fix_features(dev, features);
8396
8397	/* driver might be less strict about feature dependencies */
8398	features = netdev_fix_features(dev, features);
8399
8400	/* some features can't be enabled if they're off an an upper device */
8401	netdev_for_each_upper_dev_rcu(dev, upper, iter)
8402		features = netdev_sync_upper_features(dev, upper, features);
8403
8404	if (dev->features == features)
8405		goto sync_lower;
8406
8407	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
8408		&dev->features, &features);
8409
8410	if (dev->netdev_ops->ndo_set_features)
8411		err = dev->netdev_ops->ndo_set_features(dev, features);
8412	else
8413		err = 0;
8414
8415	if (unlikely(err < 0)) {
8416		netdev_err(dev,
8417			"set_features() failed (%d); wanted %pNF, left %pNF\n",
8418			err, &features, &dev->features);
8419		/* return non-0 since some features might have changed and
8420		 * it's better to fire a spurious notification than miss it
8421		 */
8422		return -1;
8423	}
8424
8425sync_lower:
8426	/* some features must be disabled on lower devices when disabled
8427	 * on an upper device (think: bonding master or bridge)
8428	 */
8429	netdev_for_each_lower_dev(dev, lower, iter)
8430		netdev_sync_lower_features(dev, lower, features);
8431
8432	if (!err) {
8433		netdev_features_t diff = features ^ dev->features;
8434
8435		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
8436			/* udp_tunnel_{get,drop}_rx_info both need
8437			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
8438			 * device, or they won't do anything.
8439			 * Thus we need to update dev->features
8440			 * *before* calling udp_tunnel_get_rx_info,
8441			 * but *after* calling udp_tunnel_drop_rx_info.
8442			 */
8443			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
8444				dev->features = features;
8445				udp_tunnel_get_rx_info(dev);
8446			} else {
8447				udp_tunnel_drop_rx_info(dev);
8448			}
8449		}
8450
8451		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
8452			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
8453				dev->features = features;
8454				err |= vlan_get_rx_ctag_filter_info(dev);
8455			} else {
8456				vlan_drop_rx_ctag_filter_info(dev);
8457			}
8458		}
8459
8460		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
8461			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
8462				dev->features = features;
8463				err |= vlan_get_rx_stag_filter_info(dev);
8464			} else {
8465				vlan_drop_rx_stag_filter_info(dev);
8466			}
8467		}
8468
8469		dev->features = features;
8470	}
8471
8472	return err < 0 ? 0 : 1;
8473}
8474
8475/**
8476 *	netdev_update_features - recalculate device features
8477 *	@dev: the device to check
8478 *
8479 *	Recalculate dev->features set and send notifications if it
8480 *	has changed. Should be called after driver or hardware dependent
8481 *	conditions might have changed that influence the features.
8482 */
8483void netdev_update_features(struct net_device *dev)
8484{
8485	if (__netdev_update_features(dev))
8486		netdev_features_change(dev);
8487}
8488EXPORT_SYMBOL(netdev_update_features);
8489
8490/**
8491 *	netdev_change_features - recalculate device features
8492 *	@dev: the device to check
8493 *
8494 *	Recalculate dev->features set and send notifications even
8495 *	if they have not changed. Should be called instead of
8496 *	netdev_update_features() if also dev->vlan_features might
8497 *	have changed to allow the changes to be propagated to stacked
8498 *	VLAN devices.
8499 */
8500void netdev_change_features(struct net_device *dev)
8501{
8502	__netdev_update_features(dev);
8503	netdev_features_change(dev);
8504}
8505EXPORT_SYMBOL(netdev_change_features);
8506
8507/**
8508 *	netif_stacked_transfer_operstate -	transfer operstate
8509 *	@rootdev: the root or lower level device to transfer state from
8510 *	@dev: the device to transfer operstate to
8511 *
8512 *	Transfer operational state from root to device. This is normally
8513 *	called when a stacking relationship exists between the root
8514 *	device and the device(a leaf device).
8515 */
8516void netif_stacked_transfer_operstate(const struct net_device *rootdev,
8517					struct net_device *dev)
8518{
8519	if (rootdev->operstate == IF_OPER_DORMANT)
8520		netif_dormant_on(dev);
8521	else
8522		netif_dormant_off(dev);
8523
8524	if (netif_carrier_ok(rootdev))
8525		netif_carrier_on(dev);
8526	else
8527		netif_carrier_off(dev);
8528}
8529EXPORT_SYMBOL(netif_stacked_transfer_operstate);
8530
8531static int netif_alloc_rx_queues(struct net_device *dev)
8532{
8533	unsigned int i, count = dev->num_rx_queues;
8534	struct netdev_rx_queue *rx;
8535	size_t sz = count * sizeof(*rx);
8536	int err = 0;
8537
8538	BUG_ON(count < 1);
8539
8540	rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8541	if (!rx)
8542		return -ENOMEM;
8543
8544	dev->_rx = rx;
8545
8546	for (i = 0; i < count; i++) {
8547		rx[i].dev = dev;
8548
8549		/* XDP RX-queue setup */
8550		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
8551		if (err < 0)
8552			goto err_rxq_info;
8553	}
8554	return 0;
8555
8556err_rxq_info:
8557	/* Rollback successful reg's and free other resources */
8558	while (i--)
8559		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
8560	kvfree(dev->_rx);
8561	dev->_rx = NULL;
8562	return err;
8563}
8564
8565static void netif_free_rx_queues(struct net_device *dev)
8566{
8567	unsigned int i, count = dev->num_rx_queues;
8568
8569	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
8570	if (!dev->_rx)
8571		return;
8572
8573	for (i = 0; i < count; i++)
8574		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
8575
8576	kvfree(dev->_rx);
8577}
8578
8579static void netdev_init_one_queue(struct net_device *dev,
8580				  struct netdev_queue *queue, void *_unused)
8581{
8582	/* Initialize queue lock */
8583	spin_lock_init(&queue->_xmit_lock);
8584	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
8585	queue->xmit_lock_owner = -1;
8586	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
8587	queue->dev = dev;
8588#ifdef CONFIG_BQL
8589	dql_init(&queue->dql, HZ);
8590#endif
8591}
8592
8593static void netif_free_tx_queues(struct net_device *dev)
8594{
8595	kvfree(dev->_tx);
8596}
8597
8598static int netif_alloc_netdev_queues(struct net_device *dev)
8599{
8600	unsigned int count = dev->num_tx_queues;
8601	struct netdev_queue *tx;
8602	size_t sz = count * sizeof(*tx);
8603
8604	if (count < 1 || count > 0xffff)
8605		return -EINVAL;
8606
8607	tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8608	if (!tx)
8609		return -ENOMEM;
8610
8611	dev->_tx = tx;
8612
8613	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
8614	spin_lock_init(&dev->tx_global_lock);
8615
8616	return 0;
8617}
8618
8619void netif_tx_stop_all_queues(struct net_device *dev)
8620{
8621	unsigned int i;
8622
8623	for (i = 0; i < dev->num_tx_queues; i++) {
8624		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
8625
8626		netif_tx_stop_queue(txq);
8627	}
8628}
8629EXPORT_SYMBOL(netif_tx_stop_all_queues);
8630
8631/**
8632 *	register_netdevice	- register a network device
8633 *	@dev: device to register
8634 *
8635 *	Take a completed network device structure and add it to the kernel
8636 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8637 *	chain. 0 is returned on success. A negative errno code is returned
8638 *	on a failure to set up the device, or if the name is a duplicate.
8639 *
8640 *	Callers must hold the rtnl semaphore. You may want
8641 *	register_netdev() instead of this.
8642 *
8643 *	BUGS:
8644 *	The locking appears insufficient to guarantee two parallel registers
8645 *	will not get the same name.
8646 */
8647
8648int register_netdevice(struct net_device *dev)
8649{
8650	int ret;
8651	struct net *net = dev_net(dev);
8652
8653	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
8654		     NETDEV_FEATURE_COUNT);
8655	BUG_ON(dev_boot_phase);
8656	ASSERT_RTNL();
8657
8658	might_sleep();
8659
8660	/* When net_device's are persistent, this will be fatal. */
8661	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
8662	BUG_ON(!net);
8663
8664	spin_lock_init(&dev->addr_list_lock);
8665	netdev_set_addr_lockdep_class(dev);
8666
8667	ret = dev_get_valid_name(net, dev, dev->name);
8668	if (ret < 0)
8669		goto out;
8670
8671	/* Init, if this function is available */
8672	if (dev->netdev_ops->ndo_init) {
8673		ret = dev->netdev_ops->ndo_init(dev);
8674		if (ret) {
8675			if (ret > 0)
8676				ret = -EIO;
8677			goto out;
8678		}
8679	}
8680
8681	if (((dev->hw_features | dev->features) &
8682	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
8683	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
8684	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
8685		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
8686		ret = -EINVAL;
8687		goto err_uninit;
8688	}
8689
8690	ret = -EBUSY;
8691	if (!dev->ifindex)
8692		dev->ifindex = dev_new_index(net);
8693	else if (__dev_get_by_index(net, dev->ifindex))
8694		goto err_uninit;
8695
8696	/* Transfer changeable features to wanted_features and enable
8697	 * software offloads (GSO and GRO).
8698	 */
8699	dev->hw_features |= NETIF_F_SOFT_FEATURES;
8700	dev->features |= NETIF_F_SOFT_FEATURES;
8701
8702	if (dev->netdev_ops->ndo_udp_tunnel_add) {
8703		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8704		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
8705	}
8706
8707	dev->wanted_features = dev->features & dev->hw_features;
8708
8709	if (!(dev->flags & IFF_LOOPBACK))
8710		dev->hw_features |= NETIF_F_NOCACHE_COPY;
8711
8712	/* If IPv4 TCP segmentation offload is supported we should also
8713	 * allow the device to enable segmenting the frame with the option
8714	 * of ignoring a static IP ID value.  This doesn't enable the
8715	 * feature itself but allows the user to enable it later.
8716	 */
8717	if (dev->hw_features & NETIF_F_TSO)
8718		dev->hw_features |= NETIF_F_TSO_MANGLEID;
8719	if (dev->vlan_features & NETIF_F_TSO)
8720		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
8721	if (dev->mpls_features & NETIF_F_TSO)
8722		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
8723	if (dev->hw_enc_features & NETIF_F_TSO)
8724		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
8725
8726	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
8727	 */
8728	dev->vlan_features |= NETIF_F_HIGHDMA;
8729
8730	/* Make NETIF_F_SG inheritable to tunnel devices.
8731	 */
8732	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
8733
8734	/* Make NETIF_F_SG inheritable to MPLS.
8735	 */
8736	dev->mpls_features |= NETIF_F_SG;
8737
8738	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
8739	ret = notifier_to_errno(ret);
8740	if (ret)
8741		goto err_uninit;
8742
8743	ret = netdev_register_kobject(dev);
8744	if (ret)
8745		goto err_uninit;
8746	dev->reg_state = NETREG_REGISTERED;
8747
8748	__netdev_update_features(dev);
8749
8750	/*
8751	 *	Default initial state at registry is that the
8752	 *	device is present.
8753	 */
8754
8755	set_bit(__LINK_STATE_PRESENT, &dev->state);
8756
8757	linkwatch_init_dev(dev);
8758
8759	dev_init_scheduler(dev);
8760	dev_hold(dev);
8761	list_netdevice(dev);
8762	add_device_randomness(dev->dev_addr, dev->addr_len);
8763
8764	/* If the device has permanent device address, driver should
8765	 * set dev_addr and also addr_assign_type should be set to
8766	 * NET_ADDR_PERM (default value).
8767	 */
8768	if (dev->addr_assign_type == NET_ADDR_PERM)
8769		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
8770
8771	/* Notify protocols, that a new device appeared. */
8772	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
8773	ret = notifier_to_errno(ret);
8774	if (ret) {
8775		rollback_registered(dev);
8776		dev->reg_state = NETREG_UNREGISTERED;
8777	}
8778	/*
8779	 *	Prevent userspace races by waiting until the network
8780	 *	device is fully setup before sending notifications.
8781	 */
8782	if (!dev->rtnl_link_ops ||
8783	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8784		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8785
8786out:
8787	return ret;
8788
8789err_uninit:
8790	if (dev->netdev_ops->ndo_uninit)
8791		dev->netdev_ops->ndo_uninit(dev);
8792	if (dev->priv_destructor)
8793		dev->priv_destructor(dev);
8794	goto out;
8795}
8796EXPORT_SYMBOL(register_netdevice);
8797
8798/**
8799 *	init_dummy_netdev	- init a dummy network device for NAPI
8800 *	@dev: device to init
8801 *
8802 *	This takes a network device structure and initialize the minimum
8803 *	amount of fields so it can be used to schedule NAPI polls without
8804 *	registering a full blown interface. This is to be used by drivers
8805 *	that need to tie several hardware interfaces to a single NAPI
8806 *	poll scheduler due to HW limitations.
8807 */
8808int init_dummy_netdev(struct net_device *dev)
8809{
8810	/* Clear everything. Note we don't initialize spinlocks
8811	 * are they aren't supposed to be taken by any of the
8812	 * NAPI code and this dummy netdev is supposed to be
8813	 * only ever used for NAPI polls
8814	 */
8815	memset(dev, 0, sizeof(struct net_device));
8816
8817	/* make sure we BUG if trying to hit standard
8818	 * register/unregister code path
8819	 */
8820	dev->reg_state = NETREG_DUMMY;
8821
8822	/* NAPI wants this */
8823	INIT_LIST_HEAD(&dev->napi_list);
8824
8825	/* a dummy interface is started by default */
8826	set_bit(__LINK_STATE_PRESENT, &dev->state);
8827	set_bit(__LINK_STATE_START, &dev->state);
8828
8829	/* napi_busy_loop stats accounting wants this */
8830	dev_net_set(dev, &init_net);
8831
8832	/* Note : We dont allocate pcpu_refcnt for dummy devices,
8833	 * because users of this 'device' dont need to change
8834	 * its refcount.
8835	 */
8836
8837	return 0;
8838}
8839EXPORT_SYMBOL_GPL(init_dummy_netdev);
8840
8841
8842/**
8843 *	register_netdev	- register a network device
8844 *	@dev: device to register
8845 *
8846 *	Take a completed network device structure and add it to the kernel
8847 *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8848 *	chain. 0 is returned on success. A negative errno code is returned
8849 *	on a failure to set up the device, or if the name is a duplicate.
8850 *
8851 *	This is a wrapper around register_netdevice that takes the rtnl semaphore
8852 *	and expands the device name if you passed a format string to
8853 *	alloc_netdev.
8854 */
8855int register_netdev(struct net_device *dev)
8856{
8857	int err;
8858
8859	if (rtnl_lock_killable())
8860		return -EINTR;
8861	err = register_netdevice(dev);
8862	rtnl_unlock();
8863	return err;
8864}
8865EXPORT_SYMBOL(register_netdev);
8866
8867int netdev_refcnt_read(const struct net_device *dev)
8868{
8869	int i, refcnt = 0;
8870
8871	for_each_possible_cpu(i)
8872		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
8873	return refcnt;
8874}
8875EXPORT_SYMBOL(netdev_refcnt_read);
8876
8877/**
8878 * netdev_wait_allrefs - wait until all references are gone.
8879 * @dev: target net_device
8880 *
8881 * This is called when unregistering network devices.
8882 *
8883 * Any protocol or device that holds a reference should register
8884 * for netdevice notification, and cleanup and put back the
8885 * reference if they receive an UNREGISTER event.
8886 * We can get stuck here if buggy protocols don't correctly
8887 * call dev_put.
8888 */
8889static void netdev_wait_allrefs(struct net_device *dev)
8890{
8891	unsigned long rebroadcast_time, warning_time;
8892	int refcnt;
8893
8894	linkwatch_forget_dev(dev);
8895
8896	rebroadcast_time = warning_time = jiffies;
8897	refcnt = netdev_refcnt_read(dev);
8898
8899	while (refcnt != 0) {
8900		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
8901			rtnl_lock();
8902
8903			/* Rebroadcast unregister notification */
8904			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8905
8906			__rtnl_unlock();
8907			rcu_barrier();
8908			rtnl_lock();
8909
8910			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
8911				     &dev->state)) {
8912				/* We must not have linkwatch events
8913				 * pending on unregister. If this
8914				 * happens, we simply run the queue
8915				 * unscheduled, resulting in a noop
8916				 * for this device.
8917				 */
8918				linkwatch_run_queue();
8919			}
8920
8921			__rtnl_unlock();
8922
8923			rebroadcast_time = jiffies;
8924		}
8925
8926		msleep(250);
8927
8928		refcnt = netdev_refcnt_read(dev);
8929
8930		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
8931			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
8932				 dev->name, refcnt);
8933			warning_time = jiffies;
8934		}
8935	}
8936}
8937
8938/* The sequence is:
8939 *
8940 *	rtnl_lock();
8941 *	...
8942 *	register_netdevice(x1);
8943 *	register_netdevice(x2);
8944 *	...
8945 *	unregister_netdevice(y1);
8946 *	unregister_netdevice(y2);
8947 *      ...
8948 *	rtnl_unlock();
8949 *	free_netdev(y1);
8950 *	free_netdev(y2);
8951 *
8952 * We are invoked by rtnl_unlock().
8953 * This allows us to deal with problems:
8954 * 1) We can delete sysfs objects which invoke hotplug
8955 *    without deadlocking with linkwatch via keventd.
8956 * 2) Since we run with the RTNL semaphore not held, we can sleep
8957 *    safely in order to wait for the netdev refcnt to drop to zero.
8958 *
8959 * We must not return until all unregister events added during
8960 * the interval the lock was held have been completed.
8961 */
8962void netdev_run_todo(void)
8963{
8964	struct list_head list;
8965
8966	/* Snapshot list, allow later requests */
8967	list_replace_init(&net_todo_list, &list);
8968
8969	__rtnl_unlock();
8970
8971
8972	/* Wait for rcu callbacks to finish before next phase */
8973	if (!list_empty(&list))
8974		rcu_barrier();
8975
8976	while (!list_empty(&list)) {
8977		struct net_device *dev
8978			= list_first_entry(&list, struct net_device, todo_list);
8979		list_del(&dev->todo_list);
8980
8981		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
8982			pr_err("network todo '%s' but state %d\n",
8983			       dev->name, dev->reg_state);
8984			dump_stack();
8985			continue;
8986		}
8987
8988		dev->reg_state = NETREG_UNREGISTERED;
8989
8990		netdev_wait_allrefs(dev);
8991
8992		/* paranoia */
8993		BUG_ON(netdev_refcnt_read(dev));
8994		BUG_ON(!list_empty(&dev->ptype_all));
8995		BUG_ON(!list_empty(&dev->ptype_specific));
8996		WARN_ON(rcu_access_pointer(dev->ip_ptr));
8997		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
8998#if IS_ENABLED(CONFIG_DECNET)
8999		WARN_ON(dev->dn_ptr);
9000#endif
9001		if (dev->priv_destructor)
9002			dev->priv_destructor(dev);
9003		if (dev->needs_free_netdev)
9004			free_netdev(dev);
9005
9006		/* Report a network device has been unregistered */
9007		rtnl_lock();
9008		dev_net(dev)->dev_unreg_count--;
9009		__rtnl_unlock();
9010		wake_up(&netdev_unregistering_wq);
9011
9012		/* Free network device */
9013		kobject_put(&dev->dev.kobj);
9014	}
9015}
9016
9017/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
9018 * all the same fields in the same order as net_device_stats, with only
9019 * the type differing, but rtnl_link_stats64 may have additional fields
9020 * at the end for newer counters.
9021 */
9022void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
9023			     const struct net_device_stats *netdev_stats)
9024{
9025#if BITS_PER_LONG == 64
9026	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
9027	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
9028	/* zero out counters that only exist in rtnl_link_stats64 */
9029	memset((char *)stats64 + sizeof(*netdev_stats), 0,
9030	       sizeof(*stats64) - sizeof(*netdev_stats));
9031#else
9032	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
9033	const unsigned long *src = (const unsigned long *)netdev_stats;
9034	u64 *dst = (u64 *)stats64;
9035
9036	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
9037	for (i = 0; i < n; i++)
9038		dst[i] = src[i];
9039	/* zero out counters that only exist in rtnl_link_stats64 */
9040	memset((char *)stats64 + n * sizeof(u64), 0,
9041	       sizeof(*stats64) - n * sizeof(u64));
9042#endif
9043}
9044EXPORT_SYMBOL(netdev_stats_to_stats64);
9045
9046/**
9047 *	dev_get_stats	- get network device statistics
9048 *	@dev: device to get statistics from
9049 *	@storage: place to store stats
9050 *
9051 *	Get network statistics from device. Return @storage.
9052 *	The device driver may provide its own method by setting
9053 *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
9054 *	otherwise the internal statistics structure is used.
9055 */
9056struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
9057					struct rtnl_link_stats64 *storage)
9058{
9059	const struct net_device_ops *ops = dev->netdev_ops;
9060
9061	if (ops->ndo_get_stats64) {
9062		memset(storage, 0, sizeof(*storage));
9063		ops->ndo_get_stats64(dev, storage);
9064	} else if (ops->ndo_get_stats) {
9065		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
9066	} else {
9067		netdev_stats_to_stats64(storage, &dev->stats);
9068	}
9069	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
9070	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
9071	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
9072	return storage;
9073}
9074EXPORT_SYMBOL(dev_get_stats);
9075
9076struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
9077{
9078	struct netdev_queue *queue = dev_ingress_queue(dev);
9079
9080#ifdef CONFIG_NET_CLS_ACT
9081	if (queue)
9082		return queue;
9083	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
9084	if (!queue)
9085		return NULL;
9086	netdev_init_one_queue(dev, queue, NULL);
9087	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
9088	queue->qdisc_sleeping = &noop_qdisc;
9089	rcu_assign_pointer(dev->ingress_queue, queue);
9090#endif
9091	return queue;
9092}
9093
9094static const struct ethtool_ops default_ethtool_ops;
9095
9096void netdev_set_default_ethtool_ops(struct net_device *dev,
9097				    const struct ethtool_ops *ops)
9098{
9099	if (dev->ethtool_ops == &default_ethtool_ops)
9100		dev->ethtool_ops = ops;
9101}
9102EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
9103
9104void netdev_freemem(struct net_device *dev)
9105{
9106	char *addr = (char *)dev - dev->padded;
9107
9108	kvfree(addr);
9109}
9110
9111/**
9112 * alloc_netdev_mqs - allocate network device
9113 * @sizeof_priv: size of private data to allocate space for
9114 * @name: device name format string
9115 * @name_assign_type: origin of device name
9116 * @setup: callback to initialize device
9117 * @txqs: the number of TX subqueues to allocate
9118 * @rxqs: the number of RX subqueues to allocate
9119 *
9120 * Allocates a struct net_device with private data area for driver use
9121 * and performs basic initialization.  Also allocates subqueue structs
9122 * for each queue on the device.
9123 */
9124struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
9125		unsigned char name_assign_type,
9126		void (*setup)(struct net_device *),
9127		unsigned int txqs, unsigned int rxqs)
9128{
9129	struct net_device *dev;
9130	unsigned int alloc_size;
9131	struct net_device *p;
9132
9133	BUG_ON(strlen(name) >= sizeof(dev->name));
9134
9135	if (txqs < 1) {
9136		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
9137		return NULL;
9138	}
9139
9140	if (rxqs < 1) {
9141		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
9142		return NULL;
9143	}
9144
9145	alloc_size = sizeof(struct net_device);
9146	if (sizeof_priv) {
9147		/* ensure 32-byte alignment of private area */
9148		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
9149		alloc_size += sizeof_priv;
9150	}
9151	/* ensure 32-byte alignment of whole construct */
9152	alloc_size += NETDEV_ALIGN - 1;
9153
9154	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9155	if (!p)
9156		return NULL;
9157
9158	dev = PTR_ALIGN(p, NETDEV_ALIGN);
9159	dev->padded = (char *)dev - (char *)p;
9160
9161	dev->pcpu_refcnt = alloc_percpu(int);
9162	if (!dev->pcpu_refcnt)
9163		goto free_dev;
9164
9165	if (dev_addr_init(dev))
9166		goto free_pcpu;
9167
9168	dev_mc_init(dev);
9169	dev_uc_init(dev);
9170
9171	dev_net_set(dev, &init_net);
9172
9173	dev->gso_max_size = GSO_MAX_SIZE;
9174	dev->gso_max_segs = GSO_MAX_SEGS;
9175
9176	INIT_LIST_HEAD(&dev->napi_list);
9177	INIT_LIST_HEAD(&dev->unreg_list);
9178	INIT_LIST_HEAD(&dev->close_list);
9179	INIT_LIST_HEAD(&dev->link_watch_list);
9180	INIT_LIST_HEAD(&dev->adj_list.upper);
9181	INIT_LIST_HEAD(&dev->adj_list.lower);
9182	INIT_LIST_HEAD(&dev->ptype_all);
9183	INIT_LIST_HEAD(&dev->ptype_specific);
9184#ifdef CONFIG_NET_SCHED
9185	hash_init(dev->qdisc_hash);
9186#endif
9187	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
9188	setup(dev);
9189
9190	if (!dev->tx_queue_len) {
9191		dev->priv_flags |= IFF_NO_QUEUE;
9192		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
9193	}
9194
9195	dev->num_tx_queues = txqs;
9196	dev->real_num_tx_queues = txqs;
9197	if (netif_alloc_netdev_queues(dev))
9198		goto free_all;
9199
9200	dev->num_rx_queues = rxqs;
9201	dev->real_num_rx_queues = rxqs;
9202	if (netif_alloc_rx_queues(dev))
9203		goto free_all;
9204
9205	strcpy(dev->name, name);
9206	dev->name_assign_type = name_assign_type;
9207	dev->group = INIT_NETDEV_GROUP;
9208	if (!dev->ethtool_ops)
9209		dev->ethtool_ops = &default_ethtool_ops;
9210
9211	nf_hook_ingress_init(dev);
9212
9213	return dev;
9214
9215free_all:
9216	free_netdev(dev);
9217	return NULL;
9218
9219free_pcpu:
9220	free_percpu(dev->pcpu_refcnt);
9221free_dev:
9222	netdev_freemem(dev);
9223	return NULL;
9224}
9225EXPORT_SYMBOL(alloc_netdev_mqs);
9226
9227/**
9228 * free_netdev - free network device
9229 * @dev: device
9230 *
9231 * This function does the last stage of destroying an allocated device
9232 * interface. The reference to the device object is released. If this
9233 * is the last reference then it will be freed.Must be called in process
9234 * context.
9235 */
9236void free_netdev(struct net_device *dev)
9237{
9238	struct napi_struct *p, *n;
9239
9240	might_sleep();
9241	netif_free_tx_queues(dev);
9242	netif_free_rx_queues(dev);
9243
9244	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
9245
9246	/* Flush device addresses */
9247	dev_addr_flush(dev);
9248
9249	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
9250		netif_napi_del(p);
9251
9252	free_percpu(dev->pcpu_refcnt);
9253	dev->pcpu_refcnt = NULL;
9254
9255	/*  Compatibility with error handling in drivers */
9256	if (dev->reg_state == NETREG_UNINITIALIZED) {
9257		netdev_freemem(dev);
9258		return;
9259	}
9260
9261	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
9262	dev->reg_state = NETREG_RELEASED;
9263
9264	/* will free via device release */
9265	put_device(&dev->dev);
9266}
9267EXPORT_SYMBOL(free_netdev);
9268
9269/**
9270 *	synchronize_net -  Synchronize with packet receive processing
9271 *
9272 *	Wait for packets currently being received to be done.
9273 *	Does not block later packets from starting.
9274 */
9275void synchronize_net(void)
9276{
9277	might_sleep();
9278	if (rtnl_is_locked())
9279		synchronize_rcu_expedited();
9280	else
9281		synchronize_rcu();
9282}
9283EXPORT_SYMBOL(synchronize_net);
9284
9285/**
9286 *	unregister_netdevice_queue - remove device from the kernel
9287 *	@dev: device
9288 *	@head: list
9289 *
9290 *	This function shuts down a device interface and removes it
9291 *	from the kernel tables.
9292 *	If head not NULL, device is queued to be unregistered later.
9293 *
9294 *	Callers must hold the rtnl semaphore.  You may want
9295 *	unregister_netdev() instead of this.
9296 */
9297
9298void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
9299{
9300	ASSERT_RTNL();
9301
9302	if (head) {
9303		list_move_tail(&dev->unreg_list, head);
9304	} else {
9305		rollback_registered(dev);
9306		/* Finish processing unregister after unlock */
9307		net_set_todo(dev);
9308	}
9309}
9310EXPORT_SYMBOL(unregister_netdevice_queue);
9311
9312/**
9313 *	unregister_netdevice_many - unregister many devices
9314 *	@head: list of devices
9315 *
9316 *  Note: As most callers use a stack allocated list_head,
9317 *  we force a list_del() to make sure stack wont be corrupted later.
9318 */
9319void unregister_netdevice_many(struct list_head *head)
9320{
9321	struct net_device *dev;
9322
9323	if (!list_empty(head)) {
9324		rollback_registered_many(head);
9325		list_for_each_entry(dev, head, unreg_list)
9326			net_set_todo(dev);
9327		list_del(head);
9328	}
9329}
9330EXPORT_SYMBOL(unregister_netdevice_many);
9331
9332/**
9333 *	unregister_netdev - remove device from the kernel
9334 *	@dev: device
9335 *
9336 *	This function shuts down a device interface and removes it
9337 *	from the kernel tables.
9338 *
9339 *	This is just a wrapper for unregister_netdevice that takes
9340 *	the rtnl semaphore.  In general you want to use this and not
9341 *	unregister_netdevice.
9342 */
9343void unregister_netdev(struct net_device *dev)
9344{
9345	rtnl_lock();
9346	unregister_netdevice(dev);
9347	rtnl_unlock();
9348}
9349EXPORT_SYMBOL(unregister_netdev);
9350
9351/**
9352 *	dev_change_net_namespace - move device to different nethost namespace
9353 *	@dev: device
9354 *	@net: network namespace
9355 *	@pat: If not NULL name pattern to try if the current device name
9356 *	      is already taken in the destination network namespace.
9357 *
9358 *	This function shuts down a device interface and moves it
9359 *	to a new network namespace. On success 0 is returned, on
9360 *	a failure a netagive errno code is returned.
9361 *
9362 *	Callers must hold the rtnl semaphore.
9363 */
9364
9365int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
9366{
9367	int err, new_nsid, new_ifindex;
9368
9369	ASSERT_RTNL();
9370
9371	/* Don't allow namespace local devices to be moved. */
9372	err = -EINVAL;
9373	if (dev->features & NETIF_F_NETNS_LOCAL)
9374		goto out;
9375
9376	/* Ensure the device has been registrered */
9377	if (dev->reg_state != NETREG_REGISTERED)
9378		goto out;
9379
9380	/* Get out if there is nothing todo */
9381	err = 0;
9382	if (net_eq(dev_net(dev), net))
9383		goto out;
9384
9385	/* Pick the destination device name, and ensure
9386	 * we can use it in the destination network namespace.
9387	 */
9388	err = -EEXIST;
9389	if (__dev_get_by_name(net, dev->name)) {
9390		/* We get here if we can't use the current device name */
9391		if (!pat)
9392			goto out;
9393		err = dev_get_valid_name(net, dev, pat);
9394		if (err < 0)
9395			goto out;
9396	}
9397
9398	/*
9399	 * And now a mini version of register_netdevice unregister_netdevice.
9400	 */
9401
9402	/* If device is running close it first. */
9403	dev_close(dev);
9404
9405	/* And unlink it from device chain */
9406	unlist_netdevice(dev);
9407
9408	synchronize_net();
9409
9410	/* Shutdown queueing discipline. */
9411	dev_shutdown(dev);
9412
9413	/* Notify protocols, that we are about to destroy
9414	 * this device. They should clean all the things.
9415	 *
9416	 * Note that dev->reg_state stays at NETREG_REGISTERED.
9417	 * This is wanted because this way 8021q and macvlan know
9418	 * the device is just moving and can keep their slaves up.
9419	 */
9420	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9421	rcu_barrier();
9422
9423	new_nsid = peernet2id_alloc(dev_net(dev), net);
9424	/* If there is an ifindex conflict assign a new one */
9425	if (__dev_get_by_index(net, dev->ifindex))
9426		new_ifindex = dev_new_index(net);
9427	else
9428		new_ifindex = dev->ifindex;
9429
9430	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
9431			    new_ifindex);
9432
9433	/*
9434	 *	Flush the unicast and multicast chains
9435	 */
9436	dev_uc_flush(dev);
9437	dev_mc_flush(dev);
9438
9439	/* Send a netdev-removed uevent to the old namespace */
9440	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
9441	netdev_adjacent_del_links(dev);
9442
9443	/* Actually switch the network namespace */
9444	dev_net_set(dev, net);
9445	dev->ifindex = new_ifindex;
9446
9447	/* Send a netdev-add uevent to the new namespace */
9448	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
9449	netdev_adjacent_add_links(dev);
9450
9451	/* Fixup kobjects */
9452	err = device_rename(&dev->dev, dev->name);
9453	WARN_ON(err);
9454
9455	/* Add the device back in the hashes */
9456	list_netdevice(dev);
9457
9458	/* Notify protocols, that a new device appeared. */
9459	call_netdevice_notifiers(NETDEV_REGISTER, dev);
9460
9461	/*
9462	 *	Prevent userspace races by waiting until the network
9463	 *	device is fully setup before sending notifications.
9464	 */
9465	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9466
9467	synchronize_net();
9468	err = 0;
9469out:
9470	return err;
9471}
9472EXPORT_SYMBOL_GPL(dev_change_net_namespace);
9473
9474static int dev_cpu_dead(unsigned int oldcpu)
9475{
9476	struct sk_buff **list_skb;
9477	struct sk_buff *skb;
9478	unsigned int cpu;
9479	struct softnet_data *sd, *oldsd, *remsd = NULL;
9480
9481	local_irq_disable();
9482	cpu = smp_processor_id();
9483	sd = &per_cpu(softnet_data, cpu);
9484	oldsd = &per_cpu(softnet_data, oldcpu);
9485
9486	/* Find end of our completion_queue. */
9487	list_skb = &sd->completion_queue;
9488	while (*list_skb)
9489		list_skb = &(*list_skb)->next;
9490	/* Append completion queue from offline CPU. */
9491	*list_skb = oldsd->completion_queue;
9492	oldsd->completion_queue = NULL;
9493
9494	/* Append output queue from offline CPU. */
9495	if (oldsd->output_queue) {
9496		*sd->output_queue_tailp = oldsd->output_queue;
9497		sd->output_queue_tailp = oldsd->output_queue_tailp;
9498		oldsd->output_queue = NULL;
9499		oldsd->output_queue_tailp = &oldsd->output_queue;
9500	}
9501	/* Append NAPI poll list from offline CPU, with one exception :
9502	 * process_backlog() must be called by cpu owning percpu backlog.
9503	 * We properly handle process_queue & input_pkt_queue later.
9504	 */
9505	while (!list_empty(&oldsd->poll_list)) {
9506		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
9507							    struct napi_struct,
9508							    poll_list);
9509
9510		list_del_init(&napi->poll_list);
9511		if (napi->poll == process_backlog)
9512			napi->state = 0;
9513		else
9514			____napi_schedule(sd, napi);
9515	}
9516
9517	raise_softirq_irqoff(NET_TX_SOFTIRQ);
9518	local_irq_enable();
9519
9520#ifdef CONFIG_RPS
9521	remsd = oldsd->rps_ipi_list;
9522	oldsd->rps_ipi_list = NULL;
9523#endif
9524	/* send out pending IPI's on offline CPU */
9525	net_rps_send_ipi(remsd);
9526
9527	/* Process offline CPU's input_pkt_queue */
9528	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
9529		netif_rx_ni(skb);
9530		input_queue_head_incr(oldsd);
9531	}
9532	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
9533		netif_rx_ni(skb);
9534		input_queue_head_incr(oldsd);
9535	}
9536
9537	return 0;
9538}
9539
9540/**
9541 *	netdev_increment_features - increment feature set by one
9542 *	@all: current feature set
9543 *	@one: new feature set
9544 *	@mask: mask feature set
9545 *
9546 *	Computes a new feature set after adding a device with feature set
9547 *	@one to the master device with current feature set @all.  Will not
9548 *	enable anything that is off in @mask. Returns the new feature set.
9549 */
9550netdev_features_t netdev_increment_features(netdev_features_t all,
9551	netdev_features_t one, netdev_features_t mask)
9552{
9553	if (mask & NETIF_F_HW_CSUM)
9554		mask |= NETIF_F_CSUM_MASK;
9555	mask |= NETIF_F_VLAN_CHALLENGED;
9556
9557	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
9558	all &= one | ~NETIF_F_ALL_FOR_ALL;
9559
9560	/* If one device supports hw checksumming, set for all. */
9561	if (all & NETIF_F_HW_CSUM)
9562		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
9563
9564	return all;
9565}
9566EXPORT_SYMBOL(netdev_increment_features);
9567
9568static struct hlist_head * __net_init netdev_create_hash(void)
9569{
9570	int i;
9571	struct hlist_head *hash;
9572
9573	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
9574	if (hash != NULL)
9575		for (i = 0; i < NETDEV_HASHENTRIES; i++)
9576			INIT_HLIST_HEAD(&hash[i]);
9577
9578	return hash;
9579}
9580
9581/* Initialize per network namespace state */
9582static int __net_init netdev_init(struct net *net)
9583{
9584	BUILD_BUG_ON(GRO_HASH_BUCKETS >
9585		     8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
9586
9587	if (net != &init_net)
9588		INIT_LIST_HEAD(&net->dev_base_head);
9589
9590	net->dev_name_head = netdev_create_hash();
9591	if (net->dev_name_head == NULL)
9592		goto err_name;
9593
9594	net->dev_index_head = netdev_create_hash();
9595	if (net->dev_index_head == NULL)
9596		goto err_idx;
9597
9598	return 0;
9599
9600err_idx:
9601	kfree(net->dev_name_head);
9602err_name:
9603	return -ENOMEM;
9604}
9605
9606/**
9607 *	netdev_drivername - network driver for the device
9608 *	@dev: network device
9609 *
9610 *	Determine network driver for device.
9611 */
9612const char *netdev_drivername(const struct net_device *dev)
9613{
9614	const struct device_driver *driver;
9615	const struct device *parent;
9616	const char *empty = "";
9617
9618	parent = dev->dev.parent;
9619	if (!parent)
9620		return empty;
9621
9622	driver = parent->driver;
9623	if (driver && driver->name)
9624		return driver->name;
9625	return empty;
9626}
9627
9628static void __netdev_printk(const char *level, const struct net_device *dev,
9629			    struct va_format *vaf)
9630{
9631	if (dev && dev->dev.parent) {
9632		dev_printk_emit(level[1] - '0',
9633				dev->dev.parent,
9634				"%s %s %s%s: %pV",
9635				dev_driver_string(dev->dev.parent),
9636				dev_name(dev->dev.parent),
9637				netdev_name(dev), netdev_reg_state(dev),
9638				vaf);
9639	} else if (dev) {
9640		printk("%s%s%s: %pV",
9641		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
9642	} else {
9643		printk("%s(NULL net_device): %pV", level, vaf);
9644	}
9645}
9646
9647void netdev_printk(const char *level, const struct net_device *dev,
9648		   const char *format, ...)
9649{
9650	struct va_format vaf;
9651	va_list args;
9652
9653	va_start(args, format);
9654
9655	vaf.fmt = format;
9656	vaf.va = &args;
9657
9658	__netdev_printk(level, dev, &vaf);
9659
9660	va_end(args);
9661}
9662EXPORT_SYMBOL(netdev_printk);
9663
9664#define define_netdev_printk_level(func, level)			\
9665void func(const struct net_device *dev, const char *fmt, ...)	\
9666{								\
9667	struct va_format vaf;					\
9668	va_list args;						\
9669								\
9670	va_start(args, fmt);					\
9671								\
9672	vaf.fmt = fmt;						\
9673	vaf.va = &args;						\
9674								\
9675	__netdev_printk(level, dev, &vaf);			\
9676								\
9677	va_end(args);						\
9678}								\
9679EXPORT_SYMBOL(func);
9680
9681define_netdev_printk_level(netdev_emerg, KERN_EMERG);
9682define_netdev_printk_level(netdev_alert, KERN_ALERT);
9683define_netdev_printk_level(netdev_crit, KERN_CRIT);
9684define_netdev_printk_level(netdev_err, KERN_ERR);
9685define_netdev_printk_level(netdev_warn, KERN_WARNING);
9686define_netdev_printk_level(netdev_notice, KERN_NOTICE);
9687define_netdev_printk_level(netdev_info, KERN_INFO);
9688
9689static void __net_exit netdev_exit(struct net *net)
9690{
9691	kfree(net->dev_name_head);
9692	kfree(net->dev_index_head);
9693	if (net != &init_net)
9694		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
9695}
9696
9697static struct pernet_operations __net_initdata netdev_net_ops = {
9698	.init = netdev_init,
9699	.exit = netdev_exit,
9700};
9701
9702static void __net_exit default_device_exit(struct net *net)
9703{
9704	struct net_device *dev, *aux;
9705	/*
9706	 * Push all migratable network devices back to the
9707	 * initial network namespace
9708	 */
9709	rtnl_lock();
9710	for_each_netdev_safe(net, dev, aux) {
9711		int err;
9712		char fb_name[IFNAMSIZ];
9713
9714		/* Ignore unmoveable devices (i.e. loopback) */
9715		if (dev->features & NETIF_F_NETNS_LOCAL)
9716			continue;
9717
9718		/* Leave virtual devices for the generic cleanup */
9719		if (dev->rtnl_link_ops)
9720			continue;
9721
9722		/* Push remaining network devices to init_net */
9723		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
9724		err = dev_change_net_namespace(dev, &init_net, fb_name);
9725		if (err) {
9726			pr_emerg("%s: failed to move %s to init_net: %d\n",
9727				 __func__, dev->name, err);
9728			BUG();
9729		}
9730	}
9731	rtnl_unlock();
9732}
9733
9734static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
9735{
9736	/* Return with the rtnl_lock held when there are no network
9737	 * devices unregistering in any network namespace in net_list.
9738	 */
9739	struct net *net;
9740	bool unregistering;
9741	DEFINE_WAIT_FUNC(wait, woken_wake_function);
9742
9743	add_wait_queue(&netdev_unregistering_wq, &wait);
9744	for (;;) {
9745		unregistering = false;
9746		rtnl_lock();
9747		list_for_each_entry(net, net_list, exit_list) {
9748			if (net->dev_unreg_count > 0) {
9749				unregistering = true;
9750				break;
9751			}
9752		}
9753		if (!unregistering)
9754			break;
9755		__rtnl_unlock();
9756
9757		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
9758	}
9759	remove_wait_queue(&netdev_unregistering_wq, &wait);
9760}
9761
9762static void __net_exit default_device_exit_batch(struct list_head *net_list)
9763{
9764	/* At exit all network devices most be removed from a network
9765	 * namespace.  Do this in the reverse order of registration.
9766	 * Do this across as many network namespaces as possible to
9767	 * improve batching efficiency.
9768	 */
9769	struct net_device *dev;
9770	struct net *net;
9771	LIST_HEAD(dev_kill_list);
9772
9773	/* To prevent network device cleanup code from dereferencing
9774	 * loopback devices or network devices that have been freed
9775	 * wait here for all pending unregistrations to complete,
9776	 * before unregistring the loopback device and allowing the
9777	 * network namespace be freed.
9778	 *
9779	 * The netdev todo list containing all network devices
9780	 * unregistrations that happen in default_device_exit_batch
9781	 * will run in the rtnl_unlock() at the end of
9782	 * default_device_exit_batch.
9783	 */
9784	rtnl_lock_unregistering(net_list);
9785	list_for_each_entry(net, net_list, exit_list) {
9786		for_each_netdev_reverse(net, dev) {
9787			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9788				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9789			else
9790				unregister_netdevice_queue(dev, &dev_kill_list);
9791		}
9792	}
9793	unregister_netdevice_many(&dev_kill_list);
9794	rtnl_unlock();
9795}
9796
9797static struct pernet_operations __net_initdata default_device_ops = {
9798	.exit = default_device_exit,
9799	.exit_batch = default_device_exit_batch,
9800};
9801
9802/*
9803 *	Initialize the DEV module. At boot time this walks the device list and
9804 *	unhooks any devices that fail to initialise (normally hardware not
9805 *	present) and leaves us with a valid list of present and active devices.
9806 *
9807 */
9808
9809/*
9810 *       This is called single threaded during boot, so no need
9811 *       to take the rtnl semaphore.
9812 */
9813static int __init net_dev_init(void)
9814{
9815	int i, rc = -ENOMEM;
9816
9817	BUG_ON(!dev_boot_phase);
9818
9819	if (dev_proc_init())
9820		goto out;
9821
9822	if (netdev_kobject_init())
9823		goto out;
9824
9825	INIT_LIST_HEAD(&ptype_all);
9826	for (i = 0; i < PTYPE_HASH_SIZE; i++)
9827		INIT_LIST_HEAD(&ptype_base[i]);
9828
9829	INIT_LIST_HEAD(&offload_base);
9830
9831	if (register_pernet_subsys(&netdev_net_ops))
9832		goto out;
9833
9834	/*
9835	 *	Initialise the packet receive queues.
9836	 */
9837
9838	for_each_possible_cpu(i) {
9839		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9840		struct softnet_data *sd = &per_cpu(softnet_data, i);
9841
9842		INIT_WORK(flush, flush_backlog);
9843
9844		skb_queue_head_init(&sd->input_pkt_queue);
9845		skb_queue_head_init(&sd->process_queue);
9846#ifdef CONFIG_XFRM_OFFLOAD
9847		skb_queue_head_init(&sd->xfrm_backlog);
9848#endif
9849		INIT_LIST_HEAD(&sd->poll_list);
9850		sd->output_queue_tailp = &sd->output_queue;
9851#ifdef CONFIG_RPS
9852		sd->csd.func = rps_trigger_softirq;
9853		sd->csd.info = sd;
9854		sd->cpu = i;
9855#endif
9856
9857		init_gro_hash(&sd->backlog);
9858		sd->backlog.poll = process_backlog;
9859		sd->backlog.weight = weight_p;
9860	}
9861
9862	dev_boot_phase = 0;
9863
9864	/* The loopback device is special if any other network devices
9865	 * is present in a network namespace the loopback device must
9866	 * be present. Since we now dynamically allocate and free the
9867	 * loopback device ensure this invariant is maintained by
9868	 * keeping the loopback device as the first device on the
9869	 * list of network devices.  Ensuring the loopback devices
9870	 * is the first device that appears and the last network device
9871	 * that disappears.
9872	 */
9873	if (register_pernet_device(&loopback_net_ops))
9874		goto out;
9875
9876	if (register_pernet_device(&default_device_ops))
9877		goto out;
9878
9879	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
9880	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
9881
9882	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
9883				       NULL, dev_cpu_dead);
9884	WARN_ON(rc < 0);
9885	rc = 0;
9886out:
9887	return rc;
9888}
9889
9890subsys_initcall(net_dev_init);