drivers/net/virtio_net.c at v5.19-rc8

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / net / virtio_net.c
at v5.19-rc8 3850 lines 103 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* A network driver using virtio.
   3 *
   4 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
   5 */
   6//#define DEBUG
   7#include <linux/netdevice.h>
   8#include <linux/etherdevice.h>
   9#include <linux/ethtool.h>
  10#include <linux/module.h>
  11#include <linux/virtio.h>
  12#include <linux/virtio_net.h>
  13#include <linux/bpf.h>
  14#include <linux/bpf_trace.h>
  15#include <linux/scatterlist.h>
  16#include <linux/if_vlan.h>
  17#include <linux/slab.h>
  18#include <linux/cpu.h>
  19#include <linux/average.h>
  20#include <linux/filter.h>
  21#include <linux/kernel.h>
  22#include <net/route.h>
  23#include <net/xdp.h>
  24#include <net/net_failover.h>
  25
  26static int napi_weight = NAPI_POLL_WEIGHT;
  27module_param(napi_weight, int, 0444);
  28
  29static bool csum = true, gso = true, napi_tx = true;
  30module_param(csum, bool, 0444);
  31module_param(gso, bool, 0444);
  32module_param(napi_tx, bool, 0644);
  33
  34/* FIXME: MTU in config. */
  35#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  36#define GOOD_COPY_LEN	128
  37
  38#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  39
  40/* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
  41#define VIRTIO_XDP_HEADROOM 256
  42
  43/* Separating two types of XDP xmit */
  44#define VIRTIO_XDP_TX		BIT(0)
  45#define VIRTIO_XDP_REDIR	BIT(1)
  46
  47#define VIRTIO_XDP_FLAG	BIT(0)
  48
  49/* RX packet size EWMA. The average packet size is used to determine the packet
  50 * buffer size when refilling RX rings. As the entire RX ring may be refilled
  51 * at once, the weight is chosen so that the EWMA will be insensitive to short-
  52 * term, transient changes in packet size.
  53 */
  54DECLARE_EWMA(pkt_len, 0, 64)
  55
  56#define VIRTNET_DRIVER_VERSION "1.0.0"
  57
  58static const unsigned long guest_offloads[] = {
  59	VIRTIO_NET_F_GUEST_TSO4,
  60	VIRTIO_NET_F_GUEST_TSO6,
  61	VIRTIO_NET_F_GUEST_ECN,
  62	VIRTIO_NET_F_GUEST_UFO,
  63	VIRTIO_NET_F_GUEST_CSUM
  64};
  65
  66#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
  67				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
  68				(1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
  69				(1ULL << VIRTIO_NET_F_GUEST_UFO))
  70
  71struct virtnet_stat_desc {
  72	char desc[ETH_GSTRING_LEN];
  73	size_t offset;
  74};
  75
  76struct virtnet_sq_stats {
  77	struct u64_stats_sync syncp;
  78	u64 packets;
  79	u64 bytes;
  80	u64 xdp_tx;
  81	u64 xdp_tx_drops;
  82	u64 kicks;
  83	u64 tx_timeouts;
  84};
  85
  86struct virtnet_rq_stats {
  87	struct u64_stats_sync syncp;
  88	u64 packets;
  89	u64 bytes;
  90	u64 drops;
  91	u64 xdp_packets;
  92	u64 xdp_tx;
  93	u64 xdp_redirects;
  94	u64 xdp_drops;
  95	u64 kicks;
  96};
  97
  98#define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
  99#define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
 100
 101static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
 102	{ "packets",		VIRTNET_SQ_STAT(packets) },
 103	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
 104	{ "xdp_tx",		VIRTNET_SQ_STAT(xdp_tx) },
 105	{ "xdp_tx_drops",	VIRTNET_SQ_STAT(xdp_tx_drops) },
 106	{ "kicks",		VIRTNET_SQ_STAT(kicks) },
 107	{ "tx_timeouts",	VIRTNET_SQ_STAT(tx_timeouts) },
 108};
 109
 110static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
 111	{ "packets",		VIRTNET_RQ_STAT(packets) },
 112	{ "bytes",		VIRTNET_RQ_STAT(bytes) },
 113	{ "drops",		VIRTNET_RQ_STAT(drops) },
 114	{ "xdp_packets",	VIRTNET_RQ_STAT(xdp_packets) },
 115	{ "xdp_tx",		VIRTNET_RQ_STAT(xdp_tx) },
 116	{ "xdp_redirects",	VIRTNET_RQ_STAT(xdp_redirects) },
 117	{ "xdp_drops",		VIRTNET_RQ_STAT(xdp_drops) },
 118	{ "kicks",		VIRTNET_RQ_STAT(kicks) },
 119};
 120
 121#define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
 122#define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
 123
 124/* Internal representation of a send virtqueue */
 125struct send_queue {
 126	/* Virtqueue associated with this send _queue */
 127	struct virtqueue *vq;
 128
 129	/* TX: fragments + linear part + virtio header */
 130	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 131
 132	/* Name of the send queue: output.$index */
 133	char name[40];
 134
 135	struct virtnet_sq_stats stats;
 136
 137	struct napi_struct napi;
 138};
 139
 140/* Internal representation of a receive virtqueue */
 141struct receive_queue {
 142	/* Virtqueue associated with this receive_queue */
 143	struct virtqueue *vq;
 144
 145	struct napi_struct napi;
 146
 147	struct bpf_prog __rcu *xdp_prog;
 148
 149	struct virtnet_rq_stats stats;
 150
 151	/* Chain pages by the private ptr. */
 152	struct page *pages;
 153
 154	/* Average packet length for mergeable receive buffers. */
 155	struct ewma_pkt_len mrg_avg_pkt_len;
 156
 157	/* Page frag for packet buffer allocation. */
 158	struct page_frag alloc_frag;
 159
 160	/* RX: fragments + linear part + virtio header */
 161	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 162
 163	/* Min single buffer size for mergeable buffers case. */
 164	unsigned int min_buf_len;
 165
 166	/* Name of this receive queue: input.$index */
 167	char name[40];
 168
 169	struct xdp_rxq_info xdp_rxq;
 170};
 171
 172/* This structure can contain rss message with maximum settings for indirection table and keysize
 173 * Note, that default structure that describes RSS configuration virtio_net_rss_config
 174 * contains same info but can't handle table values.
 175 * In any case, structure would be passed to virtio hw through sg_buf split by parts
 176 * because table sizes may be differ according to the device configuration.
 177 */
 178#define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
 179#define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
 180struct virtio_net_ctrl_rss {
 181	u32 hash_types;
 182	u16 indirection_table_mask;
 183	u16 unclassified_queue;
 184	u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN];
 185	u16 max_tx_vq;
 186	u8 hash_key_length;
 187	u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
 188};
 189
 190/* Control VQ buffers: protected by the rtnl lock */
 191struct control_buf {
 192	struct virtio_net_ctrl_hdr hdr;
 193	virtio_net_ctrl_ack status;
 194	struct virtio_net_ctrl_mq mq;
 195	u8 promisc;
 196	u8 allmulti;
 197	__virtio16 vid;
 198	__virtio64 offloads;
 199	struct virtio_net_ctrl_rss rss;
 200};
 201
 202struct virtnet_info {
 203	struct virtio_device *vdev;
 204	struct virtqueue *cvq;
 205	struct net_device *dev;
 206	struct send_queue *sq;
 207	struct receive_queue *rq;
 208	unsigned int status;
 209
 210	/* Max # of queue pairs supported by the device */
 211	u16 max_queue_pairs;
 212
 213	/* # of queue pairs currently used by the driver */
 214	u16 curr_queue_pairs;
 215
 216	/* # of XDP queue pairs currently used by the driver */
 217	u16 xdp_queue_pairs;
 218
 219	/* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
 220	bool xdp_enabled;
 221
 222	/* I like... big packets and I cannot lie! */
 223	bool big_packets;
 224
 225	/* Host will merge rx buffers for big packets (shake it! shake it!) */
 226	bool mergeable_rx_bufs;
 227
 228	/* Host supports rss and/or hash report */
 229	bool has_rss;
 230	bool has_rss_hash_report;
 231	u8 rss_key_size;
 232	u16 rss_indir_table_size;
 233	u32 rss_hash_types_supported;
 234	u32 rss_hash_types_saved;
 235
 236	/* Has control virtqueue */
 237	bool has_cvq;
 238
 239	/* Host can handle any s/g split between our header and packet data */
 240	bool any_header_sg;
 241
 242	/* Packet virtio header size */
 243	u8 hdr_len;
 244
 245	/* Work struct for refilling if we run low on memory. */
 246	struct delayed_work refill;
 247
 248	/* Work struct for config space updates */
 249	struct work_struct config_work;
 250
 251	/* Does the affinity hint is set for virtqueues? */
 252	bool affinity_hint_set;
 253
 254	/* CPU hotplug instances for online & dead */
 255	struct hlist_node node;
 256	struct hlist_node node_dead;
 257
 258	struct control_buf *ctrl;
 259
 260	/* Ethtool settings */
 261	u8 duplex;
 262	u32 speed;
 263
 264	unsigned long guest_offloads;
 265	unsigned long guest_offloads_capable;
 266
 267	/* failover when STANDBY feature enabled */
 268	struct failover *failover;
 269};
 270
 271struct padded_vnet_hdr {
 272	struct virtio_net_hdr_v1_hash hdr;
 273	/*
 274	 * hdr is in a separate sg buffer, and data sg buffer shares same page
 275	 * with this header sg. This padding makes next sg 16 byte aligned
 276	 * after the header.
 277	 */
 278	char padding[12];
 279};
 280
 281static bool is_xdp_frame(void *ptr)
 282{
 283	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
 284}
 285
 286static void *xdp_to_ptr(struct xdp_frame *ptr)
 287{
 288	return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
 289}
 290
 291static struct xdp_frame *ptr_to_xdp(void *ptr)
 292{
 293	return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
 294}
 295
 296/* Converting between virtqueue no. and kernel tx/rx queue no.
 297 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 298 */
 299static int vq2txq(struct virtqueue *vq)
 300{
 301	return (vq->index - 1) / 2;
 302}
 303
 304static int txq2vq(int txq)
 305{
 306	return txq * 2 + 1;
 307}
 308
 309static int vq2rxq(struct virtqueue *vq)
 310{
 311	return vq->index / 2;
 312}
 313
 314static int rxq2vq(int rxq)
 315{
 316	return rxq * 2;
 317}
 318
 319static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 320{
 321	return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 322}
 323
 324/*
 325 * private is used to chain pages for big packets, put the whole
 326 * most recent used list in the beginning for reuse
 327 */
 328static void give_pages(struct receive_queue *rq, struct page *page)
 329{
 330	struct page *end;
 331
 332	/* Find end of list, sew whole thing into vi->rq.pages. */
 333	for (end = page; end->private; end = (struct page *)end->private);
 334	end->private = (unsigned long)rq->pages;
 335	rq->pages = page;
 336}
 337
 338static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 339{
 340	struct page *p = rq->pages;
 341
 342	if (p) {
 343		rq->pages = (struct page *)p->private;
 344		/* clear private here, it is used to chain pages */
 345		p->private = 0;
 346	} else
 347		p = alloc_page(gfp_mask);
 348	return p;
 349}
 350
 351static void virtqueue_napi_schedule(struct napi_struct *napi,
 352				    struct virtqueue *vq)
 353{
 354	if (napi_schedule_prep(napi)) {
 355		virtqueue_disable_cb(vq);
 356		__napi_schedule(napi);
 357	}
 358}
 359
 360static void virtqueue_napi_complete(struct napi_struct *napi,
 361				    struct virtqueue *vq, int processed)
 362{
 363	int opaque;
 364
 365	opaque = virtqueue_enable_cb_prepare(vq);
 366	if (napi_complete_done(napi, processed)) {
 367		if (unlikely(virtqueue_poll(vq, opaque)))
 368			virtqueue_napi_schedule(napi, vq);
 369	} else {
 370		virtqueue_disable_cb(vq);
 371	}
 372}
 373
 374static void skb_xmit_done(struct virtqueue *vq)
 375{
 376	struct virtnet_info *vi = vq->vdev->priv;
 377	struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 378
 379	/* Suppress further interrupts. */
 380	virtqueue_disable_cb(vq);
 381
 382	if (napi->weight)
 383		virtqueue_napi_schedule(napi, vq);
 384	else
 385		/* We were probably waiting for more output buffers. */
 386		netif_wake_subqueue(vi->dev, vq2txq(vq));
 387}
 388
 389#define MRG_CTX_HEADER_SHIFT 22
 390static void *mergeable_len_to_ctx(unsigned int truesize,
 391				  unsigned int headroom)
 392{
 393	return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 394}
 395
 396static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 397{
 398	return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 399}
 400
 401static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 402{
 403	return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 404}
 405
 406/* Called from bottom half context */
 407static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 408				   struct receive_queue *rq,
 409				   struct page *page, unsigned int offset,
 410				   unsigned int len, unsigned int truesize,
 411				   bool hdr_valid, unsigned int metasize,
 412				   unsigned int headroom)
 413{
 414	struct sk_buff *skb;
 415	struct virtio_net_hdr_mrg_rxbuf *hdr;
 416	unsigned int copy, hdr_len, hdr_padded_len;
 417	struct page *page_to_free = NULL;
 418	int tailroom, shinfo_size;
 419	char *p, *hdr_p, *buf;
 420
 421	p = page_address(page) + offset;
 422	hdr_p = p;
 423
 424	hdr_len = vi->hdr_len;
 425	if (vi->mergeable_rx_bufs)
 426		hdr_padded_len = hdr_len;
 427	else
 428		hdr_padded_len = sizeof(struct padded_vnet_hdr);
 429
 430	/* If headroom is not 0, there is an offset between the beginning of the
 431	 * data and the allocated space, otherwise the data and the allocated
 432	 * space are aligned.
 433	 *
 434	 * Buffers with headroom use PAGE_SIZE as alloc size, see
 435	 * add_recvbuf_mergeable() + get_mergeable_buf_len()
 436	 */
 437	truesize = headroom ? PAGE_SIZE : truesize;
 438	tailroom = truesize - headroom;
 439	buf = p - headroom;
 440
 441	len -= hdr_len;
 442	offset += hdr_padded_len;
 443	p += hdr_padded_len;
 444	tailroom -= hdr_padded_len + len;
 445
 446	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 447
 448	/* copy small packet so we can reuse these pages */
 449	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
 450		skb = build_skb(buf, truesize);
 451		if (unlikely(!skb))
 452			return NULL;
 453
 454		skb_reserve(skb, p - buf);
 455		skb_put(skb, len);
 456
 457		page = (struct page *)page->private;
 458		if (page)
 459			give_pages(rq, page);
 460		goto ok;
 461	}
 462
 463	/* copy small packet so we can reuse these pages for small data */
 464	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
 465	if (unlikely(!skb))
 466		return NULL;
 467
 468	/* Copy all frame if it fits skb->head, otherwise
 469	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 470	 */
 471	if (len <= skb_tailroom(skb))
 472		copy = len;
 473	else
 474		copy = ETH_HLEN + metasize;
 475	skb_put_data(skb, p, copy);
 476
 477	len -= copy;
 478	offset += copy;
 479
 480	if (vi->mergeable_rx_bufs) {
 481		if (len)
 482			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 483		else
 484			page_to_free = page;
 485		goto ok;
 486	}
 487
 488	/*
 489	 * Verify that we can indeed put this data into a skb.
 490	 * This is here to handle cases when the device erroneously
 491	 * tries to receive more than is possible. This is usually
 492	 * the case of a broken device.
 493	 */
 494	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 495		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 496		dev_kfree_skb(skb);
 497		return NULL;
 498	}
 499	BUG_ON(offset >= PAGE_SIZE);
 500	while (len) {
 501		unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
 502		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 503				frag_size, truesize);
 504		len -= frag_size;
 505		page = (struct page *)page->private;
 506		offset = 0;
 507	}
 508
 509	if (page)
 510		give_pages(rq, page);
 511
 512ok:
 513	/* hdr_valid means no XDP, so we can copy the vnet header */
 514	if (hdr_valid) {
 515		hdr = skb_vnet_hdr(skb);
 516		memcpy(hdr, hdr_p, hdr_len);
 517	}
 518	if (page_to_free)
 519		put_page(page_to_free);
 520
 521	if (metasize) {
 522		__skb_pull(skb, metasize);
 523		skb_metadata_set(skb, metasize);
 524	}
 525
 526	return skb;
 527}
 528
 529static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 530				   struct send_queue *sq,
 531				   struct xdp_frame *xdpf)
 532{
 533	struct virtio_net_hdr_mrg_rxbuf *hdr;
 534	int err;
 535
 536	if (unlikely(xdpf->headroom < vi->hdr_len))
 537		return -EOVERFLOW;
 538
 539	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
 540	xdpf->data -= vi->hdr_len;
 541	/* Zero header and leave csum up to XDP layers */
 542	hdr = xdpf->data;
 543	memset(hdr, 0, vi->hdr_len);
 544	xdpf->len   += vi->hdr_len;
 545
 546	sg_init_one(sq->sg, xdpf->data, xdpf->len);
 547
 548	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
 549				   GFP_ATOMIC);
 550	if (unlikely(err))
 551		return -ENOSPC; /* Caller handle free/refcnt */
 552
 553	return 0;
 554}
 555
 556/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 557 * the current cpu, so it does not need to be locked.
 558 *
 559 * Here we use marco instead of inline functions because we have to deal with
 560 * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 561 * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 562 * functions to perfectly solve these three problems at the same time.
 563 */
 564#define virtnet_xdp_get_sq(vi) ({                                       \
 565	int cpu = smp_processor_id();                                   \
 566	struct netdev_queue *txq;                                       \
 567	typeof(vi) v = (vi);                                            \
 568	unsigned int qp;                                                \
 569									\
 570	if (v->curr_queue_pairs > nr_cpu_ids) {                         \
 571		qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
 572		qp += cpu;                                              \
 573		txq = netdev_get_tx_queue(v->dev, qp);                  \
 574		__netif_tx_acquire(txq);                                \
 575	} else {                                                        \
 576		qp = cpu % v->curr_queue_pairs;                         \
 577		txq = netdev_get_tx_queue(v->dev, qp);                  \
 578		__netif_tx_lock(txq, cpu);                              \
 579	}                                                               \
 580	v->sq + qp;                                                     \
 581})
 582
 583#define virtnet_xdp_put_sq(vi, q) {                                     \
 584	struct netdev_queue *txq;                                       \
 585	typeof(vi) v = (vi);                                            \
 586									\
 587	txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
 588	if (v->curr_queue_pairs > nr_cpu_ids)                           \
 589		__netif_tx_release(txq);                                \
 590	else                                                            \
 591		__netif_tx_unlock(txq);                                 \
 592}
 593
 594static int virtnet_xdp_xmit(struct net_device *dev,
 595			    int n, struct xdp_frame **frames, u32 flags)
 596{
 597	struct virtnet_info *vi = netdev_priv(dev);
 598	struct receive_queue *rq = vi->rq;
 599	struct bpf_prog *xdp_prog;
 600	struct send_queue *sq;
 601	unsigned int len;
 602	int packets = 0;
 603	int bytes = 0;
 604	int nxmit = 0;
 605	int kicks = 0;
 606	void *ptr;
 607	int ret;
 608	int i;
 609
 610	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 611	 * indicate XDP resources have been successfully allocated.
 612	 */
 613	xdp_prog = rcu_access_pointer(rq->xdp_prog);
 614	if (!xdp_prog)
 615		return -ENXIO;
 616
 617	sq = virtnet_xdp_get_sq(vi);
 618
 619	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
 620		ret = -EINVAL;
 621		goto out;
 622	}
 623
 624	/* Free up any pending old buffers before queueing new ones. */
 625	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 626		if (likely(is_xdp_frame(ptr))) {
 627			struct xdp_frame *frame = ptr_to_xdp(ptr);
 628
 629			bytes += frame->len;
 630			xdp_return_frame(frame);
 631		} else {
 632			struct sk_buff *skb = ptr;
 633
 634			bytes += skb->len;
 635			napi_consume_skb(skb, false);
 636		}
 637		packets++;
 638	}
 639
 640	for (i = 0; i < n; i++) {
 641		struct xdp_frame *xdpf = frames[i];
 642
 643		if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
 644			break;
 645		nxmit++;
 646	}
 647	ret = nxmit;
 648
 649	if (flags & XDP_XMIT_FLUSH) {
 650		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
 651			kicks = 1;
 652	}
 653out:
 654	u64_stats_update_begin(&sq->stats.syncp);
 655	sq->stats.bytes += bytes;
 656	sq->stats.packets += packets;
 657	sq->stats.xdp_tx += n;
 658	sq->stats.xdp_tx_drops += n - nxmit;
 659	sq->stats.kicks += kicks;
 660	u64_stats_update_end(&sq->stats.syncp);
 661
 662	virtnet_xdp_put_sq(vi, sq);
 663	return ret;
 664}
 665
 666static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 667{
 668	return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
 669}
 670
 671/* We copy the packet for XDP in the following cases:
 672 *
 673 * 1) Packet is scattered across multiple rx buffers.
 674 * 2) Headroom space is insufficient.
 675 *
 676 * This is inefficient but it's a temporary condition that
 677 * we hit right after XDP is enabled and until queue is refilled
 678 * with large buffers with sufficient headroom - so it should affect
 679 * at most queue size packets.
 680 * Afterwards, the conditions to enable
 681 * XDP should preclude the underlying device from sending packets
 682 * across multiple buffers (num_buf > 1), and we make sure buffers
 683 * have enough headroom.
 684 */
 685static struct page *xdp_linearize_page(struct receive_queue *rq,
 686				       u16 *num_buf,
 687				       struct page *p,
 688				       int offset,
 689				       int page_off,
 690				       unsigned int *len)
 691{
 692	struct page *page = alloc_page(GFP_ATOMIC);
 693
 694	if (!page)
 695		return NULL;
 696
 697	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 698	page_off += *len;
 699
 700	while (--*num_buf) {
 701		int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 702		unsigned int buflen;
 703		void *buf;
 704		int off;
 705
 706		buf = virtqueue_get_buf(rq->vq, &buflen);
 707		if (unlikely(!buf))
 708			goto err_buf;
 709
 710		p = virt_to_head_page(buf);
 711		off = buf - page_address(p);
 712
 713		/* guard against a misconfigured or uncooperative backend that
 714		 * is sending packet larger than the MTU.
 715		 */
 716		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
 717			put_page(p);
 718			goto err_buf;
 719		}
 720
 721		memcpy(page_address(page) + page_off,
 722		       page_address(p) + off, buflen);
 723		page_off += buflen;
 724		put_page(p);
 725	}
 726
 727	/* Headroom does not contribute to packet length */
 728	*len = page_off - VIRTIO_XDP_HEADROOM;
 729	return page;
 730err_buf:
 731	__free_pages(page, 0);
 732	return NULL;
 733}
 734
 735static struct sk_buff *receive_small(struct net_device *dev,
 736				     struct virtnet_info *vi,
 737				     struct receive_queue *rq,
 738				     void *buf, void *ctx,
 739				     unsigned int len,
 740				     unsigned int *xdp_xmit,
 741				     struct virtnet_rq_stats *stats)
 742{
 743	struct sk_buff *skb;
 744	struct bpf_prog *xdp_prog;
 745	unsigned int xdp_headroom = (unsigned long)ctx;
 746	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
 747	unsigned int headroom = vi->hdr_len + header_offset;
 748	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 749			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 750	struct page *page = virt_to_head_page(buf);
 751	unsigned int delta = 0;
 752	struct page *xdp_page;
 753	int err;
 754	unsigned int metasize = 0;
 755
 756	len -= vi->hdr_len;
 757	stats->bytes += len;
 758
 759	if (unlikely(len > GOOD_PACKET_LEN)) {
 760		pr_debug("%s: rx error: len %u exceeds max size %d\n",
 761			 dev->name, len, GOOD_PACKET_LEN);
 762		dev->stats.rx_length_errors++;
 763		goto err;
 764	}
 765
 766	if (likely(!vi->xdp_enabled)) {
 767		xdp_prog = NULL;
 768		goto skip_xdp;
 769	}
 770
 771	rcu_read_lock();
 772	xdp_prog = rcu_dereference(rq->xdp_prog);
 773	if (xdp_prog) {
 774		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 775		struct xdp_frame *xdpf;
 776		struct xdp_buff xdp;
 777		void *orig_data;
 778		u32 act;
 779
 780		if (unlikely(hdr->hdr.gso_type))
 781			goto err_xdp;
 782
 783		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 784			int offset = buf - page_address(page) + header_offset;
 785			unsigned int tlen = len + vi->hdr_len;
 786			u16 num_buf = 1;
 787
 788			xdp_headroom = virtnet_get_headroom(vi);
 789			header_offset = VIRTNET_RX_PAD + xdp_headroom;
 790			headroom = vi->hdr_len + header_offset;
 791			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 792				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 793			xdp_page = xdp_linearize_page(rq, &num_buf, page,
 794						      offset, header_offset,
 795						      &tlen);
 796			if (!xdp_page)
 797				goto err_xdp;
 798
 799			buf = page_address(xdp_page);
 800			put_page(page);
 801			page = xdp_page;
 802		}
 803
 804		xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
 805		xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
 806				 xdp_headroom, len, true);
 807		orig_data = xdp.data;
 808		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 809		stats->xdp_packets++;
 810
 811		switch (act) {
 812		case XDP_PASS:
 813			/* Recalculate length in case bpf program changed it */
 814			delta = orig_data - xdp.data;
 815			len = xdp.data_end - xdp.data;
 816			metasize = xdp.data - xdp.data_meta;
 817			break;
 818		case XDP_TX:
 819			stats->xdp_tx++;
 820			xdpf = xdp_convert_buff_to_frame(&xdp);
 821			if (unlikely(!xdpf))
 822				goto err_xdp;
 823			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
 824			if (unlikely(!err)) {
 825				xdp_return_frame_rx_napi(xdpf);
 826			} else if (unlikely(err < 0)) {
 827				trace_xdp_exception(vi->dev, xdp_prog, act);
 828				goto err_xdp;
 829			}
 830			*xdp_xmit |= VIRTIO_XDP_TX;
 831			rcu_read_unlock();
 832			goto xdp_xmit;
 833		case XDP_REDIRECT:
 834			stats->xdp_redirects++;
 835			err = xdp_do_redirect(dev, &xdp, xdp_prog);
 836			if (err)
 837				goto err_xdp;
 838			*xdp_xmit |= VIRTIO_XDP_REDIR;
 839			rcu_read_unlock();
 840			goto xdp_xmit;
 841		default:
 842			bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
 843			fallthrough;
 844		case XDP_ABORTED:
 845			trace_xdp_exception(vi->dev, xdp_prog, act);
 846			goto err_xdp;
 847		case XDP_DROP:
 848			goto err_xdp;
 849		}
 850	}
 851	rcu_read_unlock();
 852
 853skip_xdp:
 854	skb = build_skb(buf, buflen);
 855	if (!skb)
 856		goto err;
 857	skb_reserve(skb, headroom - delta);
 858	skb_put(skb, len);
 859	if (!xdp_prog) {
 860		buf += header_offset;
 861		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 862	} /* keep zeroed vnet hdr since XDP is loaded */
 863
 864	if (metasize)
 865		skb_metadata_set(skb, metasize);
 866
 867	return skb;
 868
 869err_xdp:
 870	rcu_read_unlock();
 871	stats->xdp_drops++;
 872err:
 873	stats->drops++;
 874	put_page(page);
 875xdp_xmit:
 876	return NULL;
 877}
 878
 879static struct sk_buff *receive_big(struct net_device *dev,
 880				   struct virtnet_info *vi,
 881				   struct receive_queue *rq,
 882				   void *buf,
 883				   unsigned int len,
 884				   struct virtnet_rq_stats *stats)
 885{
 886	struct page *page = buf;
 887	struct sk_buff *skb =
 888		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
 889
 890	stats->bytes += len - vi->hdr_len;
 891	if (unlikely(!skb))
 892		goto err;
 893
 894	return skb;
 895
 896err:
 897	stats->drops++;
 898	give_pages(rq, page);
 899	return NULL;
 900}
 901
 902static struct sk_buff *receive_mergeable(struct net_device *dev,
 903					 struct virtnet_info *vi,
 904					 struct receive_queue *rq,
 905					 void *buf,
 906					 void *ctx,
 907					 unsigned int len,
 908					 unsigned int *xdp_xmit,
 909					 struct virtnet_rq_stats *stats)
 910{
 911	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 912	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 913	struct page *page = virt_to_head_page(buf);
 914	int offset = buf - page_address(page);
 915	struct sk_buff *head_skb, *curr_skb;
 916	struct bpf_prog *xdp_prog;
 917	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
 918	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
 919	unsigned int metasize = 0;
 920	unsigned int frame_sz;
 921	int err;
 922
 923	head_skb = NULL;
 924	stats->bytes += len - vi->hdr_len;
 925
 926	if (unlikely(len > truesize)) {
 927		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 928			 dev->name, len, (unsigned long)ctx);
 929		dev->stats.rx_length_errors++;
 930		goto err_skb;
 931	}
 932
 933	if (likely(!vi->xdp_enabled)) {
 934		xdp_prog = NULL;
 935		goto skip_xdp;
 936	}
 937
 938	rcu_read_lock();
 939	xdp_prog = rcu_dereference(rq->xdp_prog);
 940	if (xdp_prog) {
 941		struct xdp_frame *xdpf;
 942		struct page *xdp_page;
 943		struct xdp_buff xdp;
 944		void *data;
 945		u32 act;
 946
 947		/* Transient failure which in theory could occur if
 948		 * in-flight packets from before XDP was enabled reach
 949		 * the receive path after XDP is loaded.
 950		 */
 951		if (unlikely(hdr->hdr.gso_type))
 952			goto err_xdp;
 953
 954		/* Buffers with headroom use PAGE_SIZE as alloc size,
 955		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
 956		 */
 957		frame_sz = headroom ? PAGE_SIZE : truesize;
 958
 959		/* This happens when rx buffer size is underestimated
 960		 * or headroom is not enough because of the buffer
 961		 * was refilled before XDP is set. This should only
 962		 * happen for the first several packets, so we don't
 963		 * care much about its performance.
 964		 */
 965		if (unlikely(num_buf > 1 ||
 966			     headroom < virtnet_get_headroom(vi))) {
 967			/* linearize data for XDP */
 968			xdp_page = xdp_linearize_page(rq, &num_buf,
 969						      page, offset,
 970						      VIRTIO_XDP_HEADROOM,
 971						      &len);
 972			frame_sz = PAGE_SIZE;
 973
 974			if (!xdp_page)
 975				goto err_xdp;
 976			offset = VIRTIO_XDP_HEADROOM;
 977		} else {
 978			xdp_page = page;
 979		}
 980
 981		/* Allow consuming headroom but reserve enough space to push
 982		 * the descriptor on if we get an XDP_TX return code.
 983		 */
 984		data = page_address(xdp_page) + offset;
 985		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
 986		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
 987				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
 988
 989		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 990		stats->xdp_packets++;
 991
 992		switch (act) {
 993		case XDP_PASS:
 994			metasize = xdp.data - xdp.data_meta;
 995
 996			/* recalculate offset to account for any header
 997			 * adjustments and minus the metasize to copy the
 998			 * metadata in page_to_skb(). Note other cases do not
 999			 * build an skb and avoid using offset
1000			 */
1001			offset = xdp.data - page_address(xdp_page) -
1002				 vi->hdr_len - metasize;
1003
1004			/* recalculate len if xdp.data, xdp.data_end or
1005			 * xdp.data_meta were adjusted
1006			 */
1007			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
1008
1009			/* recalculate headroom if xdp.data or xdp_data_meta
1010			 * were adjusted, note that offset should always point
1011			 * to the start of the reserved bytes for virtio_net
1012			 * header which are followed by xdp.data, that means
1013			 * that offset is equal to the headroom (when buf is
1014			 * starting at the beginning of the page, otherwise
1015			 * there is a base offset inside the page) but it's used
1016			 * with a different starting point (buf start) than
1017			 * xdp.data (buf start + vnet hdr size). If xdp.data or
1018			 * data_meta were adjusted by the xdp prog then the
1019			 * headroom size has changed and so has the offset, we
1020			 * can use data_hard_start, which points at buf start +
1021			 * vnet hdr size, to calculate the new headroom and use
1022			 * it later to compute buf start in page_to_skb()
1023			 */
1024			headroom = xdp.data - xdp.data_hard_start - metasize;
1025
1026			/* We can only create skb based on xdp_page. */
1027			if (unlikely(xdp_page != page)) {
1028				rcu_read_unlock();
1029				put_page(page);
1030				head_skb = page_to_skb(vi, rq, xdp_page, offset,
1031						       len, PAGE_SIZE, false,
1032						       metasize,
1033						       headroom);
1034				return head_skb;
1035			}
1036			break;
1037		case XDP_TX:
1038			stats->xdp_tx++;
1039			xdpf = xdp_convert_buff_to_frame(&xdp);
1040			if (unlikely(!xdpf))
1041				goto err_xdp;
1042			err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
1043			if (unlikely(!err)) {
1044				xdp_return_frame_rx_napi(xdpf);
1045			} else if (unlikely(err < 0)) {
1046				trace_xdp_exception(vi->dev, xdp_prog, act);
1047				if (unlikely(xdp_page != page))
1048					put_page(xdp_page);
1049				goto err_xdp;
1050			}
1051			*xdp_xmit |= VIRTIO_XDP_TX;
1052			if (unlikely(xdp_page != page))
1053				put_page(page);
1054			rcu_read_unlock();
1055			goto xdp_xmit;
1056		case XDP_REDIRECT:
1057			stats->xdp_redirects++;
1058			err = xdp_do_redirect(dev, &xdp, xdp_prog);
1059			if (err) {
1060				if (unlikely(xdp_page != page))
1061					put_page(xdp_page);
1062				goto err_xdp;
1063			}
1064			*xdp_xmit |= VIRTIO_XDP_REDIR;
1065			if (unlikely(xdp_page != page))
1066				put_page(page);
1067			rcu_read_unlock();
1068			goto xdp_xmit;
1069		default:
1070			bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
1071			fallthrough;
1072		case XDP_ABORTED:
1073			trace_xdp_exception(vi->dev, xdp_prog, act);
1074			fallthrough;
1075		case XDP_DROP:
1076			if (unlikely(xdp_page != page))
1077				__free_pages(xdp_page, 0);
1078			goto err_xdp;
1079		}
1080	}
1081	rcu_read_unlock();
1082
1083skip_xdp:
1084	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1085			       metasize, headroom);
1086	curr_skb = head_skb;
1087
1088	if (unlikely(!curr_skb))
1089		goto err_skb;
1090	while (--num_buf) {
1091		int num_skb_frags;
1092
1093		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1094		if (unlikely(!buf)) {
1095			pr_debug("%s: rx error: %d buffers out of %d missing\n",
1096				 dev->name, num_buf,
1097				 virtio16_to_cpu(vi->vdev,
1098						 hdr->num_buffers));
1099			dev->stats.rx_length_errors++;
1100			goto err_buf;
1101		}
1102
1103		stats->bytes += len;
1104		page = virt_to_head_page(buf);
1105
1106		truesize = mergeable_ctx_to_truesize(ctx);
1107		if (unlikely(len > truesize)) {
1108			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1109				 dev->name, len, (unsigned long)ctx);
1110			dev->stats.rx_length_errors++;
1111			goto err_skb;
1112		}
1113
1114		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1115		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
1116			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1117
1118			if (unlikely(!nskb))
1119				goto err_skb;
1120			if (curr_skb == head_skb)
1121				skb_shinfo(curr_skb)->frag_list = nskb;
1122			else
1123				curr_skb->next = nskb;
1124			curr_skb = nskb;
1125			head_skb->truesize += nskb->truesize;
1126			num_skb_frags = 0;
1127		}
1128		if (curr_skb != head_skb) {
1129			head_skb->data_len += len;
1130			head_skb->len += len;
1131			head_skb->truesize += truesize;
1132		}
1133		offset = buf - page_address(page);
1134		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
1135			put_page(page);
1136			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1137					     len, truesize);
1138		} else {
1139			skb_add_rx_frag(curr_skb, num_skb_frags, page,
1140					offset, len, truesize);
1141		}
1142	}
1143
1144	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1145	return head_skb;
1146
1147err_xdp:
1148	rcu_read_unlock();
1149	stats->xdp_drops++;
1150err_skb:
1151	put_page(page);
1152	while (num_buf-- > 1) {
1153		buf = virtqueue_get_buf(rq->vq, &len);
1154		if (unlikely(!buf)) {
1155			pr_debug("%s: rx error: %d buffers missing\n",
1156				 dev->name, num_buf);
1157			dev->stats.rx_length_errors++;
1158			break;
1159		}
1160		stats->bytes += len;
1161		page = virt_to_head_page(buf);
1162		put_page(page);
1163	}
1164err_buf:
1165	stats->drops++;
1166	dev_kfree_skb(head_skb);
1167xdp_xmit:
1168	return NULL;
1169}
1170
1171static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
1172				struct sk_buff *skb)
1173{
1174	enum pkt_hash_types rss_hash_type;
1175
1176	if (!hdr_hash || !skb)
1177		return;
1178
1179	switch ((int)hdr_hash->hash_report) {
1180	case VIRTIO_NET_HASH_REPORT_TCPv4:
1181	case VIRTIO_NET_HASH_REPORT_UDPv4:
1182	case VIRTIO_NET_HASH_REPORT_TCPv6:
1183	case VIRTIO_NET_HASH_REPORT_UDPv6:
1184	case VIRTIO_NET_HASH_REPORT_TCPv6_EX:
1185	case VIRTIO_NET_HASH_REPORT_UDPv6_EX:
1186		rss_hash_type = PKT_HASH_TYPE_L4;
1187		break;
1188	case VIRTIO_NET_HASH_REPORT_IPv4:
1189	case VIRTIO_NET_HASH_REPORT_IPv6:
1190	case VIRTIO_NET_HASH_REPORT_IPv6_EX:
1191		rss_hash_type = PKT_HASH_TYPE_L3;
1192		break;
1193	case VIRTIO_NET_HASH_REPORT_NONE:
1194	default:
1195		rss_hash_type = PKT_HASH_TYPE_NONE;
1196	}
1197	skb_set_hash(skb, (unsigned int)hdr_hash->hash_value, rss_hash_type);
1198}
1199
1200static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
1201			void *buf, unsigned int len, void **ctx,
1202			unsigned int *xdp_xmit,
1203			struct virtnet_rq_stats *stats)
1204{
1205	struct net_device *dev = vi->dev;
1206	struct sk_buff *skb;
1207	struct virtio_net_hdr_mrg_rxbuf *hdr;
1208
1209	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1210		pr_debug("%s: short packet %i\n", dev->name, len);
1211		dev->stats.rx_length_errors++;
1212		if (vi->mergeable_rx_bufs) {
1213			put_page(virt_to_head_page(buf));
1214		} else if (vi->big_packets) {
1215			give_pages(rq, buf);
1216		} else {
1217			put_page(virt_to_head_page(buf));
1218		}
1219		return;
1220	}
1221
1222	if (vi->mergeable_rx_bufs)
1223		skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1224					stats);
1225	else if (vi->big_packets)
1226		skb = receive_big(dev, vi, rq, buf, len, stats);
1227	else
1228		skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1229
1230	if (unlikely(!skb))
1231		return;
1232
1233	hdr = skb_vnet_hdr(skb);
1234	if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
1235		virtio_skb_set_hash((const struct virtio_net_hdr_v1_hash *)hdr, skb);
1236
1237	if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1238		skb->ip_summed = CHECKSUM_UNNECESSARY;
1239
1240	if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
1241				  virtio_is_little_endian(vi->vdev))) {
1242		net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
1243				     dev->name, hdr->hdr.gso_type,
1244				     hdr->hdr.gso_size);
1245		goto frame_err;
1246	}
1247
1248	skb_record_rx_queue(skb, vq2rxq(rq->vq));
1249	skb->protocol = eth_type_trans(skb, dev);
1250	pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
1251		 ntohs(skb->protocol), skb->len, skb->pkt_type);
1252
1253	napi_gro_receive(&rq->napi, skb);
1254	return;
1255
1256frame_err:
1257	dev->stats.rx_frame_errors++;
1258	dev_kfree_skb(skb);
1259}
1260
1261/* Unlike mergeable buffers, all buffers are allocated to the
1262 * same size, except for the headroom. For this reason we do
1263 * not need to use  mergeable_len_to_ctx here - it is enough
1264 * to store the headroom as the context ignoring the truesize.
1265 */
1266static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
1267			     gfp_t gfp)
1268{
1269	struct page_frag *alloc_frag = &rq->alloc_frag;
1270	char *buf;
1271	unsigned int xdp_headroom = virtnet_get_headroom(vi);
1272	void *ctx = (void *)(unsigned long)xdp_headroom;
1273	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1274	int err;
1275
1276	len = SKB_DATA_ALIGN(len) +
1277	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1278	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1279		return -ENOMEM;
1280
1281	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1282	get_page(alloc_frag->page);
1283	alloc_frag->offset += len;
1284	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
1285		    vi->hdr_len + GOOD_PACKET_LEN);
1286	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1287	if (err < 0)
1288		put_page(virt_to_head_page(buf));
1289	return err;
1290}
1291
1292static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
1293			   gfp_t gfp)
1294{
1295	struct page *first, *list = NULL;
1296	char *p;
1297	int i, err, offset;
1298
1299	sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
1300
1301	/* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
1302	for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
1303		first = get_a_page(rq, gfp);
1304		if (!first) {
1305			if (list)
1306				give_pages(rq, list);
1307			return -ENOMEM;
1308		}
1309		sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1310
1311		/* chain new page in list head to match sg */
1312		first->private = (unsigned long)list;
1313		list = first;
1314	}
1315
1316	first = get_a_page(rq, gfp);
1317	if (!first) {
1318		give_pages(rq, list);
1319		return -ENOMEM;
1320	}
1321	p = page_address(first);
1322
1323	/* rq->sg[0], rq->sg[1] share the same page */
1324	/* a separated rq->sg[0] for header - required in case !any_header_sg */
1325	sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1326
1327	/* rq->sg[1] for data packet, from offset */
1328	offset = sizeof(struct padded_vnet_hdr);
1329	sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1330
1331	/* chain first in list head */
1332	first->private = (unsigned long)list;
1333	err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
1334				  first, gfp);
1335	if (err < 0)
1336		give_pages(rq, first);
1337
1338	return err;
1339}
1340
1341static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1342					  struct ewma_pkt_len *avg_pkt_len,
1343					  unsigned int room)
1344{
1345	struct virtnet_info *vi = rq->vq->vdev->priv;
1346	const size_t hdr_len = vi->hdr_len;
1347	unsigned int len;
1348
1349	if (room)
1350		return PAGE_SIZE - room;
1351
1352	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1353				rq->min_buf_len, PAGE_SIZE - hdr_len);
1354
1355	return ALIGN(len, L1_CACHE_BYTES);
1356}
1357
1358static int add_recvbuf_mergeable(struct virtnet_info *vi,
1359				 struct receive_queue *rq, gfp_t gfp)
1360{
1361	struct page_frag *alloc_frag = &rq->alloc_frag;
1362	unsigned int headroom = virtnet_get_headroom(vi);
1363	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1364	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1365	char *buf;
1366	void *ctx;
1367	int err;
1368	unsigned int len, hole;
1369
1370	/* Extra tailroom is needed to satisfy XDP's assumption. This
1371	 * means rx frags coalescing won't work, but consider we've
1372	 * disabled GSO for XDP, it won't be a big issue.
1373	 */
1374	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
1375	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1376		return -ENOMEM;
1377
1378	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1379	buf += headroom; /* advance address leaving hole at front of pkt */
1380	get_page(alloc_frag->page);
1381	alloc_frag->offset += len + room;
1382	hole = alloc_frag->size - alloc_frag->offset;
1383	if (hole < len + room) {
1384		/* To avoid internal fragmentation, if there is very likely not
1385		 * enough space for another buffer, add the remaining space to
1386		 * the current buffer.
1387		 */
1388		len += hole;
1389		alloc_frag->offset += hole;
1390	}
1391
1392	sg_init_one(rq->sg, buf, len);
1393	ctx = mergeable_len_to_ctx(len, headroom);
1394	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1395	if (err < 0)
1396		put_page(virt_to_head_page(buf));
1397
1398	return err;
1399}
1400
1401/*
1402 * Returns false if we couldn't fill entirely (OOM).
1403 *
1404 * Normally run in the receive path, but can also be run from ndo_open
1405 * before we're receiving packets, or from refill_work which is
1406 * careful to disable receiving (using napi_disable).
1407 */
1408static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1409			  gfp_t gfp)
1410{
1411	int err;
1412	bool oom;
1413
1414	do {
1415		if (vi->mergeable_rx_bufs)
1416			err = add_recvbuf_mergeable(vi, rq, gfp);
1417		else if (vi->big_packets)
1418			err = add_recvbuf_big(vi, rq, gfp);
1419		else
1420			err = add_recvbuf_small(vi, rq, gfp);
1421
1422		oom = err == -ENOMEM;
1423		if (err)
1424			break;
1425	} while (rq->vq->num_free);
1426	if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1427		unsigned long flags;
1428
1429		flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1430		rq->stats.kicks++;
1431		u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
1432	}
1433
1434	return !oom;
1435}
1436
1437static void skb_recv_done(struct virtqueue *rvq)
1438{
1439	struct virtnet_info *vi = rvq->vdev->priv;
1440	struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1441
1442	virtqueue_napi_schedule(&rq->napi, rvq);
1443}
1444
1445static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1446{
1447	napi_enable(napi);
1448
1449	/* If all buffers were filled by other side before we napi_enabled, we
1450	 * won't get another interrupt, so process any outstanding packets now.
1451	 * Call local_bh_enable after to trigger softIRQ processing.
1452	 */
1453	local_bh_disable();
1454	virtqueue_napi_schedule(napi, vq);
1455	local_bh_enable();
1456}
1457
1458static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1459				   struct virtqueue *vq,
1460				   struct napi_struct *napi)
1461{
1462	if (!napi->weight)
1463		return;
1464
1465	/* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1466	 * enable the feature if this is likely affine with the transmit path.
1467	 */
1468	if (!vi->affinity_hint_set) {
1469		napi->weight = 0;
1470		return;
1471	}
1472
1473	return virtnet_napi_enable(vq, napi);
1474}
1475
1476static void virtnet_napi_tx_disable(struct napi_struct *napi)
1477{
1478	if (napi->weight)
1479		napi_disable(napi);
1480}
1481
1482static void refill_work(struct work_struct *work)
1483{
1484	struct virtnet_info *vi =
1485		container_of(work, struct virtnet_info, refill.work);
1486	bool still_empty;
1487	int i;
1488
1489	for (i = 0; i < vi->curr_queue_pairs; i++) {
1490		struct receive_queue *rq = &vi->rq[i];
1491
1492		napi_disable(&rq->napi);
1493		still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1494		virtnet_napi_enable(rq->vq, &rq->napi);
1495
1496		/* In theory, this can happen: if we don't get any buffers in
1497		 * we will *never* try to fill again.
1498		 */
1499		if (still_empty)
1500			schedule_delayed_work(&vi->refill, HZ/2);
1501	}
1502}
1503
1504static int virtnet_receive(struct receive_queue *rq, int budget,
1505			   unsigned int *xdp_xmit)
1506{
1507	struct virtnet_info *vi = rq->vq->vdev->priv;
1508	struct virtnet_rq_stats stats = {};
1509	unsigned int len;
1510	void *buf;
1511	int i;
1512
1513	if (!vi->big_packets || vi->mergeable_rx_bufs) {
1514		void *ctx;
1515
1516		while (stats.packets < budget &&
1517		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1518			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1519			stats.packets++;
1520		}
1521	} else {
1522		while (stats.packets < budget &&
1523		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1524			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1525			stats.packets++;
1526		}
1527	}
1528
1529	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
1530		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1531			schedule_delayed_work(&vi->refill, 0);
1532	}
1533
1534	u64_stats_update_begin(&rq->stats.syncp);
1535	for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
1536		size_t offset = virtnet_rq_stats_desc[i].offset;
1537		u64 *item;
1538
1539		item = (u64 *)((u8 *)&rq->stats + offset);
1540		*item += *(u64 *)((u8 *)&stats + offset);
1541	}
1542	u64_stats_update_end(&rq->stats.syncp);
1543
1544	return stats.packets;
1545}
1546
1547static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1548{
1549	unsigned int len;
1550	unsigned int packets = 0;
1551	unsigned int bytes = 0;
1552	void *ptr;
1553
1554	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1555		if (likely(!is_xdp_frame(ptr))) {
1556			struct sk_buff *skb = ptr;
1557
1558			pr_debug("Sent skb %p\n", skb);
1559
1560			bytes += skb->len;
1561			napi_consume_skb(skb, in_napi);
1562		} else {
1563			struct xdp_frame *frame = ptr_to_xdp(ptr);
1564
1565			bytes += frame->len;
1566			xdp_return_frame(frame);
1567		}
1568		packets++;
1569	}
1570
1571	/* Avoid overhead when no packets have been processed
1572	 * happens when called speculatively from start_xmit.
1573	 */
1574	if (!packets)
1575		return;
1576
1577	u64_stats_update_begin(&sq->stats.syncp);
1578	sq->stats.bytes += bytes;
1579	sq->stats.packets += packets;
1580	u64_stats_update_end(&sq->stats.syncp);
1581}
1582
1583static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
1584{
1585	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
1586		return false;
1587	else if (q < vi->curr_queue_pairs)
1588		return true;
1589	else
1590		return false;
1591}
1592
1593static void virtnet_poll_cleantx(struct receive_queue *rq)
1594{
1595	struct virtnet_info *vi = rq->vq->vdev->priv;
1596	unsigned int index = vq2rxq(rq->vq);
1597	struct send_queue *sq = &vi->sq[index];
1598	struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1599
1600	if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1601		return;
1602
1603	if (__netif_tx_trylock(txq)) {
1604		do {
1605			virtqueue_disable_cb(sq->vq);
1606			free_old_xmit_skbs(sq, true);
1607		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1608
1609		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1610			netif_tx_wake_queue(txq);
1611
1612		__netif_tx_unlock(txq);
1613	}
1614}
1615
1616static int virtnet_poll(struct napi_struct *napi, int budget)
1617{
1618	struct receive_queue *rq =
1619		container_of(napi, struct receive_queue, napi);
1620	struct virtnet_info *vi = rq->vq->vdev->priv;
1621	struct send_queue *sq;
1622	unsigned int received;
1623	unsigned int xdp_xmit = 0;
1624
1625	virtnet_poll_cleantx(rq);
1626
1627	received = virtnet_receive(rq, budget, &xdp_xmit);
1628
1629	/* Out of packets? */
1630	if (received < budget)
1631		virtqueue_napi_complete(napi, rq->vq, received);
1632
1633	if (xdp_xmit & VIRTIO_XDP_REDIR)
1634		xdp_do_flush();
1635
1636	if (xdp_xmit & VIRTIO_XDP_TX) {
1637		sq = virtnet_xdp_get_sq(vi);
1638		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1639			u64_stats_update_begin(&sq->stats.syncp);
1640			sq->stats.kicks++;
1641			u64_stats_update_end(&sq->stats.syncp);
1642		}
1643		virtnet_xdp_put_sq(vi, sq);
1644	}
1645
1646	return received;
1647}
1648
1649static int virtnet_open(struct net_device *dev)
1650{
1651	struct virtnet_info *vi = netdev_priv(dev);
1652	int i, err;
1653
1654	for (i = 0; i < vi->max_queue_pairs; i++) {
1655		if (i < vi->curr_queue_pairs)
1656			/* Make sure we have some buffers: if oom use wq. */
1657			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1658				schedule_delayed_work(&vi->refill, 0);
1659
1660		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1661		if (err < 0)
1662			return err;
1663
1664		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
1665						 MEM_TYPE_PAGE_SHARED, NULL);
1666		if (err < 0) {
1667			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
1668			return err;
1669		}
1670
1671		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1672		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1673	}
1674
1675	return 0;
1676}
1677
1678static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1679{
1680	struct send_queue *sq = container_of(napi, struct send_queue, napi);
1681	struct virtnet_info *vi = sq->vq->vdev->priv;
1682	unsigned int index = vq2txq(sq->vq);
1683	struct netdev_queue *txq;
1684	int opaque;
1685	bool done;
1686
1687	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
1688		/* We don't need to enable cb for XDP */
1689		napi_complete_done(napi, 0);
1690		return 0;
1691	}
1692
1693	txq = netdev_get_tx_queue(vi->dev, index);
1694	__netif_tx_lock(txq, raw_smp_processor_id());
1695	virtqueue_disable_cb(sq->vq);
1696	free_old_xmit_skbs(sq, true);
1697
1698	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1699		netif_tx_wake_queue(txq);
1700
1701	opaque = virtqueue_enable_cb_prepare(sq->vq);
1702
1703	done = napi_complete_done(napi, 0);
1704
1705	if (!done)
1706		virtqueue_disable_cb(sq->vq);
1707
1708	__netif_tx_unlock(txq);
1709
1710	if (done) {
1711		if (unlikely(virtqueue_poll(sq->vq, opaque))) {
1712			if (napi_schedule_prep(napi)) {
1713				__netif_tx_lock(txq, raw_smp_processor_id());
1714				virtqueue_disable_cb(sq->vq);
1715				__netif_tx_unlock(txq);
1716				__napi_schedule(napi);
1717			}
1718		}
1719	}
1720
1721	return 0;
1722}
1723
1724static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1725{
1726	struct virtio_net_hdr_mrg_rxbuf *hdr;
1727	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1728	struct virtnet_info *vi = sq->vq->vdev->priv;
1729	int num_sg;
1730	unsigned hdr_len = vi->hdr_len;
1731	bool can_push;
1732
1733	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1734
1735	can_push = vi->any_header_sg &&
1736		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1737		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1738	/* Even if we can, don't push here yet as this would skew
1739	 * csum_start offset below. */
1740	if (can_push)
1741		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1742	else
1743		hdr = skb_vnet_hdr(skb);
1744
1745	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1746				    virtio_is_little_endian(vi->vdev), false,
1747				    0))
1748		return -EPROTO;
1749
1750	if (vi->mergeable_rx_bufs)
1751		hdr->num_buffers = 0;
1752
1753	sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1754	if (can_push) {
1755		__skb_push(skb, hdr_len);
1756		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1757		if (unlikely(num_sg < 0))
1758			return num_sg;
1759		/* Pull header back to avoid skew in tx bytes calculations. */
1760		__skb_pull(skb, hdr_len);
1761	} else {
1762		sg_set_buf(sq->sg, hdr, hdr_len);
1763		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1764		if (unlikely(num_sg < 0))
1765			return num_sg;
1766		num_sg++;
1767	}
1768	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1769}
1770
1771static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1772{
1773	struct virtnet_info *vi = netdev_priv(dev);
1774	int qnum = skb_get_queue_mapping(skb);
1775	struct send_queue *sq = &vi->sq[qnum];
1776	int err;
1777	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1778	bool kick = !netdev_xmit_more();
1779	bool use_napi = sq->napi.weight;
1780
1781	/* Free up any pending old buffers before queueing new ones. */
1782	do {
1783		if (use_napi)
1784			virtqueue_disable_cb(sq->vq);
1785
1786		free_old_xmit_skbs(sq, false);
1787
1788	} while (use_napi && kick &&
1789	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1790
1791	/* timestamp packet in software */
1792	skb_tx_timestamp(skb);
1793
1794	/* Try to transmit */
1795	err = xmit_skb(sq, skb);
1796
1797	/* This should not happen! */
1798	if (unlikely(err)) {
1799		dev->stats.tx_fifo_errors++;
1800		if (net_ratelimit())
1801			dev_warn(&dev->dev,
1802				 "Unexpected TXQ (%d) queue failure: %d\n",
1803				 qnum, err);
1804		dev->stats.tx_dropped++;
1805		dev_kfree_skb_any(skb);
1806		return NETDEV_TX_OK;
1807	}
1808
1809	/* Don't wait up for transmitted skbs to be freed. */
1810	if (!use_napi) {
1811		skb_orphan(skb);
1812		nf_reset_ct(skb);
1813	}
1814
1815	/* If running out of space, stop queue to avoid getting packets that we
1816	 * are then unable to transmit.
1817	 * An alternative would be to force queuing layer to requeue the skb by
1818	 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1819	 * returned in a normal path of operation: it means that driver is not
1820	 * maintaining the TX queue stop/start state properly, and causes
1821	 * the stack to do a non-trivial amount of useless work.
1822	 * Since most packets only take 1 or 2 ring slots, stopping the queue
1823	 * early means 16 slots are typically wasted.
1824	 */
1825	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
1826		netif_stop_subqueue(dev, qnum);
1827		if (!use_napi &&
1828		    unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1829			/* More just got used, free them then recheck. */
1830			free_old_xmit_skbs(sq, false);
1831			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
1832				netif_start_subqueue(dev, qnum);
1833				virtqueue_disable_cb(sq->vq);
1834			}
1835		}
1836	}
1837
1838	if (kick || netif_xmit_stopped(txq)) {
1839		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1840			u64_stats_update_begin(&sq->stats.syncp);
1841			sq->stats.kicks++;
1842			u64_stats_update_end(&sq->stats.syncp);
1843		}
1844	}
1845
1846	return NETDEV_TX_OK;
1847}
1848
1849/*
1850 * Send command via the control virtqueue and check status.  Commands
1851 * supported by the hypervisor, as indicated by feature bits, should
1852 * never fail unless improperly formatted.
1853 */
1854static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1855				 struct scatterlist *out)
1856{
1857	struct scatterlist *sgs[4], hdr, stat;
1858	unsigned out_num = 0, tmp;
1859	int ret;
1860
1861	/* Caller should know better */
1862	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1863
1864	vi->ctrl->status = ~0;
1865	vi->ctrl->hdr.class = class;
1866	vi->ctrl->hdr.cmd = cmd;
1867	/* Add header */
1868	sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1869	sgs[out_num++] = &hdr;
1870
1871	if (out)
1872		sgs[out_num++] = out;
1873
1874	/* Add return status. */
1875	sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1876	sgs[out_num] = &stat;
1877
1878	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1879	ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1880	if (ret < 0) {
1881		dev_warn(&vi->vdev->dev,
1882			 "Failed to add sgs for command vq: %d\n.", ret);
1883		return false;
1884	}
1885
1886	if (unlikely(!virtqueue_kick(vi->cvq)))
1887		return vi->ctrl->status == VIRTIO_NET_OK;
1888
1889	/* Spin for a response, the kick causes an ioport write, trapping
1890	 * into the hypervisor, so the request should be handled immediately.
1891	 */
1892	while (!virtqueue_get_buf(vi->cvq, &tmp) &&
1893	       !virtqueue_is_broken(vi->cvq))
1894		cpu_relax();
1895
1896	return vi->ctrl->status == VIRTIO_NET_OK;
1897}
1898
1899static int virtnet_set_mac_address(struct net_device *dev, void *p)
1900{
1901	struct virtnet_info *vi = netdev_priv(dev);
1902	struct virtio_device *vdev = vi->vdev;
1903	int ret;
1904	struct sockaddr *addr;
1905	struct scatterlist sg;
1906
1907	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
1908		return -EOPNOTSUPP;
1909
1910	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1911	if (!addr)
1912		return -ENOMEM;
1913
1914	ret = eth_prepare_mac_addr_change(dev, addr);
1915	if (ret)
1916		goto out;
1917
1918	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1919		sg_init_one(&sg, addr->sa_data, dev->addr_len);
1920		if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1921					  VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1922			dev_warn(&vdev->dev,
1923				 "Failed to set mac address by vq command.\n");
1924			ret = -EINVAL;
1925			goto out;
1926		}
1927	} else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
1928		   !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1929		unsigned int i;
1930
1931		/* Naturally, this has an atomicity problem. */
1932		for (i = 0; i < dev->addr_len; i++)
1933			virtio_cwrite8(vdev,
1934				       offsetof(struct virtio_net_config, mac) +
1935				       i, addr->sa_data[i]);
1936	}
1937
1938	eth_commit_mac_addr_change(dev, p);
1939	ret = 0;
1940
1941out:
1942	kfree(addr);
1943	return ret;
1944}
1945
1946static void virtnet_stats(struct net_device *dev,
1947			  struct rtnl_link_stats64 *tot)
1948{
1949	struct virtnet_info *vi = netdev_priv(dev);
1950	unsigned int start;
1951	int i;
1952
1953	for (i = 0; i < vi->max_queue_pairs; i++) {
1954		u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
1955		struct receive_queue *rq = &vi->rq[i];
1956		struct send_queue *sq = &vi->sq[i];
1957
1958		do {
1959			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
1960			tpackets = sq->stats.packets;
1961			tbytes   = sq->stats.bytes;
1962			terrors  = sq->stats.tx_timeouts;
1963		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
1964
1965		do {
1966			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
1967			rpackets = rq->stats.packets;
1968			rbytes   = rq->stats.bytes;
1969			rdrops   = rq->stats.drops;
1970		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
1971
1972		tot->rx_packets += rpackets;
1973		tot->tx_packets += tpackets;
1974		tot->rx_bytes   += rbytes;
1975		tot->tx_bytes   += tbytes;
1976		tot->rx_dropped += rdrops;
1977		tot->tx_errors  += terrors;
1978	}
1979
1980	tot->tx_dropped = dev->stats.tx_dropped;
1981	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1982	tot->rx_length_errors = dev->stats.rx_length_errors;
1983	tot->rx_frame_errors = dev->stats.rx_frame_errors;
1984}
1985
1986static void virtnet_ack_link_announce(struct virtnet_info *vi)
1987{
1988	rtnl_lock();
1989	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1990				  VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1991		dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
1992	rtnl_unlock();
1993}
1994
1995static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1996{
1997	struct scatterlist sg;
1998	struct net_device *dev = vi->dev;
1999
2000	if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
2001		return 0;
2002
2003	vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
2004	sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
2005
2006	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2007				  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
2008		dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
2009			 queue_pairs);
2010		return -EINVAL;
2011	} else {
2012		vi->curr_queue_pairs = queue_pairs;
2013		/* virtnet_open() will refill when device is going to up. */
2014		if (dev->flags & IFF_UP)
2015			schedule_delayed_work(&vi->refill, 0);
2016	}
2017
2018	return 0;
2019}
2020
2021static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
2022{
2023	int err;
2024
2025	rtnl_lock();
2026	err = _virtnet_set_queues(vi, queue_pairs);
2027	rtnl_unlock();
2028	return err;
2029}
2030
2031static int virtnet_close(struct net_device *dev)
2032{
2033	struct virtnet_info *vi = netdev_priv(dev);
2034	int i;
2035
2036	/* Make sure refill_work doesn't re-enable napi! */
2037	cancel_delayed_work_sync(&vi->refill);
2038
2039	for (i = 0; i < vi->max_queue_pairs; i++) {
2040		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
2041		napi_disable(&vi->rq[i].napi);
2042		virtnet_napi_tx_disable(&vi->sq[i].napi);
2043	}
2044
2045	return 0;
2046}
2047
2048static void virtnet_set_rx_mode(struct net_device *dev)
2049{
2050	struct virtnet_info *vi = netdev_priv(dev);
2051	struct scatterlist sg[2];
2052	struct virtio_net_ctrl_mac *mac_data;
2053	struct netdev_hw_addr *ha;
2054	int uc_count;
2055	int mc_count;
2056	void *buf;
2057	int i;
2058
2059	/* We can't dynamically set ndo_set_rx_mode, so return gracefully */
2060	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
2061		return;
2062
2063	vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
2064	vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
2065
2066	sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
2067
2068	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2069				  VIRTIO_NET_CTRL_RX_PROMISC, sg))
2070		dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
2071			 vi->ctrl->promisc ? "en" : "dis");
2072
2073	sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
2074
2075	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2076				  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
2077		dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
2078			 vi->ctrl->allmulti ? "en" : "dis");
2079
2080	uc_count = netdev_uc_count(dev);
2081	mc_count = netdev_mc_count(dev);
2082	/* MAC filter - use one buffer for both lists */
2083	buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
2084		      (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
2085	mac_data = buf;
2086	if (!buf)
2087		return;
2088
2089	sg_init_table(sg, 2);
2090
2091	/* Store the unicast list and count in the front of the buffer */
2092	mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
2093	i = 0;
2094	netdev_for_each_uc_addr(ha, dev)
2095		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2096
2097	sg_set_buf(&sg[0], mac_data,
2098		   sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2099
2100	/* multicast list and count fill the end */
2101	mac_data = (void *)&mac_data->macs[uc_count][0];
2102
2103	mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2104	i = 0;
2105	netdev_for_each_mc_addr(ha, dev)
2106		memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2107
2108	sg_set_buf(&sg[1], mac_data,
2109		   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2110
2111	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2112				  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2113		dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2114
2115	kfree(buf);
2116}
2117
2118static int virtnet_vlan_rx_add_vid(struct net_device *dev,
2119				   __be16 proto, u16 vid)
2120{
2121	struct virtnet_info *vi = netdev_priv(dev);
2122	struct scatterlist sg;
2123
2124	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2125	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2126
2127	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2128				  VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2129		dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2130	return 0;
2131}
2132
2133static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
2134				    __be16 proto, u16 vid)
2135{
2136	struct virtnet_info *vi = netdev_priv(dev);
2137	struct scatterlist sg;
2138
2139	vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2140	sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2141
2142	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2143				  VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2144		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2145	return 0;
2146}
2147
2148static void virtnet_clean_affinity(struct virtnet_info *vi)
2149{
2150	int i;
2151
2152	if (vi->affinity_hint_set) {
2153		for (i = 0; i < vi->max_queue_pairs; i++) {
2154			virtqueue_set_affinity(vi->rq[i].vq, NULL);
2155			virtqueue_set_affinity(vi->sq[i].vq, NULL);
2156		}
2157
2158		vi->affinity_hint_set = false;
2159	}
2160}
2161
2162static void virtnet_set_affinity(struct virtnet_info *vi)
2163{
2164	cpumask_var_t mask;
2165	int stragglers;
2166	int group_size;
2167	int i, j, cpu;
2168	int num_cpu;
2169	int stride;
2170
2171	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2172		virtnet_clean_affinity(vi);
2173		return;
2174	}
2175
2176	num_cpu = num_online_cpus();
2177	stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
2178	stragglers = num_cpu >= vi->curr_queue_pairs ?
2179			num_cpu % vi->curr_queue_pairs :
2180			0;
2181	cpu = cpumask_first(cpu_online_mask);
2182
2183	for (i = 0; i < vi->curr_queue_pairs; i++) {
2184		group_size = stride + (i < stragglers ? 1 : 0);
2185
2186		for (j = 0; j < group_size; j++) {
2187			cpumask_set_cpu(cpu, mask);
2188			cpu = cpumask_next_wrap(cpu, cpu_online_mask,
2189						nr_cpu_ids, false);
2190		}
2191		virtqueue_set_affinity(vi->rq[i].vq, mask);
2192		virtqueue_set_affinity(vi->sq[i].vq, mask);
2193		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2194		cpumask_clear(mask);
2195	}
2196
2197	vi->affinity_hint_set = true;
2198	free_cpumask_var(mask);
2199}
2200
2201static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2202{
2203	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2204						   node);
2205	virtnet_set_affinity(vi);
2206	return 0;
2207}
2208
2209static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
2210{
2211	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2212						   node_dead);
2213	virtnet_set_affinity(vi);
2214	return 0;
2215}
2216
2217static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
2218{
2219	struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2220						   node);
2221
2222	virtnet_clean_affinity(vi);
2223	return 0;
2224}
2225
2226static enum cpuhp_state virtionet_online;
2227
2228static int virtnet_cpu_notif_add(struct virtnet_info *vi)
2229{
2230	int ret;
2231
2232	ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
2233	if (ret)
2234		return ret;
2235	ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2236					       &vi->node_dead);
2237	if (!ret)
2238		return ret;
2239	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2240	return ret;
2241}
2242
2243static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
2244{
2245	cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2246	cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2247					    &vi->node_dead);
2248}
2249
2250static void virtnet_get_ringparam(struct net_device *dev,
2251				  struct ethtool_ringparam *ring,
2252				  struct kernel_ethtool_ringparam *kernel_ring,
2253				  struct netlink_ext_ack *extack)
2254{
2255	struct virtnet_info *vi = netdev_priv(dev);
2256
2257	ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2258	ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2259	ring->rx_pending = ring->rx_max_pending;
2260	ring->tx_pending = ring->tx_max_pending;
2261}
2262
2263static bool virtnet_commit_rss_command(struct virtnet_info *vi)
2264{
2265	struct net_device *dev = vi->dev;
2266	struct scatterlist sgs[4];
2267	unsigned int sg_buf_size;
2268
2269	/* prepare sgs */
2270	sg_init_table(sgs, 4);
2271
2272	sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table);
2273	sg_set_buf(&sgs[0], &vi->ctrl->rss, sg_buf_size);
2274
2275	sg_buf_size = sizeof(uint16_t) * (vi->ctrl->rss.indirection_table_mask + 1);
2276	sg_set_buf(&sgs[1], vi->ctrl->rss.indirection_table, sg_buf_size);
2277
2278	sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key)
2279			- offsetof(struct virtio_net_ctrl_rss, max_tx_vq);
2280	sg_set_buf(&sgs[2], &vi->ctrl->rss.max_tx_vq, sg_buf_size);
2281
2282	sg_buf_size = vi->rss_key_size;
2283	sg_set_buf(&sgs[3], vi->ctrl->rss.key, sg_buf_size);
2284
2285	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2286				  vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG
2287				  : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) {
2288		dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n");
2289		return false;
2290	}
2291	return true;
2292}
2293
2294static void virtnet_init_default_rss(struct virtnet_info *vi)
2295{
2296	u32 indir_val = 0;
2297	int i = 0;
2298
2299	vi->ctrl->rss.hash_types = vi->rss_hash_types_supported;
2300	vi->rss_hash_types_saved = vi->rss_hash_types_supported;
2301	vi->ctrl->rss.indirection_table_mask = vi->rss_indir_table_size
2302						? vi->rss_indir_table_size - 1 : 0;
2303	vi->ctrl->rss.unclassified_queue = 0;
2304
2305	for (; i < vi->rss_indir_table_size; ++i) {
2306		indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs);
2307		vi->ctrl->rss.indirection_table[i] = indir_val;
2308	}
2309
2310	vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs;
2311	vi->ctrl->rss.hash_key_length = vi->rss_key_size;
2312
2313	netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size);
2314}
2315
2316static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info)
2317{
2318	info->data = 0;
2319	switch (info->flow_type) {
2320	case TCP_V4_FLOW:
2321		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
2322			info->data = RXH_IP_SRC | RXH_IP_DST |
2323						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
2324		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2325			info->data = RXH_IP_SRC | RXH_IP_DST;
2326		}
2327		break;
2328	case TCP_V6_FLOW:
2329		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
2330			info->data = RXH_IP_SRC | RXH_IP_DST |
2331						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
2332		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2333			info->data = RXH_IP_SRC | RXH_IP_DST;
2334		}
2335		break;
2336	case UDP_V4_FLOW:
2337		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
2338			info->data = RXH_IP_SRC | RXH_IP_DST |
2339						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
2340		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2341			info->data = RXH_IP_SRC | RXH_IP_DST;
2342		}
2343		break;
2344	case UDP_V6_FLOW:
2345		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
2346			info->data = RXH_IP_SRC | RXH_IP_DST |
2347						 RXH_L4_B_0_1 | RXH_L4_B_2_3;
2348		} else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2349			info->data = RXH_IP_SRC | RXH_IP_DST;
2350		}
2351		break;
2352	case IPV4_FLOW:
2353		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
2354			info->data = RXH_IP_SRC | RXH_IP_DST;
2355
2356		break;
2357	case IPV6_FLOW:
2358		if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
2359			info->data = RXH_IP_SRC | RXH_IP_DST;
2360
2361		break;
2362	default:
2363		info->data = 0;
2364		break;
2365	}
2366}
2367
2368static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info)
2369{
2370	u32 new_hashtypes = vi->rss_hash_types_saved;
2371	bool is_disable = info->data & RXH_DISCARD;
2372	bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3);
2373
2374	/* supports only 'sd', 'sdfn' and 'r' */
2375	if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable))
2376		return false;
2377
2378	switch (info->flow_type) {
2379	case TCP_V4_FLOW:
2380		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4);
2381		if (!is_disable)
2382			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2383				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0);
2384		break;
2385	case UDP_V4_FLOW:
2386		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4);
2387		if (!is_disable)
2388			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2389				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0);
2390		break;
2391	case IPV4_FLOW:
2392		new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2393		if (!is_disable)
2394			new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2395		break;
2396	case TCP_V6_FLOW:
2397		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6);
2398		if (!is_disable)
2399			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2400				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0);
2401		break;
2402	case UDP_V6_FLOW:
2403		new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6);
2404		if (!is_disable)
2405			new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2406				| (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0);
2407		break;
2408	case IPV6_FLOW:
2409		new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2410		if (!is_disable)
2411			new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2412		break;
2413	default:
2414		/* unsupported flow */
2415		return false;
2416	}
2417
2418	/* if unsupported hashtype was set */
2419	if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported))
2420		return false;
2421
2422	if (new_hashtypes != vi->rss_hash_types_saved) {
2423		vi->rss_hash_types_saved = new_hashtypes;
2424		vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
2425		if (vi->dev->features & NETIF_F_RXHASH)
2426			return virtnet_commit_rss_command(vi);
2427	}
2428
2429	return true;
2430}
2431
2432static void virtnet_get_drvinfo(struct net_device *dev,
2433				struct ethtool_drvinfo *info)
2434{
2435	struct virtnet_info *vi = netdev_priv(dev);
2436	struct virtio_device *vdev = vi->vdev;
2437
2438	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
2439	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
2440	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
2441
2442}
2443
2444/* TODO: Eliminate OOO packets during switching */
2445static int virtnet_set_channels(struct net_device *dev,
2446				struct ethtool_channels *channels)
2447{
2448	struct virtnet_info *vi = netdev_priv(dev);
2449	u16 queue_pairs = channels->combined_count;
2450	int err;
2451
2452	/* We don't support separate rx/tx channels.
2453	 * We don't allow setting 'other' channels.
2454	 */
2455	if (channels->rx_count || channels->tx_count || channels->other_count)
2456		return -EINVAL;
2457
2458	if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2459		return -EINVAL;
2460
2461	/* For now we don't support modifying channels while XDP is loaded
2462	 * also when XDP is loaded all RX queues have XDP programs so we only
2463	 * need to check a single RX queue.
2464	 */
2465	if (vi->rq[0].xdp_prog)
2466		return -EINVAL;
2467
2468	cpus_read_lock();
2469	err = _virtnet_set_queues(vi, queue_pairs);
2470	if (err) {
2471		cpus_read_unlock();
2472		goto err;
2473	}
2474	virtnet_set_affinity(vi);
2475	cpus_read_unlock();
2476
2477	netif_set_real_num_tx_queues(dev, queue_pairs);
2478	netif_set_real_num_rx_queues(dev, queue_pairs);
2479 err:
2480	return err;
2481}
2482
2483static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
2484{
2485	struct virtnet_info *vi = netdev_priv(dev);
2486	unsigned int i, j;
2487	u8 *p = data;
2488
2489	switch (stringset) {
2490	case ETH_SS_STATS:
2491		for (i = 0; i < vi->curr_queue_pairs; i++) {
2492			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
2493				ethtool_sprintf(&p, "rx_queue_%u_%s", i,
2494						virtnet_rq_stats_desc[j].desc);
2495		}
2496
2497		for (i = 0; i < vi->curr_queue_pairs; i++) {
2498			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
2499				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
2500						virtnet_sq_stats_desc[j].desc);
2501		}
2502		break;
2503	}
2504}
2505
2506static int virtnet_get_sset_count(struct net_device *dev, int sset)
2507{
2508	struct virtnet_info *vi = netdev_priv(dev);
2509
2510	switch (sset) {
2511	case ETH_SS_STATS:
2512		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
2513					       VIRTNET_SQ_STATS_LEN);
2514	default:
2515		return -EOPNOTSUPP;
2516	}
2517}
2518
2519static void virtnet_get_ethtool_stats(struct net_device *dev,
2520				      struct ethtool_stats *stats, u64 *data)
2521{
2522	struct virtnet_info *vi = netdev_priv(dev);
2523	unsigned int idx = 0, start, i, j;
2524	const u8 *stats_base;
2525	size_t offset;
2526
2527	for (i = 0; i < vi->curr_queue_pairs; i++) {
2528		struct receive_queue *rq = &vi->rq[i];
2529
2530		stats_base = (u8 *)&rq->stats;
2531		do {
2532			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
2533			for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
2534				offset = virtnet_rq_stats_desc[j].offset;
2535				data[idx + j] = *(u64 *)(stats_base + offset);
2536			}
2537		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
2538		idx += VIRTNET_RQ_STATS_LEN;
2539	}
2540
2541	for (i = 0; i < vi->curr_queue_pairs; i++) {
2542		struct send_queue *sq = &vi->sq[i];
2543
2544		stats_base = (u8 *)&sq->stats;
2545		do {
2546			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
2547			for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
2548				offset = virtnet_sq_stats_desc[j].offset;
2549				data[idx + j] = *(u64 *)(stats_base + offset);
2550			}
2551		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
2552		idx += VIRTNET_SQ_STATS_LEN;
2553	}
2554}
2555
2556static void virtnet_get_channels(struct net_device *dev,
2557				 struct ethtool_channels *channels)
2558{
2559	struct virtnet_info *vi = netdev_priv(dev);
2560
2561	channels->combined_count = vi->curr_queue_pairs;
2562	channels->max_combined = vi->max_queue_pairs;
2563	channels->max_other = 0;
2564	channels->rx_count = 0;
2565	channels->tx_count = 0;
2566	channels->other_count = 0;
2567}
2568
2569static int virtnet_set_link_ksettings(struct net_device *dev,
2570				      const struct ethtool_link_ksettings *cmd)
2571{
2572	struct virtnet_info *vi = netdev_priv(dev);
2573
2574	return ethtool_virtdev_set_link_ksettings(dev, cmd,
2575						  &vi->speed, &vi->duplex);
2576}
2577
2578static int virtnet_get_link_ksettings(struct net_device *dev,
2579				      struct ethtool_link_ksettings *cmd)
2580{
2581	struct virtnet_info *vi = netdev_priv(dev);
2582
2583	cmd->base.speed = vi->speed;
2584	cmd->base.duplex = vi->duplex;
2585	cmd->base.port = PORT_OTHER;
2586
2587	return 0;
2588}
2589
2590static int virtnet_set_coalesce(struct net_device *dev,
2591				struct ethtool_coalesce *ec,
2592				struct kernel_ethtool_coalesce *kernel_coal,
2593				struct netlink_ext_ack *extack)
2594{
2595	struct virtnet_info *vi = netdev_priv(dev);
2596	int i, napi_weight;
2597
2598	if (ec->tx_max_coalesced_frames > 1 ||
2599	    ec->rx_max_coalesced_frames != 1)
2600		return -EINVAL;
2601
2602	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
2603	if (napi_weight ^ vi->sq[0].napi.weight) {
2604		if (dev->flags & IFF_UP)
2605			return -EBUSY;
2606		for (i = 0; i < vi->max_queue_pairs; i++)
2607			vi->sq[i].napi.weight = napi_weight;
2608	}
2609
2610	return 0;
2611}
2612
2613static int virtnet_get_coalesce(struct net_device *dev,
2614				struct ethtool_coalesce *ec,
2615				struct kernel_ethtool_coalesce *kernel_coal,
2616				struct netlink_ext_ack *extack)
2617{
2618	struct ethtool_coalesce ec_default = {
2619		.cmd = ETHTOOL_GCOALESCE,
2620		.rx_max_coalesced_frames = 1,
2621	};
2622	struct virtnet_info *vi = netdev_priv(dev);
2623
2624	memcpy(ec, &ec_default, sizeof(ec_default));
2625
2626	if (vi->sq[0].napi.weight)
2627		ec->tx_max_coalesced_frames = 1;
2628
2629	return 0;
2630}
2631
2632static void virtnet_init_settings(struct net_device *dev)
2633{
2634	struct virtnet_info *vi = netdev_priv(dev);
2635
2636	vi->speed = SPEED_UNKNOWN;
2637	vi->duplex = DUPLEX_UNKNOWN;
2638}
2639
2640static void virtnet_update_settings(struct virtnet_info *vi)
2641{
2642	u32 speed;
2643	u8 duplex;
2644
2645	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
2646		return;
2647
2648	virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
2649
2650	if (ethtool_validate_speed(speed))
2651		vi->speed = speed;
2652
2653	virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
2654
2655	if (ethtool_validate_duplex(duplex))
2656		vi->duplex = duplex;
2657}
2658
2659static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
2660{
2661	return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
2662}
2663
2664static u32 virtnet_get_rxfh_indir_size(struct net_device *dev)
2665{
2666	return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size;
2667}
2668
2669static int virtnet_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
2670{
2671	struct virtnet_info *vi = netdev_priv(dev);
2672	int i;
2673
2674	if (indir) {
2675		for (i = 0; i < vi->rss_indir_table_size; ++i)
2676			indir[i] = vi->ctrl->rss.indirection_table[i];
2677	}
2678
2679	if (key)
2680		memcpy(key, vi->ctrl->rss.key, vi->rss_key_size);
2681
2682	if (hfunc)
2683		*hfunc = ETH_RSS_HASH_TOP;
2684
2685	return 0;
2686}
2687
2688static int virtnet_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, const u8 hfunc)
2689{
2690	struct virtnet_info *vi = netdev_priv(dev);
2691	int i;
2692
2693	if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
2694		return -EOPNOTSUPP;
2695
2696	if (indir) {
2697		for (i = 0; i < vi->rss_indir_table_size; ++i)
2698			vi->ctrl->rss.indirection_table[i] = indir[i];
2699	}
2700	if (key)
2701		memcpy(vi->ctrl->rss.key, key, vi->rss_key_size);
2702
2703	virtnet_commit_rss_command(vi);
2704
2705	return 0;
2706}
2707
2708static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs)
2709{
2710	struct virtnet_info *vi = netdev_priv(dev);
2711	int rc = 0;
2712
2713	switch (info->cmd) {
2714	case ETHTOOL_GRXRINGS:
2715		info->data = vi->curr_queue_pairs;
2716		break;
2717	case ETHTOOL_GRXFH:
2718		virtnet_get_hashflow(vi, info);
2719		break;
2720	default:
2721		rc = -EOPNOTSUPP;
2722	}
2723
2724	return rc;
2725}
2726
2727static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
2728{
2729	struct virtnet_info *vi = netdev_priv(dev);
2730	int rc = 0;
2731
2732	switch (info->cmd) {
2733	case ETHTOOL_SRXFH:
2734		if (!virtnet_set_hashflow(vi, info))
2735			rc = -EINVAL;
2736
2737		break;
2738	default:
2739		rc = -EOPNOTSUPP;
2740	}
2741
2742	return rc;
2743}
2744
2745static const struct ethtool_ops virtnet_ethtool_ops = {
2746	.supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES,
2747	.get_drvinfo = virtnet_get_drvinfo,
2748	.get_link = ethtool_op_get_link,
2749	.get_ringparam = virtnet_get_ringparam,
2750	.get_strings = virtnet_get_strings,
2751	.get_sset_count = virtnet_get_sset_count,
2752	.get_ethtool_stats = virtnet_get_ethtool_stats,
2753	.set_channels = virtnet_set_channels,
2754	.get_channels = virtnet_get_channels,
2755	.get_ts_info = ethtool_op_get_ts_info,
2756	.get_link_ksettings = virtnet_get_link_ksettings,
2757	.set_link_ksettings = virtnet_set_link_ksettings,
2758	.set_coalesce = virtnet_set_coalesce,
2759	.get_coalesce = virtnet_get_coalesce,
2760	.get_rxfh_key_size = virtnet_get_rxfh_key_size,
2761	.get_rxfh_indir_size = virtnet_get_rxfh_indir_size,
2762	.get_rxfh = virtnet_get_rxfh,
2763	.set_rxfh = virtnet_set_rxfh,
2764	.get_rxnfc = virtnet_get_rxnfc,
2765	.set_rxnfc = virtnet_set_rxnfc,
2766};
2767
2768static void virtnet_freeze_down(struct virtio_device *vdev)
2769{
2770	struct virtnet_info *vi = vdev->priv;
2771
2772	/* Make sure no work handler is accessing the device */
2773	flush_work(&vi->config_work);
2774
2775	netif_tx_lock_bh(vi->dev);
2776	netif_device_detach(vi->dev);
2777	netif_tx_unlock_bh(vi->dev);
2778	if (netif_running(vi->dev))
2779		virtnet_close(vi->dev);
2780}
2781
2782static int init_vqs(struct virtnet_info *vi);
2783
2784static int virtnet_restore_up(struct virtio_device *vdev)
2785{
2786	struct virtnet_info *vi = vdev->priv;
2787	int err;
2788
2789	err = init_vqs(vi);
2790	if (err)
2791		return err;
2792
2793	virtio_device_ready(vdev);
2794
2795	if (netif_running(vi->dev)) {
2796		err = virtnet_open(vi->dev);
2797		if (err)
2798			return err;
2799	}
2800
2801	netif_tx_lock_bh(vi->dev);
2802	netif_device_attach(vi->dev);
2803	netif_tx_unlock_bh(vi->dev);
2804	return err;
2805}
2806
2807static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
2808{
2809	struct scatterlist sg;
2810	vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
2811
2812	sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
2813
2814	if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
2815				  VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
2816		dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
2817		return -EINVAL;
2818	}
2819
2820	return 0;
2821}
2822
2823static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
2824{
2825	u64 offloads = 0;
2826
2827	if (!vi->guest_offloads)
2828		return 0;
2829
2830	return virtnet_set_guest_offloads(vi, offloads);
2831}
2832
2833static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
2834{
2835	u64 offloads = vi->guest_offloads;
2836
2837	if (!vi->guest_offloads)
2838		return 0;
2839
2840	return virtnet_set_guest_offloads(vi, offloads);
2841}
2842
2843static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
2844			   struct netlink_ext_ack *extack)
2845{
2846	unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
2847	struct virtnet_info *vi = netdev_priv(dev);
2848	struct bpf_prog *old_prog;
2849	u16 xdp_qp = 0, curr_qp;
2850	int i, err;
2851
2852	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
2853	    && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
2854	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2855	        virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
2856		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
2857		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) {
2858		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
2859		return -EOPNOTSUPP;
2860	}
2861
2862	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
2863		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
2864		return -EINVAL;
2865	}
2866
2867	if (dev->mtu > max_sz) {
2868		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
2869		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
2870		return -EINVAL;
2871	}
2872
2873	curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
2874	if (prog)
2875		xdp_qp = nr_cpu_ids;
2876
2877	/* XDP requires extra queues for XDP_TX */
2878	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
2879		netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
2880				 curr_qp + xdp_qp, vi->max_queue_pairs);
2881		xdp_qp = 0;
2882	}
2883
2884	old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
2885	if (!prog && !old_prog)
2886		return 0;
2887
2888	if (prog)
2889		bpf_prog_add(prog, vi->max_queue_pairs - 1);
2890
2891	/* Make sure NAPI is not using any XDP TX queues for RX. */
2892	if (netif_running(dev)) {
2893		for (i = 0; i < vi->max_queue_pairs; i++) {
2894			napi_disable(&vi->rq[i].napi);
2895			virtnet_napi_tx_disable(&vi->sq[i].napi);
2896		}
2897	}
2898
2899	if (!prog) {
2900		for (i = 0; i < vi->max_queue_pairs; i++) {
2901			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2902			if (i == 0)
2903				virtnet_restore_guest_offloads(vi);
2904		}
2905		synchronize_net();
2906	}
2907
2908	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
2909	if (err)
2910		goto err;
2911	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2912	vi->xdp_queue_pairs = xdp_qp;
2913
2914	if (prog) {
2915		vi->xdp_enabled = true;
2916		for (i = 0; i < vi->max_queue_pairs; i++) {
2917			rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2918			if (i == 0 && !old_prog)
2919				virtnet_clear_guest_offloads(vi);
2920		}
2921	} else {
2922		vi->xdp_enabled = false;
2923	}
2924
2925	for (i = 0; i < vi->max_queue_pairs; i++) {
2926		if (old_prog)
2927			bpf_prog_put(old_prog);
2928		if (netif_running(dev)) {
2929			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2930			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2931					       &vi->sq[i].napi);
2932		}
2933	}
2934
2935	return 0;
2936
2937err:
2938	if (!prog) {
2939		virtnet_clear_guest_offloads(vi);
2940		for (i = 0; i < vi->max_queue_pairs; i++)
2941			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
2942	}
2943
2944	if (netif_running(dev)) {
2945		for (i = 0; i < vi->max_queue_pairs; i++) {
2946			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2947			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
2948					       &vi->sq[i].napi);
2949		}
2950	}
2951	if (prog)
2952		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
2953	return err;
2954}
2955
2956static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
2957{
2958	switch (xdp->command) {
2959	case XDP_SETUP_PROG:
2960		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
2961	default:
2962		return -EINVAL;
2963	}
2964}
2965
2966static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
2967				      size_t len)
2968{
2969	struct virtnet_info *vi = netdev_priv(dev);
2970	int ret;
2971
2972	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
2973		return -EOPNOTSUPP;
2974
2975	ret = snprintf(buf, len, "sby");
2976	if (ret >= len)
2977		return -EOPNOTSUPP;
2978
2979	return 0;
2980}
2981
2982static int virtnet_set_features(struct net_device *dev,
2983				netdev_features_t features)
2984{
2985	struct virtnet_info *vi = netdev_priv(dev);
2986	u64 offloads;
2987	int err;
2988
2989	if ((dev->features ^ features) & NETIF_F_GRO_HW) {
2990		if (vi->xdp_enabled)
2991			return -EBUSY;
2992
2993		if (features & NETIF_F_GRO_HW)
2994			offloads = vi->guest_offloads_capable;
2995		else
2996			offloads = vi->guest_offloads_capable &
2997				   ~GUEST_OFFLOAD_GRO_HW_MASK;
2998
2999		err = virtnet_set_guest_offloads(vi, offloads);
3000		if (err)
3001			return err;
3002		vi->guest_offloads = offloads;
3003	}
3004
3005	if ((dev->features ^ features) & NETIF_F_RXHASH) {
3006		if (features & NETIF_F_RXHASH)
3007			vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
3008		else
3009			vi->ctrl->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE;
3010
3011		if (!virtnet_commit_rss_command(vi))
3012			return -EINVAL;
3013	}
3014
3015	return 0;
3016}
3017
3018static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
3019{
3020	struct virtnet_info *priv = netdev_priv(dev);
3021	struct send_queue *sq = &priv->sq[txqueue];
3022	struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue);
3023
3024	u64_stats_update_begin(&sq->stats.syncp);
3025	sq->stats.tx_timeouts++;
3026	u64_stats_update_end(&sq->stats.syncp);
3027
3028	netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n",
3029		   txqueue, sq->name, sq->vq->index, sq->vq->name,
3030		   jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
3031}
3032
3033static const struct net_device_ops virtnet_netdev = {
3034	.ndo_open            = virtnet_open,
3035	.ndo_stop   	     = virtnet_close,
3036	.ndo_start_xmit      = start_xmit,
3037	.ndo_validate_addr   = eth_validate_addr,
3038	.ndo_set_mac_address = virtnet_set_mac_address,
3039	.ndo_set_rx_mode     = virtnet_set_rx_mode,
3040	.ndo_get_stats64     = virtnet_stats,
3041	.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
3042	.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
3043	.ndo_bpf		= virtnet_xdp,
3044	.ndo_xdp_xmit		= virtnet_xdp_xmit,
3045	.ndo_features_check	= passthru_features_check,
3046	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
3047	.ndo_set_features	= virtnet_set_features,
3048	.ndo_tx_timeout		= virtnet_tx_timeout,
3049};
3050
3051static void virtnet_config_changed_work(struct work_struct *work)
3052{
3053	struct virtnet_info *vi =
3054		container_of(work, struct virtnet_info, config_work);
3055	u16 v;
3056
3057	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
3058				 struct virtio_net_config, status, &v) < 0)
3059		return;
3060
3061	if (v & VIRTIO_NET_S_ANNOUNCE) {
3062		netdev_notify_peers(vi->dev);
3063		virtnet_ack_link_announce(vi);
3064	}
3065
3066	/* Ignore unknown (future) status bits */
3067	v &= VIRTIO_NET_S_LINK_UP;
3068
3069	if (vi->status == v)
3070		return;
3071
3072	vi->status = v;
3073
3074	if (vi->status & VIRTIO_NET_S_LINK_UP) {
3075		virtnet_update_settings(vi);
3076		netif_carrier_on(vi->dev);
3077		netif_tx_wake_all_queues(vi->dev);
3078	} else {
3079		netif_carrier_off(vi->dev);
3080		netif_tx_stop_all_queues(vi->dev);
3081	}
3082}
3083
3084static void virtnet_config_changed(struct virtio_device *vdev)
3085{
3086	struct virtnet_info *vi = vdev->priv;
3087
3088	schedule_work(&vi->config_work);
3089}
3090
3091static void virtnet_free_queues(struct virtnet_info *vi)
3092{
3093	int i;
3094
3095	for (i = 0; i < vi->max_queue_pairs; i++) {
3096		__netif_napi_del(&vi->rq[i].napi);
3097		__netif_napi_del(&vi->sq[i].napi);
3098	}
3099
3100	/* We called __netif_napi_del(),
3101	 * we need to respect an RCU grace period before freeing vi->rq
3102	 */
3103	synchronize_net();
3104
3105	kfree(vi->rq);
3106	kfree(vi->sq);
3107	kfree(vi->ctrl);
3108}
3109
3110static void _free_receive_bufs(struct virtnet_info *vi)
3111{
3112	struct bpf_prog *old_prog;
3113	int i;
3114
3115	for (i = 0; i < vi->max_queue_pairs; i++) {
3116		while (vi->rq[i].pages)
3117			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
3118
3119		old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
3120		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
3121		if (old_prog)
3122			bpf_prog_put(old_prog);
3123	}
3124}
3125
3126static void free_receive_bufs(struct virtnet_info *vi)
3127{
3128	rtnl_lock();
3129	_free_receive_bufs(vi);
3130	rtnl_unlock();
3131}
3132
3133static void free_receive_page_frags(struct virtnet_info *vi)
3134{
3135	int i;
3136	for (i = 0; i < vi->max_queue_pairs; i++)
3137		if (vi->rq[i].alloc_frag.page)
3138			put_page(vi->rq[i].alloc_frag.page);
3139}
3140
3141static void free_unused_bufs(struct virtnet_info *vi)
3142{
3143	void *buf;
3144	int i;
3145
3146	for (i = 0; i < vi->max_queue_pairs; i++) {
3147		struct virtqueue *vq = vi->sq[i].vq;
3148		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
3149			if (!is_xdp_frame(buf))
3150				dev_kfree_skb(buf);
3151			else
3152				xdp_return_frame(ptr_to_xdp(buf));
3153		}
3154	}
3155
3156	for (i = 0; i < vi->max_queue_pairs; i++) {
3157		struct virtqueue *vq = vi->rq[i].vq;
3158
3159		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
3160			if (vi->mergeable_rx_bufs) {
3161				put_page(virt_to_head_page(buf));
3162			} else if (vi->big_packets) {
3163				give_pages(&vi->rq[i], buf);
3164			} else {
3165				put_page(virt_to_head_page(buf));
3166			}
3167		}
3168	}
3169}
3170
3171static void virtnet_del_vqs(struct virtnet_info *vi)
3172{
3173	struct virtio_device *vdev = vi->vdev;
3174
3175	virtnet_clean_affinity(vi);
3176
3177	vdev->config->del_vqs(vdev);
3178
3179	virtnet_free_queues(vi);
3180}
3181
3182/* How large should a single buffer be so a queue full of these can fit at
3183 * least one full packet?
3184 * Logic below assumes the mergeable buffer header is used.
3185 */
3186static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
3187{
3188	const unsigned int hdr_len = vi->hdr_len;
3189	unsigned int rq_size = virtqueue_get_vring_size(vq);
3190	unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
3191	unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
3192	unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
3193
3194	return max(max(min_buf_len, hdr_len) - hdr_len,
3195		   (unsigned int)GOOD_PACKET_LEN);
3196}
3197
3198static int virtnet_find_vqs(struct virtnet_info *vi)
3199{
3200	vq_callback_t **callbacks;
3201	struct virtqueue **vqs;
3202	int ret = -ENOMEM;
3203	int i, total_vqs;
3204	const char **names;
3205	bool *ctx;
3206
3207	/* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
3208	 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
3209	 * possible control vq.
3210	 */
3211	total_vqs = vi->max_queue_pairs * 2 +
3212		    virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
3213
3214	/* Allocate space for find_vqs parameters */
3215	vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
3216	if (!vqs)
3217		goto err_vq;
3218	callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
3219	if (!callbacks)
3220		goto err_callback;
3221	names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
3222	if (!names)
3223		goto err_names;
3224	if (!vi->big_packets || vi->mergeable_rx_bufs) {
3225		ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
3226		if (!ctx)
3227			goto err_ctx;
3228	} else {
3229		ctx = NULL;
3230	}
3231
3232	/* Parameters for control virtqueue, if any */
3233	if (vi->has_cvq) {
3234		callbacks[total_vqs - 1] = NULL;
3235		names[total_vqs - 1] = "control";
3236	}
3237
3238	/* Allocate/initialize parameters for send/receive virtqueues */
3239	for (i = 0; i < vi->max_queue_pairs; i++) {
3240		callbacks[rxq2vq(i)] = skb_recv_done;
3241		callbacks[txq2vq(i)] = skb_xmit_done;
3242		sprintf(vi->rq[i].name, "input.%d", i);
3243		sprintf(vi->sq[i].name, "output.%d", i);
3244		names[rxq2vq(i)] = vi->rq[i].name;
3245		names[txq2vq(i)] = vi->sq[i].name;
3246		if (ctx)
3247			ctx[rxq2vq(i)] = true;
3248	}
3249
3250	ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
3251				  names, ctx, NULL);
3252	if (ret)
3253		goto err_find;
3254
3255	if (vi->has_cvq) {
3256		vi->cvq = vqs[total_vqs - 1];
3257		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
3258			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
3259	}
3260
3261	for (i = 0; i < vi->max_queue_pairs; i++) {
3262		vi->rq[i].vq = vqs[rxq2vq(i)];
3263		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
3264		vi->sq[i].vq = vqs[txq2vq(i)];
3265	}
3266
3267	/* run here: ret == 0. */
3268
3269
3270err_find:
3271	kfree(ctx);
3272err_ctx:
3273	kfree(names);
3274err_names:
3275	kfree(callbacks);
3276err_callback:
3277	kfree(vqs);
3278err_vq:
3279	return ret;
3280}
3281
3282static int virtnet_alloc_queues(struct virtnet_info *vi)
3283{
3284	int i;
3285
3286	if (vi->has_cvq) {
3287		vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
3288		if (!vi->ctrl)
3289			goto err_ctrl;
3290	} else {
3291		vi->ctrl = NULL;
3292	}
3293	vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
3294	if (!vi->sq)
3295		goto err_sq;
3296	vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
3297	if (!vi->rq)
3298		goto err_rq;
3299
3300	INIT_DELAYED_WORK(&vi->refill, refill_work);
3301	for (i = 0; i < vi->max_queue_pairs; i++) {
3302		vi->rq[i].pages = NULL;
3303		netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll,
3304				      napi_weight);
3305		netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi,
3306					 virtnet_poll_tx,
3307					 napi_tx ? napi_weight : 0);
3308
3309		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
3310		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
3311		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
3312
3313		u64_stats_init(&vi->rq[i].stats.syncp);
3314		u64_stats_init(&vi->sq[i].stats.syncp);
3315	}
3316
3317	return 0;
3318
3319err_rq:
3320	kfree(vi->sq);
3321err_sq:
3322	kfree(vi->ctrl);
3323err_ctrl:
3324	return -ENOMEM;
3325}
3326
3327static int init_vqs(struct virtnet_info *vi)
3328{
3329	int ret;
3330
3331	/* Allocate send & receive queues */
3332	ret = virtnet_alloc_queues(vi);
3333	if (ret)
3334		goto err;
3335
3336	ret = virtnet_find_vqs(vi);
3337	if (ret)
3338		goto err_free;
3339
3340	cpus_read_lock();
3341	virtnet_set_affinity(vi);
3342	cpus_read_unlock();
3343
3344	return 0;
3345
3346err_free:
3347	virtnet_free_queues(vi);
3348err:
3349	return ret;
3350}
3351
3352#ifdef CONFIG_SYSFS
3353static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
3354		char *buf)
3355{
3356	struct virtnet_info *vi = netdev_priv(queue->dev);
3357	unsigned int queue_index = get_netdev_rx_queue_index(queue);
3358	unsigned int headroom = virtnet_get_headroom(vi);
3359	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
3360	struct ewma_pkt_len *avg;
3361
3362	BUG_ON(queue_index >= vi->max_queue_pairs);
3363	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
3364	return sprintf(buf, "%u\n",
3365		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
3366				       SKB_DATA_ALIGN(headroom + tailroom)));
3367}
3368
3369static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
3370	__ATTR_RO(mergeable_rx_buffer_size);
3371
3372static struct attribute *virtio_net_mrg_rx_attrs[] = {
3373	&mergeable_rx_buffer_size_attribute.attr,
3374	NULL
3375};
3376
3377static const struct attribute_group virtio_net_mrg_rx_group = {
3378	.name = "virtio_net",
3379	.attrs = virtio_net_mrg_rx_attrs
3380};
3381#endif
3382
3383static bool virtnet_fail_on_feature(struct virtio_device *vdev,
3384				    unsigned int fbit,
3385				    const char *fname, const char *dname)
3386{
3387	if (!virtio_has_feature(vdev, fbit))
3388		return false;
3389
3390	dev_err(&vdev->dev, "device advertises feature %s but not %s",
3391		fname, dname);
3392
3393	return true;
3394}
3395
3396#define VIRTNET_FAIL_ON(vdev, fbit, dbit)			\
3397	virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
3398
3399static bool virtnet_validate_features(struct virtio_device *vdev)
3400{
3401	if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
3402	    (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
3403			     "VIRTIO_NET_F_CTRL_VQ") ||
3404	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
3405			     "VIRTIO_NET_F_CTRL_VQ") ||
3406	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
3407			     "VIRTIO_NET_F_CTRL_VQ") ||
3408	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
3409	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
3410			     "VIRTIO_NET_F_CTRL_VQ") ||
3411	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
3412			     "VIRTIO_NET_F_CTRL_VQ") ||
3413	     VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
3414			     "VIRTIO_NET_F_CTRL_VQ"))) {
3415		return false;
3416	}
3417
3418	return true;
3419}
3420
3421#define MIN_MTU ETH_MIN_MTU
3422#define MAX_MTU ETH_MAX_MTU
3423
3424static int virtnet_validate(struct virtio_device *vdev)
3425{
3426	if (!vdev->config->get) {
3427		dev_err(&vdev->dev, "%s failure: config access disabled\n",
3428			__func__);
3429		return -EINVAL;
3430	}
3431
3432	if (!virtnet_validate_features(vdev))
3433		return -EINVAL;
3434
3435	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3436		int mtu = virtio_cread16(vdev,
3437					 offsetof(struct virtio_net_config,
3438						  mtu));
3439		if (mtu < MIN_MTU)
3440			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
3441	}
3442
3443	return 0;
3444}
3445
3446static int virtnet_probe(struct virtio_device *vdev)
3447{
3448	int i, err = -ENOMEM;
3449	struct net_device *dev;
3450	struct virtnet_info *vi;
3451	u16 max_queue_pairs;
3452	int mtu;
3453
3454	/* Find if host supports multiqueue/rss virtio_net device */
3455	max_queue_pairs = 1;
3456	if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3457		max_queue_pairs =
3458		     virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs));
3459
3460	/* We need at least 2 queue's */
3461	if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
3462	    max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
3463	    !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3464		max_queue_pairs = 1;
3465
3466	/* Allocate ourselves a network device with room for our info */
3467	dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
3468	if (!dev)
3469		return -ENOMEM;
3470
3471	/* Set up network device as normal. */
3472	dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
3473			   IFF_TX_SKB_NO_LINEAR;
3474	dev->netdev_ops = &virtnet_netdev;
3475	dev->features = NETIF_F_HIGHDMA;
3476
3477	dev->ethtool_ops = &virtnet_ethtool_ops;
3478	SET_NETDEV_DEV(dev, &vdev->dev);
3479
3480	/* Do we support "hardware" checksums? */
3481	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
3482		/* This opens up the world of extra features. */
3483		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3484		if (csum)
3485			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3486
3487		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3488			dev->hw_features |= NETIF_F_TSO
3489				| NETIF_F_TSO_ECN | NETIF_F_TSO6;
3490		}
3491		/* Individual feature bits: what can host handle? */
3492		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
3493			dev->hw_features |= NETIF_F_TSO;
3494		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
3495			dev->hw_features |= NETIF_F_TSO6;
3496		if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
3497			dev->hw_features |= NETIF_F_TSO_ECN;
3498
3499		dev->features |= NETIF_F_GSO_ROBUST;
3500
3501		if (gso)
3502			dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3503		/* (!csum && gso) case will be fixed by register_netdev() */
3504	}
3505	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
3506		dev->features |= NETIF_F_RXCSUM;
3507	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3508	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3509		dev->features |= NETIF_F_GRO_HW;
3510	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3511		dev->hw_features |= NETIF_F_GRO_HW;
3512
3513	dev->vlan_features = dev->features;
3514
3515	/* MTU range: 68 - 65535 */
3516	dev->min_mtu = MIN_MTU;
3517	dev->max_mtu = MAX_MTU;
3518
3519	/* Configuration may specify what MAC to use.  Otherwise random. */
3520	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
3521		u8 addr[ETH_ALEN];
3522
3523		virtio_cread_bytes(vdev,
3524				   offsetof(struct virtio_net_config, mac),
3525				   addr, ETH_ALEN);
3526		eth_hw_addr_set(dev, addr);
3527	} else {
3528		eth_hw_addr_random(dev);
3529	}
3530
3531	/* Set up our device-specific information */
3532	vi = netdev_priv(dev);
3533	vi->dev = dev;
3534	vi->vdev = vdev;
3535	vdev->priv = vi;
3536
3537	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
3538
3539	/* If we can receive ANY GSO packets, we must allocate large ones. */
3540	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3541	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3542	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
3543	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
3544		vi->big_packets = true;
3545
3546	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
3547		vi->mergeable_rx_bufs = true;
3548
3549	if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
3550		vi->has_rss_hash_report = true;
3551
3552	if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3553		vi->has_rss = true;
3554
3555	if (vi->has_rss || vi->has_rss_hash_report) {
3556		vi->rss_indir_table_size =
3557			virtio_cread16(vdev, offsetof(struct virtio_net_config,
3558				rss_max_indirection_table_length));
3559		vi->rss_key_size =
3560			virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size));
3561
3562		vi->rss_hash_types_supported =
3563		    virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types));
3564		vi->rss_hash_types_supported &=
3565				~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX |
3566				  VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
3567				  VIRTIO_NET_RSS_HASH_TYPE_UDP_EX);
3568
3569		dev->hw_features |= NETIF_F_RXHASH;
3570	}
3571
3572	if (vi->has_rss_hash_report)
3573		vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash);
3574	else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
3575		 virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3576		vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3577	else
3578		vi->hdr_len = sizeof(struct virtio_net_hdr);
3579
3580	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
3581	    virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3582		vi->any_header_sg = true;
3583
3584	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3585		vi->has_cvq = true;
3586
3587	if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3588		mtu = virtio_cread16(vdev,
3589				     offsetof(struct virtio_net_config,
3590					      mtu));
3591		if (mtu < dev->min_mtu) {
3592			/* Should never trigger: MTU was previously validated
3593			 * in virtnet_validate.
3594			 */
3595			dev_err(&vdev->dev,
3596				"device MTU appears to have changed it is now %d < %d",
3597				mtu, dev->min_mtu);
3598			err = -EINVAL;
3599			goto free;
3600		}
3601
3602		dev->mtu = mtu;
3603		dev->max_mtu = mtu;
3604
3605		/* TODO: size buffers correctly in this case. */
3606		if (dev->mtu > ETH_DATA_LEN)
3607			vi->big_packets = true;
3608	}
3609
3610	if (vi->any_header_sg)
3611		dev->needed_headroom = vi->hdr_len;
3612
3613	/* Enable multiqueue by default */
3614	if (num_online_cpus() >= max_queue_pairs)
3615		vi->curr_queue_pairs = max_queue_pairs;
3616	else
3617		vi->curr_queue_pairs = num_online_cpus();
3618	vi->max_queue_pairs = max_queue_pairs;
3619
3620	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3621	err = init_vqs(vi);
3622	if (err)
3623		goto free;
3624
3625#ifdef CONFIG_SYSFS
3626	if (vi->mergeable_rx_bufs)
3627		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
3628#endif
3629	netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
3630	netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
3631
3632	virtnet_init_settings(dev);
3633
3634	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3635		vi->failover = net_failover_create(vi->dev);
3636		if (IS_ERR(vi->failover)) {
3637			err = PTR_ERR(vi->failover);
3638			goto free_vqs;
3639		}
3640	}
3641
3642	if (vi->has_rss || vi->has_rss_hash_report)
3643		virtnet_init_default_rss(vi);
3644
3645	/* serialize netdev register + virtio_device_ready() with ndo_open() */
3646	rtnl_lock();
3647
3648	err = register_netdevice(dev);
3649	if (err) {
3650		pr_debug("virtio_net: registering device failed\n");
3651		rtnl_unlock();
3652		goto free_failover;
3653	}
3654
3655	virtio_device_ready(vdev);
3656
3657	rtnl_unlock();
3658
3659	err = virtnet_cpu_notif_add(vi);
3660	if (err) {
3661		pr_debug("virtio_net: registering cpu notifier failed\n");
3662		goto free_unregister_netdev;
3663	}
3664
3665	virtnet_set_queues(vi, vi->curr_queue_pairs);
3666
3667	/* Assume link up if device can't report link status,
3668	   otherwise get link status from config. */
3669	netif_carrier_off(dev);
3670	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3671		schedule_work(&vi->config_work);
3672	} else {
3673		vi->status = VIRTIO_NET_S_LINK_UP;
3674		virtnet_update_settings(vi);
3675		netif_carrier_on(dev);
3676	}
3677
3678	for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
3679		if (virtio_has_feature(vi->vdev, guest_offloads[i]))
3680			set_bit(guest_offloads[i], &vi->guest_offloads);
3681	vi->guest_offloads_capable = vi->guest_offloads;
3682
3683	pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
3684		 dev->name, max_queue_pairs);
3685
3686	return 0;
3687
3688free_unregister_netdev:
3689	virtio_reset_device(vdev);
3690
3691	unregister_netdev(dev);
3692free_failover:
3693	net_failover_destroy(vi->failover);
3694free_vqs:
3695	cancel_delayed_work_sync(&vi->refill);
3696	free_receive_page_frags(vi);
3697	virtnet_del_vqs(vi);
3698free:
3699	free_netdev(dev);
3700	return err;
3701}
3702
3703static void remove_vq_common(struct virtnet_info *vi)
3704{
3705	virtio_reset_device(vi->vdev);
3706
3707	/* Free unused buffers in both send and recv, if any. */
3708	free_unused_bufs(vi);
3709
3710	free_receive_bufs(vi);
3711
3712	free_receive_page_frags(vi);
3713
3714	virtnet_del_vqs(vi);
3715}
3716
3717static void virtnet_remove(struct virtio_device *vdev)
3718{
3719	struct virtnet_info *vi = vdev->priv;
3720
3721	virtnet_cpu_notif_remove(vi);
3722
3723	/* Make sure no work handler is accessing the device. */
3724	flush_work(&vi->config_work);
3725
3726	unregister_netdev(vi->dev);
3727
3728	net_failover_destroy(vi->failover);
3729
3730	remove_vq_common(vi);
3731
3732	free_netdev(vi->dev);
3733}
3734
3735static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
3736{
3737	struct virtnet_info *vi = vdev->priv;
3738
3739	virtnet_cpu_notif_remove(vi);
3740	virtnet_freeze_down(vdev);
3741	remove_vq_common(vi);
3742
3743	return 0;
3744}
3745
3746static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
3747{
3748	struct virtnet_info *vi = vdev->priv;
3749	int err;
3750
3751	err = virtnet_restore_up(vdev);
3752	if (err)
3753		return err;
3754	virtnet_set_queues(vi, vi->curr_queue_pairs);
3755
3756	err = virtnet_cpu_notif_add(vi);
3757	if (err) {
3758		virtnet_freeze_down(vdev);
3759		remove_vq_common(vi);
3760		return err;
3761	}
3762
3763	return 0;
3764}
3765
3766static struct virtio_device_id id_table[] = {
3767	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3768	{ 0 },
3769};
3770
3771#define VIRTNET_FEATURES \
3772	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
3773	VIRTIO_NET_F_MAC, \
3774	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
3775	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
3776	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
3777	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
3778	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
3779	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
3780	VIRTIO_NET_F_CTRL_MAC_ADDR, \
3781	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
3782	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
3783	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT
3784
3785static unsigned int features[] = {
3786	VIRTNET_FEATURES,
3787};
3788
3789static unsigned int features_legacy[] = {
3790	VIRTNET_FEATURES,
3791	VIRTIO_NET_F_GSO,
3792	VIRTIO_F_ANY_LAYOUT,
3793};
3794
3795static struct virtio_driver virtio_net_driver = {
3796	.feature_table = features,
3797	.feature_table_size = ARRAY_SIZE(features),
3798	.feature_table_legacy = features_legacy,
3799	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
3800	.driver.name =	KBUILD_MODNAME,
3801	.driver.owner =	THIS_MODULE,
3802	.id_table =	id_table,
3803	.validate =	virtnet_validate,
3804	.probe =	virtnet_probe,
3805	.remove =	virtnet_remove,
3806	.config_changed = virtnet_config_changed,
3807#ifdef CONFIG_PM_SLEEP
3808	.freeze =	virtnet_freeze,
3809	.restore =	virtnet_restore,
3810#endif
3811};
3812
3813static __init int virtio_net_driver_init(void)
3814{
3815	int ret;
3816
3817	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
3818				      virtnet_cpu_online,
3819				      virtnet_cpu_down_prep);
3820	if (ret < 0)
3821		goto out;
3822	virtionet_online = ret;
3823	ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
3824				      NULL, virtnet_cpu_dead);
3825	if (ret)
3826		goto err_dead;
3827	ret = register_virtio_driver(&virtio_net_driver);
3828	if (ret)
3829		goto err_virtio;
3830	return 0;
3831err_virtio:
3832	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
3833err_dead:
3834	cpuhp_remove_multi_state(virtionet_online);
3835out:
3836	return ret;
3837}
3838module_init(virtio_net_driver_init);
3839
3840static __exit void virtio_net_driver_exit(void)
3841{
3842	unregister_virtio_driver(&virtio_net_driver);
3843	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
3844	cpuhp_remove_multi_state(virtionet_online);
3845}
3846module_exit(virtio_net_driver_exit);
3847
3848MODULE_DEVICE_TABLE(virtio, id_table);
3849MODULE_DESCRIPTION("Virtio network driver");
3850MODULE_LICENSE("GPL");
Configure Feed

Configure Feed