Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

gve: Reduce alloc and copy costs in the GQ rx path

Previously, even if just one of the many fragments of a 9k packet
required a copy, we'd copy the whole packet into a freshly-allocated
9k-sized linear SKB, and this led to performance issues.

By having a pool of pages to copy into, each fragment can be
independently handled, leading to a reduced incidence of
allocation and copy.

Signed-off-by: Shailend Chand <shailend@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Shailend Chand and committed by
David S. Miller
82fd151d d08b0f8f

+333 -261
+20 -6
drivers/net/ethernet/google/gve/gve.h
··· 60 60 void *page_address; 61 61 u32 page_offset; /* offset to write to in page */ 62 62 int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ 63 - u8 can_flip; 63 + u16 pad; /* adjustment for rx padding */ 64 + u8 can_flip; /* tracks if the networking stack is using the page */ 64 65 }; 65 66 66 67 /* A list of pages registered with the device during setup and used by a queue ··· 150 149 /* head and tail of skb chain for the current packet or NULL if none */ 151 150 struct sk_buff *skb_head; 152 151 struct sk_buff *skb_tail; 153 - u16 total_expected_size; 154 - u8 expected_frag_cnt; 155 - u8 curr_frag_cnt; 156 - u8 reuse_frags; 152 + u32 total_size; 153 + u8 frag_cnt; 154 + bool drop_pkt; 155 + }; 156 + 157 + struct gve_rx_cnts { 158 + u32 ok_pkt_bytes; 159 + u16 ok_pkt_cnt; 160 + u16 total_pkt_cnt; 161 + u16 cont_pkt_cnt; 162 + u16 desc_err_pkt_cnt; 157 163 }; 158 164 159 165 /* Contains datapath state used to represent an RX queue. */ ··· 175 167 /* threshold for posting new buffs and descs */ 176 168 u32 db_threshold; 177 169 u16 packet_buffer_size; 170 + 171 + u32 qpl_copy_pool_mask; 172 + u32 qpl_copy_pool_head; 173 + struct gve_rx_slot_page_info *qpl_copy_pool; 178 174 }; 179 175 180 176 /* DQO fields. */ ··· 228 216 u64 rx_desc_err_dropped_pkt; /* free-running count of packets dropped by descriptor error */ 229 217 u64 rx_cont_packet_cnt; /* free-running multi-fragment packets received */ 230 218 u64 rx_frag_flip_cnt; /* free-running count of rx segments where page_flip was used */ 231 - u64 rx_frag_copy_cnt; /* free-running count of rx segments copied into skb linear portion */ 219 + u64 rx_frag_copy_cnt; /* free-running count of rx segments copied */ 220 + u64 rx_frag_alloc_cnt; /* free-running count of rx page allocations */ 221 + 232 222 u32 q_num; /* queue index */ 233 223 u32 ntfy_id; /* notification block index */ 234 224 struct gve_queue_resources *q_resources; /* head and tail pointer idx */
+2
drivers/net/ethernet/google/gve/gve_ethtool.c
··· 45 45 static const char gve_gstrings_rx_stats[][ETH_GSTRING_LEN] = { 46 46 "rx_posted_desc[%u]", "rx_completed_desc[%u]", "rx_consumed_desc[%u]", "rx_bytes[%u]", 47 47 "rx_cont_packet_cnt[%u]", "rx_frag_flip_cnt[%u]", "rx_frag_copy_cnt[%u]", 48 + "rx_frag_alloc_cnt[%u]", 48 49 "rx_dropped_pkt[%u]", "rx_copybreak_pkt[%u]", "rx_copied_pkt[%u]", 49 50 "rx_queue_drop_cnt[%u]", "rx_no_buffers_posted[%u]", 50 51 "rx_drops_packet_over_mru[%u]", "rx_drops_invalid_checksum[%u]", ··· 272 271 data[i++] = rx->rx_cont_packet_cnt; 273 272 data[i++] = rx->rx_frag_flip_cnt; 274 273 data[i++] = rx->rx_frag_copy_cnt; 274 + data[i++] = rx->rx_frag_alloc_cnt; 275 275 /* rx dropped packets */ 276 276 data[i++] = tmp_rx_skb_alloc_fail + 277 277 tmp_rx_buf_alloc_fail +
+303 -231
drivers/net/ethernet/google/gve/gve_rx.c
··· 35 35 rx->data.page_info[i].pagecnt_bias - 1); 36 36 gve_unassign_qpl(priv, rx->data.qpl->id); 37 37 rx->data.qpl = NULL; 38 + 39 + for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) { 40 + page_ref_sub(rx->qpl_copy_pool[i].page, 41 + rx->qpl_copy_pool[i].pagecnt_bias - 1); 42 + put_page(rx->qpl_copy_pool[i].page); 43 + } 38 44 } 39 45 kvfree(rx->data.page_info); 40 46 rx->data.page_info = NULL; ··· 69 63 dma_free_coherent(dev, bytes, rx->data.data_ring, 70 64 rx->data.data_bus); 71 65 rx->data.data_ring = NULL; 66 + 67 + kvfree(rx->qpl_copy_pool); 68 + rx->qpl_copy_pool = NULL; 69 + 72 70 netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); 73 71 } 74 72 ··· 111 101 u32 slots; 112 102 int err; 113 103 int i; 104 + int j; 114 105 115 106 /* Allocate one page per Rx queue slot. Each page is split into two 116 107 * packet buffers, when possible we "page flip" between the two. ··· 146 135 goto alloc_err; 147 136 } 148 137 138 + if (!rx->data.raw_addressing) { 139 + for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 140 + struct page *page = alloc_page(GFP_KERNEL); 141 + 142 + if (!page) 143 + goto alloc_err_qpl; 144 + 145 + rx->qpl_copy_pool[j].page = page; 146 + rx->qpl_copy_pool[j].page_offset = 0; 147 + rx->qpl_copy_pool[j].page_address = page_address(page); 148 + 149 + /* The page already has 1 ref. */ 150 + page_ref_add(page, INT_MAX - 1); 151 + rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX; 152 + } 153 + } 154 + 149 155 return slots; 156 + 157 + alloc_err_qpl: 158 + while (j--) { 159 + page_ref_sub(rx->qpl_copy_pool[j].page, 160 + rx->qpl_copy_pool[j].pagecnt_bias - 1); 161 + put_page(rx->qpl_copy_pool[j].page); 162 + } 150 163 alloc_err: 151 164 while (i--) 152 165 gve_rx_free_buffer(&priv->pdev->dev, ··· 181 146 182 147 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx) 183 148 { 184 - ctx->curr_frag_cnt = 0; 185 - ctx->total_expected_size = 0; 186 - ctx->expected_frag_cnt = 0; 187 149 ctx->skb_head = NULL; 188 150 ctx->skb_tail = NULL; 189 - ctx->reuse_frags = false; 151 + ctx->total_size = 0; 152 + ctx->frag_cnt = 0; 153 + ctx->drop_pkt = false; 190 154 } 191 155 192 156 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) ··· 215 181 GFP_KERNEL); 216 182 if (!rx->data.data_ring) 217 183 return -ENOMEM; 184 + 185 + rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 186 + rx->qpl_copy_pool_head = 0; 187 + rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 188 + sizeof(rx->qpl_copy_pool[0]), 189 + GFP_KERNEL); 190 + 191 + if (!rx->qpl_copy_pool) { 192 + err = -ENOMEM; 193 + goto abort_with_slots; 194 + } 195 + 218 196 filled_pages = gve_prefill_rx_pages(rx); 219 197 if (filled_pages < 0) { 220 198 err = -ENOMEM; 221 - goto abort_with_slots; 199 + goto abort_with_copy_pool; 222 200 } 223 201 rx->fill_cnt = filled_pages; 224 202 /* Ensure data ring slots (packet buffers) are visible. */ ··· 282 236 rx->q_resources = NULL; 283 237 abort_filled: 284 238 gve_rx_unfill_pages(priv, rx); 239 + abort_with_copy_pool: 240 + kvfree(rx->qpl_copy_pool); 241 + rx->qpl_copy_pool = NULL; 285 242 abort_with_slots: 286 243 bytes = sizeof(*rx->data.data_ring) * slots; 287 244 dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus); ··· 341 292 return PKT_HASH_TYPE_L2; 342 293 } 343 294 344 - static u16 gve_rx_ctx_padding(struct gve_rx_ctx *ctx) 345 - { 346 - return (ctx->curr_frag_cnt == 0) ? GVE_RX_PAD : 0; 347 - } 348 - 349 295 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, 350 296 struct gve_rx_slot_page_info *page_info, 351 297 u16 packet_buffer_size, u16 len, 352 298 struct gve_rx_ctx *ctx) 353 299 { 354 - u32 offset = page_info->page_offset + gve_rx_ctx_padding(ctx); 355 - struct sk_buff *skb; 300 + u32 offset = page_info->page_offset + page_info->pad; 301 + struct sk_buff *skb = ctx->skb_tail; 302 + int num_frags = 0; 356 303 357 - if (!ctx->skb_head) 358 - ctx->skb_head = napi_get_frags(napi); 304 + if (!skb) { 305 + skb = napi_get_frags(napi); 306 + if (unlikely(!skb)) 307 + return NULL; 359 308 360 - if (unlikely(!ctx->skb_head)) 361 - return NULL; 309 + ctx->skb_head = skb; 310 + ctx->skb_tail = skb; 311 + } else { 312 + num_frags = skb_shinfo(ctx->skb_tail)->nr_frags; 313 + if (num_frags == MAX_SKB_FRAGS) { 314 + skb = napi_alloc_skb(napi, 0); 315 + if (!skb) 316 + return NULL; 362 317 363 - skb = ctx->skb_head; 364 - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page_info->page, 318 + // We will never chain more than two SKBs: 2 * 16 * 2k > 64k 319 + // which is why we do not need to chain by using skb->next 320 + skb_shinfo(ctx->skb_tail)->frag_list = skb; 321 + 322 + ctx->skb_tail = skb; 323 + num_frags = 0; 324 + } 325 + } 326 + 327 + if (skb != ctx->skb_head) { 328 + ctx->skb_head->len += len; 329 + ctx->skb_head->data_len += len; 330 + ctx->skb_head->truesize += packet_buffer_size; 331 + } 332 + skb_add_rx_frag(skb, num_frags, page_info->page, 365 333 offset, len, packet_buffer_size); 366 334 367 - return skb; 335 + return ctx->skb_head; 368 336 } 369 337 370 338 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) ··· 429 363 return skb; 430 364 } 431 365 366 + static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx, 367 + struct gve_rx_slot_page_info *page_info, 368 + u16 len, struct napi_struct *napi) 369 + { 370 + u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask; 371 + void *src = page_info->page_address + page_info->page_offset; 372 + struct gve_rx_slot_page_info *copy_page_info; 373 + struct gve_rx_ctx *ctx = &rx->ctx; 374 + bool alloc_page = false; 375 + struct sk_buff *skb; 376 + void *dst; 377 + 378 + copy_page_info = &rx->qpl_copy_pool[pool_idx]; 379 + if (!copy_page_info->can_flip) { 380 + int recycle = gve_rx_can_recycle_buffer(copy_page_info); 381 + 382 + if (unlikely(recycle < 0)) { 383 + gve_schedule_reset(rx->gve); 384 + return NULL; 385 + } 386 + alloc_page = !recycle; 387 + } 388 + 389 + if (alloc_page) { 390 + struct gve_rx_slot_page_info alloc_page_info; 391 + struct page *page; 392 + 393 + /* The least recently used page turned out to be 394 + * still in use by the kernel. Ignoring it and moving 395 + * on alleviates head-of-line blocking. 396 + */ 397 + rx->qpl_copy_pool_head++; 398 + 399 + page = alloc_page(GFP_ATOMIC); 400 + if (!page) 401 + return NULL; 402 + 403 + alloc_page_info.page = page; 404 + alloc_page_info.page_offset = 0; 405 + alloc_page_info.page_address = page_address(page); 406 + alloc_page_info.pad = page_info->pad; 407 + 408 + memcpy(alloc_page_info.page_address, src, page_info->pad + len); 409 + skb = gve_rx_add_frags(napi, &alloc_page_info, 410 + rx->packet_buffer_size, 411 + len, ctx); 412 + 413 + u64_stats_update_begin(&rx->statss); 414 + rx->rx_frag_copy_cnt++; 415 + rx->rx_frag_alloc_cnt++; 416 + u64_stats_update_end(&rx->statss); 417 + 418 + return skb; 419 + } 420 + 421 + dst = copy_page_info->page_address + copy_page_info->page_offset; 422 + memcpy(dst, src, page_info->pad + len); 423 + copy_page_info->pad = page_info->pad; 424 + 425 + skb = gve_rx_add_frags(napi, copy_page_info, 426 + rx->packet_buffer_size, len, ctx); 427 + if (unlikely(!skb)) 428 + return NULL; 429 + 430 + gve_dec_pagecnt_bias(copy_page_info); 431 + copy_page_info->page_offset += rx->packet_buffer_size; 432 + copy_page_info->page_offset &= (PAGE_SIZE - 1); 433 + 434 + if (copy_page_info->can_flip) { 435 + /* We have used both halves of this copy page, it 436 + * is time for it to go to the back of the queue. 437 + */ 438 + copy_page_info->can_flip = false; 439 + rx->qpl_copy_pool_head++; 440 + prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page); 441 + } else { 442 + copy_page_info->can_flip = true; 443 + } 444 + 445 + u64_stats_update_begin(&rx->statss); 446 + rx->rx_frag_copy_cnt++; 447 + u64_stats_update_end(&rx->statss); 448 + 449 + return skb; 450 + } 451 + 432 452 static struct sk_buff * 433 453 gve_rx_qpl(struct device *dev, struct net_device *netdev, 434 454 struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, ··· 529 377 * choice is to copy the data out of it so that we can return it to the 530 378 * device. 531 379 */ 532 - if (ctx->reuse_frags) { 380 + if (page_info->can_flip) { 533 381 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx); 534 382 /* No point in recycling if we didn't get the skb */ 535 383 if (skb) { ··· 538 386 gve_rx_flip_buff(page_info, &data_slot->qpl_offset); 539 387 } 540 388 } else { 541 - const u16 padding = gve_rx_ctx_padding(ctx); 542 - 543 - skb = gve_rx_copy(netdev, napi, page_info, len, padding, ctx); 544 - if (skb) { 545 - u64_stats_update_begin(&rx->statss); 546 - rx->rx_frag_copy_cnt++; 547 - u64_stats_update_end(&rx->statss); 548 - } 389 + skb = gve_rx_copy_to_pool(rx, page_info, len, napi); 549 390 } 550 391 return skb; 551 392 } 552 393 553 - #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 554 - static u16 gve_rx_get_fragment_size(struct gve_rx_ctx *ctx, struct gve_rx_desc *desc) 555 - { 556 - return be16_to_cpu(desc->len) - gve_rx_ctx_padding(ctx); 557 - } 558 - 559 - static bool gve_rx_ctx_init(struct gve_rx_ctx *ctx, struct gve_rx_ring *rx) 560 - { 561 - bool qpl_mode = !rx->data.raw_addressing, packet_size_error = false; 562 - bool buffer_error = false, desc_error = false, seqno_error = false; 563 - struct gve_rx_slot_page_info *page_info; 564 - struct gve_priv *priv = rx->gve; 565 - u32 idx = rx->cnt & rx->mask; 566 - bool reuse_frags, can_flip; 567 - struct gve_rx_desc *desc; 568 - u16 packet_size = 0; 569 - u16 n_frags = 0; 570 - int recycle; 571 - 572 - /** In QPL mode, we only flip buffers when all buffers containing the packet 573 - * can be flipped. RDA can_flip decisions will be made later, per frag. 574 - */ 575 - can_flip = qpl_mode; 576 - reuse_frags = can_flip; 577 - do { 578 - u16 frag_size; 579 - 580 - n_frags++; 581 - desc = &rx->desc.desc_ring[idx]; 582 - desc_error = unlikely(desc->flags_seq & GVE_RXF_ERR) || desc_error; 583 - if (GVE_SEQNO(desc->flags_seq) != rx->desc.seqno) { 584 - seqno_error = true; 585 - netdev_warn(priv->dev, 586 - "RX seqno error: want=%d, got=%d, dropping packet and scheduling reset.", 587 - rx->desc.seqno, GVE_SEQNO(desc->flags_seq)); 588 - } 589 - frag_size = be16_to_cpu(desc->len); 590 - packet_size += frag_size; 591 - if (frag_size > rx->packet_buffer_size) { 592 - packet_size_error = true; 593 - netdev_warn(priv->dev, 594 - "RX fragment error: packet_buffer_size=%d, frag_size=%d, dropping packet.", 595 - rx->packet_buffer_size, be16_to_cpu(desc->len)); 596 - } 597 - page_info = &rx->data.page_info[idx]; 598 - if (can_flip) { 599 - recycle = gve_rx_can_recycle_buffer(page_info); 600 - reuse_frags = reuse_frags && recycle > 0; 601 - buffer_error = buffer_error || unlikely(recycle < 0); 602 - } 603 - idx = (idx + 1) & rx->mask; 604 - rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 605 - } while (GVE_PKTCONT_BIT_IS_SET(desc->flags_seq)); 606 - 607 - prefetch(rx->desc.desc_ring + idx); 608 - 609 - ctx->curr_frag_cnt = 0; 610 - ctx->total_expected_size = packet_size - GVE_RX_PAD; 611 - ctx->expected_frag_cnt = n_frags; 612 - ctx->skb_head = NULL; 613 - ctx->reuse_frags = reuse_frags; 614 - 615 - if (ctx->expected_frag_cnt > 1) { 616 - u64_stats_update_begin(&rx->statss); 617 - rx->rx_cont_packet_cnt++; 618 - u64_stats_update_end(&rx->statss); 619 - } 620 - if (ctx->total_expected_size > priv->rx_copybreak && !ctx->reuse_frags && qpl_mode) { 621 - u64_stats_update_begin(&rx->statss); 622 - rx->rx_copied_pkt++; 623 - u64_stats_update_end(&rx->statss); 624 - } 625 - 626 - if (unlikely(buffer_error || seqno_error || packet_size_error)) { 627 - gve_schedule_reset(priv); 628 - return false; 629 - } 630 - 631 - if (unlikely(desc_error)) { 632 - u64_stats_update_begin(&rx->statss); 633 - rx->rx_desc_err_dropped_pkt++; 634 - u64_stats_update_end(&rx->statss); 635 - return false; 636 - } 637 - return true; 638 - } 639 - 640 394 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx, 641 395 struct gve_rx_slot_page_info *page_info, struct napi_struct *napi, 642 - u16 len, union gve_rx_data_slot *data_slot) 396 + u16 len, union gve_rx_data_slot *data_slot, 397 + bool is_only_frag) 643 398 { 644 399 struct net_device *netdev = priv->dev; 645 400 struct gve_rx_ctx *ctx = &rx->ctx; 646 401 struct sk_buff *skb = NULL; 647 402 648 - if (len <= priv->rx_copybreak && ctx->expected_frag_cnt == 1) { 403 + if (len <= priv->rx_copybreak && is_only_frag) { 649 404 /* Just copy small packets */ 650 - skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD, ctx); 405 + skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD); 651 406 if (skb) { 652 407 u64_stats_update_begin(&rx->statss); 653 408 rx->rx_copied_pkt++; ··· 563 504 u64_stats_update_end(&rx->statss); 564 505 } 565 506 } else { 566 - if (rx->data.raw_addressing) { 567 - int recycle = gve_rx_can_recycle_buffer(page_info); 507 + int recycle = gve_rx_can_recycle_buffer(page_info); 568 508 569 - if (unlikely(recycle < 0)) { 570 - gve_schedule_reset(priv); 571 - return NULL; 572 - } 573 - page_info->can_flip = recycle; 574 - if (page_info->can_flip) { 575 - u64_stats_update_begin(&rx->statss); 576 - rx->rx_frag_flip_cnt++; 577 - u64_stats_update_end(&rx->statss); 578 - } 509 + if (unlikely(recycle < 0)) { 510 + gve_schedule_reset(priv); 511 + return NULL; 512 + } 513 + page_info->can_flip = recycle; 514 + if (page_info->can_flip) { 515 + u64_stats_update_begin(&rx->statss); 516 + rx->rx_frag_flip_cnt++; 517 + u64_stats_update_end(&rx->statss); 518 + } 519 + 520 + if (rx->data.raw_addressing) { 579 521 skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev, 580 522 page_info, len, napi, 581 523 data_slot, 582 524 rx->packet_buffer_size, ctx); 583 525 } else { 584 - if (ctx->reuse_frags) { 585 - u64_stats_update_begin(&rx->statss); 586 - rx->rx_frag_flip_cnt++; 587 - u64_stats_update_end(&rx->statss); 588 - } 589 526 skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx, 590 527 page_info, len, napi, data_slot); 591 528 } ··· 589 534 return skb; 590 535 } 591 536 592 - static bool gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 593 - u64 *packet_size_bytes, u32 *work_done) 537 + #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x)) 538 + static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat, 539 + struct gve_rx_desc *desc, u32 idx, 540 + struct gve_rx_cnts *cnts) 594 541 { 542 + bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq); 595 543 struct gve_rx_slot_page_info *page_info; 544 + u16 frag_size = be16_to_cpu(desc->len); 596 545 struct gve_rx_ctx *ctx = &rx->ctx; 597 546 union gve_rx_data_slot *data_slot; 598 547 struct gve_priv *priv = rx->gve; 599 - struct gve_rx_desc *first_desc; 600 548 struct sk_buff *skb = NULL; 601 - struct gve_rx_desc *desc; 602 - struct napi_struct *napi; 603 549 dma_addr_t page_bus; 604 - u32 work_cnt = 0; 605 550 void *va; 606 - u32 idx; 607 - u16 len; 608 551 609 - idx = rx->cnt & rx->mask; 610 - first_desc = &rx->desc.desc_ring[idx]; 611 - desc = first_desc; 612 - napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 552 + struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 553 + bool is_first_frag = ctx->frag_cnt == 0; 613 554 614 - if (unlikely(!gve_rx_ctx_init(ctx, rx))) 615 - goto skb_alloc_fail; 555 + bool is_only_frag = is_first_frag && is_last_frag; 616 556 617 - while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { 618 - /* Prefetch two packet buffers ahead, we will need it soon. */ 619 - page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 620 - va = page_info->page_address + page_info->page_offset; 557 + if (unlikely(ctx->drop_pkt)) 558 + goto finish_frag; 621 559 622 - prefetch(page_info->page); /* Kernel page struct. */ 623 - prefetch(va); /* Packet header. */ 624 - prefetch(va + 64); /* Next cacheline too. */ 560 + if (desc->flags_seq & GVE_RXF_ERR) { 561 + ctx->drop_pkt = true; 562 + cnts->desc_err_pkt_cnt++; 563 + napi_free_frags(napi); 564 + goto finish_frag; 565 + } 625 566 626 - len = gve_rx_get_fragment_size(ctx, desc); 567 + if (unlikely(frag_size > rx->packet_buffer_size)) { 568 + netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset", 569 + frag_size, rx->packet_buffer_size); 570 + ctx->drop_pkt = true; 571 + napi_free_frags(napi); 572 + gve_schedule_reset(rx->gve); 573 + goto finish_frag; 574 + } 627 575 628 - page_info = &rx->data.page_info[idx]; 629 - data_slot = &rx->data.data_ring[idx]; 630 - page_bus = rx->data.raw_addressing ? 631 - be64_to_cpu(data_slot->addr) - page_info->page_offset : 632 - rx->data.qpl->page_buses[idx]; 633 - dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, PAGE_SIZE, DMA_FROM_DEVICE); 576 + /* Prefetch two packet buffers ahead, we will need it soon. */ 577 + page_info = &rx->data.page_info[(idx + 2) & rx->mask]; 578 + va = page_info->page_address + page_info->page_offset; 579 + prefetch(page_info->page); /* Kernel page struct. */ 580 + prefetch(va); /* Packet header. */ 581 + prefetch(va + 64); /* Next cacheline too. */ 634 582 635 - skb = gve_rx_skb(priv, rx, page_info, napi, len, data_slot); 636 - if (!skb) { 637 - u64_stats_update_begin(&rx->statss); 638 - rx->rx_skb_alloc_fail++; 639 - u64_stats_update_end(&rx->statss); 640 - goto skb_alloc_fail; 583 + page_info = &rx->data.page_info[idx]; 584 + data_slot = &rx->data.data_ring[idx]; 585 + page_bus = (rx->data.raw_addressing) ? 586 + be64_to_cpu(data_slot->addr) - page_info->page_offset : 587 + rx->data.qpl->page_buses[idx]; 588 + dma_sync_single_for_cpu(&priv->pdev->dev, page_bus, 589 + PAGE_SIZE, DMA_FROM_DEVICE); 590 + page_info->pad = is_first_frag ? GVE_RX_PAD : 0; 591 + frag_size -= page_info->pad; 592 + 593 + skb = gve_rx_skb(priv, rx, page_info, napi, frag_size, 594 + data_slot, is_only_frag); 595 + if (!skb) { 596 + u64_stats_update_begin(&rx->statss); 597 + rx->rx_skb_alloc_fail++; 598 + u64_stats_update_end(&rx->statss); 599 + 600 + napi_free_frags(napi); 601 + ctx->drop_pkt = true; 602 + goto finish_frag; 603 + } 604 + ctx->total_size += frag_size; 605 + 606 + if (is_first_frag) { 607 + if (likely(feat & NETIF_F_RXCSUM)) { 608 + /* NIC passes up the partial sum */ 609 + if (desc->csum) 610 + skb->ip_summed = CHECKSUM_COMPLETE; 611 + else 612 + skb->ip_summed = CHECKSUM_NONE; 613 + skb->csum = csum_unfold(desc->csum); 641 614 } 642 615 643 - ctx->curr_frag_cnt++; 644 - rx->cnt++; 645 - idx = rx->cnt & rx->mask; 646 - work_cnt++; 647 - desc = &rx->desc.desc_ring[idx]; 616 + /* parse flags & pass relevant info up */ 617 + if (likely(feat & NETIF_F_RXHASH) && 618 + gve_needs_rss(desc->flags_seq)) 619 + skb_set_hash(skb, be32_to_cpu(desc->rss_hash), 620 + gve_rss_type(desc->flags_seq)); 648 621 } 649 622 650 - if (likely(feat & NETIF_F_RXCSUM)) { 651 - /* NIC passes up the partial sum */ 652 - if (first_desc->csum) 653 - skb->ip_summed = CHECKSUM_COMPLETE; 623 + if (is_last_frag) { 624 + skb_record_rx_queue(skb, rx->q_num); 625 + if (skb_is_nonlinear(skb)) 626 + napi_gro_frags(napi); 654 627 else 655 - skb->ip_summed = CHECKSUM_NONE; 656 - skb->csum = csum_unfold(first_desc->csum); 628 + napi_gro_receive(napi, skb); 629 + goto finish_ok_pkt; 657 630 } 658 631 659 - /* parse flags & pass relevant info up */ 660 - if (likely(feat & NETIF_F_RXHASH) && 661 - gve_needs_rss(first_desc->flags_seq)) 662 - skb_set_hash(skb, be32_to_cpu(first_desc->rss_hash), 663 - gve_rss_type(first_desc->flags_seq)); 632 + goto finish_frag; 664 633 665 - *packet_size_bytes = skb->len + (skb->protocol ? ETH_HLEN : 0); 666 - *work_done = work_cnt; 667 - skb_record_rx_queue(skb, rx->q_num); 668 - if (skb_is_nonlinear(skb)) 669 - napi_gro_frags(napi); 670 - else 671 - napi_gro_receive(napi, skb); 672 - 673 - gve_rx_ctx_clear(ctx); 674 - return true; 675 - 676 - skb_alloc_fail: 677 - if (napi->skb) 678 - napi_free_frags(napi); 679 - *packet_size_bytes = 0; 680 - *work_done = ctx->expected_frag_cnt; 681 - while (ctx->curr_frag_cnt < ctx->expected_frag_cnt) { 682 - rx->cnt++; 683 - ctx->curr_frag_cnt++; 634 + finish_ok_pkt: 635 + cnts->ok_pkt_bytes += ctx->total_size; 636 + cnts->ok_pkt_cnt++; 637 + finish_frag: 638 + ctx->frag_cnt++; 639 + if (is_last_frag) { 640 + cnts->total_pkt_cnt++; 641 + cnts->cont_pkt_cnt += (ctx->frag_cnt > 1); 642 + gve_rx_ctx_clear(ctx); 684 643 } 685 - gve_rx_ctx_clear(ctx); 686 - return false; 687 644 } 688 645 689 646 bool gve_rx_work_pending(struct gve_rx_ring *rx) ··· 771 704 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget, 772 705 netdev_features_t feat) 773 706 { 774 - u32 work_done = 0, total_packet_cnt = 0, ok_packet_cnt = 0; 707 + struct gve_rx_ctx *ctx = &rx->ctx; 775 708 struct gve_priv *priv = rx->gve; 709 + struct gve_rx_cnts cnts = {0}; 710 + struct gve_rx_desc *next_desc; 776 711 u32 idx = rx->cnt & rx->mask; 777 - struct gve_rx_desc *desc; 778 - u64 bytes = 0; 712 + u32 work_done = 0; 779 713 780 - desc = &rx->desc.desc_ring[idx]; 714 + struct gve_rx_desc *desc = &rx->desc.desc_ring[idx]; 715 + 716 + // Exceed budget only if (and till) the inflight packet is consumed. 781 717 while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) && 782 - work_done < budget) { 783 - u64 packet_size_bytes = 0; 784 - u32 work_cnt = 0; 785 - bool dropped; 718 + (work_done < budget || ctx->frag_cnt)) { 719 + next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask]; 720 + prefetch(next_desc); 786 721 787 - netif_info(priv, rx_status, priv->dev, 788 - "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n", 789 - rx->q_num, idx, desc, desc->flags_seq); 790 - netif_info(priv, rx_status, priv->dev, 791 - "[%d] seqno=%d rx->desc.seqno=%d\n", 792 - rx->q_num, GVE_SEQNO(desc->flags_seq), 793 - rx->desc.seqno); 722 + gve_rx(rx, feat, desc, idx, &cnts); 794 723 795 - dropped = !gve_rx(rx, feat, &packet_size_bytes, &work_cnt); 796 - if (!dropped) { 797 - bytes += packet_size_bytes; 798 - ok_packet_cnt++; 799 - } 800 - total_packet_cnt++; 724 + rx->cnt++; 801 725 idx = rx->cnt & rx->mask; 802 726 desc = &rx->desc.desc_ring[idx]; 803 - work_done += work_cnt; 727 + rx->desc.seqno = gve_next_seqno(rx->desc.seqno); 728 + work_done++; 729 + } 730 + 731 + // The device will only send whole packets. 732 + if (unlikely(ctx->frag_cnt)) { 733 + struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi; 734 + 735 + napi_free_frags(napi); 736 + gve_rx_ctx_clear(&rx->ctx); 737 + netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", 738 + GVE_SEQNO(desc->flags_seq), rx->desc.seqno); 739 + gve_schedule_reset(rx->gve); 804 740 } 805 741 806 742 if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold) ··· 811 741 812 742 if (work_done) { 813 743 u64_stats_update_begin(&rx->statss); 814 - rx->rpackets += ok_packet_cnt; 815 - rx->rbytes += bytes; 744 + rx->rpackets += cnts.ok_pkt_cnt; 745 + rx->rbytes += cnts.ok_pkt_bytes; 746 + rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt; 747 + rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt; 816 748 u64_stats_update_end(&rx->statss); 817 749 } 818 750 ··· 839 767 } 840 768 841 769 gve_rx_write_doorbell(priv, rx); 842 - return total_packet_cnt; 770 + return cnts.total_pkt_cnt; 843 771 } 844 772 845 773 int gve_rx_poll(struct gve_notify_block *block, int budget)
+1 -1
drivers/net/ethernet/google/gve/gve_rx_dqo.c
··· 568 568 569 569 if (eop && buf_len <= priv->rx_copybreak) { 570 570 rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, 571 - &buf_state->page_info, buf_len, 0, NULL); 571 + &buf_state->page_info, buf_len, 0); 572 572 if (unlikely(!rx->ctx.skb_head)) 573 573 goto error; 574 574 rx->ctx.skb_tail = rx->ctx.skb_head;
+6 -22
drivers/net/ethernet/google/gve/gve_utils.c
··· 50 50 51 51 struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, 52 52 struct gve_rx_slot_page_info *page_info, u16 len, 53 - u16 padding, struct gve_rx_ctx *ctx) 53 + u16 padding) 54 54 { 55 55 void *va = page_info->page_address + padding + page_info->page_offset; 56 - int skb_linear_offset = 0; 57 - bool set_protocol = false; 58 56 struct sk_buff *skb; 59 57 60 - if (ctx) { 61 - if (!ctx->skb_head) 62 - ctx->skb_head = napi_alloc_skb(napi, ctx->total_expected_size); 58 + skb = napi_alloc_skb(napi, len); 59 + if (unlikely(!skb)) 60 + return NULL; 63 61 64 - if (unlikely(!ctx->skb_head)) 65 - return NULL; 66 - skb = ctx->skb_head; 67 - skb_linear_offset = skb->len; 68 - set_protocol = ctx->curr_frag_cnt == ctx->expected_frag_cnt - 1; 69 - } else { 70 - skb = napi_alloc_skb(napi, len); 71 - 72 - if (unlikely(!skb)) 73 - return NULL; 74 - set_protocol = true; 75 - } 76 62 __skb_put(skb, len); 77 - skb_copy_to_linear_data_offset(skb, skb_linear_offset, va, len); 78 - 79 - if (set_protocol) 80 - skb->protocol = eth_type_trans(skb, dev); 63 + skb_copy_to_linear_data_offset(skb, 0, va, len); 64 + skb->protocol = eth_type_trans(skb, dev); 81 65 82 66 return skb; 83 67 }
+1 -1
drivers/net/ethernet/google/gve/gve_utils.h
··· 19 19 20 20 struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, 21 21 struct gve_rx_slot_page_info *page_info, u16 len, 22 - u16 pad, struct gve_rx_ctx *ctx); 22 + u16 pad); 23 23 24 24 /* Decrement pagecnt_bias. Set it back to INT_MAX if it reached zero. */ 25 25 void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info);