Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mlx4-next'

Amir Vadai says:

====================
net/mlx4_en: Optimizations to TX flow

This patchset contains optimizations to TX flow in mlx4_en driver. It also introduce
setting/getting tx copybreak, to enable controlling inline threshold dynamically.

TX flow optimizations was authored and posted to the mailing list by Eric
Dumazet [1] as a single patch. I splitted this patch to smaller patches,
Reviewed it and tested.
Changed from original patch:
- s/iowrite32be/iowrite32/, since ring->doorbell_qpn is stored as be32

The tx copybreak patch was also suggested by Eric Dumazet, and was edited and
reviewed by me. User space patch will be sent after kernel code is ready.

I am sending this patchset now since the merge window is near and don't want to
miss it.

More work need to do:
- Disable BF when xmit_more is in use
- Make TSO use xmit_more too. Maybe by splitting small TSO packets in the
driver itself, to avoid extra cpu/memory costs of GSO before the driver
- Fix mlx4_en_xmit buggy handling of queue full in the middle of a burst
partially posted to send queue using xmit_more

Eric, I edited the patches to have you as the Author and the first
signed-off-by. I hope it is ok with you (I wasn't sure if it is ok to sign by
you), anyway all the credit to those changes should go to you.

Patchset was tested and applied over commit 1e203c1 "(net: sched:
suspicious RCU usage in qdisc_watchdog")

[1] - https://patchwork.ozlabs.org/patch/394256/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+290 -178
+44
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
··· 1267 1267 return priv->pflags; 1268 1268 } 1269 1269 1270 + static int mlx4_en_get_tunable(struct net_device *dev, 1271 + const struct ethtool_tunable *tuna, 1272 + void *data) 1273 + { 1274 + const struct mlx4_en_priv *priv = netdev_priv(dev); 1275 + int ret = 0; 1276 + 1277 + switch (tuna->id) { 1278 + case ETHTOOL_TX_COPYBREAK: 1279 + *(u32 *)data = priv->prof->inline_thold; 1280 + break; 1281 + default: 1282 + ret = -EINVAL; 1283 + break; 1284 + } 1285 + 1286 + return ret; 1287 + } 1288 + 1289 + static int mlx4_en_set_tunable(struct net_device *dev, 1290 + const struct ethtool_tunable *tuna, 1291 + const void *data) 1292 + { 1293 + struct mlx4_en_priv *priv = netdev_priv(dev); 1294 + int val, ret = 0; 1295 + 1296 + switch (tuna->id) { 1297 + case ETHTOOL_TX_COPYBREAK: 1298 + val = *(u32 *)data; 1299 + if (val < MIN_PKT_LEN || val > MAX_INLINE) 1300 + ret = -EINVAL; 1301 + else 1302 + priv->prof->inline_thold = val; 1303 + break; 1304 + default: 1305 + ret = -EINVAL; 1306 + break; 1307 + } 1308 + 1309 + return ret; 1310 + } 1311 + 1270 1312 1271 1313 const struct ethtool_ops mlx4_en_ethtool_ops = { 1272 1314 .get_drvinfo = mlx4_en_get_drvinfo, ··· 1339 1297 .get_ts_info = mlx4_en_get_ts_info, 1340 1298 .set_priv_flags = mlx4_en_set_priv_flags, 1341 1299 .get_priv_flags = mlx4_en_get_priv_flags, 1300 + .get_tunable = mlx4_en_get_tunable, 1301 + .set_tunable = mlx4_en_set_tunable, 1342 1302 }; 1343 1303 1344 1304
+194 -136
drivers/net/ethernet/mellanox/mlx4/en_tx.c
··· 37 37 #include <linux/mlx4/qp.h> 38 38 #include <linux/skbuff.h> 39 39 #include <linux/if_vlan.h> 40 + #include <linux/prefetch.h> 40 41 #include <linux/vmalloc.h> 41 42 #include <linux/tcp.h> 42 43 #include <linux/ip.h> ··· 66 65 ring->size = size; 67 66 ring->size_mask = size - 1; 68 67 ring->stride = stride; 69 - ring->inline_thold = priv->prof->inline_thold; 70 68 71 69 tmp = size * sizeof(struct mlx4_en_tx_info); 72 - ring->tx_info = vmalloc_node(tmp, node); 70 + ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node); 73 71 if (!ring->tx_info) { 74 72 ring->tx_info = vmalloc(tmp); 75 73 if (!ring->tx_info) { ··· 151 151 kfree(ring->bounce_buf); 152 152 ring->bounce_buf = NULL; 153 153 err_info: 154 - vfree(ring->tx_info); 154 + kvfree(ring->tx_info); 155 155 ring->tx_info = NULL; 156 156 err_ring: 157 157 kfree(ring); ··· 174 174 mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); 175 175 kfree(ring->bounce_buf); 176 176 ring->bounce_buf = NULL; 177 - vfree(ring->tx_info); 177 + kvfree(ring->tx_info); 178 178 ring->tx_info = NULL; 179 179 kfree(ring); 180 180 *pring = NULL; ··· 191 191 ring->prod = 0; 192 192 ring->cons = 0xffffffff; 193 193 ring->last_nr_txbb = 1; 194 - ring->poll_cnt = 0; 195 194 memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); 196 195 memset(ring->buf, 0, ring->buf_size); 197 196 198 197 ring->qp_state = MLX4_QP_STATE_RST; 199 - ring->doorbell_qpn = ring->qp.qpn << 8; 198 + ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8); 199 + ring->mr_key = cpu_to_be32(mdev->mr.key); 200 200 201 201 mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, 202 202 ring->cqn, user_prio, &ring->context); ··· 259 259 struct mlx4_en_tx_ring *ring, 260 260 int index, u8 owner, u64 timestamp) 261 261 { 262 - struct mlx4_en_dev *mdev = priv->mdev; 263 262 struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; 264 263 struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; 265 264 struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; 266 - struct sk_buff *skb = tx_info->skb; 267 - struct skb_frag_struct *frag; 268 265 void *end = ring->buf + ring->buf_size; 269 - int frags = skb_shinfo(skb)->nr_frags; 266 + struct sk_buff *skb = tx_info->skb; 267 + int nr_maps = tx_info->nr_maps; 270 268 int i; 271 - struct skb_shared_hwtstamps hwts; 272 269 273 - if (timestamp) { 274 - mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp); 270 + /* We do not touch skb here, so prefetch skb->users location 271 + * to speedup consume_skb() 272 + */ 273 + prefetchw(&skb->users); 274 + 275 + if (unlikely(timestamp)) { 276 + struct skb_shared_hwtstamps hwts; 277 + 278 + mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp); 275 279 skb_tstamp_tx(skb, &hwts); 276 280 } 277 281 278 282 /* Optimize the common case when there are no wraparounds */ 279 283 if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { 280 284 if (!tx_info->inl) { 281 - if (tx_info->linear) { 285 + if (tx_info->linear) 282 286 dma_unmap_single(priv->ddev, 283 - (dma_addr_t) be64_to_cpu(data->addr), 284 - be32_to_cpu(data->byte_count), 285 - PCI_DMA_TODEVICE); 286 - ++data; 287 - } 288 - 289 - for (i = 0; i < frags; i++) { 290 - frag = &skb_shinfo(skb)->frags[i]; 287 + tx_info->map0_dma, 288 + tx_info->map0_byte_count, 289 + PCI_DMA_TODEVICE); 290 + else 291 291 dma_unmap_page(priv->ddev, 292 - (dma_addr_t) be64_to_cpu(data[i].addr), 293 - skb_frag_size(frag), PCI_DMA_TODEVICE); 292 + tx_info->map0_dma, 293 + tx_info->map0_byte_count, 294 + PCI_DMA_TODEVICE); 295 + for (i = 1; i < nr_maps; i++) { 296 + data++; 297 + dma_unmap_page(priv->ddev, 298 + (dma_addr_t)be64_to_cpu(data->addr), 299 + be32_to_cpu(data->byte_count), 300 + PCI_DMA_TODEVICE); 294 301 } 295 302 } 296 303 } else { ··· 306 299 data = ring->buf + ((void *)data - end); 307 300 } 308 301 309 - if (tx_info->linear) { 302 + if (tx_info->linear) 310 303 dma_unmap_single(priv->ddev, 311 - (dma_addr_t) be64_to_cpu(data->addr), 312 - be32_to_cpu(data->byte_count), 313 - PCI_DMA_TODEVICE); 314 - ++data; 315 - } 316 - 317 - for (i = 0; i < frags; i++) { 304 + tx_info->map0_dma, 305 + tx_info->map0_byte_count, 306 + PCI_DMA_TODEVICE); 307 + else 308 + dma_unmap_page(priv->ddev, 309 + tx_info->map0_dma, 310 + tx_info->map0_byte_count, 311 + PCI_DMA_TODEVICE); 312 + for (i = 1; i < nr_maps; i++) { 313 + data++; 318 314 /* Check for wraparound before unmapping */ 319 315 if ((void *) data >= end) 320 316 data = ring->buf; 321 - frag = &skb_shinfo(skb)->frags[i]; 322 317 dma_unmap_page(priv->ddev, 323 - (dma_addr_t) be64_to_cpu(data->addr), 324 - skb_frag_size(frag), PCI_DMA_TODEVICE); 325 - ++data; 318 + (dma_addr_t)be64_to_cpu(data->addr), 319 + be32_to_cpu(data->byte_count), 320 + PCI_DMA_TODEVICE); 326 321 } 327 322 } 328 323 } ··· 386 377 u64 timestamp = 0; 387 378 int done = 0; 388 379 int budget = priv->tx_work_limit; 380 + u32 last_nr_txbb; 381 + u32 ring_cons; 389 382 390 383 if (!priv->port_up) 391 384 return true; 392 385 386 + prefetchw(&ring->tx_queue->dql.limit); 393 387 index = cons_index & size_mask; 394 388 cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor; 395 - ring_index = ring->cons & size_mask; 389 + last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb); 390 + ring_cons = ACCESS_ONCE(ring->cons); 391 + ring_index = ring_cons & size_mask; 396 392 stamp_index = ring_index; 397 393 398 394 /* Process all completed CQEs */ ··· 422 408 new_index = be16_to_cpu(cqe->wqe_index) & size_mask; 423 409 424 410 do { 425 - txbbs_skipped += ring->last_nr_txbb; 426 - ring_index = (ring_index + ring->last_nr_txbb) & size_mask; 411 + txbbs_skipped += last_nr_txbb; 412 + ring_index = (ring_index + last_nr_txbb) & size_mask; 427 413 if (ring->tx_info[ring_index].ts_requested) 428 414 timestamp = mlx4_en_get_cqe_ts(cqe); 429 415 430 416 /* free next descriptor */ 431 - ring->last_nr_txbb = mlx4_en_free_tx_desc( 417 + last_nr_txbb = mlx4_en_free_tx_desc( 432 418 priv, ring, ring_index, 433 - !!((ring->cons + txbbs_skipped) & 419 + !!((ring_cons + txbbs_skipped) & 434 420 ring->size), timestamp); 435 421 436 422 mlx4_en_stamp_wqe(priv, ring, stamp_index, 437 - !!((ring->cons + txbbs_stamp) & 423 + !!((ring_cons + txbbs_stamp) & 438 424 ring->size)); 439 425 stamp_index = ring_index; 440 426 txbbs_stamp = txbbs_skipped; ··· 455 441 mcq->cons_index = cons_index; 456 442 mlx4_cq_set_ci(mcq); 457 443 wmb(); 458 - ring->cons += txbbs_skipped; 444 + 445 + /* we want to dirty this cache line once */ 446 + ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; 447 + ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; 448 + 459 449 netdev_tx_completed_queue(ring->tx_queue, packets, bytes); 460 450 461 451 /* ··· 530 512 return ring->buf + index * TXBB_SIZE; 531 513 } 532 514 533 - static int is_inline(int inline_thold, struct sk_buff *skb, void **pfrag) 515 + /* Decide if skb can be inlined in tx descriptor to avoid dma mapping 516 + * 517 + * It seems strange we do not simply use skb_copy_bits(). 518 + * This would allow to inline all skbs iff skb->len <= inline_thold 519 + * 520 + * Note that caller already checked skb was not a gso packet 521 + */ 522 + static bool is_inline(int inline_thold, const struct sk_buff *skb, 523 + const struct skb_shared_info *shinfo, 524 + void **pfrag) 534 525 { 535 526 void *ptr; 536 527 537 - if (inline_thold && !skb_is_gso(skb) && skb->len <= inline_thold) { 538 - if (skb_shinfo(skb)->nr_frags == 1) { 539 - ptr = skb_frag_address_safe(&skb_shinfo(skb)->frags[0]); 540 - if (unlikely(!ptr)) 541 - return 0; 528 + if (skb->len > inline_thold || !inline_thold) 529 + return false; 542 530 543 - if (pfrag) 544 - *pfrag = ptr; 545 - 546 - return 1; 547 - } else if (unlikely(skb_shinfo(skb)->nr_frags)) 548 - return 0; 549 - else 550 - return 1; 531 + if (shinfo->nr_frags == 1) { 532 + ptr = skb_frag_address_safe(&shinfo->frags[0]); 533 + if (unlikely(!ptr)) 534 + return false; 535 + *pfrag = ptr; 536 + return true; 551 537 } 552 - 553 - return 0; 538 + if (shinfo->nr_frags) 539 + return false; 540 + return true; 554 541 } 555 542 556 - static int inline_size(struct sk_buff *skb) 543 + static int inline_size(const struct sk_buff *skb) 557 544 { 558 545 if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) 559 546 <= MLX4_INLINE_ALIGN) ··· 569 546 sizeof(struct mlx4_wqe_inline_seg), 16); 570 547 } 571 548 572 - static int get_real_size(struct sk_buff *skb, struct net_device *dev, 573 - int *lso_header_size) 549 + static int get_real_size(const struct sk_buff *skb, 550 + const struct skb_shared_info *shinfo, 551 + struct net_device *dev, 552 + int *lso_header_size, 553 + bool *inline_ok, 554 + void **pfrag) 574 555 { 575 556 struct mlx4_en_priv *priv = netdev_priv(dev); 576 557 int real_size; 577 558 578 - if (skb_is_gso(skb)) { 559 + if (shinfo->gso_size) { 560 + *inline_ok = false; 579 561 if (skb->encapsulation) 580 562 *lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb); 581 563 else 582 564 *lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb); 583 - real_size = CTRL_SIZE + skb_shinfo(skb)->nr_frags * DS_SIZE + 565 + real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE + 584 566 ALIGN(*lso_header_size + 4, DS_SIZE); 585 567 if (unlikely(*lso_header_size != skb_headlen(skb))) { 586 568 /* We add a segment for the skb linear buffer only if ··· 600 572 } 601 573 } else { 602 574 *lso_header_size = 0; 603 - if (!is_inline(priv->prof->inline_thold, skb, NULL)) 604 - real_size = CTRL_SIZE + (skb_shinfo(skb)->nr_frags + 1) * DS_SIZE; 605 - else 575 + *inline_ok = is_inline(priv->prof->inline_thold, skb, 576 + shinfo, pfrag); 577 + 578 + if (*inline_ok) 606 579 real_size = inline_size(skb); 580 + else 581 + real_size = CTRL_SIZE + 582 + (shinfo->nr_frags + 1) * DS_SIZE; 607 583 } 608 584 609 585 return real_size; 610 586 } 611 587 612 - static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *skb, 613 - int real_size, u16 *vlan_tag, int tx_ind, void *fragptr) 588 + static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, 589 + const struct sk_buff *skb, 590 + const struct skb_shared_info *shinfo, 591 + int real_size, u16 *vlan_tag, 592 + int tx_ind, void *fragptr) 614 593 { 615 594 struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; 616 595 int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; 596 + unsigned int hlen = skb_headlen(skb); 617 597 618 598 if (skb->len <= spc) { 619 599 if (likely(skb->len >= MIN_PKT_LEN)) { ··· 631 595 memset(((void *)(inl + 1)) + skb->len, 0, 632 596 MIN_PKT_LEN - skb->len); 633 597 } 634 - skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); 635 - if (skb_shinfo(skb)->nr_frags) 636 - memcpy(((void *)(inl + 1)) + skb_headlen(skb), fragptr, 637 - skb_frag_size(&skb_shinfo(skb)->frags[0])); 598 + skb_copy_from_linear_data(skb, inl + 1, hlen); 599 + if (shinfo->nr_frags) 600 + memcpy(((void *)(inl + 1)) + hlen, fragptr, 601 + skb_frag_size(&shinfo->frags[0])); 638 602 639 603 } else { 640 604 inl->byte_count = cpu_to_be32(1 << 31 | spc); 641 - if (skb_headlen(skb) <= spc) { 642 - skb_copy_from_linear_data(skb, inl + 1, skb_headlen(skb)); 643 - if (skb_headlen(skb) < spc) { 644 - memcpy(((void *)(inl + 1)) + skb_headlen(skb), 645 - fragptr, spc - skb_headlen(skb)); 646 - fragptr += spc - skb_headlen(skb); 605 + if (hlen <= spc) { 606 + skb_copy_from_linear_data(skb, inl + 1, hlen); 607 + if (hlen < spc) { 608 + memcpy(((void *)(inl + 1)) + hlen, 609 + fragptr, spc - hlen); 610 + fragptr += spc - hlen; 647 611 } 648 612 inl = (void *) (inl + 1) + spc; 649 613 memcpy(((void *)(inl + 1)), fragptr, skb->len - spc); ··· 651 615 skb_copy_from_linear_data(skb, inl + 1, spc); 652 616 inl = (void *) (inl + 1) + spc; 653 617 skb_copy_from_linear_data_offset(skb, spc, inl + 1, 654 - skb_headlen(skb) - spc); 655 - if (skb_shinfo(skb)->nr_frags) 656 - memcpy(((void *)(inl + 1)) + skb_headlen(skb) - spc, 657 - fragptr, skb_frag_size(&skb_shinfo(skb)->frags[0])); 618 + hlen - spc); 619 + if (shinfo->nr_frags) 620 + memcpy(((void *)(inl + 1)) + hlen - spc, 621 + fragptr, 622 + skb_frag_size(&shinfo->frags[0])); 658 623 } 659 624 660 625 wmb(); ··· 679 642 return fallback(dev, skb) % rings_p_up + up * rings_p_up; 680 643 } 681 644 682 - static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) 645 + static void mlx4_bf_copy(void __iomem *dst, const void *src, 646 + unsigned int bytecnt) 683 647 { 684 648 __iowrite64_copy(dst, src, bytecnt / 8); 685 649 } 686 650 687 651 netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) 688 652 { 653 + struct skb_shared_info *shinfo = skb_shinfo(skb); 689 654 struct mlx4_en_priv *priv = netdev_priv(dev); 690 - struct mlx4_en_dev *mdev = priv->mdev; 691 655 struct device *ddev = priv->ddev; 692 656 struct mlx4_en_tx_ring *ring; 693 657 struct mlx4_en_tx_desc *tx_desc; ··· 701 663 u32 index, bf_index; 702 664 __be32 op_own; 703 665 u16 vlan_tag = 0; 704 - int i; 666 + int i_frag; 705 667 int lso_header_size; 706 - void *fragptr; 668 + void *fragptr = NULL; 707 669 bool bounce = false; 708 670 bool send_doorbell; 671 + bool inline_ok; 672 + u32 ring_cons; 709 673 710 674 if (!priv->port_up) 711 675 goto tx_drop; 712 676 713 - real_size = get_real_size(skb, dev, &lso_header_size); 677 + tx_ind = skb_get_queue_mapping(skb); 678 + ring = priv->tx_ring[tx_ind]; 679 + 680 + /* fetch ring->cons far ahead before needing it to avoid stall */ 681 + ring_cons = ACCESS_ONCE(ring->cons); 682 + 683 + real_size = get_real_size(skb, shinfo, dev, &lso_header_size, 684 + &inline_ok, &fragptr); 714 685 if (unlikely(!real_size)) 715 686 goto tx_drop; 716 687 ··· 732 685 goto tx_drop; 733 686 } 734 687 735 - tx_ind = skb->queue_mapping; 736 - ring = priv->tx_ring[tx_ind]; 737 688 if (vlan_tx_tag_present(skb)) 738 689 vlan_tag = vlan_tx_tag_get(skb); 739 690 740 691 /* Check available TXBBs And 2K spare for prefetch */ 741 - if (unlikely(((int)(ring->prod - ring->cons)) > 692 + if (unlikely(((int)(ring->prod - ring_cons)) > 742 693 ring->size - HEADROOM - MAX_DESC_TXBBS)) { 743 694 /* every full Tx ring stops queue */ 744 695 netif_tx_stop_queue(ring->tx_queue); ··· 750 705 */ 751 706 wmb(); 752 707 753 - if (unlikely(((int)(ring->prod - ring->cons)) <= 708 + ring_cons = ACCESS_ONCE(ring->cons); 709 + if (unlikely(((int)(ring->prod - ring_cons)) <= 754 710 ring->size - HEADROOM - MAX_DESC_TXBBS)) { 755 711 netif_tx_wake_queue(ring->tx_queue); 756 712 ring->wake_queue++; ··· 760 714 } 761 715 } 762 716 717 + prefetchw(&ring->tx_queue->dql); 718 + 763 719 /* Track current inflight packets for performance analysis */ 764 720 AVG_PERF_COUNTER(priv->pstats.inflight_avg, 765 - (u32) (ring->prod - ring->cons - 1)); 721 + (u32)(ring->prod - ring_cons - 1)); 766 722 767 723 /* Packet is good - grab an index and transmit it */ 768 724 index = ring->prod & ring->size_mask; ··· 784 736 tx_info->skb = skb; 785 737 tx_info->nr_txbb = nr_txbb; 786 738 739 + data = &tx_desc->data; 787 740 if (lso_header_size) 788 741 data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, 789 742 DS_SIZE)); 790 - else 791 - data = &tx_desc->data; 792 743 793 744 /* valid only for none inline segments */ 794 745 tx_info->data_offset = (void *)data - (void *)tx_desc; 795 746 747 + tx_info->inl = inline_ok; 748 + 796 749 tx_info->linear = (lso_header_size < skb_headlen(skb) && 797 - !is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0; 750 + !inline_ok) ? 1 : 0; 798 751 799 - data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; 752 + tx_info->nr_maps = shinfo->nr_frags + tx_info->linear; 753 + data += tx_info->nr_maps - 1; 800 754 801 - if (is_inline(ring->inline_thold, skb, &fragptr)) { 802 - tx_info->inl = 1; 803 - } else { 804 - /* Map fragments */ 805 - for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { 806 - struct skb_frag_struct *frag; 807 - dma_addr_t dma; 755 + if (!tx_info->inl) { 756 + dma_addr_t dma = 0; 757 + u32 byte_count = 0; 808 758 809 - frag = &skb_shinfo(skb)->frags[i]; 759 + /* Map fragments if any */ 760 + for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) { 761 + const struct skb_frag_struct *frag; 762 + 763 + frag = &shinfo->frags[i_frag]; 764 + byte_count = skb_frag_size(frag); 810 765 dma = skb_frag_dma_map(ddev, frag, 811 - 0, skb_frag_size(frag), 766 + 0, byte_count, 812 767 DMA_TO_DEVICE); 813 768 if (dma_mapping_error(ddev, dma)) 814 769 goto tx_drop_unmap; 815 770 816 771 data->addr = cpu_to_be64(dma); 817 - data->lkey = cpu_to_be32(mdev->mr.key); 772 + data->lkey = ring->mr_key; 818 773 wmb(); 819 - data->byte_count = cpu_to_be32(skb_frag_size(frag)); 774 + data->byte_count = cpu_to_be32(byte_count); 820 775 --data; 821 776 } 822 777 823 - /* Map linear part */ 778 + /* Map linear part if needed */ 824 779 if (tx_info->linear) { 825 - u32 byte_count = skb_headlen(skb) - lso_header_size; 826 - dma_addr_t dma; 780 + byte_count = skb_headlen(skb) - lso_header_size; 827 781 828 782 dma = dma_map_single(ddev, skb->data + 829 783 lso_header_size, byte_count, ··· 834 784 goto tx_drop_unmap; 835 785 836 786 data->addr = cpu_to_be64(dma); 837 - data->lkey = cpu_to_be32(mdev->mr.key); 787 + data->lkey = ring->mr_key; 838 788 wmb(); 839 789 data->byte_count = cpu_to_be32(byte_count); 840 790 } 841 - tx_info->inl = 0; 791 + /* tx completion can avoid cache line miss for common cases */ 792 + tx_info->map0_dma = dma; 793 + tx_info->map0_byte_count = byte_count; 842 794 } 843 795 844 796 /* 845 797 * For timestamping add flag to skb_shinfo and 846 798 * set flag for further reference 847 799 */ 848 - if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && 849 - skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { 850 - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; 800 + tx_info->ts_requested = 0; 801 + if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON && 802 + shinfo->tx_flags & SKBTX_HW_TSTAMP)) { 803 + shinfo->tx_flags |= SKBTX_IN_PROGRESS; 851 804 tx_info->ts_requested = 1; 852 805 } 853 806 854 807 /* Prepare ctrl segement apart opcode+ownership, which depends on 855 808 * whether LSO is used */ 856 - tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); 857 - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * 858 - !!vlan_tx_tag_present(skb); 859 - tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; 860 809 tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; 861 810 if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { 862 811 tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | ··· 876 827 877 828 /* Handle LSO (TSO) packets */ 878 829 if (lso_header_size) { 830 + int i; 831 + 879 832 /* Mark opcode as LSO */ 880 833 op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | 881 834 ((ring->prod & ring->size) ? ··· 885 834 886 835 /* Fill in the LSO prefix */ 887 836 tx_desc->lso.mss_hdr_size = cpu_to_be32( 888 - skb_shinfo(skb)->gso_size << 16 | lso_header_size); 837 + shinfo->gso_size << 16 | lso_header_size); 889 838 890 839 /* Copy headers; 891 840 * note that we already verified that it is linear */ 892 841 memcpy(tx_desc->lso.header, skb->data, lso_header_size); 893 842 894 843 ring->tso_packets++; 895 - i = ((skb->len - lso_header_size) / skb_shinfo(skb)->gso_size) + 896 - !!((skb->len - lso_header_size) % skb_shinfo(skb)->gso_size); 844 + 845 + i = ((skb->len - lso_header_size) / shinfo->gso_size) + 846 + !!((skb->len - lso_header_size) % shinfo->gso_size); 897 847 tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size; 898 848 ring->packets += i; 899 849 } else { ··· 904 852 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); 905 853 tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); 906 854 ring->packets++; 907 - 908 855 } 909 856 ring->bytes += tx_info->nr_bytes; 910 857 netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes); 911 858 AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len); 912 859 913 - if (tx_info->inl) { 914 - build_inline_wqe(tx_desc, skb, real_size, &vlan_tag, tx_ind, fragptr); 915 - tx_info->inl = 1; 916 - } 860 + if (tx_info->inl) 861 + build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag, 862 + tx_ind, fragptr); 917 863 918 864 if (skb->encapsulation) { 919 865 struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb); ··· 924 874 ring->prod += nr_txbb; 925 875 926 876 /* If we used a bounce buffer then copy descriptor back into place */ 927 - if (bounce) 877 + if (unlikely(bounce)) 928 878 tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); 929 879 930 880 skb_tx_timestamp(skb); 931 881 932 882 send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue); 933 883 884 + real_size = (real_size / 16) & 0x3f; 885 + 934 886 if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && 935 887 !vlan_tx_tag_present(skb) && send_doorbell) { 936 - tx_desc->ctrl.bf_qpn |= cpu_to_be32(ring->doorbell_qpn); 888 + tx_desc->ctrl.bf_qpn = ring->doorbell_qpn | 889 + cpu_to_be32(real_size); 937 890 938 891 op_own |= htonl((bf_index & 0xffff) << 8); 939 892 /* Ensure new descriptor hits memory ··· 947 894 948 895 wmb(); 949 896 950 - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, 951 - desc_size); 897 + mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl, 898 + desc_size); 952 899 953 900 wmb(); 954 901 955 902 ring->bf.offset ^= ring->bf.buf_size; 956 903 } else { 904 + tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); 905 + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * 906 + !!vlan_tx_tag_present(skb); 907 + tx_desc->ctrl.fence_size = real_size; 908 + 957 909 /* Ensure new descriptor hits memory 958 910 * before setting ownership of this descriptor to HW 959 911 */ ··· 966 908 tx_desc->ctrl.owner_opcode = op_own; 967 909 if (send_doorbell) { 968 910 wmb(); 969 - iowrite32be(ring->doorbell_qpn, 970 - ring->bf.uar->map + MLX4_SEND_DOORBELL); 911 + iowrite32(ring->doorbell_qpn, 912 + ring->bf.uar->map + MLX4_SEND_DOORBELL); 971 913 } else { 972 914 ring->xmit_more++; 973 915 } ··· 978 920 tx_drop_unmap: 979 921 en_err(priv, "DMA mapping error\n"); 980 922 981 - for (i++; i < skb_shinfo(skb)->nr_frags; i++) { 982 - data++; 923 + while (++i_frag < shinfo->nr_frags) { 924 + ++data; 983 925 dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr), 984 926 be32_to_cpu(data->byte_count), 985 927 PCI_DMA_TODEVICE);
+49 -41
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
··· 216 216 217 217 struct mlx4_en_tx_info { 218 218 struct sk_buff *skb; 219 - u32 nr_txbb; 220 - u32 nr_bytes; 221 - u8 linear; 222 - u8 data_offset; 223 - u8 inl; 224 - u8 ts_requested; 225 - }; 219 + dma_addr_t map0_dma; 220 + u32 map0_byte_count; 221 + u32 nr_txbb; 222 + u32 nr_bytes; 223 + u8 linear; 224 + u8 data_offset; 225 + u8 inl; 226 + u8 ts_requested; 227 + u8 nr_maps; 228 + } ____cacheline_aligned_in_smp; 226 229 227 230 228 231 #define MLX4_EN_BIT_DESC_OWN 0x80000000 ··· 256 253 }; 257 254 258 255 struct mlx4_en_tx_ring { 256 + /* cache line used and dirtied in tx completion 257 + * (mlx4_en_free_tx_buf()) 258 + */ 259 + u32 last_nr_txbb; 260 + u32 cons; 261 + unsigned long wake_queue; 262 + 263 + /* cache line used and dirtied in mlx4_en_xmit() */ 264 + u32 prod ____cacheline_aligned_in_smp; 265 + unsigned long bytes; 266 + unsigned long packets; 267 + unsigned long tx_csum; 268 + unsigned long tso_packets; 269 + unsigned long xmit_more; 270 + struct mlx4_bf bf; 271 + unsigned long queue_stopped; 272 + 273 + /* Following part should be mostly read */ 274 + cpumask_t affinity_mask; 275 + struct mlx4_qp qp; 259 276 struct mlx4_hwq_resources wqres; 260 - u32 size ; /* number of TXBBs */ 261 - u32 size_mask; 262 - u16 stride; 263 - u16 cqn; /* index of port CQ associated with this ring */ 264 - u32 prod; 265 - u32 cons; 266 - u32 buf_size; 267 - u32 doorbell_qpn; 268 - void *buf; 269 - u16 poll_cnt; 270 - struct mlx4_en_tx_info *tx_info; 271 - u8 *bounce_buf; 272 - u8 queue_index; 273 - cpumask_t affinity_mask; 274 - u32 last_nr_txbb; 275 - struct mlx4_qp qp; 276 - struct mlx4_qp_context context; 277 - int qpn; 278 - enum mlx4_qp_state qp_state; 279 - struct mlx4_srq dummy; 280 - unsigned long bytes; 281 - unsigned long packets; 282 - unsigned long tx_csum; 283 - unsigned long queue_stopped; 284 - unsigned long wake_queue; 285 - unsigned long tso_packets; 286 - unsigned long xmit_more; 287 - struct mlx4_bf bf; 288 - bool bf_enabled; 289 - bool bf_alloced; 290 - struct netdev_queue *tx_queue; 291 - int hwtstamp_tx_type; 292 - int inline_thold; 293 - }; 277 + u32 size; /* number of TXBBs */ 278 + u32 size_mask; 279 + u16 stride; 280 + u16 cqn; /* index of port CQ associated with this ring */ 281 + u32 buf_size; 282 + __be32 doorbell_qpn; 283 + __be32 mr_key; 284 + void *buf; 285 + struct mlx4_en_tx_info *tx_info; 286 + u8 *bounce_buf; 287 + struct mlx4_qp_context context; 288 + int qpn; 289 + enum mlx4_qp_state qp_state; 290 + u8 queue_index; 291 + bool bf_enabled; 292 + bool bf_alloced; 293 + struct netdev_queue *tx_queue; 294 + int hwtstamp_tx_type; 295 + } ____cacheline_aligned_in_smp; 294 296 295 297 struct mlx4_en_rx_desc { 296 298 /* actual number of entries depends on rx ring stride */
+1 -1
include/linux/mlx4/device.h
··· 583 583 }; 584 584 585 585 struct mlx4_bf { 586 - unsigned long offset; 586 + unsigned int offset; 587 587 int buf_size; 588 588 struct mlx4_uar *uar; 589 589 void __iomem *reg;
+1
include/uapi/linux/ethtool.h
··· 212 212 enum tunable_id { 213 213 ETHTOOL_ID_UNSPEC, 214 214 ETHTOOL_RX_COPYBREAK, 215 + ETHTOOL_TX_COPYBREAK, 215 216 }; 216 217 217 218 enum tunable_type_id {
+1
net/core/ethtool.c
··· 1625 1625 { 1626 1626 switch (tuna->id) { 1627 1627 case ETHTOOL_RX_COPYBREAK: 1628 + case ETHTOOL_TX_COPYBREAK: 1628 1629 if (tuna->len != sizeof(u32) || 1629 1630 tuna->type_id != ETHTOOL_TUNABLE_U32) 1630 1631 return -EINVAL;