Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/ipoib: Scatter-Gather support in connected mode

By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better
performance.
This MTU plus overhead puts the memory allocation for IP based packets at
32 4k pages (order 5), which have to be contiguous.
When the system memory under pressure, it was observed that allocating 128k
contiguous physical memory is difficult and causes serious errors (such as
system becomes unusable).

This enhancement resolve the issue by removing the physically contiguous
memory requirement using Scatter/Gather feature that exists in Linux stack.

With this fix Scatter-Gather will be supported also in connected mode.

This change reverts some of the change made in commit e112373fd6aa
("IPoIB/cm: Reduce connected mode TX object size").

The ability to use SG in IPoIB CM is possible because the coupling
between NETIF_F_SG and NETIF_F_CSUM was removed in commit
ec5f06156423 ("net: Kill link between CSUM and SG features.")

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
Acked-by: Christian Marie <christian@ponies.io>
Signed-off-by: Doug Ledford <dledford@redhat.com>

authored by

Yuval Shaia and committed by
Doug Ledford
c4268778 59d40dd9

+54 -46
+28 -1
drivers/infiniband/ulp/ipoib/ipoib.h
··· 239 239 struct net_device *dev; 240 240 struct ipoib_neigh *neigh; 241 241 struct ipoib_path *path; 242 - struct ipoib_cm_tx_buf *tx_ring; 242 + struct ipoib_tx_buf *tx_ring; 243 243 unsigned tx_head; 244 244 unsigned tx_tail; 245 245 unsigned long flags; ··· 503 503 504 504 void ipoib_mcast_dev_down(struct net_device *dev); 505 505 void ipoib_mcast_dev_flush(struct net_device *dev); 506 + 507 + int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); 508 + void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, 509 + struct ipoib_tx_buf *tx_req); 510 + 511 + static inline void ipoib_build_sge(struct ipoib_dev_priv *priv, 512 + struct ipoib_tx_buf *tx_req) 513 + { 514 + int i, off; 515 + struct sk_buff *skb = tx_req->skb; 516 + skb_frag_t *frags = skb_shinfo(skb)->frags; 517 + int nr_frags = skb_shinfo(skb)->nr_frags; 518 + u64 *mapping = tx_req->mapping; 519 + 520 + if (skb_headlen(skb)) { 521 + priv->tx_sge[0].addr = mapping[0]; 522 + priv->tx_sge[0].length = skb_headlen(skb); 523 + off = 1; 524 + } else 525 + off = 0; 526 + 527 + for (i = 0; i < nr_frags; ++i) { 528 + priv->tx_sge[i + off].addr = mapping[i + off]; 529 + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); 530 + } 531 + priv->tx_wr.num_sge = nr_frags + off; 532 + } 506 533 507 534 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 508 535 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+14 -19
drivers/infiniband/ulp/ipoib/ipoib_cm.c
··· 694 694 static inline int post_send(struct ipoib_dev_priv *priv, 695 695 struct ipoib_cm_tx *tx, 696 696 unsigned int wr_id, 697 - u64 addr, int len) 697 + struct ipoib_tx_buf *tx_req) 698 698 { 699 699 struct ib_send_wr *bad_wr; 700 700 701 - priv->tx_sge[0].addr = addr; 702 - priv->tx_sge[0].length = len; 701 + ipoib_build_sge(priv, tx_req); 703 702 704 - priv->tx_wr.num_sge = 1; 705 703 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 706 704 707 705 return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); ··· 708 710 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) 709 711 { 710 712 struct ipoib_dev_priv *priv = netdev_priv(dev); 711 - struct ipoib_cm_tx_buf *tx_req; 712 - u64 addr; 713 + struct ipoib_tx_buf *tx_req; 713 714 int rc; 714 715 715 716 if (unlikely(skb->len > tx->mtu)) { ··· 732 735 */ 733 736 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; 734 737 tx_req->skb = skb; 735 - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); 736 - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 738 + 739 + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { 737 740 ++dev->stats.tx_errors; 738 741 dev_kfree_skb_any(skb); 739 742 return; 740 743 } 741 744 742 - tx_req->mapping = addr; 743 - 744 745 skb_orphan(skb); 745 746 skb_dst_drop(skb); 746 747 747 - rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 748 - addr, skb->len); 748 + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); 749 749 if (unlikely(rc)) { 750 750 ipoib_warn(priv, "post_send failed, error %d\n", rc); 751 751 ++dev->stats.tx_errors; 752 - ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 752 + ipoib_dma_unmap_tx(priv, tx_req); 753 753 dev_kfree_skb_any(skb); 754 754 } else { 755 755 dev->trans_start = jiffies; ··· 771 777 struct ipoib_dev_priv *priv = netdev_priv(dev); 772 778 struct ipoib_cm_tx *tx = wc->qp->qp_context; 773 779 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; 774 - struct ipoib_cm_tx_buf *tx_req; 780 + struct ipoib_tx_buf *tx_req; 775 781 unsigned long flags; 776 782 777 783 ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", ··· 785 791 786 792 tx_req = &tx->tx_ring[wr_id]; 787 793 788 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 794 + ipoib_dma_unmap_tx(priv, tx_req); 789 795 790 796 /* FIXME: is this right? Shouldn't we only increment on success? */ 791 797 ++dev->stats.tx_packets; ··· 1030 1036 1031 1037 struct ib_qp *tx_qp; 1032 1038 1039 + if (dev->features & NETIF_F_SG) 1040 + attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; 1041 + 1033 1042 tx_qp = ib_create_qp(priv->pd, &attr); 1034 1043 if (PTR_ERR(tx_qp) == -EINVAL) { 1035 1044 ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", ··· 1167 1170 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) 1168 1171 { 1169 1172 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 1170 - struct ipoib_cm_tx_buf *tx_req; 1173 + struct ipoib_tx_buf *tx_req; 1171 1174 unsigned long begin; 1172 1175 1173 1176 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", ··· 1194 1197 1195 1198 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1196 1199 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1197 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 1198 - DMA_TO_DEVICE); 1200 + ipoib_dma_unmap_tx(priv, tx_req); 1199 1201 dev_kfree_skb_any(tx_req->skb); 1200 1202 ++p->tx_tail; 1201 1203 netif_tx_lock_bh(p->dev); ··· 1450 1454 &priv->cm.stale_task, IPOIB_CM_RX_DELAY); 1451 1455 spin_unlock_irq(&priv->lock); 1452 1456 } 1453 - 1454 1457 1455 1458 static ssize_t show_mode(struct device *d, struct device_attribute *attr, 1456 1459 char *buf)
+11 -25
drivers/infiniband/ulp/ipoib/ipoib_ib.c
··· 263 263 "for buf %d\n", wr_id); 264 264 } 265 265 266 - static int ipoib_dma_map_tx(struct ib_device *ca, 267 - struct ipoib_tx_buf *tx_req) 266 + int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) 268 267 { 269 268 struct sk_buff *skb = tx_req->skb; 270 269 u64 *mapping = tx_req->mapping; ··· 304 305 return -EIO; 305 306 } 306 307 307 - static void ipoib_dma_unmap_tx(struct ib_device *ca, 308 - struct ipoib_tx_buf *tx_req) 308 + void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, 309 + struct ipoib_tx_buf *tx_req) 309 310 { 310 311 struct sk_buff *skb = tx_req->skb; 311 312 u64 *mapping = tx_req->mapping; ··· 313 314 int off; 314 315 315 316 if (skb_headlen(skb)) { 316 - ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 317 + ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb), 318 + DMA_TO_DEVICE); 317 319 off = 1; 318 320 } else 319 321 off = 0; ··· 322 322 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 323 323 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 324 324 325 - ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), 326 - DMA_TO_DEVICE); 325 + ib_dma_unmap_page(priv->ca, mapping[i + off], 326 + skb_frag_size(frag), DMA_TO_DEVICE); 327 327 } 328 328 } 329 329 ··· 389 389 390 390 tx_req = &priv->tx_ring[wr_id]; 391 391 392 - ipoib_dma_unmap_tx(priv->ca, tx_req); 392 + ipoib_dma_unmap_tx(priv, tx_req); 393 393 394 394 ++dev->stats.tx_packets; 395 395 dev->stats.tx_bytes += tx_req->skb->len; ··· 514 514 void *head, int hlen) 515 515 { 516 516 struct ib_send_wr *bad_wr; 517 - int i, off; 518 517 struct sk_buff *skb = tx_req->skb; 519 - skb_frag_t *frags = skb_shinfo(skb)->frags; 520 - int nr_frags = skb_shinfo(skb)->nr_frags; 521 - u64 *mapping = tx_req->mapping; 522 518 523 - if (skb_headlen(skb)) { 524 - priv->tx_sge[0].addr = mapping[0]; 525 - priv->tx_sge[0].length = skb_headlen(skb); 526 - off = 1; 527 - } else 528 - off = 0; 519 + ipoib_build_sge(priv, tx_req); 529 520 530 - for (i = 0; i < nr_frags; ++i) { 531 - priv->tx_sge[i + off].addr = mapping[i + off]; 532 - priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); 533 - } 534 - priv->tx_wr.num_sge = nr_frags + off; 535 521 priv->tx_wr.wr_id = wr_id; 536 522 priv->tx_wr.wr.ud.remote_qpn = qpn; 537 523 priv->tx_wr.wr.ud.ah = address; ··· 603 617 ipoib_warn(priv, "post_send failed, error %d\n", rc); 604 618 ++dev->stats.tx_errors; 605 619 --priv->tx_outstanding; 606 - ipoib_dma_unmap_tx(priv->ca, tx_req); 620 + ipoib_dma_unmap_tx(priv, tx_req); 607 621 dev_kfree_skb_any(skb); 608 622 if (netif_queue_stopped(dev)) 609 623 netif_wake_queue(dev); ··· 854 868 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 855 869 tx_req = &priv->tx_ring[priv->tx_tail & 856 870 (ipoib_sendq_size - 1)]; 857 - ipoib_dma_unmap_tx(priv->ca, tx_req); 871 + ipoib_dma_unmap_tx(priv, tx_req); 858 872 dev_kfree_skb_any(tx_req->skb); 859 873 ++priv->tx_tail; 860 874 --priv->tx_outstanding;
+1 -1
drivers/infiniband/ulp/ipoib/ipoib_main.c
··· 190 190 struct ipoib_dev_priv *priv = netdev_priv(dev); 191 191 192 192 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) 193 - features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); 193 + features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); 194 194 195 195 return features; 196 196 }