IPoIB: Add send gather support

This patch acts as a preparation for using checksum offload for IB
devices capable of inserting/verifying checksum in IP packets. The
patch does not actaully turn on NETIF_F_SG - we defer that to the
patches adding checksum offload capabilities.

We only add support for send gathers for datagram mode, since existing
HW does not support checksum offload on connected QPs.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by Eli Cohen and committed by Roland Dreier 7143740d eb14032f

+83 -30
+2 -2
drivers/infiniband/ulp/ipoib/ipoib.h
··· 143 144 struct ipoib_tx_buf { 145 struct sk_buff *skb; 146 - u64 mapping; 147 }; 148 149 struct ib_cm_id; ··· 296 struct ipoib_tx_buf *tx_ring; 297 unsigned tx_head; 298 unsigned tx_tail; 299 - struct ib_sge tx_sge; 300 struct ib_send_wr tx_wr; 301 unsigned tx_outstanding; 302
··· 143 144 struct ipoib_tx_buf { 145 struct sk_buff *skb; 146 + u64 mapping[MAX_SKB_FRAGS + 1]; 147 }; 148 149 struct ib_cm_id; ··· 296 struct ipoib_tx_buf *tx_ring; 297 unsigned tx_head; 298 unsigned tx_tail; 299 + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; 300 struct ib_send_wr tx_wr; 301 unsigned tx_outstanding; 302
+5 -5
drivers/infiniband/ulp/ipoib/ipoib_cm.c
··· 634 { 635 struct ib_send_wr *bad_wr; 636 637 - priv->tx_sge.addr = addr; 638 - priv->tx_sge.length = len; 639 640 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 641 ··· 676 return; 677 } 678 679 - tx_req->mapping = addr; 680 681 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 682 addr, skb->len))) { ··· 715 716 tx_req = &tx->tx_ring[wr_id]; 717 718 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 719 720 /* FIXME: is this right? Shouldn't we only increment on success? */ 721 ++dev->stats.tx_packets; ··· 1110 1111 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1112 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1113 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 1114 DMA_TO_DEVICE); 1115 dev_kfree_skb_any(tx_req->skb); 1116 ++p->tx_tail;
··· 634 { 635 struct ib_send_wr *bad_wr; 636 637 + priv->tx_sge[0].addr = addr; 638 + priv->tx_sge[0].length = len; 639 640 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 641 ··· 676 return; 677 } 678 679 + tx_req->mapping[0] = addr; 680 681 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 682 addr, skb->len))) { ··· 715 716 tx_req = &tx->tx_ring[wr_id]; 717 718 + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE); 719 720 /* FIXME: is this right? Shouldn't we only increment on success? */ 721 ++dev->stats.tx_packets; ··· 1110 1111 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1112 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1113 + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, 1114 DMA_TO_DEVICE); 1115 dev_kfree_skb_any(tx_req->skb); 1116 ++p->tx_tail;
+69 -20
drivers/infiniband/ulp/ipoib/ipoib_ib.c
··· 239 "for buf %d\n", wr_id); 240 } 241 242 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) 243 { 244 struct ipoib_dev_priv *priv = netdev_priv(dev); ··· 305 306 tx_req = &priv->tx_ring[wr_id]; 307 308 - ib_dma_unmap_single(priv->ca, tx_req->mapping, 309 - tx_req->skb->len, DMA_TO_DEVICE); 310 311 ++dev->stats.tx_packets; 312 dev->stats.tx_bytes += tx_req->skb->len; ··· 388 static inline int post_send(struct ipoib_dev_priv *priv, 389 unsigned int wr_id, 390 struct ib_ah *address, u32 qpn, 391 - u64 addr, int len) 392 { 393 struct ib_send_wr *bad_wr; 394 395 - priv->tx_sge.addr = addr; 396 - priv->tx_sge.length = len; 397 - 398 - priv->tx_wr.wr_id = wr_id; 399 - priv->tx_wr.wr.ud.remote_qpn = qpn; 400 - priv->tx_wr.wr.ud.ah = address; 401 402 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); 403 } ··· 414 { 415 struct ipoib_dev_priv *priv = netdev_priv(dev); 416 struct ipoib_tx_buf *tx_req; 417 - u64 addr; 418 419 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { 420 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", ··· 436 */ 437 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 438 tx_req->skb = skb; 439 - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, 440 - DMA_TO_DEVICE); 441 - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 442 ++dev->stats.tx_errors; 443 dev_kfree_skb_any(skb); 444 return; 445 } 446 - tx_req->mapping = addr; 447 448 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), 449 - address->ah, qpn, addr, skb->len))) { 450 ipoib_warn(priv, "post_send failed\n"); 451 ++dev->stats.tx_errors; 452 - ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 453 dev_kfree_skb_any(skb); 454 } else { 455 dev->trans_start = jiffies; ··· 667 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 668 tx_req = &priv->tx_ring[priv->tx_tail & 669 (ipoib_sendq_size - 1)]; 670 - ib_dma_unmap_single(priv->ca, 671 - tx_req->mapping, 672 - tx_req->skb->len, 673 - DMA_TO_DEVICE); 674 dev_kfree_skb_any(tx_req->skb); 675 ++priv->tx_tail; 676 --priv->tx_outstanding;
··· 239 "for buf %d\n", wr_id); 240 } 241 242 + static int ipoib_dma_map_tx(struct ib_device *ca, 243 + struct ipoib_tx_buf *tx_req) 244 + { 245 + struct sk_buff *skb = tx_req->skb; 246 + u64 *mapping = tx_req->mapping; 247 + int i; 248 + 249 + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), 250 + DMA_TO_DEVICE); 251 + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) 252 + return -EIO; 253 + 254 + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 255 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 256 + mapping[i + 1] = ib_dma_map_page(ca, frag->page, 257 + frag->page_offset, frag->size, 258 + DMA_TO_DEVICE); 259 + if (unlikely(ib_dma_mapping_error(ca, mapping[i + 1]))) 260 + goto partial_error; 261 + } 262 + return 0; 263 + 264 + partial_error: 265 + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 266 + 267 + for (; i > 0; --i) { 268 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; 269 + ib_dma_unmap_page(ca, mapping[i], frag->size, DMA_TO_DEVICE); 270 + } 271 + return -EIO; 272 + } 273 + 274 + static void ipoib_dma_unmap_tx(struct ib_device *ca, 275 + struct ipoib_tx_buf *tx_req) 276 + { 277 + struct sk_buff *skb = tx_req->skb; 278 + u64 *mapping = tx_req->mapping; 279 + int i; 280 + 281 + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 282 + 283 + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 284 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 285 + ib_dma_unmap_page(ca, mapping[i + 1], frag->size, 286 + DMA_TO_DEVICE); 287 + } 288 + } 289 + 290 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) 291 { 292 struct ipoib_dev_priv *priv = netdev_priv(dev); ··· 257 258 tx_req = &priv->tx_ring[wr_id]; 259 260 + ipoib_dma_unmap_tx(priv->ca, tx_req); 261 262 ++dev->stats.tx_packets; 263 dev->stats.tx_bytes += tx_req->skb->len; ··· 341 static inline int post_send(struct ipoib_dev_priv *priv, 342 unsigned int wr_id, 343 struct ib_ah *address, u32 qpn, 344 + u64 *mapping, int headlen, 345 + skb_frag_t *frags, 346 + int nr_frags) 347 { 348 struct ib_send_wr *bad_wr; 349 + int i; 350 351 + priv->tx_sge[0].addr = mapping[0]; 352 + priv->tx_sge[0].length = headlen; 353 + for (i = 0; i < nr_frags; ++i) { 354 + priv->tx_sge[i + 1].addr = mapping[i + 1]; 355 + priv->tx_sge[i + 1].length = frags[i].size; 356 + } 357 + priv->tx_wr.num_sge = nr_frags + 1; 358 + priv->tx_wr.wr_id = wr_id; 359 + priv->tx_wr.wr.ud.remote_qpn = qpn; 360 + priv->tx_wr.wr.ud.ah = address; 361 362 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); 363 } ··· 360 { 361 struct ipoib_dev_priv *priv = netdev_priv(dev); 362 struct ipoib_tx_buf *tx_req; 363 364 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { 365 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", ··· 383 */ 384 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 385 tx_req->skb = skb; 386 + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { 387 ++dev->stats.tx_errors; 388 dev_kfree_skb_any(skb); 389 return; 390 } 391 392 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), 393 + address->ah, qpn, 394 + tx_req->mapping, skb_headlen(skb), 395 + skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags))) { 396 ipoib_warn(priv, "post_send failed\n"); 397 ++dev->stats.tx_errors; 398 + ipoib_dma_unmap_tx(priv->ca, tx_req); 399 dev_kfree_skb_any(skb); 400 } else { 401 dev->trans_start = jiffies; ··· 615 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 616 tx_req = &priv->tx_ring[priv->tx_tail & 617 (ipoib_sendq_size - 1)]; 618 + ipoib_dma_unmap_tx(priv->ca, tx_req); 619 dev_kfree_skb_any(tx_req->skb); 620 ++priv->tx_tail; 621 --priv->tx_outstanding;
+7 -3
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
··· 157 }; 158 159 int ret, size; 160 161 priv->pd = ib_alloc_pd(priv->ca); 162 if (IS_ERR(priv->pd)) { ··· 192 init_attr.send_cq = priv->cq; 193 init_attr.recv_cq = priv->cq; 194 195 priv->qp = ib_create_qp(priv->pd, &init_attr); 196 if (IS_ERR(priv->qp)) { 197 printk(KERN_WARNING "%s: failed to create QP\n", ca->name); ··· 205 priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; 206 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; 207 208 - priv->tx_sge.lkey = priv->mr->lkey; 209 210 priv->tx_wr.opcode = IB_WR_SEND; 211 - priv->tx_wr.sg_list = &priv->tx_sge; 212 - priv->tx_wr.num_sge = 1; 213 priv->tx_wr.send_flags = IB_SEND_SIGNALED; 214 215 return 0;
··· 157 }; 158 159 int ret, size; 160 + int i; 161 162 priv->pd = ib_alloc_pd(priv->ca); 163 if (IS_ERR(priv->pd)) { ··· 191 init_attr.send_cq = priv->cq; 192 init_attr.recv_cq = priv->cq; 193 194 + if (dev->features & NETIF_F_SG) 195 + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; 196 + 197 priv->qp = ib_create_qp(priv->pd, &init_attr); 198 if (IS_ERR(priv->qp)) { 199 printk(KERN_WARNING "%s: failed to create QP\n", ca->name); ··· 201 priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; 202 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; 203 204 + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) 205 + priv->tx_sge[i].lkey = priv->mr->lkey; 206 207 priv->tx_wr.opcode = IB_WR_SEND; 208 + priv->tx_wr.sg_list = priv->tx_sge; 209 priv->tx_wr.send_flags = IB_SEND_SIGNALED; 210 211 return 0;