Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/cxgb3: Support peer-2-peer connection setup

Open MPI, Intel MPI and other applications don't respect the iWARP
requirement that the client (active) side of the connection send the
first RDMA message. This class of application connection setup is
called peer-to-peer. Typically once the connection is setup, _both_
sides want to send data.

This patch enables supporting peer-to-peer over the chelsio RNIC by
enforcing this iWARP requirement in the driver itself as part of RDMA
connection setup.

Connection setup is extended, when the peer2peer module option is 1,
such that the MPA initiator will send a 0B Read (the RTR) just after
connection setup. The MPA responder will suspend SQ processing until
the RTR message is received and reply-to.

In the longer term, this will be handled in a standardized way by
enhancing the MPA negotiation so peers can indicate whether they
want/need the RTR and what type of RTR (0B read, 0B write, or 0B send)
should be sent. This will be done by standardizing a few bits of the
private data in order to negotiate all this. However this patch
enables peer-to-peer applications now and allows most of the required
firmware and driver changes to be done and tested now.

Design:

- Add a module option, peer2peer, to enable this mode.

- New firmware support for peer-to-peer mode:

- a new bit in the rdma_init WR to tell it to do peer-2-peer
and what form of RTR message to send or expect.

- process _all_ preposted recvs before moving the connection
into rdma mode.

- passive side: defer completing the rdma_init WR until all
pre-posted recvs are processed. Suspend SQ processing until
the RTR is received.

- active side: expect and process the 0B read WR on offload TX
queue. Defer completing the rdma_init WR until all
pre-posted recvs are processed. Suspend SQ processing until
the 0B read WR is processed from the offload TX queue.

- If peer2peer is set, driver posts 0B read request on offload TX
queue just after posting the rdma_init WR to the offload TX queue.

- Add CQ poll logic to ignore unsolicitied read responses.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by

Steve Wise and committed by
Roland Dreier
f8b0dfd1 ccaf10d0

+137 -29
+16 -2
drivers/infiniband/hw/cxgb3/cxio_hal.c
··· 456 456 ptr = cq->sw_rptr; 457 457 while (!Q_EMPTY(ptr, cq->sw_wptr)) { 458 458 cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); 459 - if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) && 459 + if ((SQ_TYPE(*cqe) || 460 + ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) && 460 461 (CQE_QPID(*cqe) == wq->qpid)) 461 462 (*count)++; 462 463 ptr++; ··· 830 829 wqe->mpaattrs = attr->mpaattrs; 831 830 wqe->qpcaps = attr->qpcaps; 832 831 wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); 833 - wqe->flags = cpu_to_be32(attr->flags); 832 + wqe->rqe_count = cpu_to_be16(attr->rqe_count); 833 + wqe->flags_rtr_type = cpu_to_be16(attr->flags|V_RTR_TYPE(attr->rtr_type)); 834 834 wqe->ord = cpu_to_be32(attr->ord); 835 835 wqe->ird = cpu_to_be32(attr->ird); 836 836 wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); ··· 1135 1133 * 4) cq_type is RQ_TYPE not SQ_TYPE. 1136 1134 */ 1137 1135 if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { 1136 + 1137 + /* 1138 + * If this is an unsolicited read response, then the read 1139 + * was generated by the kernel driver as part of peer-2-peer 1140 + * connection setup. So ignore the completion. 1141 + */ 1142 + if (!wq->oldest_read) { 1143 + if (CQE_STATUS(*hw_cqe)) 1144 + wq->error = 1; 1145 + ret = -1; 1146 + goto skip_cqe; 1147 + } 1138 1148 1139 1149 /* 1140 1150 * Don't write to the HWCQ, so create a new read req CQE
+17 -4
drivers/infiniband/hw/cxgb3/cxio_wr.h
··· 278 278 uP_RI_QP_STAG0_ENABLE = 0x10 279 279 } __attribute__ ((packed)); 280 280 281 + enum rdma_init_rtr_types { 282 + RTR_READ = 1, 283 + RTR_WRITE = 2, 284 + RTR_SEND = 3, 285 + }; 286 + 287 + #define S_RTR_TYPE 2 288 + #define M_RTR_TYPE 0x3 289 + #define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) 290 + #define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) 291 + 281 292 struct t3_rdma_init_attr { 282 293 u32 tid; 283 294 u32 qpid; ··· 304 293 u32 ird; 305 294 u64 qp_dma_addr; 306 295 u32 qp_dma_size; 307 - u32 flags; 296 + enum rdma_init_rtr_types rtr_type; 297 + u16 flags; 298 + u16 rqe_count; 308 299 u32 irs; 309 300 }; 310 301 ··· 322 309 u8 mpaattrs; /* 5 */ 323 310 u8 qpcaps; 324 311 __be16 ulpdu_size; 325 - __be32 flags; /* bits 31-1 - reservered */ 326 - /* bit 0 - set if RECV posted */ 312 + __be16 flags_rtr_type; 313 + __be16 rqe_count; 327 314 __be32 ord; /* 6 */ 328 315 __be32 ird; 329 316 __be64 qp_dma_addr; /* 7 */ ··· 337 324 }; 338 325 339 326 enum rdma_init_wr_flags { 340 - RECVS_POSTED = (1<<0), 327 + MPA_INITIATOR = (1<<0), 341 328 PRIV_QP = (1<<1), 342 329 }; 343 330
+49 -18
drivers/infiniband/hw/cxgb3/iwch_cm.c
··· 63 63 NULL, 64 64 }; 65 65 66 + int peer2peer = 0; 67 + module_param(peer2peer, int, 0644); 68 + MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); 69 + 66 70 static int ep_timeout_secs = 10; 67 71 module_param(ep_timeout_secs, int, 0644); 68 72 MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " ··· 518 514 skb_reset_transport_header(skb); 519 515 len = skb->len; 520 516 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 521 - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 517 + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); 522 518 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 523 519 req->len = htonl(len); 524 520 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | ··· 569 565 set_arp_failure_handler(skb, arp_failure_discard); 570 566 skb_reset_transport_header(skb); 571 567 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 572 - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 568 + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); 573 569 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 574 570 req->len = htonl(mpalen); 575 571 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | ··· 621 617 skb_reset_transport_header(skb); 622 618 len = skb->len; 623 619 req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); 624 - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 620 + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); 625 621 req->wr_lo = htonl(V_WR_TID(ep->hwtid)); 626 622 req->len = htonl(len); 627 623 req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | ··· 889 885 * the MPA header is valid. 890 886 */ 891 887 state_set(&ep->com, FPDU_MODE); 888 + ep->mpa_attr.initiator = 1; 892 889 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; 893 890 ep->mpa_attr.recv_marker_enabled = markers_enabled; 894 891 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ··· 912 907 /* bind QP and TID with INIT_WR */ 913 908 err = iwch_modify_qp(ep->com.qp->rhp, 914 909 ep->com.qp, mask, &attrs, 1); 915 - if (!err) 916 - goto out; 910 + if (err) 911 + goto err; 912 + 913 + if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) { 914 + iwch_post_zb_read(ep->com.qp); 915 + } 916 + 917 + goto out; 917 918 err: 918 919 abort_connection(ep, skb, GFP_KERNEL); 919 920 out: ··· 1012 1001 * If we get here we have accumulated the entire mpa 1013 1002 * start reply message including private data. 1014 1003 */ 1004 + ep->mpa_attr.initiator = 0; 1015 1005 ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; 1016 1006 ep->mpa_attr.recv_marker_enabled = markers_enabled; 1017 1007 ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; ··· 1083 1071 1084 1072 PDBG("%s ep %p credits %u\n", __func__, ep, credits); 1085 1073 1086 - if (credits == 0) 1074 + if (credits == 0) { 1075 + PDBG(KERN_ERR "%s 0 credit ack ep %p state %u\n", 1076 + __func__, ep, state_read(&ep->com)); 1087 1077 return CPL_RET_BUF_DONE; 1078 + } 1079 + 1088 1080 BUG_ON(credits != 1); 1089 - BUG_ON(ep->mpa_skb == NULL); 1090 - kfree_skb(ep->mpa_skb); 1091 - ep->mpa_skb = NULL; 1092 1081 dst_confirm(ep->dst); 1093 - if (state_read(&ep->com) == MPA_REP_SENT) { 1094 - ep->com.rpl_done = 1; 1095 - PDBG("waking up ep %p\n", ep); 1096 - wake_up(&ep->com.waitq); 1082 + if (!ep->mpa_skb) { 1083 + PDBG("%s rdma_init wr_ack ep %p state %u\n", 1084 + __func__, ep, state_read(&ep->com)); 1085 + if (ep->mpa_attr.initiator) { 1086 + PDBG("%s initiator ep %p state %u\n", 1087 + __func__, ep, state_read(&ep->com)); 1088 + if (peer2peer) 1089 + iwch_post_zb_read(ep->com.qp); 1090 + } else { 1091 + PDBG("%s responder ep %p state %u\n", 1092 + __func__, ep, state_read(&ep->com)); 1093 + ep->com.rpl_done = 1; 1094 + wake_up(&ep->com.waitq); 1095 + } 1096 + } else { 1097 + PDBG("%s lsm ack ep %p state %u freeing skb\n", 1098 + __func__, ep, state_read(&ep->com)); 1099 + kfree_skb(ep->mpa_skb); 1100 + ep->mpa_skb = NULL; 1097 1101 } 1098 1102 return CPL_RET_BUF_DONE; 1099 1103 } ··· 1823 1795 if (err) 1824 1796 goto err; 1825 1797 1798 + /* if needed, wait for wr_ack */ 1799 + if (iwch_rqes_posted(qp)) { 1800 + wait_event(ep->com.waitq, ep->com.rpl_done); 1801 + err = ep->com.rpl_err; 1802 + if (err) 1803 + goto err; 1804 + } 1805 + 1826 1806 err = send_mpa_reply(ep, conn_param->private_data, 1827 1807 conn_param->private_data_len); 1828 1808 if (err) 1829 1809 goto err; 1830 1810 1831 - /* wait for wr_ack */ 1832 - wait_event(ep->com.waitq, ep->com.rpl_done); 1833 - err = ep->com.rpl_err; 1834 - if (err) 1835 - goto err; 1836 1811 1837 1812 state_set(&ep->com, FPDU_MODE); 1838 1813 established_upcall(ep);
+1
drivers/infiniband/hw/cxgb3/iwch_cm.h
··· 226 226 227 227 int __init iwch_cm_init(void); 228 228 void __exit iwch_cm_term(void); 229 + extern int peer2peer; 229 230 230 231 #endif /* _IWCH_CM_H_ */
+3
drivers/infiniband/hw/cxgb3/iwch_provider.h
··· 118 118 }; 119 119 120 120 struct iwch_mpa_attributes { 121 + u8 initiator; 121 122 u8 recv_marker_enabled; 122 123 u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ 123 124 u8 crc_enabled; ··· 323 322 IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ 324 323 }; 325 324 325 + u16 iwch_rqes_posted(struct iwch_qp *qhp); 326 326 int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, 327 327 struct ib_send_wr **bad_wr); 328 328 int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, ··· 333 331 struct ib_mw_bind *mw_bind); 334 332 int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); 335 333 int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); 334 + int iwch_post_zb_read(struct iwch_qp *qhp); 336 335 int iwch_register_device(struct iwch_dev *dev); 337 336 void iwch_unregister_device(struct iwch_dev *dev); 338 337 int iwch_quiesce_qps(struct iwch_cq *chp);
+50 -4
drivers/infiniband/hw/cxgb3/iwch_qp.c
··· 586 586 } 587 587 } 588 588 589 + int iwch_post_zb_read(struct iwch_qp *qhp) 590 + { 591 + union t3_wr *wqe; 592 + struct sk_buff *skb; 593 + u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; 594 + 595 + PDBG("%s enter\n", __func__); 596 + skb = alloc_skb(40, GFP_KERNEL); 597 + if (!skb) { 598 + printk(KERN_ERR "%s cannot send zb_read!!\n", __func__); 599 + return -ENOMEM; 600 + } 601 + wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr)); 602 + memset(wqe, 0, sizeof(struct t3_rdma_read_wr)); 603 + wqe->read.rdmaop = T3_READ_REQ; 604 + wqe->read.reserved[0] = 0; 605 + wqe->read.reserved[1] = 0; 606 + wqe->read.reserved[2] = 0; 607 + wqe->read.rem_stag = cpu_to_be32(1); 608 + wqe->read.rem_to = cpu_to_be64(1); 609 + wqe->read.local_stag = cpu_to_be32(1); 610 + wqe->read.local_len = cpu_to_be32(0); 611 + wqe->read.local_to = cpu_to_be64(1); 612 + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ)); 613 + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)| 614 + V_FW_RIWR_LEN(flit_cnt)); 615 + skb->priority = CPL_PRIORITY_DATA; 616 + return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); 617 + } 618 + 589 619 /* 590 620 * This posts a TERMINATE with layer=RDMA, type=catastrophic. 591 621 */ ··· 701 671 702 672 703 673 /* 704 - * Return non zero if at least one RECV was pre-posted. 674 + * Return count of RECV WRs posted 705 675 */ 706 - static int rqes_posted(struct iwch_qp *qhp) 676 + u16 iwch_rqes_posted(struct iwch_qp *qhp) 707 677 { 708 - return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; 678 + union t3_wr *wqe = qhp->wq.queue; 679 + u16 count = 0; 680 + while ((count+1) != 0 && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { 681 + count++; 682 + wqe++; 683 + } 684 + PDBG("%s qhp %p count %u\n", __func__, qhp, count); 685 + return count; 709 686 } 710 687 711 688 static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, ··· 753 716 init_attr.ird = qhp->attr.max_ird; 754 717 init_attr.qp_dma_addr = qhp->wq.dma_addr; 755 718 init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); 756 - init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; 719 + init_attr.rqe_count = iwch_rqes_posted(qhp); 720 + init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; 757 721 init_attr.flags |= capable(CAP_NET_BIND_SERVICE) ? PRIV_QP : 0; 722 + if (peer2peer) { 723 + init_attr.rtr_type = RTR_READ; 724 + if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator) 725 + init_attr.ord = 1; 726 + if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator) 727 + init_attr.ird = 1; 728 + } else 729 + init_attr.rtr_type = 0; 758 730 init_attr.irs = qhp->ep->rcv_seq; 759 731 PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " 760 732 "flags 0x%x qpcaps 0x%x\n", __func__,
+1 -1
drivers/net/cxgb3/version.h
··· 38 38 #define DRV_VERSION "1.0-ko" 39 39 40 40 /* Firmware version */ 41 - #define FW_VERSION_MAJOR 5 41 + #define FW_VERSION_MAJOR 6 42 42 #define FW_VERSION_MINOR 0 43 43 #define FW_VERSION_MICRO 0 44 44 #endif /* __CHELSIO_VERSION_H */