Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband:
IB/core: Remove unused struct ib_device.flags member
IB/core: Add IP checksum offload support
IPoIB: Add send gather support
IPoIB: Add high DMA feature flag
IB/mlx4: Use multiple WQ blocks to post smaller send WQEs
mlx4_core: Clean up struct mlx4_buf
mlx4_core: For 64-bit systems, vmap() kernel queue buffers
IB/mlx4: Consolidate code to get an entry from a struct mlx4_buf

+342 -112
+11 -9
drivers/infiniband/hw/mlx4/cq.c
··· 64 64 65 65 static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) 66 66 { 67 - int offset = n * sizeof (struct mlx4_cqe); 68 - 69 - if (buf->buf.nbufs == 1) 70 - return buf->buf.u.direct.buf + offset; 71 - else 72 - return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf + 73 - (offset & (PAGE_SIZE - 1)); 67 + return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); 74 68 } 75 69 76 70 static void *get_cqe(struct mlx4_ib_cq *cq, int n) ··· 326 332 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == 327 333 MLX4_CQE_OPCODE_ERROR; 328 334 335 + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && 336 + is_send)) { 337 + printk(KERN_WARNING "Completion for NOP opcode detected!\n"); 338 + return -EINVAL; 339 + } 340 + 329 341 if (!*cur_qp || 330 342 (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) { 331 343 /* ··· 354 354 355 355 if (is_send) { 356 356 wq = &(*cur_qp)->sq; 357 - wqe_ctr = be16_to_cpu(cqe->wqe_index); 358 - wq->tail += (u16) (wqe_ctr - (u16) wq->tail); 357 + if (!(*cur_qp)->sq_signal_bits) { 358 + wqe_ctr = be16_to_cpu(cqe->wqe_index); 359 + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); 360 + } 359 361 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; 360 362 ++wq->tail; 361 363 } else if ((*cur_qp)->ibqp.srq) {
+2
drivers/infiniband/hw/mlx4/mlx4_ib.h
··· 120 120 121 121 u32 doorbell_qpn; 122 122 __be32 sq_signal_bits; 123 + unsigned sq_next_wqe; 124 + int sq_max_wqes_per_wr; 123 125 int sq_spare_wqes; 124 126 struct mlx4_ib_wq sq; 125 127
+177 -39
drivers/infiniband/hw/mlx4/qp.c
··· 30 30 * SOFTWARE. 31 31 */ 32 32 33 + #include <linux/log2.h> 34 + 33 35 #include <rdma/ib_cache.h> 34 36 #include <rdma/ib_pack.h> 35 37 ··· 98 96 99 97 static void *get_wqe(struct mlx4_ib_qp *qp, int offset) 100 98 { 101 - if (qp->buf.nbufs == 1) 102 - return qp->buf.u.direct.buf + offset; 103 - else 104 - return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf + 105 - (offset & (PAGE_SIZE - 1)); 99 + return mlx4_buf_offset(&qp->buf, offset); 106 100 } 107 101 108 102 static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) ··· 113 115 114 116 /* 115 117 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 116 - * first four bytes of every 64 byte chunk with 0xffffffff, except for 117 - * the very first chunk of the WQE. 118 + * first four bytes of every 64 byte chunk with 119 + * 0x7FFFFFF | (invalid_ownership_value << 31). 120 + * 121 + * When the max work request size is less than or equal to the WQE 122 + * basic block size, as an optimization, we can stamp all WQEs with 123 + * 0xffffffff, and skip the very first chunk of each WQE. 118 124 */ 119 - static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) 125 + static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) 120 126 { 121 - u32 *wqe = get_send_wqe(qp, n); 127 + u32 *wqe; 122 128 int i; 129 + int s; 130 + int ind; 131 + void *buf; 132 + __be32 stamp; 123 133 124 - for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) 125 - wqe[i] = 0xffffffff; 134 + s = roundup(size, 1U << qp->sq.wqe_shift); 135 + if (qp->sq_max_wqes_per_wr > 1) { 136 + for (i = 0; i < s; i += 64) { 137 + ind = (i >> qp->sq.wqe_shift) + n; 138 + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : 139 + cpu_to_be32(0xffffffff); 140 + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 141 + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); 142 + *wqe = stamp; 143 + } 144 + } else { 145 + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 146 + for (i = 64; i < s; i += 64) { 147 + wqe = buf + i; 148 + *wqe = 0xffffffff; 149 + } 150 + } 151 + } 152 + 153 + static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) 154 + { 155 + struct mlx4_wqe_ctrl_seg *ctrl; 156 + struct mlx4_wqe_inline_seg *inl; 157 + void *wqe; 158 + int s; 159 + 160 + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 161 + s = sizeof(struct mlx4_wqe_ctrl_seg); 162 + 163 + if (qp->ibqp.qp_type == IB_QPT_UD) { 164 + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; 165 + struct mlx4_av *av = (struct mlx4_av *)dgram->av; 166 + memset(dgram, 0, sizeof *dgram); 167 + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); 168 + s += sizeof(struct mlx4_wqe_datagram_seg); 169 + } 170 + 171 + /* Pad the remainder of the WQE with an inline data segment. */ 172 + if (size > s) { 173 + inl = wqe + s; 174 + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); 175 + } 176 + ctrl->srcrb_flags = 0; 177 + ctrl->fence_size = size / 16; 178 + /* 179 + * Make sure descriptor is fully written before setting ownership bit 180 + * (because HW can start executing as soon as we do). 181 + */ 182 + wmb(); 183 + 184 + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | 185 + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 186 + 187 + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); 188 + } 189 + 190 + /* Post NOP WQE to prevent wrap-around in the middle of WR */ 191 + static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) 192 + { 193 + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); 194 + if (unlikely(s < qp->sq_max_wqes_per_wr)) { 195 + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); 196 + ind += s; 197 + } 198 + return ind; 126 199 } 127 200 128 201 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) ··· 310 241 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 311 242 enum ib_qp_type type, struct mlx4_ib_qp *qp) 312 243 { 244 + int s; 245 + 313 246 /* Sanity check SQ size before proceeding */ 314 247 if (cap->max_send_wr > dev->dev->caps.max_wqes || 315 248 cap->max_send_sge > dev->dev->caps.max_sq_sg || ··· 327 256 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) 328 257 return -EINVAL; 329 258 330 - qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * 331 - sizeof (struct mlx4_wqe_data_seg), 332 - cap->max_inline_data + 333 - sizeof (struct mlx4_wqe_inline_seg)) + 334 - send_wqe_overhead(type))); 335 - qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / 336 - sizeof (struct mlx4_wqe_data_seg); 259 + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), 260 + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + 261 + send_wqe_overhead(type); 337 262 338 263 /* 339 - * We need to leave 2 KB + 1 WQE of headroom in the SQ to 340 - * allow HW to prefetch. 264 + * Hermon supports shrinking WQEs, such that a single work 265 + * request can include multiple units of 1 << wqe_shift. This 266 + * way, work requests can differ in size, and do not have to 267 + * be a power of 2 in size, saving memory and speeding up send 268 + * WR posting. Unfortunately, if we do this then the 269 + * wqe_index field in CQEs can't be used to look up the WR ID 270 + * anymore, so we do this only if selective signaling is off. 271 + * 272 + * Further, on 32-bit platforms, we can't use vmap() to make 273 + * the QP buffer virtually contigious. Thus we have to use 274 + * constant-sized WRs to make sure a WR is always fully within 275 + * a single page-sized chunk. 276 + * 277 + * Finally, we use NOP work requests to pad the end of the 278 + * work queue, to avoid wrap-around in the middle of WR. We 279 + * set NEC bit to avoid getting completions with error for 280 + * these NOP WRs, but since NEC is only supported starting 281 + * with firmware 2.2.232, we use constant-sized WRs for older 282 + * firmware. 283 + * 284 + * And, since MLX QPs only support SEND, we use constant-sized 285 + * WRs in this case. 286 + * 287 + * We look for the smallest value of wqe_shift such that the 288 + * resulting number of wqes does not exceed device 289 + * capabilities. 290 + * 291 + * We set WQE size to at least 64 bytes, this way stamping 292 + * invalidates each WQE. 341 293 */ 342 - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; 343 - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); 294 + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && 295 + qp->sq_signal_bits && BITS_PER_LONG == 64 && 296 + type != IB_QPT_SMI && type != IB_QPT_GSI) 297 + qp->sq.wqe_shift = ilog2(64); 298 + else 299 + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); 300 + 301 + for (;;) { 302 + if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz) 303 + return -EINVAL; 304 + 305 + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); 306 + 307 + /* 308 + * We need to leave 2 KB + 1 WR of headroom in the SQ to 309 + * allow HW to prefetch. 310 + */ 311 + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; 312 + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * 313 + qp->sq_max_wqes_per_wr + 314 + qp->sq_spare_wqes); 315 + 316 + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) 317 + break; 318 + 319 + if (qp->sq_max_wqes_per_wr <= 1) 320 + return -EINVAL; 321 + 322 + ++qp->sq.wqe_shift; 323 + } 324 + 325 + qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - 326 + send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg); 344 327 345 328 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 346 329 (qp->sq.wqe_cnt << qp->sq.wqe_shift); ··· 406 281 qp->sq.offset = 0; 407 282 } 408 283 409 - cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; 284 + cap->max_send_wr = qp->sq.max_post = 285 + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; 410 286 cap->max_send_sge = qp->sq.max_gs; 411 287 /* We don't support inline sends for kernel QPs (yet) */ 412 288 cap->max_inline_data = 0; ··· 453 327 qp->rq.tail = 0; 454 328 qp->sq.head = 0; 455 329 qp->sq.tail = 0; 330 + qp->sq_next_wqe = 0; 331 + 332 + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 333 + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 334 + else 335 + qp->sq_signal_bits = 0; 456 336 457 337 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); 458 338 if (err) ··· 548 416 * a little bit when posting sends. 549 417 */ 550 418 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); 551 - 552 - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 553 - qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 554 - else 555 - qp->sq_signal_bits = 0; 556 419 557 420 qp->mqp.event = mlx4_ib_qp_event; 558 421 ··· 1043 916 ctrl = get_send_wqe(qp, i); 1044 917 ctrl->owner_opcode = cpu_to_be32(1 << 31); 1045 918 1046 - stamp_send_wqe(qp, i); 919 + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); 1047 920 } 1048 921 } 1049 922 ··· 1096 969 qp->rq.tail = 0; 1097 970 qp->sq.head = 0; 1098 971 qp->sq.tail = 0; 972 + qp->sq_next_wqe = 0; 1099 973 if (!ibqp->srq) 1100 974 *qp->db.db = 0; 1101 975 } ··· 1406 1278 unsigned long flags; 1407 1279 int nreq; 1408 1280 int err = 0; 1409 - int ind; 1410 - int size; 1281 + unsigned ind; 1282 + int uninitialized_var(stamp); 1283 + int uninitialized_var(size); 1411 1284 int i; 1412 1285 1413 1286 spin_lock_irqsave(&qp->sq.lock, flags); 1414 1287 1415 - ind = qp->sq.head; 1288 + ind = qp->sq_next_wqe; 1416 1289 1417 1290 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1418 1291 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { ··· 1429 1300 } 1430 1301 1431 1302 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 1432 - qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1303 + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1433 1304 1434 1305 ctrl->srcrb_flags = 1435 1306 (wr->send_flags & IB_SEND_SIGNALED ? ··· 1542 1413 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | 1543 1414 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 1544 1415 1416 + stamp = ind + qp->sq_spare_wqes; 1417 + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); 1418 + 1545 1419 /* 1546 1420 * We can improve latency by not stamping the last 1547 1421 * send queue WQE until after ringing the doorbell, so 1548 1422 * only stamp here if there are still more WQEs to post. 1423 + * 1424 + * Same optimization applies to padding with NOP wqe 1425 + * in case of WQE shrinking (used to prevent wrap-around 1426 + * in the middle of WR). 1549 1427 */ 1550 - if (wr->next) 1551 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 1552 - (qp->sq.wqe_cnt - 1)); 1428 + if (wr->next) { 1429 + stamp_send_wqe(qp, stamp, size * 16); 1430 + ind = pad_wraparound(qp, ind); 1431 + } 1553 1432 1554 - ++ind; 1555 1433 } 1556 1434 1557 1435 out: ··· 1580 1444 */ 1581 1445 mmiowb(); 1582 1446 1583 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 1584 - (qp->sq.wqe_cnt - 1)); 1447 + stamp_send_wqe(qp, stamp, size * 16); 1448 + 1449 + ind = pad_wraparound(qp, ind); 1450 + qp->sq_next_wqe = ind; 1585 1451 } 1586 1452 1587 1453 spin_unlock_irqrestore(&qp->sq.lock, flags);
+1 -7
drivers/infiniband/hw/mlx4/srq.c
··· 38 38 39 39 static void *get_wqe(struct mlx4_ib_srq *srq, int n) 40 40 { 41 - int offset = n << srq->msrq.wqe_shift; 42 - 43 - if (srq->buf.nbufs == 1) 44 - return srq->buf.u.direct.buf + offset; 45 - else 46 - return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf + 47 - (offset & (PAGE_SIZE - 1)); 41 + return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift); 48 42 } 49 43 50 44 static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+2 -2
drivers/infiniband/ulp/ipoib/ipoib.h
··· 143 143 144 144 struct ipoib_tx_buf { 145 145 struct sk_buff *skb; 146 - u64 mapping; 146 + u64 mapping[MAX_SKB_FRAGS + 1]; 147 147 }; 148 148 149 149 struct ib_cm_id; ··· 296 296 struct ipoib_tx_buf *tx_ring; 297 297 unsigned tx_head; 298 298 unsigned tx_tail; 299 - struct ib_sge tx_sge; 299 + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; 300 300 struct ib_send_wr tx_wr; 301 301 unsigned tx_outstanding; 302 302
+5 -5
drivers/infiniband/ulp/ipoib/ipoib_cm.c
··· 634 634 { 635 635 struct ib_send_wr *bad_wr; 636 636 637 - priv->tx_sge.addr = addr; 638 - priv->tx_sge.length = len; 637 + priv->tx_sge[0].addr = addr; 638 + priv->tx_sge[0].length = len; 639 639 640 640 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 641 641 ··· 676 676 return; 677 677 } 678 678 679 - tx_req->mapping = addr; 679 + tx_req->mapping[0] = addr; 680 680 681 681 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 682 682 addr, skb->len))) { ··· 715 715 716 716 tx_req = &tx->tx_ring[wr_id]; 717 717 718 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 718 + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE); 719 719 720 720 /* FIXME: is this right? Shouldn't we only increment on success? */ 721 721 ++dev->stats.tx_packets; ··· 1110 1110 1111 1111 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1112 1112 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1113 - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 1113 + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, 1114 1114 DMA_TO_DEVICE); 1115 1115 dev_kfree_skb_any(tx_req->skb); 1116 1116 ++p->tx_tail;
+69 -20
drivers/infiniband/ulp/ipoib/ipoib_ib.c
··· 239 239 "for buf %d\n", wr_id); 240 240 } 241 241 242 + static int ipoib_dma_map_tx(struct ib_device *ca, 243 + struct ipoib_tx_buf *tx_req) 244 + { 245 + struct sk_buff *skb = tx_req->skb; 246 + u64 *mapping = tx_req->mapping; 247 + int i; 248 + 249 + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), 250 + DMA_TO_DEVICE); 251 + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) 252 + return -EIO; 253 + 254 + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 255 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 256 + mapping[i + 1] = ib_dma_map_page(ca, frag->page, 257 + frag->page_offset, frag->size, 258 + DMA_TO_DEVICE); 259 + if (unlikely(ib_dma_mapping_error(ca, mapping[i + 1]))) 260 + goto partial_error; 261 + } 262 + return 0; 263 + 264 + partial_error: 265 + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 266 + 267 + for (; i > 0; --i) { 268 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; 269 + ib_dma_unmap_page(ca, mapping[i], frag->size, DMA_TO_DEVICE); 270 + } 271 + return -EIO; 272 + } 273 + 274 + static void ipoib_dma_unmap_tx(struct ib_device *ca, 275 + struct ipoib_tx_buf *tx_req) 276 + { 277 + struct sk_buff *skb = tx_req->skb; 278 + u64 *mapping = tx_req->mapping; 279 + int i; 280 + 281 + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 282 + 283 + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 284 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 285 + ib_dma_unmap_page(ca, mapping[i + 1], frag->size, 286 + DMA_TO_DEVICE); 287 + } 288 + } 289 + 242 290 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) 243 291 { 244 292 struct ipoib_dev_priv *priv = netdev_priv(dev); ··· 305 257 306 258 tx_req = &priv->tx_ring[wr_id]; 307 259 308 - ib_dma_unmap_single(priv->ca, tx_req->mapping, 309 - tx_req->skb->len, DMA_TO_DEVICE); 260 + ipoib_dma_unmap_tx(priv->ca, tx_req); 310 261 311 262 ++dev->stats.tx_packets; 312 263 dev->stats.tx_bytes += tx_req->skb->len; ··· 388 341 static inline int post_send(struct ipoib_dev_priv *priv, 389 342 unsigned int wr_id, 390 343 struct ib_ah *address, u32 qpn, 391 - u64 addr, int len) 344 + u64 *mapping, int headlen, 345 + skb_frag_t *frags, 346 + int nr_frags) 392 347 { 393 348 struct ib_send_wr *bad_wr; 349 + int i; 394 350 395 - priv->tx_sge.addr = addr; 396 - priv->tx_sge.length = len; 397 - 398 - priv->tx_wr.wr_id = wr_id; 399 - priv->tx_wr.wr.ud.remote_qpn = qpn; 400 - priv->tx_wr.wr.ud.ah = address; 351 + priv->tx_sge[0].addr = mapping[0]; 352 + priv->tx_sge[0].length = headlen; 353 + for (i = 0; i < nr_frags; ++i) { 354 + priv->tx_sge[i + 1].addr = mapping[i + 1]; 355 + priv->tx_sge[i + 1].length = frags[i].size; 356 + } 357 + priv->tx_wr.num_sge = nr_frags + 1; 358 + priv->tx_wr.wr_id = wr_id; 359 + priv->tx_wr.wr.ud.remote_qpn = qpn; 360 + priv->tx_wr.wr.ud.ah = address; 401 361 402 362 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); 403 363 } ··· 414 360 { 415 361 struct ipoib_dev_priv *priv = netdev_priv(dev); 416 362 struct ipoib_tx_buf *tx_req; 417 - u64 addr; 418 363 419 364 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { 420 365 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", ··· 436 383 */ 437 384 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 438 385 tx_req->skb = skb; 439 - addr = ib_dma_map_single(priv->ca, skb->data, skb->len, 440 - DMA_TO_DEVICE); 441 - if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 386 + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { 442 387 ++dev->stats.tx_errors; 443 388 dev_kfree_skb_any(skb); 444 389 return; 445 390 } 446 - tx_req->mapping = addr; 447 391 448 392 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), 449 - address->ah, qpn, addr, skb->len))) { 393 + address->ah, qpn, 394 + tx_req->mapping, skb_headlen(skb), 395 + skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags))) { 450 396 ipoib_warn(priv, "post_send failed\n"); 451 397 ++dev->stats.tx_errors; 452 - ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 398 + ipoib_dma_unmap_tx(priv->ca, tx_req); 453 399 dev_kfree_skb_any(skb); 454 400 } else { 455 401 dev->trans_start = jiffies; ··· 667 615 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 668 616 tx_req = &priv->tx_ring[priv->tx_tail & 669 617 (ipoib_sendq_size - 1)]; 670 - ib_dma_unmap_single(priv->ca, 671 - tx_req->mapping, 672 - tx_req->skb->len, 673 - DMA_TO_DEVICE); 618 + ipoib_dma_unmap_tx(priv->ca, tx_req); 674 619 dev_kfree_skb_any(tx_req->skb); 675 620 ++priv->tx_tail; 676 621 --priv->tx_outstanding;
+3 -1
drivers/infiniband/ulp/ipoib/ipoib_main.c
··· 965 965 dev->addr_len = INFINIBAND_ALEN; 966 966 dev->type = ARPHRD_INFINIBAND; 967 967 dev->tx_queue_len = ipoib_sendq_size * 2; 968 - dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 968 + dev->features = (NETIF_F_VLAN_CHALLENGED | 969 + NETIF_F_LLTX | 970 + NETIF_F_HIGHDMA); 969 971 970 972 /* MTU will be reset when mcast join happens */ 971 973 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+7 -3
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
··· 157 157 }; 158 158 159 159 int ret, size; 160 + int i; 160 161 161 162 priv->pd = ib_alloc_pd(priv->ca); 162 163 if (IS_ERR(priv->pd)) { ··· 192 191 init_attr.send_cq = priv->cq; 193 192 init_attr.recv_cq = priv->cq; 194 193 194 + if (dev->features & NETIF_F_SG) 195 + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; 196 + 195 197 priv->qp = ib_create_qp(priv->pd, &init_attr); 196 198 if (IS_ERR(priv->qp)) { 197 199 printk(KERN_WARNING "%s: failed to create QP\n", ca->name); ··· 205 201 priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; 206 202 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; 207 203 208 - priv->tx_sge.lkey = priv->mr->lkey; 204 + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) 205 + priv->tx_sge[i].lkey = priv->mr->lkey; 209 206 210 207 priv->tx_wr.opcode = IB_WR_SEND; 211 - priv->tx_wr.sg_list = &priv->tx_sge; 212 - priv->tx_wr.num_sge = 1; 208 + priv->tx_wr.sg_list = priv->tx_sge; 213 209 priv->tx_wr.send_flags = IB_SEND_SIGNALED; 214 210 215 211 return 0;
+32 -16
drivers/net/mlx4/alloc.c
··· 116 116 buf->nbufs = 1; 117 117 buf->npages = 1; 118 118 buf->page_shift = get_order(size) + PAGE_SHIFT; 119 - buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev, 119 + buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, 120 120 size, &t, GFP_KERNEL); 121 - if (!buf->u.direct.buf) 121 + if (!buf->direct.buf) 122 122 return -ENOMEM; 123 123 124 - buf->u.direct.map = t; 124 + buf->direct.map = t; 125 125 126 126 while (t & ((1 << buf->page_shift) - 1)) { 127 127 --buf->page_shift; 128 128 buf->npages *= 2; 129 129 } 130 130 131 - memset(buf->u.direct.buf, 0, size); 131 + memset(buf->direct.buf, 0, size); 132 132 } else { 133 133 int i; 134 134 135 135 buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; 136 136 buf->npages = buf->nbufs; 137 137 buf->page_shift = PAGE_SHIFT; 138 - buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list, 138 + buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list, 139 139 GFP_KERNEL); 140 - if (!buf->u.page_list) 140 + if (!buf->page_list) 141 141 return -ENOMEM; 142 142 143 143 for (i = 0; i < buf->nbufs; ++i) { 144 - buf->u.page_list[i].buf = 144 + buf->page_list[i].buf = 145 145 dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, 146 146 &t, GFP_KERNEL); 147 - if (!buf->u.page_list[i].buf) 147 + if (!buf->page_list[i].buf) 148 148 goto err_free; 149 149 150 - buf->u.page_list[i].map = t; 150 + buf->page_list[i].map = t; 151 151 152 - memset(buf->u.page_list[i].buf, 0, PAGE_SIZE); 152 + memset(buf->page_list[i].buf, 0, PAGE_SIZE); 153 + } 154 + 155 + if (BITS_PER_LONG == 64) { 156 + struct page **pages; 157 + pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); 158 + if (!pages) 159 + goto err_free; 160 + for (i = 0; i < buf->nbufs; ++i) 161 + pages[i] = virt_to_page(buf->page_list[i].buf); 162 + buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); 163 + kfree(pages); 164 + if (!buf->direct.buf) 165 + goto err_free; 153 166 } 154 167 } 155 168 ··· 180 167 int i; 181 168 182 169 if (buf->nbufs == 1) 183 - dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf, 184 - buf->u.direct.map); 170 + dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, 171 + buf->direct.map); 185 172 else { 173 + if (BITS_PER_LONG == 64) 174 + vunmap(buf->direct.buf); 175 + 186 176 for (i = 0; i < buf->nbufs; ++i) 187 - if (buf->u.page_list[i].buf) 177 + if (buf->page_list[i].buf) 188 178 dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, 189 - buf->u.page_list[i].buf, 190 - buf->u.page_list[i].map); 191 - kfree(buf->u.page_list); 179 + buf->page_list[i].buf, 180 + buf->page_list[i].map); 181 + kfree(buf->page_list); 192 182 } 193 183 } 194 184 EXPORT_SYMBOL_GPL(mlx4_buf_free);
+2 -2
drivers/net/mlx4/mr.c
··· 419 419 420 420 for (i = 0; i < buf->npages; ++i) 421 421 if (buf->nbufs == 1) 422 - page_list[i] = buf->u.direct.map + (i << buf->page_shift); 422 + page_list[i] = buf->direct.map + (i << buf->page_shift); 423 423 else 424 - page_list[i] = buf->u.page_list[i].map; 424 + page_list[i] = buf->page_list[i].map; 425 425 426 426 err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list); 427 427
+15 -4
include/linux/mlx4/device.h
··· 133 133 MLX4_STAT_RATE_OFFSET = 5 134 134 }; 135 135 136 + static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) 137 + { 138 + return (major << 32) | (minor << 16) | subminor; 139 + } 140 + 136 141 struct mlx4_caps { 137 142 u64 fw_ver; 138 143 int num_ports; ··· 194 189 }; 195 190 196 191 struct mlx4_buf { 197 - union { 198 - struct mlx4_buf_list direct; 199 - struct mlx4_buf_list *page_list; 200 - } u; 192 + struct mlx4_buf_list direct; 193 + struct mlx4_buf_list *page_list; 201 194 int nbufs; 202 195 int npages; 203 196 int page_shift; ··· 311 308 int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, 312 309 struct mlx4_buf *buf); 313 310 void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); 311 + static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) 312 + { 313 + if (BITS_PER_LONG == 64 || buf->nbufs == 1) 314 + return buf->direct.buf + offset; 315 + else 316 + return buf->page_list[offset >> PAGE_SHIFT].buf + 317 + (offset & (PAGE_SIZE - 1)); 318 + } 314 319 315 320 int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); 316 321 void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+4
include/linux/mlx4/qp.h
··· 154 154 u32 reserved5[10]; 155 155 }; 156 156 157 + /* Which firmware version adds support for NEC (NoErrorCompletion) bit */ 158 + #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232) 159 + 157 160 enum { 161 + MLX4_WQE_CTRL_NEC = 1 << 29, 158 162 MLX4_WQE_CTRL_FENCE = 1 << 6, 159 163 MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, 160 164 MLX4_WQE_CTRL_SOLICITED = 1 << 1,
+12 -4
include/rdma/ib_verbs.h
··· 95 95 IB_DEVICE_N_NOTIFY_CQ = (1<<14), 96 96 IB_DEVICE_ZERO_STAG = (1<<15), 97 97 IB_DEVICE_SEND_W_INV = (1<<16), 98 - IB_DEVICE_MEM_WINDOW = (1<<17) 98 + IB_DEVICE_MEM_WINDOW = (1<<17), 99 + /* 100 + * Devices should set IB_DEVICE_UD_IP_SUM if they support 101 + * insertion of UDP and TCP checksum on outgoing UD IPoIB 102 + * messages and can verify the validity of checksum for 103 + * incoming messages. Setting this flag implies that the 104 + * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. 105 + */ 106 + IB_DEVICE_UD_IP_CSUM = (1<<18), 99 107 }; 100 108 101 109 enum ib_atomic_cap { ··· 439 431 u8 sl; 440 432 u8 dlid_path_bits; 441 433 u8 port_num; /* valid only for DR SMPs on switches */ 434 + int csum_ok; 442 435 }; 443 436 444 437 enum ib_cq_notify_flags { ··· 624 615 IB_SEND_FENCE = 1, 625 616 IB_SEND_SIGNALED = (1<<1), 626 617 IB_SEND_SOLICITED = (1<<2), 627 - IB_SEND_INLINE = (1<<3) 618 + IB_SEND_INLINE = (1<<3), 619 + IB_SEND_IP_CSUM = (1<<4) 628 620 }; 629 621 630 622 struct ib_sge { ··· 899 889 struct ib_cache cache; 900 890 int *pkey_tbl_len; 901 891 int *gid_tbl_len; 902 - 903 - u32 flags; 904 892 905 893 int num_comp_vectors; 906 894