IB/mlx4: Use multiple WQ blocks to post smaller send WQEs

ConnectX HCA supports shrinking WQEs, so that a single work request
can be made of multiple units of wqe_shift. This way, WRs can differ
in size, and do not have to be a power of 2 in size, saving memory and
speeding up send WR posting. Unfortunately, if we do this then the
wqe_index field in CQEs can't be used to look up the WR ID anymore, so
our implementation does this only if selective signaling is off.

Further, on 32-bit platforms, we can't use vmap() to make the QP
buffer virtually contigious. Thus we have to use constant-sized WRs to
make sure a WR is always fully within a single page-sized chunk.

Finally, we use WRs with the NOP opcode to avoid wrapping around the
queue buffer in the middle of posting a WR, and we set the
NoErrorCompletion bit to avoid getting completions with error for NOP
WRs. However, NEC is only supported starting with firmware 2.2.232,
so we use constant-sized WRs for older firmware. And, since MLX QPs
only support SEND, we use constant-sized WRs in this case.

When stamping during NOP posting, do stamping following setting of the
NOP WQE valid bit.

Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by Jack Morgenstein and committed by Roland Dreier ea54b10c b57aacfa

+197 -36
+10 -2
drivers/infiniband/hw/mlx4/cq.c
··· 326 326 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == 327 327 MLX4_CQE_OPCODE_ERROR; 328 328 329 + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && 330 + is_send)) { 331 + printk(KERN_WARNING "Completion for NOP opcode detected!\n"); 332 + return -EINVAL; 333 + } 334 + 329 335 if (!*cur_qp || 330 336 (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) { 331 337 /* ··· 354 348 355 349 if (is_send) { 356 350 wq = &(*cur_qp)->sq; 357 - wqe_ctr = be16_to_cpu(cqe->wqe_index); 358 - wq->tail += (u16) (wqe_ctr - (u16) wq->tail); 351 + if (!(*cur_qp)->sq_signal_bits) { 352 + wqe_ctr = be16_to_cpu(cqe->wqe_index); 353 + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); 354 + } 359 355 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; 360 356 ++wq->tail; 361 357 } else if ((*cur_qp)->ibqp.srq) {
+2
drivers/infiniband/hw/mlx4/mlx4_ib.h
··· 120 120 121 121 u32 doorbell_qpn; 122 122 __be32 sq_signal_bits; 123 + unsigned sq_next_wqe; 124 + int sq_max_wqes_per_wr; 123 125 int sq_spare_wqes; 124 126 struct mlx4_ib_wq sq; 125 127
+176 -34
drivers/infiniband/hw/mlx4/qp.c
··· 30 30 * SOFTWARE. 31 31 */ 32 32 33 + #include <linux/log2.h> 34 + 33 35 #include <rdma/ib_cache.h> 34 36 #include <rdma/ib_pack.h> 35 37 ··· 113 111 114 112 /* 115 113 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 116 - * first four bytes of every 64 byte chunk with 0xffffffff, except for 117 - * the very first chunk of the WQE. 114 + * first four bytes of every 64 byte chunk with 115 + * 0x7FFFFFF | (invalid_ownership_value << 31). 116 + * 117 + * When the max work request size is less than or equal to the WQE 118 + * basic block size, as an optimization, we can stamp all WQEs with 119 + * 0xffffffff, and skip the very first chunk of each WQE. 118 120 */ 119 - static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) 121 + static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) 120 122 { 121 - u32 *wqe = get_send_wqe(qp, n); 123 + u32 *wqe; 122 124 int i; 125 + int s; 126 + int ind; 127 + void *buf; 128 + __be32 stamp; 123 129 124 - for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) 125 - wqe[i] = 0xffffffff; 130 + s = roundup(size, 1U << qp->sq.wqe_shift); 131 + if (qp->sq_max_wqes_per_wr > 1) { 132 + for (i = 0; i < s; i += 64) { 133 + ind = (i >> qp->sq.wqe_shift) + n; 134 + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : 135 + cpu_to_be32(0xffffffff); 136 + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 137 + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); 138 + *wqe = stamp; 139 + } 140 + } else { 141 + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 142 + for (i = 64; i < s; i += 64) { 143 + wqe = buf + i; 144 + *wqe = 0xffffffff; 145 + } 146 + } 147 + } 148 + 149 + static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) 150 + { 151 + struct mlx4_wqe_ctrl_seg *ctrl; 152 + struct mlx4_wqe_inline_seg *inl; 153 + void *wqe; 154 + int s; 155 + 156 + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); 157 + s = sizeof(struct mlx4_wqe_ctrl_seg); 158 + 159 + if (qp->ibqp.qp_type == IB_QPT_UD) { 160 + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; 161 + struct mlx4_av *av = (struct mlx4_av *)dgram->av; 162 + memset(dgram, 0, sizeof *dgram); 163 + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); 164 + s += sizeof(struct mlx4_wqe_datagram_seg); 165 + } 166 + 167 + /* Pad the remainder of the WQE with an inline data segment. */ 168 + if (size > s) { 169 + inl = wqe + s; 170 + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); 171 + } 172 + ctrl->srcrb_flags = 0; 173 + ctrl->fence_size = size / 16; 174 + /* 175 + * Make sure descriptor is fully written before setting ownership bit 176 + * (because HW can start executing as soon as we do). 177 + */ 178 + wmb(); 179 + 180 + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | 181 + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 182 + 183 + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); 184 + } 185 + 186 + /* Post NOP WQE to prevent wrap-around in the middle of WR */ 187 + static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) 188 + { 189 + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); 190 + if (unlikely(s < qp->sq_max_wqes_per_wr)) { 191 + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); 192 + ind += s; 193 + } 194 + return ind; 126 195 } 127 196 128 197 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) ··· 310 237 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 311 238 enum ib_qp_type type, struct mlx4_ib_qp *qp) 312 239 { 240 + int s; 241 + 313 242 /* Sanity check SQ size before proceeding */ 314 243 if (cap->max_send_wr > dev->dev->caps.max_wqes || 315 244 cap->max_send_sge > dev->dev->caps.max_sq_sg || ··· 327 252 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) 328 253 return -EINVAL; 329 254 330 - qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * 331 - sizeof (struct mlx4_wqe_data_seg), 332 - cap->max_inline_data + 333 - sizeof (struct mlx4_wqe_inline_seg)) + 334 - send_wqe_overhead(type))); 335 - qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / 336 - sizeof (struct mlx4_wqe_data_seg); 255 + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), 256 + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + 257 + send_wqe_overhead(type); 337 258 338 259 /* 339 - * We need to leave 2 KB + 1 WQE of headroom in the SQ to 340 - * allow HW to prefetch. 260 + * Hermon supports shrinking WQEs, such that a single work 261 + * request can include multiple units of 1 << wqe_shift. This 262 + * way, work requests can differ in size, and do not have to 263 + * be a power of 2 in size, saving memory and speeding up send 264 + * WR posting. Unfortunately, if we do this then the 265 + * wqe_index field in CQEs can't be used to look up the WR ID 266 + * anymore, so we do this only if selective signaling is off. 267 + * 268 + * Further, on 32-bit platforms, we can't use vmap() to make 269 + * the QP buffer virtually contigious. Thus we have to use 270 + * constant-sized WRs to make sure a WR is always fully within 271 + * a single page-sized chunk. 272 + * 273 + * Finally, we use NOP work requests to pad the end of the 274 + * work queue, to avoid wrap-around in the middle of WR. We 275 + * set NEC bit to avoid getting completions with error for 276 + * these NOP WRs, but since NEC is only supported starting 277 + * with firmware 2.2.232, we use constant-sized WRs for older 278 + * firmware. 279 + * 280 + * And, since MLX QPs only support SEND, we use constant-sized 281 + * WRs in this case. 282 + * 283 + * We look for the smallest value of wqe_shift such that the 284 + * resulting number of wqes does not exceed device 285 + * capabilities. 286 + * 287 + * We set WQE size to at least 64 bytes, this way stamping 288 + * invalidates each WQE. 341 289 */ 342 - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; 343 - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); 290 + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && 291 + qp->sq_signal_bits && BITS_PER_LONG == 64 && 292 + type != IB_QPT_SMI && type != IB_QPT_GSI) 293 + qp->sq.wqe_shift = ilog2(64); 294 + else 295 + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); 296 + 297 + for (;;) { 298 + if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz) 299 + return -EINVAL; 300 + 301 + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); 302 + 303 + /* 304 + * We need to leave 2 KB + 1 WR of headroom in the SQ to 305 + * allow HW to prefetch. 306 + */ 307 + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; 308 + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * 309 + qp->sq_max_wqes_per_wr + 310 + qp->sq_spare_wqes); 311 + 312 + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) 313 + break; 314 + 315 + if (qp->sq_max_wqes_per_wr <= 1) 316 + return -EINVAL; 317 + 318 + ++qp->sq.wqe_shift; 319 + } 320 + 321 + qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - 322 + send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg); 344 323 345 324 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 346 325 (qp->sq.wqe_cnt << qp->sq.wqe_shift); ··· 406 277 qp->sq.offset = 0; 407 278 } 408 279 409 - cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; 280 + cap->max_send_wr = qp->sq.max_post = 281 + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; 410 282 cap->max_send_sge = qp->sq.max_gs; 411 283 /* We don't support inline sends for kernel QPs (yet) */ 412 284 cap->max_inline_data = 0; ··· 453 323 qp->rq.tail = 0; 454 324 qp->sq.head = 0; 455 325 qp->sq.tail = 0; 326 + qp->sq_next_wqe = 0; 327 + 328 + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 329 + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 330 + else 331 + qp->sq_signal_bits = 0; 456 332 457 333 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); 458 334 if (err) ··· 548 412 * a little bit when posting sends. 549 413 */ 550 414 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); 551 - 552 - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) 553 - qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); 554 - else 555 - qp->sq_signal_bits = 0; 556 415 557 416 qp->mqp.event = mlx4_ib_qp_event; 558 417 ··· 1043 912 ctrl = get_send_wqe(qp, i); 1044 913 ctrl->owner_opcode = cpu_to_be32(1 << 31); 1045 914 1046 - stamp_send_wqe(qp, i); 915 + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); 1047 916 } 1048 917 } 1049 918 ··· 1096 965 qp->rq.tail = 0; 1097 966 qp->sq.head = 0; 1098 967 qp->sq.tail = 0; 968 + qp->sq_next_wqe = 0; 1099 969 if (!ibqp->srq) 1100 970 *qp->db.db = 0; 1101 971 } ··· 1406 1274 unsigned long flags; 1407 1275 int nreq; 1408 1276 int err = 0; 1409 - int ind; 1410 - int size; 1277 + unsigned ind; 1278 + int uninitialized_var(stamp); 1279 + int uninitialized_var(size); 1411 1280 int i; 1412 1281 1413 1282 spin_lock_irqsave(&qp->sq.lock, flags); 1414 1283 1415 - ind = qp->sq.head; 1284 + ind = qp->sq_next_wqe; 1416 1285 1417 1286 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1418 1287 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { ··· 1429 1296 } 1430 1297 1431 1298 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 1432 - qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1299 + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1433 1300 1434 1301 ctrl->srcrb_flags = 1435 1302 (wr->send_flags & IB_SEND_SIGNALED ? ··· 1542 1409 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | 1543 1410 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 1544 1411 1412 + stamp = ind + qp->sq_spare_wqes; 1413 + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); 1414 + 1545 1415 /* 1546 1416 * We can improve latency by not stamping the last 1547 1417 * send queue WQE until after ringing the doorbell, so 1548 1418 * only stamp here if there are still more WQEs to post. 1419 + * 1420 + * Same optimization applies to padding with NOP wqe 1421 + * in case of WQE shrinking (used to prevent wrap-around 1422 + * in the middle of WR). 1549 1423 */ 1550 - if (wr->next) 1551 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 1552 - (qp->sq.wqe_cnt - 1)); 1424 + if (wr->next) { 1425 + stamp_send_wqe(qp, stamp, size * 16); 1426 + ind = pad_wraparound(qp, ind); 1427 + } 1553 1428 1554 - ++ind; 1555 1429 } 1556 1430 1557 1431 out: ··· 1580 1440 */ 1581 1441 mmiowb(); 1582 1442 1583 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 1584 - (qp->sq.wqe_cnt - 1)); 1443 + stamp_send_wqe(qp, stamp, size * 16); 1444 + 1445 + ind = pad_wraparound(qp, ind); 1446 + qp->sq_next_wqe = ind; 1585 1447 } 1586 1448 1587 1449 spin_unlock_irqrestore(&qp->sq.lock, flags);
+5
include/linux/mlx4/device.h
··· 133 133 MLX4_STAT_RATE_OFFSET = 5 134 134 }; 135 135 136 + static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) 137 + { 138 + return (major << 32) | (minor << 16) | subminor; 139 + } 140 + 136 141 struct mlx4_caps { 137 142 u64 fw_ver; 138 143 int num_ports;
+4
include/linux/mlx4/qp.h
··· 154 154 u32 reserved5[10]; 155 155 }; 156 156 157 + /* Which firmware version adds support for NEC (NoErrorCompletion) bit */ 158 + #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232) 159 + 157 160 enum { 161 + MLX4_WQE_CTRL_NEC = 1 << 29, 158 162 MLX4_WQE_CTRL_FENCE = 1 << 6, 159 163 MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, 160 164 MLX4_WQE_CTRL_SOLICITED = 1 << 1,