Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

svcrdma: support Remote Invalidation

Support Remote Invalidation. A private message is exchanged with
the client upon RDMA transport connect that indicates whether
Send With Invalidation may be used by the server to send RPC
replies. The invalidate_rkey is arbitrarily chosen from among
rkeys present in the RPC-over-RDMA header's chunk lists.

Send With Invalidate improves performance only when clients can
recognize, while processing an RPC reply, that an rkey has already
been invalidated. That has been submitted as a separate change.

In the future, the RPC-over-RDMA protocol might support Remote
Invalidation properly. The protocol needs to enable signaling
between peers to indicate when Remote Invalidation can be used
for each individual RPC.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>

authored by

Chuck Lever and committed by
J. Bruce Fields
25d55296 cc9d8340

+65 -6
+1
include/linux/sunrpc/svc_rdma.h
··· 137 137 int sc_ord; /* RDMA read limit */ 138 138 int sc_max_sge; 139 139 int sc_max_sge_rd; /* max sge for read target */ 140 + bool sc_snd_w_inv; /* OK to use Send With Invalidate */ 140 141 141 142 atomic_t sc_sq_count; /* Number of SQ WR on queue */ 142 143 unsigned int sc_sq_depth; /* Depth of SQ */
+55 -3
net/sunrpc/xprtrdma/svc_rdma_sendto.c
··· 225 225 return rp_ary; 226 226 } 227 227 228 + /* RPC-over-RDMA Version One private extension: Remote Invalidation. 229 + * Responder's choice: requester signals it can handle Send With 230 + * Invalidate, and responder chooses one rkey to invalidate. 231 + * 232 + * Find a candidate rkey to invalidate when sending a reply. Picks the 233 + * first rkey it finds in the chunks lists. 234 + * 235 + * Returns zero if RPC's chunk lists are empty. 236 + */ 237 + static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, 238 + struct rpcrdma_write_array *wr_ary, 239 + struct rpcrdma_write_array *rp_ary) 240 + { 241 + struct rpcrdma_read_chunk *rd_ary; 242 + struct rpcrdma_segment *arg_ch; 243 + u32 inv_rkey; 244 + 245 + inv_rkey = 0; 246 + 247 + rd_ary = svc_rdma_get_read_chunk(rdma_argp); 248 + if (rd_ary) { 249 + inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); 250 + goto out; 251 + } 252 + 253 + if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { 254 + arg_ch = &wr_ary->wc_array[0].wc_target; 255 + inv_rkey = be32_to_cpu(arg_ch->rs_handle); 256 + goto out; 257 + } 258 + 259 + if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { 260 + arg_ch = &rp_ary->wc_array[0].wc_target; 261 + inv_rkey = be32_to_cpu(arg_ch->rs_handle); 262 + goto out; 263 + } 264 + 265 + out: 266 + dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey); 267 + return inv_rkey; 268 + } 269 + 228 270 /* Assumptions: 229 271 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 230 272 */ ··· 506 464 struct page *page, 507 465 struct rpcrdma_msg *rdma_resp, 508 466 struct svc_rdma_req_map *vec, 509 - int byte_count) 467 + int byte_count, 468 + u32 inv_rkey) 510 469 { 511 470 struct svc_rdma_op_ctxt *ctxt; 512 471 struct ib_send_wr send_wr; ··· 578 535 send_wr.wr_cqe = &ctxt->cqe; 579 536 send_wr.sg_list = ctxt->sge; 580 537 send_wr.num_sge = sge_no; 581 - send_wr.opcode = IB_WR_SEND; 538 + if (inv_rkey) { 539 + send_wr.opcode = IB_WR_SEND_WITH_INV; 540 + send_wr.ex.invalidate_rkey = inv_rkey; 541 + } else 542 + send_wr.opcode = IB_WR_SEND; 582 543 send_wr.send_flags = IB_SEND_SIGNALED; 583 544 584 545 ret = svc_rdma_send(rdma, &send_wr); ··· 614 567 int inline_bytes; 615 568 struct page *res_page; 616 569 struct svc_rdma_req_map *vec; 570 + u32 inv_rkey; 617 571 618 572 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 619 573 ··· 624 576 rdma_argp = page_address(rqstp->rq_pages[0]); 625 577 wr_ary = svc_rdma_get_write_array(rdma_argp); 626 578 rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); 579 + 580 + inv_rkey = 0; 581 + if (rdma->sc_snd_w_inv) 582 + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); 627 583 628 584 /* Build an req vec for the XDR */ 629 585 vec = svc_rdma_get_req_map(rdma); ··· 671 619 goto err1; 672 620 673 621 ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, 674 - inline_bytes); 622 + inline_bytes, inv_rkey); 675 623 if (ret < 0) 676 624 goto err0; 677 625
+9 -3
net/sunrpc/xprtrdma/svc_rdma_transport.c
··· 657 657 if (pmsg && 658 658 pmsg->cp_magic == rpcrdma_cmp_magic && 659 659 pmsg->cp_version == RPCRDMA_CMP_VERSION) { 660 - dprintk("svcrdma: client send_size %u, recv_size %u\n", 660 + newxprt->sc_snd_w_inv = pmsg->cp_flags & 661 + RPCRDMA_CMP_F_SND_W_INV_OK; 662 + 663 + dprintk("svcrdma: client send_size %u, recv_size %u " 664 + "remote inv %ssupported\n", 661 665 rpcrdma_decode_buffer_size(pmsg->cp_send_size), 662 - rpcrdma_decode_buffer_size(pmsg->cp_recv_size)); 666 + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), 667 + newxprt->sc_snd_w_inv ? "" : "un"); 663 668 } 664 669 } 665 670 ··· 1098 1093 dev->attrs.max_fast_reg_page_list_len; 1099 1094 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 1100 1095 newxprt->sc_reader = rdma_read_chunk_frmr; 1101 - } 1096 + } else 1097 + newxprt->sc_snd_w_inv = false; 1102 1098 1103 1099 /* 1104 1100 * Determine if a DMA MR is required and if so, what privs are required