Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xprtrdma: Provide a buffer to pad Write chunks of unaligned length

This is a buffer to be left persistently registered while a
connection is up. Connection tear-down will automatically DMA-unmap,
invalidate, and dereg the MR. A persistently registered buffer is
lower in cost to provide, and it can never be coalesced into the
RDMA segment that carries the data payload.

An RPC that provisions a Write chunk with a non-aligned length now
uses this MR rather than the tail buffer of the RPC's rq_rcv_buf.

Reviewed-By: Tom Talpey <tom@talpey.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>

authored by

Chuck Lever and committed by
Trond Myklebust
21037b8c d5f458a9

+65 -12
+10 -3
include/trace/events/rpcrdma.h
··· 375 375 376 376 TP_fast_assign( 377 377 const struct rpcrdma_req *req = mr->mr_req; 378 - const struct rpc_task *task = req->rl_slot.rq_task; 379 378 380 - __entry->task_id = task->tk_pid; 381 - __entry->client_id = task->tk_client->cl_clid; 379 + if (req) { 380 + const struct rpc_task *task = req->rl_slot.rq_task; 381 + 382 + __entry->task_id = task->tk_pid; 383 + __entry->client_id = task->tk_client->cl_clid; 384 + } else { 385 + __entry->task_id = 0; 386 + __entry->client_id = -1; 387 + } 382 388 __entry->mr_id = mr->mr_ibmr->res.id; 383 389 __entry->nents = mr->mr_nents; 384 390 __entry->handle = mr->mr_handle; ··· 645 639 DEFINE_RDCH_EVENT(read); 646 640 DEFINE_WRCH_EVENT(write); 647 641 DEFINE_WRCH_EVENT(reply); 642 + DEFINE_WRCH_EVENT(wp); 648 643 649 644 TRACE_DEFINE_ENUM(rpcrdma_noch); 650 645 TRACE_DEFINE_ENUM(rpcrdma_noch_pullup);
+35
net/sunrpc/xprtrdma/frwr_ops.c
··· 666 666 */ 667 667 rpcrdma_force_disconnect(ep); 668 668 } 669 + 670 + /** 671 + * frwr_wp_create - Create an MR for padding Write chunks 672 + * @r_xprt: transport resources to use 673 + * 674 + * Return 0 on success, negative errno on failure. 675 + */ 676 + int frwr_wp_create(struct rpcrdma_xprt *r_xprt) 677 + { 678 + struct rpcrdma_ep *ep = r_xprt->rx_ep; 679 + struct rpcrdma_mr_seg seg; 680 + struct rpcrdma_mr *mr; 681 + 682 + mr = rpcrdma_mr_get(r_xprt); 683 + if (!mr) 684 + return -EAGAIN; 685 + mr->mr_req = NULL; 686 + ep->re_write_pad_mr = mr; 687 + 688 + seg.mr_len = XDR_UNIT; 689 + seg.mr_page = virt_to_page(ep->re_write_pad); 690 + seg.mr_offset = offset_in_page(ep->re_write_pad); 691 + if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) 692 + return -EIO; 693 + trace_xprtrdma_mr_fastreg(mr); 694 + 695 + mr->mr_cqe.done = frwr_wc_fastreg; 696 + mr->mr_regwr.wr.next = NULL; 697 + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; 698 + mr->mr_regwr.wr.num_sge = 0; 699 + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; 700 + mr->mr_regwr.wr.send_flags = 0; 701 + 702 + return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); 703 + }
+14 -9
net/sunrpc/xprtrdma/rpc_rdma.c
··· 255 255 page_base = 0; 256 256 } 257 257 258 - if (type == rpcrdma_readch) 259 - goto out; 260 - 261 - /* When encoding a Write chunk, some servers need to see an 262 - * extra segment for non-XDR-aligned Write chunks. The upper 263 - * layer provides space in the tail iovec that may be used 264 - * for this purpose. 265 - */ 266 - if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) 258 + if (type == rpcrdma_readch || type == rpcrdma_writech) 267 259 goto out; 268 260 269 261 if (xdrbuf->tail[0].iov_len) ··· 397 405 enum rpcrdma_chunktype wtype) 398 406 { 399 407 struct xdr_stream *xdr = &req->rl_stream; 408 + struct rpcrdma_ep *ep = r_xprt->rx_ep; 400 409 struct rpcrdma_mr_seg *seg; 401 410 struct rpcrdma_mr *mr; 402 411 int nsegs, nchunks; ··· 435 442 nchunks++; 436 443 nsegs -= mr->mr_nents; 437 444 } while (nsegs); 445 + 446 + if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { 447 + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) 448 + return -EMSGSIZE; 449 + 450 + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, 451 + nsegs); 452 + r_xprt->rx_stats.write_chunk_count++; 453 + r_xprt->rx_stats.total_rdma_request += mr->mr_length; 454 + nchunks++; 455 + nsegs -= mr->mr_nents; 456 + } 438 457 439 458 /* Update count of segments in this Write chunk */ 440 459 *segcount = cpu_to_be32(nchunks);
+1
net/sunrpc/xprtrdma/verbs.c
··· 551 551 goto out; 552 552 } 553 553 rpcrdma_mrs_create(r_xprt); 554 + frwr_wp_create(r_xprt); 554 555 555 556 out: 556 557 trace_xprtrdma_connect(r_xprt, rc);
+5
net/sunrpc/xprtrdma/xprt_rdma.h
··· 68 68 /* 69 69 * RDMA Endpoint -- connection endpoint details 70 70 */ 71 + struct rpcrdma_mr; 71 72 struct rpcrdma_ep { 72 73 struct kref re_kref; 73 74 struct rdma_cm_id *re_id; 74 75 struct ib_pd *re_pd; 75 76 unsigned int re_max_rdma_segs; 76 77 unsigned int re_max_fr_depth; 78 + struct rpcrdma_mr *re_write_pad_mr; 77 79 bool re_implicit_roundup; 78 80 enum ib_mr_type re_mrtype; 79 81 struct completion re_done; ··· 99 97 unsigned int re_inline_recv; /* negotiated */ 100 98 101 99 atomic_t re_completion_ids; 100 + 101 + char re_write_pad[XDR_UNIT]; 102 102 }; 103 103 104 104 /* Pre-allocate extra Work Requests for handling reverse-direction ··· 539 535 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); 540 536 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); 541 537 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); 538 + int frwr_wp_create(struct rpcrdma_xprt *r_xprt); 542 539 543 540 /* 544 541 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c