Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xprtrdma: Pull up sometimes

On some platforms, DMA mapping part of a page is more costly than
copying bytes. Restore the pull-up code and use that when we
think it's going to be faster. The heuristic for now is to pull-up
when the size of the RPC message body fits in the buffer underlying
the head iovec.

Indeed, not involving the I/O MMU can help the RPC/RDMA transport
scale better for tiny I/Os across more RDMA devices. This is because
interaction with the I/O MMU is eliminated, as is handling a Send
completion, for each of these small I/Os. Without the explicit
unmapping, the NIC no longer needs to do a costly internal TLB shoot
down for buffers that are just a handful of bytes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>

authored by

Chuck Lever and committed by
Anna Schumaker
614f3c96 d6764bbd

+85 -7
+4
include/trace/events/rpcrdma.h
··· 532 532 DEFINE_WRCH_EVENT(reply); 533 533 534 534 TRACE_DEFINE_ENUM(rpcrdma_noch); 535 + TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); 536 + TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); 535 537 TRACE_DEFINE_ENUM(rpcrdma_readch); 536 538 TRACE_DEFINE_ENUM(rpcrdma_areadch); 537 539 TRACE_DEFINE_ENUM(rpcrdma_writech); ··· 542 540 #define xprtrdma_show_chunktype(x) \ 543 541 __print_symbolic(x, \ 544 542 { rpcrdma_noch, "inline" }, \ 543 + { rpcrdma_noch_pullup, "pullup" }, \ 544 + { rpcrdma_noch_mapped, "mapped" }, \ 545 545 { rpcrdma_readch, "read list" }, \ 546 546 { rpcrdma_areadch, "*read list" }, \ 547 547 { rpcrdma_writech, "write list" }, \
+1 -1
net/sunrpc/xprtrdma/backchannel.c
··· 79 79 *p = xdr_zero; 80 80 81 81 if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, 82 - &rqst->rq_snd_buf, rpcrdma_noch)) 82 + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) 83 83 return -EIO; 84 84 85 85 trace_xprtrdma_cb_reply(rqst);
+77 -5
net/sunrpc/xprtrdma/rpc_rdma.c
··· 392 392 unsigned int pos; 393 393 int nsegs; 394 394 395 - if (rtype == rpcrdma_noch) 395 + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 396 396 goto done; 397 397 398 398 pos = rqst->rq_snd_buf.head[0].iov_len; ··· 691 691 return false; 692 692 } 693 693 694 + /* Copy the tail to the end of the head buffer. 695 + */ 696 + static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, 697 + struct rpcrdma_req *req, 698 + struct xdr_buf *xdr) 699 + { 700 + unsigned char *dst; 701 + 702 + dst = (unsigned char *)xdr->head[0].iov_base; 703 + dst += xdr->head[0].iov_len + xdr->page_len; 704 + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); 705 + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; 706 + } 707 + 708 + /* Copy pagelist content into the head buffer. 709 + */ 710 + static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, 711 + struct rpcrdma_req *req, 712 + struct xdr_buf *xdr) 713 + { 714 + unsigned int len, page_base, remaining; 715 + struct page **ppages; 716 + unsigned char *src, *dst; 717 + 718 + dst = (unsigned char *)xdr->head[0].iov_base; 719 + dst += xdr->head[0].iov_len; 720 + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 721 + page_base = offset_in_page(xdr->page_base); 722 + remaining = xdr->page_len; 723 + while (remaining) { 724 + src = page_address(*ppages); 725 + src += page_base; 726 + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 727 + memcpy(dst, src, len); 728 + r_xprt->rx_stats.pullup_copy_count += len; 729 + 730 + ppages++; 731 + dst += len; 732 + remaining -= len; 733 + page_base = 0; 734 + } 735 + } 736 + 737 + /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. 738 + * When the head, pagelist, and tail are small, a pull-up copy 739 + * is considerably less costly than DMA mapping the components 740 + * of @xdr. 741 + * 742 + * Assumptions: 743 + * - the caller has already verified that the total length 744 + * of the RPC Call body will fit into @rl_sendbuf. 745 + */ 746 + static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, 747 + struct rpcrdma_req *req, 748 + struct xdr_buf *xdr) 749 + { 750 + if (unlikely(xdr->tail[0].iov_len)) 751 + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); 752 + 753 + if (unlikely(xdr->page_len)) 754 + rpcrdma_pullup_pagelist(r_xprt, req, xdr); 755 + 756 + /* The whole RPC message resides in the head iovec now */ 757 + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); 758 + } 759 + 694 760 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, 695 761 struct rpcrdma_req *req, 696 762 struct xdr_buf *xdr) ··· 845 779 goto out_unmap; 846 780 847 781 switch (rtype) { 848 - case rpcrdma_noch: 782 + case rpcrdma_noch_pullup: 783 + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) 784 + goto out_unmap; 785 + break; 786 + case rpcrdma_noch_mapped: 849 787 if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) 850 788 goto out_unmap; 851 789 break; ··· 897 827 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 898 828 struct xdr_stream *xdr = &req->rl_stream; 899 829 enum rpcrdma_chunktype rtype, wtype; 830 + struct xdr_buf *buf = &rqst->rq_snd_buf; 900 831 bool ddp_allowed; 901 832 __be32 *p; 902 833 int ret; ··· 955 884 */ 956 885 if (rpcrdma_args_inline(r_xprt, rqst)) { 957 886 *p++ = rdma_msg; 958 - rtype = rpcrdma_noch; 959 - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 887 + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? 888 + rpcrdma_noch_pullup : rpcrdma_noch_mapped; 889 + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { 960 890 *p++ = rdma_msg; 961 891 rtype = rpcrdma_readch; 962 892 } else { ··· 999 927 goto out_err; 1000 928 1001 929 ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, 1002 - &rqst->rq_snd_buf, rtype); 930 + buf, rtype); 1003 931 if (ret) 1004 932 goto out_err; 1005 933
+1 -1
net/sunrpc/xprtrdma/verbs.c
··· 1165 1165 for (i = 0; i < buf->rb_max_requests; i++) { 1166 1166 struct rpcrdma_req *req; 1167 1167 1168 - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, 1168 + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, 1169 1169 GFP_KERNEL); 1170 1170 if (!req) 1171 1171 goto out;
+2
net/sunrpc/xprtrdma/xprt_rdma.h
··· 554 554 555 555 enum rpcrdma_chunktype { 556 556 rpcrdma_noch = 0, 557 + rpcrdma_noch_pullup, 558 + rpcrdma_noch_mapped, 557 559 rpcrdma_readch, 558 560 rpcrdma_areadch, 559 561 rpcrdma_writech,