Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

svcrdma: Post Send WR chain

Eventually I'd like the server to post the reply's Send WR along
with any Write WRs using only a single call to ib_post_send(), in
order to reduce the NIC's doorbell rate.

To do this, add an anchor for a WR chain to svc_rdma_send_ctxt, and
refactor svc_rdma_send() to post this WR chain to the Send Queue. For
the moment, the posted chain will continue to contain a single Send
WR.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

+38 -19
+4 -2
include/linux/sunrpc/svc_rdma.h
··· 210 210 211 211 struct svcxprt_rdma *sc_rdma; 212 212 struct ib_send_wr sc_send_wr; 213 + struct ib_send_wr *sc_wr_chain; 214 + int sc_sqecount; 213 215 struct ib_cqe sc_cqe; 214 216 struct xdr_buf sc_hdrbuf; 215 217 struct xdr_stream sc_stream; ··· 260 258 svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); 261 259 extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, 262 260 struct svc_rdma_send_ctxt *ctxt); 263 - extern int svc_rdma_send(struct svcxprt_rdma *rdma, 264 - struct svc_rdma_send_ctxt *ctxt); 261 + extern int svc_rdma_post_send(struct svcxprt_rdma *rdma, 262 + struct svc_rdma_send_ctxt *ctxt); 265 263 extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, 266 264 struct svc_rdma_send_ctxt *sctxt, 267 265 const struct svc_rdma_pcl *write_pcl,
+1 -1
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
··· 90 90 */ 91 91 get_page(virt_to_page(rqst->rq_buffer)); 92 92 sctxt->sc_send_wr.opcode = IB_WR_SEND; 93 - return svc_rdma_send(rdma, sctxt); 93 + return svc_rdma_post_send(rdma, sctxt); 94 94 } 95 95 96 96 /* Server-side transport endpoint wants a whole page for its send
+33 -16
net/sunrpc/xprtrdma/svc_rdma_sendto.c
··· 208 208 ctxt->sc_send_wr.num_sge = 0; 209 209 ctxt->sc_cur_sge_no = 0; 210 210 ctxt->sc_page_count = 0; 211 + ctxt->sc_wr_chain = &ctxt->sc_send_wr; 212 + ctxt->sc_sqecount = 1; 213 + 211 214 return ctxt; 212 215 213 216 out_empty: ··· 296 293 struct svc_rdma_send_ctxt *ctxt = 297 294 container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); 298 295 299 - svc_rdma_wake_send_waiters(rdma, 1); 296 + svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount); 300 297 301 298 if (unlikely(wc->status != IB_WC_SUCCESS)) 302 299 goto flushed; ··· 315 312 } 316 313 317 314 /** 318 - * svc_rdma_send - Post a single Send WR 319 - * @rdma: transport on which to post the WR 320 - * @ctxt: send ctxt with a Send WR ready to post 315 + * svc_rdma_post_send - Post a WR chain to the Send Queue 316 + * @rdma: transport context 317 + * @ctxt: WR chain to post 321 318 * 322 319 * Copy fields in @ctxt to stack variables in order to guarantee 323 320 * that these values remain available after the ib_post_send() call. 324 321 * In some error flow cases, svc_rdma_wc_send() releases @ctxt. 325 322 * 323 + * Note there is potential for starvation when the Send Queue is 324 + * full because there is no order to when waiting threads are 325 + * awoken. The transport is typically provisioned with a deep 326 + * enough Send Queue that SQ exhaustion should be a rare event. 327 + * 326 328 * Return values: 327 329 * %0: @ctxt's WR chain was posted successfully 328 330 * %-ENOTCONN: The connection was lost 329 331 */ 330 - int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) 332 + int svc_rdma_post_send(struct svcxprt_rdma *rdma, 333 + struct svc_rdma_send_ctxt *ctxt) 331 334 { 332 - struct ib_send_wr *wr = &ctxt->sc_send_wr; 335 + struct ib_send_wr *first_wr = ctxt->sc_wr_chain; 336 + struct ib_send_wr *send_wr = &ctxt->sc_send_wr; 337 + const struct ib_send_wr *bad_wr = first_wr; 333 338 struct rpc_rdma_cid cid = ctxt->sc_cid; 334 - int ret; 339 + int ret, sqecount = ctxt->sc_sqecount; 335 340 336 341 might_sleep(); 337 342 338 343 /* Sync the transport header buffer */ 339 344 ib_dma_sync_single_for_device(rdma->sc_pd->device, 340 - wr->sg_list[0].addr, 341 - wr->sg_list[0].length, 345 + send_wr->sg_list[0].addr, 346 + send_wr->sg_list[0].length, 342 347 DMA_TO_DEVICE); 343 348 344 349 /* If the SQ is full, wait until an SQ entry is available */ 345 350 while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { 346 - if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { 347 - svc_rdma_wake_send_waiters(rdma, 1); 351 + if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { 352 + svc_rdma_wake_send_waiters(rdma, sqecount); 348 353 349 354 /* When the transport is torn down, assume 350 355 * ib_drain_sq() will trigger enough Send ··· 369 358 } 370 359 371 360 trace_svcrdma_post_send(ctxt); 372 - ret = ib_post_send(rdma->sc_qp, wr, NULL); 361 + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); 373 362 if (ret) { 374 363 trace_svcrdma_sq_post_err(rdma, &cid, ret); 375 364 svc_xprt_deferred_close(&rdma->sc_xprt); 376 - svc_rdma_wake_send_waiters(rdma, 1); 377 - break; 365 + 366 + /* If even one WR was posted, there will be a 367 + * Send completion that bumps sc_sq_avail. 368 + */ 369 + if (bad_wr == first_wr) { 370 + svc_rdma_wake_send_waiters(rdma, sqecount); 371 + break; 372 + } 378 373 } 379 374 return 0; 380 375 } ··· 901 884 sctxt->sc_send_wr.opcode = IB_WR_SEND; 902 885 } 903 886 904 - return svc_rdma_send(rdma, sctxt); 887 + return svc_rdma_post_send(rdma, sctxt); 905 888 } 906 889 907 890 /** ··· 965 948 sctxt->sc_send_wr.num_sge = 1; 966 949 sctxt->sc_send_wr.opcode = IB_WR_SEND; 967 950 sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; 968 - if (svc_rdma_send(rdma, sctxt)) 951 + if (svc_rdma_post_send(rdma, sctxt)) 969 952 goto put_ctxt; 970 953 return; 971 954