Merge tag 'nfs-rdma-for-3.20' of git://git.linux-nfs.org/projects/anna/nfs-rdma

+13 -1

include/linux/sunrpc/rpc_rdma.h

··· 42 42 43 43 #include <linux/types.h> 44 44 45 + #define RPCRDMA_VERSION 1 46 + #define rpcrdma_version cpu_to_be32(RPCRDMA_VERSION) 47 + 45 48 struct rpcrdma_segment { 46 49 __be32 rs_handle; /* Registered memory handle */ 47 50 __be32 rs_length; /* Length of the chunk in bytes */ ··· 98 95 } rm_body; 99 96 }; 100 97 101 - #define RPCRDMA_HDRLEN_MIN 28 98 + /* 99 + * Smallest RPC/RDMA header: rm_xid through rm_type, then rm_nochunks 100 + */ 101 + #define RPCRDMA_HDRLEN_MIN (sizeof(__be32) * 7) 102 102 103 103 enum rpcrdma_errcode { 104 104 ERR_VERS = 1, ··· 120 114 RDMA_DONE = 3, /* Client signals reply completion */ 121 115 RDMA_ERROR = 4 /* An RPC RDMA encoding error */ 122 116 }; 117 + 118 + #define rdma_msg cpu_to_be32(RDMA_MSG) 119 + #define rdma_nomsg cpu_to_be32(RDMA_NOMSG) 120 + #define rdma_msgp cpu_to_be32(RDMA_MSGP) 121 + #define rdma_done cpu_to_be32(RDMA_DONE) 122 + #define rdma_error cpu_to_be32(RDMA_ERROR) 123 123 124 124 #endif /* _LINUX_SUNRPC_RPC_RDMA_H */

-2

include/linux/sunrpc/svc_rdma.h

··· 63 63 extern atomic_t rdma_stat_sq_poll; 64 64 extern atomic_t rdma_stat_sq_prod; 65 65 66 - #define RPCRDMA_VERSION 1 67 - 68 66 /* 69 67 * Contexts are built when an RDMA request is created and are a 70 68 * record of the resources that can be recovered when the request

+62 -46

net/sunrpc/xprtrdma/rpc_rdma.c

··· 209 209 if (cur_rchunk) { /* read */ 210 210 cur_rchunk->rc_discrim = xdr_one; 211 211 /* all read chunks have the same "position" */ 212 - cur_rchunk->rc_position = htonl(pos); 213 - cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); 214 - cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); 212 + cur_rchunk->rc_position = cpu_to_be32(pos); 213 + cur_rchunk->rc_target.rs_handle = 214 + cpu_to_be32(seg->mr_rkey); 215 + cur_rchunk->rc_target.rs_length = 216 + cpu_to_be32(seg->mr_len); 215 217 xdr_encode_hyper( 216 218 (__be32 *)&cur_rchunk->rc_target.rs_offset, 217 219 seg->mr_base); ··· 224 222 cur_rchunk++; 225 223 r_xprt->rx_stats.read_chunk_count++; 226 224 } else { /* write/reply */ 227 - cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); 228 - cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); 225 + cur_wchunk->wc_target.rs_handle = 226 + cpu_to_be32(seg->mr_rkey); 227 + cur_wchunk->wc_target.rs_length = 228 + cpu_to_be32(seg->mr_len); 229 229 xdr_encode_hyper( 230 230 (__be32 *)&cur_wchunk->wc_target.rs_offset, 231 231 seg->mr_base); ··· 261 257 *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 262 258 } else { 263 259 warray->wc_discrim = xdr_one; 264 - warray->wc_nchunks = htonl(nchunks); 260 + warray->wc_nchunks = cpu_to_be32(nchunks); 265 261 iptr = (__be32 *) cur_wchunk; 266 262 if (type == rpcrdma_writech) { 267 263 *iptr++ = xdr_zero; /* finish the write chunk list */ ··· 294 290 rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) 295 291 { 296 292 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 297 - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; 293 + struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); 298 294 299 295 if (req->rl_rtype != rpcrdma_noch) 300 296 result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, ··· 406 402 base = rqst->rq_svec[0].iov_base; 407 403 rpclen = rqst->rq_svec[0].iov_len; 408 404 409 - /* build RDMA header in private area at front */ 410 - headerp = (struct rpcrdma_msg *) req->rl_base; 411 - /* don't htonl XID, it's already done in request */ 405 + headerp = rdmab_to_msg(req->rl_rdmabuf); 406 + /* don't byte-swap XID, it's already done in request */ 412 407 headerp->rm_xid = rqst->rq_xid; 413 - headerp->rm_vers = xdr_one; 414 - headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); 415 - headerp->rm_type = htonl(RDMA_MSG); 408 + headerp->rm_vers = rpcrdma_version; 409 + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 410 + headerp->rm_type = rdma_msg; 416 411 417 412 /* 418 413 * Chunks needed for results? ··· 471 468 return -EIO; 472 469 } 473 470 474 - hdrlen = 28; /*sizeof *headerp;*/ 471 + hdrlen = RPCRDMA_HDRLEN_MIN; 475 472 padlen = 0; 476 473 477 474 /* ··· 485 482 RPCRDMA_INLINE_PAD_VALUE(rqst)); 486 483 487 484 if (padlen) { 488 - headerp->rm_type = htonl(RDMA_MSGP); 485 + headerp->rm_type = rdma_msgp; 489 486 headerp->rm_body.rm_padded.rm_align = 490 - htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); 487 + cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); 491 488 headerp->rm_body.rm_padded.rm_thresh = 492 - htonl(RPCRDMA_INLINE_PAD_THRESH); 489 + cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); 493 490 headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; 494 491 headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 495 492 headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; ··· 527 524 dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" 528 525 " headerp 0x%p base 0x%p lkey 0x%x\n", 529 526 __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, 530 - headerp, base, req->rl_iov.lkey); 527 + headerp, base, rdmab_lkey(req->rl_rdmabuf)); 531 528 532 529 /* 533 530 * initialize send_iov's - normally only two: rdma chunk header and ··· 536 533 * header and any write data. In all non-rdma cases, any following 537 534 * data has been copied into the RPC header buffer. 538 535 */ 539 - req->rl_send_iov[0].addr = req->rl_iov.addr; 536 + req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); 540 537 req->rl_send_iov[0].length = hdrlen; 541 - req->rl_send_iov[0].lkey = req->rl_iov.lkey; 538 + req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); 542 539 543 - req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); 540 + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); 544 541 req->rl_send_iov[1].length = rpclen; 545 - req->rl_send_iov[1].lkey = req->rl_iov.lkey; 542 + req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); 546 543 547 544 req->rl_niovs = 2; 548 545 549 546 if (padlen) { 550 547 struct rpcrdma_ep *ep = &r_xprt->rx_ep; 551 548 552 - req->rl_send_iov[2].addr = ep->rep_pad.addr; 549 + req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); 553 550 req->rl_send_iov[2].length = padlen; 554 - req->rl_send_iov[2].lkey = ep->rep_pad.lkey; 551 + req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); 555 552 556 553 req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; 557 554 req->rl_send_iov[3].length = rqst->rq_slen - rpclen; 558 - req->rl_send_iov[3].lkey = req->rl_iov.lkey; 555 + req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); 559 556 560 557 req->rl_niovs = 4; 561 558 } ··· 572 569 { 573 570 unsigned int i, total_len; 574 571 struct rpcrdma_write_chunk *cur_wchunk; 572 + char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 575 573 576 - i = ntohl(**iptrp); /* get array count */ 574 + i = be32_to_cpu(**iptrp); 577 575 if (i > max) 578 576 return -1; 579 577 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); ··· 586 582 xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 587 583 dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", 588 584 __func__, 589 - ntohl(seg->rs_length), 585 + be32_to_cpu(seg->rs_length), 590 586 (unsigned long long)off, 591 - ntohl(seg->rs_handle)); 587 + be32_to_cpu(seg->rs_handle)); 592 588 } 593 - total_len += ntohl(seg->rs_length); 589 + total_len += be32_to_cpu(seg->rs_length); 594 590 ++cur_wchunk; 595 591 } 596 592 /* check and adjust for properly terminated write chunk */ ··· 600 596 return -1; 601 597 cur_wchunk = (struct rpcrdma_write_chunk *) w; 602 598 } 603 - if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) 599 + if ((char *)cur_wchunk > base + rep->rr_len) 604 600 return -1; 605 601 606 602 *iptrp = (__be32 *) cur_wchunk; ··· 695 691 { 696 692 struct rpcrdma_ep *ep = 697 693 container_of(work, struct rpcrdma_ep, rep_connect_worker.work); 698 - struct rpc_xprt *xprt = ep->rep_xprt; 694 + struct rpcrdma_xprt *r_xprt = 695 + container_of(ep, struct rpcrdma_xprt, rx_ep); 696 + struct rpc_xprt *xprt = &r_xprt->rx_xprt; 699 697 700 698 spin_lock_bh(&xprt->transport_lock); 701 699 if (++xprt->connect_cookie == 0) /* maintain a reserved value */ ··· 738 732 struct rpc_xprt *xprt = rep->rr_xprt; 739 733 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 740 734 __be32 *iptr; 741 - int rdmalen, status; 735 + int credits, rdmalen, status; 742 736 unsigned long cwnd; 743 737 744 738 /* Check status. If bad, signal disconnect and return rep to pool */ ··· 750 744 } 751 745 return; 752 746 } 753 - if (rep->rr_len < 28) { 747 + if (rep->rr_len < RPCRDMA_HDRLEN_MIN) { 754 748 dprintk("RPC: %s: short/invalid reply\n", __func__); 755 749 goto repost; 756 750 } 757 - headerp = (struct rpcrdma_msg *) rep->rr_base; 758 - if (headerp->rm_vers != xdr_one) { 751 + headerp = rdmab_to_msg(rep->rr_rdmabuf); 752 + if (headerp->rm_vers != rpcrdma_version) { 759 753 dprintk("RPC: %s: invalid version %d\n", 760 - __func__, ntohl(headerp->rm_vers)); 754 + __func__, be32_to_cpu(headerp->rm_vers)); 761 755 goto repost; 762 756 } 763 757 ··· 768 762 spin_unlock(&xprt->transport_lock); 769 763 dprintk("RPC: %s: reply 0x%p failed " 770 764 "to match any request xid 0x%08x len %d\n", 771 - __func__, rep, headerp->rm_xid, rep->rr_len); 765 + __func__, rep, be32_to_cpu(headerp->rm_xid), 766 + rep->rr_len); 772 767 repost: 773 768 r_xprt->rx_stats.bad_reply_count++; 774 769 rep->rr_func = rpcrdma_reply_handler; ··· 785 778 spin_unlock(&xprt->transport_lock); 786 779 dprintk("RPC: %s: duplicate reply 0x%p to RPC " 787 780 "request 0x%p: xid 0x%08x\n", __func__, rep, req, 788 - headerp->rm_xid); 781 + be32_to_cpu(headerp->rm_xid)); 789 782 goto repost; 790 783 } 791 784 792 785 dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" 793 786 " RPC request 0x%p xid 0x%08x\n", 794 - __func__, rep, req, rqst, headerp->rm_xid); 787 + __func__, rep, req, rqst, 788 + be32_to_cpu(headerp->rm_xid)); 795 789 796 790 /* from here on, the reply is no longer an orphan */ 797 791 req->rl_reply = rep; ··· 801 793 /* check for expected message types */ 802 794 /* The order of some of these tests is important. */ 803 795 switch (headerp->rm_type) { 804 - case htonl(RDMA_MSG): 796 + case rdma_msg: 805 797 /* never expect read chunks */ 806 798 /* never expect reply chunks (two ways to check) */ 807 799 /* never expect write chunks without having offered RDMA */ ··· 832 824 } else { 833 825 /* else ordinary inline */ 834 826 rdmalen = 0; 835 - iptr = (__be32 *)((unsigned char *)headerp + 28); 836 - rep->rr_len -= 28; /*sizeof *headerp;*/ 827 + iptr = (__be32 *)((unsigned char *)headerp + 828 + RPCRDMA_HDRLEN_MIN); 829 + rep->rr_len -= RPCRDMA_HDRLEN_MIN; 837 830 status = rep->rr_len; 838 831 } 839 832 /* Fix up the rpc results for upper layer */ 840 833 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 841 834 break; 842 835 843 - case htonl(RDMA_NOMSG): 836 + case rdma_nomsg: 844 837 /* never expect read or write chunks, always reply chunks */ 845 838 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 846 839 headerp->rm_body.rm_chunks[1] != xdr_zero || 847 840 headerp->rm_body.rm_chunks[2] != xdr_one || 848 841 req->rl_nchunks == 0) 849 842 goto badheader; 850 - iptr = (__be32 *)((unsigned char *)headerp + 28); 843 + iptr = (__be32 *)((unsigned char *)headerp + 844 + RPCRDMA_HDRLEN_MIN); 851 845 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 852 846 if (rdmalen < 0) 853 847 goto badheader; ··· 863 853 dprintk("%s: invalid rpcrdma reply header (type %d):" 864 854 " chunks[012] == %d %d %d" 865 855 " expected chunks <= %d\n", 866 - __func__, ntohl(headerp->rm_type), 856 + __func__, be32_to_cpu(headerp->rm_type), 867 857 headerp->rm_body.rm_chunks[0], 868 858 headerp->rm_body.rm_chunks[1], 869 859 headerp->rm_body.rm_chunks[2], ··· 873 863 break; 874 864 } 875 865 866 + credits = be32_to_cpu(headerp->rm_credit); 867 + if (credits == 0) 868 + credits = 1; /* don't deadlock */ 869 + else if (credits > r_xprt->rx_buf.rb_max_requests) 870 + credits = r_xprt->rx_buf.rb_max_requests; 871 + 876 872 cwnd = xprt->cwnd; 877 - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; 873 + xprt->cwnd = credits << RPC_CWNDSHIFT; 878 874 if (xprt->cwnd > cwnd) 879 875 xprt_release_rqst_cong(rqst->rq_task); 880 876

+80 -98

net/sunrpc/xprtrdma/transport.c

··· 200 200 static void 201 201 xprt_rdma_connect_worker(struct work_struct *work) 202 202 { 203 - struct rpcrdma_xprt *r_xprt = 204 - container_of(work, struct rpcrdma_xprt, rdma_connect.work); 205 - struct rpc_xprt *xprt = &r_xprt->xprt; 203 + struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, 204 + rx_connect_worker.work); 205 + struct rpc_xprt *xprt = &r_xprt->rx_xprt; 206 206 int rc = 0; 207 207 208 208 xprt_clear_connected(xprt); ··· 235 235 236 236 dprintk("RPC: %s: called\n", __func__); 237 237 238 - cancel_delayed_work_sync(&r_xprt->rdma_connect); 238 + cancel_delayed_work_sync(&r_xprt->rx_connect_worker); 239 239 240 240 xprt_clear_connected(xprt); 241 241 ··· 364 364 * any inline data. Also specify any padding which will be provided 365 365 * from a preregistered zero buffer. 366 366 */ 367 - rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, 368 - &new_xprt->rx_data); 367 + rc = rpcrdma_buffer_create(new_xprt); 369 368 if (rc) 370 369 goto out3; 371 370 ··· 373 374 * connection loss notification is async. We also catch connection loss 374 375 * when reaping receives. 375 376 */ 376 - INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); 377 - new_ep->rep_func = rpcrdma_conn_func; 378 - new_ep->rep_xprt = xprt; 377 + INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, 378 + xprt_rdma_connect_worker); 379 379 380 380 xprt_rdma_format_addresses(xprt); 381 381 xprt->max_payload = rpcrdma_max_payload(new_xprt); ··· 432 434 433 435 if (r_xprt->rx_ep.rep_connected != 0) { 434 436 /* Reconnect */ 435 - schedule_delayed_work(&r_xprt->rdma_connect, 436 - xprt->reestablish_timeout); 437 + schedule_delayed_work(&r_xprt->rx_connect_worker, 438 + xprt->reestablish_timeout); 437 439 xprt->reestablish_timeout <<= 1; 438 440 if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) 439 441 xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; 440 442 else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) 441 443 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; 442 444 } else { 443 - schedule_delayed_work(&r_xprt->rdma_connect, 0); 445 + schedule_delayed_work(&r_xprt->rx_connect_worker, 0); 444 446 if (!RPC_IS_ASYNC(task)) 445 - flush_delayed_work(&r_xprt->rdma_connect); 447 + flush_delayed_work(&r_xprt->rx_connect_worker); 446 448 } 447 449 } 448 450 449 451 /* 450 452 * The RDMA allocate/free functions need the task structure as a place 451 453 * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 452 - * sequence. For this reason, the recv buffers are attached to send 453 - * buffers for portions of the RPC. Note that the RPC layer allocates 454 - * both send and receive buffers in the same call. We may register 455 - * the receive buffer portion when using reply chunks. 454 + * sequence. 455 + * 456 + * The RPC layer allocates both send and receive buffers in the same call 457 + * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). 458 + * We may register rq_rcv_buf when using reply chunks. 456 459 */ 457 460 static void * 458 461 xprt_rdma_allocate(struct rpc_task *task, size_t size) 459 462 { 460 463 struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; 461 - struct rpcrdma_req *req, *nreq; 464 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 465 + struct rpcrdma_regbuf *rb; 466 + struct rpcrdma_req *req; 467 + size_t min_size; 468 + gfp_t flags; 462 469 463 - req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); 470 + req = rpcrdma_buffer_get(&r_xprt->rx_buf); 464 471 if (req == NULL) 465 472 return NULL; 466 473 467 - if (size > req->rl_size) { 468 - dprintk("RPC: %s: size %zd too large for buffer[%zd]: " 469 - "prog %d vers %d proc %d\n", 470 - __func__, size, req->rl_size, 471 - task->tk_client->cl_prog, task->tk_client->cl_vers, 472 - task->tk_msg.rpc_proc->p_proc); 473 - /* 474 - * Outgoing length shortage. Our inline write max must have 475 - * been configured to perform direct i/o. 476 - * 477 - * This is therefore a large metadata operation, and the 478 - * allocate call was made on the maximum possible message, 479 - * e.g. containing long filename(s) or symlink data. In 480 - * fact, while these metadata operations *might* carry 481 - * large outgoing payloads, they rarely *do*. However, we 482 - * have to commit to the request here, so reallocate and 483 - * register it now. The data path will never require this 484 - * reallocation. 485 - * 486 - * If the allocation or registration fails, the RPC framework 487 - * will (doggedly) retry. 488 - */ 489 - if (task->tk_flags & RPC_TASK_SWAPPER) 490 - nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); 491 - else 492 - nreq = kmalloc(sizeof *req + size, GFP_NOFS); 493 - if (nreq == NULL) 494 - goto outfail; 474 + flags = GFP_NOIO | __GFP_NOWARN; 475 + if (RPC_IS_SWAPPER(task)) 476 + flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; 495 477 496 - if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, 497 - nreq->rl_base, size + sizeof(struct rpcrdma_req) 498 - - offsetof(struct rpcrdma_req, rl_base), 499 - &nreq->rl_handle, &nreq->rl_iov)) { 500 - kfree(nreq); 501 - goto outfail; 502 - } 503 - rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; 504 - nreq->rl_size = size; 505 - nreq->rl_niovs = 0; 506 - nreq->rl_nchunks = 0; 507 - nreq->rl_buffer = (struct rpcrdma_buffer *)req; 508 - nreq->rl_reply = req->rl_reply; 509 - memcpy(nreq->rl_segments, 510 - req->rl_segments, sizeof nreq->rl_segments); 511 - /* flag the swap with an unused field */ 512 - nreq->rl_iov.length = 0; 513 - req->rl_reply = NULL; 514 - req = nreq; 515 - } 478 + if (req->rl_rdmabuf == NULL) 479 + goto out_rdmabuf; 480 + if (req->rl_sendbuf == NULL) 481 + goto out_sendbuf; 482 + if (size > req->rl_sendbuf->rg_size) 483 + goto out_sendbuf; 484 + 485 + out: 516 486 dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 517 487 req->rl_connect_cookie = 0; /* our reserved value */ 518 - return req->rl_xdr_buf; 488 + return req->rl_sendbuf->rg_base; 519 489 520 - outfail: 490 + out_rdmabuf: 491 + min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); 492 + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); 493 + if (IS_ERR(rb)) 494 + goto out_fail; 495 + req->rl_rdmabuf = rb; 496 + 497 + out_sendbuf: 498 + /* XDR encoding and RPC/RDMA marshaling of this request has not 499 + * yet occurred. Thus a lower bound is needed to prevent buffer 500 + * overrun during marshaling. 501 + * 502 + * RPC/RDMA marshaling may choose to send payload bearing ops 503 + * inline, if the result is smaller than the inline threshold. 504 + * The value of the "size" argument accounts for header 505 + * requirements but not for the payload in these cases. 506 + * 507 + * Likewise, allocate enough space to receive a reply up to the 508 + * size of the inline threshold. 509 + * 510 + * It's unlikely that both the send header and the received 511 + * reply will be large, but slush is provided here to allow 512 + * flexibility when marshaling. 513 + */ 514 + min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp); 515 + min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); 516 + if (size < min_size) 517 + size = min_size; 518 + 519 + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); 520 + if (IS_ERR(rb)) 521 + goto out_fail; 522 + rb->rg_owner = req; 523 + 524 + r_xprt->rx_stats.hardway_register_count += size; 525 + rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); 526 + req->rl_sendbuf = rb; 527 + goto out; 528 + 529 + out_fail: 521 530 rpcrdma_buffer_put(req); 522 - rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; 531 + r_xprt->rx_stats.failed_marshal_count++; 523 532 return NULL; 524 533 } 525 534 ··· 538 533 { 539 534 struct rpcrdma_req *req; 540 535 struct rpcrdma_xprt *r_xprt; 541 - struct rpcrdma_rep *rep; 536 + struct rpcrdma_regbuf *rb; 542 537 int i; 543 538 544 539 if (buffer == NULL) 545 540 return; 546 541 547 - req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); 548 - if (req->rl_iov.length == 0) { /* see allocate above */ 549 - r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer, 550 - struct rpcrdma_xprt, rx_buf); 551 - } else 552 - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); 553 - rep = req->rl_reply; 542 + rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); 543 + req = rb->rg_owner; 544 + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); 554 545 555 - dprintk("RPC: %s: called on 0x%p%s\n", 556 - __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); 546 + dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); 557 547 558 - /* 559 - * Finish the deregistration. The process is considered 560 - * complete when the rr_func vector becomes NULL - this 561 - * was put in place during rpcrdma_reply_handler() - the wait 562 - * call below will not block if the dereg is "done". If 563 - * interrupted, our framework will clean up. 564 - */ 565 548 for (i = 0; req->rl_nchunks;) { 566 549 --req->rl_nchunks; 567 550 i += rpcrdma_deregister_external( 568 551 &req->rl_segments[i], r_xprt); 569 552 } 570 553 571 - if (req->rl_iov.length == 0) { /* see allocate above */ 572 - struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; 573 - oreq->rl_reply = req->rl_reply; 574 - (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, 575 - req->rl_handle, 576 - &req->rl_iov); 577 - kfree(req); 578 - req = oreq; 579 - } 580 - 581 - /* Put back request+reply buffers */ 582 554 rpcrdma_buffer_put(req); 583 555 } 584 556

+255 -158

net/sunrpc/xprtrdma/verbs.c

··· 49 49 50 50 #include <linux/interrupt.h> 51 51 #include <linux/slab.h> 52 + #include <linux/prefetch.h> 52 53 #include <asm/bitops.h> 53 54 54 55 #include "xprt_rdma.h" ··· 154 153 event->device->name, context); 155 154 if (ep->rep_connected == 1) { 156 155 ep->rep_connected = -EIO; 157 - ep->rep_func(ep); 156 + rpcrdma_conn_func(ep); 158 157 wake_up_all(&ep->rep_connect_wait); 159 158 } 160 159 } ··· 169 168 event->device->name, context); 170 169 if (ep->rep_connected == 1) { 171 170 ep->rep_connected = -EIO; 172 - ep->rep_func(ep); 171 + rpcrdma_conn_func(ep); 173 172 wake_up_all(&ep->rep_connect_wait); 174 173 } 175 174 } 176 175 176 + static const char * const wc_status[] = { 177 + "success", 178 + "local length error", 179 + "local QP operation error", 180 + "local EE context operation error", 181 + "local protection error", 182 + "WR flushed", 183 + "memory management operation error", 184 + "bad response error", 185 + "local access error", 186 + "remote invalid request error", 187 + "remote access error", 188 + "remote operation error", 189 + "transport retry counter exceeded", 190 + "RNR retrycounter exceeded", 191 + "local RDD violation error", 192 + "remove invalid RD request", 193 + "operation aborted", 194 + "invalid EE context number", 195 + "invalid EE context state", 196 + "fatal error", 197 + "response timeout error", 198 + "general error", 199 + }; 200 + 201 + #define COMPLETION_MSG(status) \ 202 + ((status) < ARRAY_SIZE(wc_status) ? \ 203 + wc_status[(status)] : "unexpected completion error") 204 + 177 205 static void 178 206 rpcrdma_sendcq_process_wc(struct ib_wc *wc) 179 207 { 180 - struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 181 - 182 - dprintk("RPC: %s: frmr %p status %X opcode %d\n", 183 - __func__, frmr, wc->status, wc->opcode); 184 - 185 - if (wc->wr_id == 0ULL) 208 + if (likely(wc->status == IB_WC_SUCCESS)) 186 209 return; 187 - if (wc->status != IB_WC_SUCCESS) 188 - frmr->r.frmr.fr_state = FRMR_IS_STALE; 210 + 211 + /* WARNING: Only wr_id and status are reliable at this point */ 212 + if (wc->wr_id == 0ULL) { 213 + if (wc->status != IB_WC_WR_FLUSH_ERR) 214 + pr_err("RPC: %s: SEND: %s\n", 215 + __func__, COMPLETION_MSG(wc->status)); 216 + } else { 217 + struct rpcrdma_mw *r; 218 + 219 + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 220 + r->r.frmr.fr_state = FRMR_IS_STALE; 221 + pr_err("RPC: %s: frmr %p (stale): %s\n", 222 + __func__, r, COMPLETION_MSG(wc->status)); 223 + } 189 224 } 190 225 191 226 static int ··· 285 248 struct rpcrdma_rep *rep = 286 249 (struct rpcrdma_rep *)(unsigned long)wc->wr_id; 287 250 288 - dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", 289 - __func__, rep, wc->status, wc->opcode, wc->byte_len); 251 + /* WARNING: Only wr_id and status are reliable at this point */ 252 + if (wc->status != IB_WC_SUCCESS) 253 + goto out_fail; 290 254 291 - if (wc->status != IB_WC_SUCCESS) { 292 - rep->rr_len = ~0U; 293 - goto out_schedule; 294 - } 255 + /* status == SUCCESS means all fields in wc are trustworthy */ 295 256 if (wc->opcode != IB_WC_RECV) 296 257 return; 297 258 259 + dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", 260 + __func__, rep, wc->byte_len); 261 + 298 262 rep->rr_len = wc->byte_len; 299 263 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, 300 - rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 301 - 302 - if (rep->rr_len >= 16) { 303 - struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; 304 - unsigned int credits = ntohl(p->rm_credit); 305 - 306 - if (credits == 0) 307 - credits = 1; /* don't deadlock */ 308 - else if (credits > rep->rr_buffer->rb_max_requests) 309 - credits = rep->rr_buffer->rb_max_requests; 310 - atomic_set(&rep->rr_buffer->rb_credits, credits); 311 - } 264 + rdmab_addr(rep->rr_rdmabuf), 265 + rep->rr_len, DMA_FROM_DEVICE); 266 + prefetch(rdmab_to_msg(rep->rr_rdmabuf)); 312 267 313 268 out_schedule: 314 269 list_add_tail(&rep->rr_list, sched_list); 270 + return; 271 + out_fail: 272 + if (wc->status != IB_WC_WR_FLUSH_ERR) 273 + pr_err("RPC: %s: rep %p: %s\n", 274 + __func__, rep, COMPLETION_MSG(wc->status)); 275 + rep->rr_len = ~0U; 276 + goto out_schedule; 315 277 } 316 278 317 279 static int ··· 426 390 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 427 391 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 428 392 #endif 429 - struct ib_qp_attr attr; 430 - struct ib_qp_init_attr iattr; 393 + struct ib_qp_attr *attr = &ia->ri_qp_attr; 394 + struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; 431 395 int connstate = 0; 432 396 433 397 switch (event->event) { ··· 450 414 break; 451 415 case RDMA_CM_EVENT_ESTABLISHED: 452 416 connstate = 1; 453 - ib_query_qp(ia->ri_id->qp, &attr, 454 - IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 455 - &iattr); 417 + ib_query_qp(ia->ri_id->qp, attr, 418 + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 419 + iattr); 456 420 dprintk("RPC: %s: %d responder resources" 457 421 " (%d initiator)\n", 458 - __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 422 + __func__, attr->max_dest_rd_atomic, 423 + attr->max_rd_atomic); 459 424 goto connected; 460 425 case RDMA_CM_EVENT_CONNECT_ERROR: 461 426 connstate = -ENOTCONN; ··· 473 436 case RDMA_CM_EVENT_DEVICE_REMOVAL: 474 437 connstate = -ENODEV; 475 438 connected: 476 - atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 477 439 dprintk("RPC: %s: %sconnected\n", 478 440 __func__, connstate > 0 ? "" : "dis"); 479 441 ep->rep_connected = connstate; 480 - ep->rep_func(ep); 442 + rpcrdma_conn_func(ep); 481 443 wake_up_all(&ep->rep_connect_wait); 482 444 /*FALLTHROUGH*/ 483 445 default: ··· 489 453 490 454 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 491 455 if (connstate == 1) { 492 - int ird = attr.max_dest_rd_atomic; 456 + int ird = attr->max_dest_rd_atomic; 493 457 int tird = ep->rep_remote_cma.responder_resources; 494 458 printk(KERN_INFO "rpcrdma: connection to %pI4:%u " 495 459 "on %s, memreg %d slots %d ird %d%s\n", ··· 590 554 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 591 555 { 592 556 int rc, mem_priv; 593 - struct ib_device_attr devattr; 594 557 struct rpcrdma_ia *ia = &xprt->rx_ia; 558 + struct ib_device_attr *devattr = &ia->ri_devattr; 595 559 596 560 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 597 561 if (IS_ERR(ia->ri_id)) { ··· 607 571 goto out2; 608 572 } 609 573 610 - /* 611 - * Query the device to determine if the requested memory 612 - * registration strategy is supported. If it isn't, set the 613 - * strategy to a globally supported model. 614 - */ 615 - rc = ib_query_device(ia->ri_id->device, &devattr); 574 + rc = ib_query_device(ia->ri_id->device, devattr); 616 575 if (rc) { 617 576 dprintk("RPC: %s: ib_query_device failed %d\n", 618 577 __func__, rc); 619 - goto out2; 578 + goto out3; 620 579 } 621 580 622 - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 581 + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 623 582 ia->ri_have_dma_lkey = 1; 624 583 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; 625 584 } 626 585 627 586 if (memreg == RPCRDMA_FRMR) { 628 587 /* Requires both frmr reg and local dma lkey */ 629 - if ((devattr.device_cap_flags & 588 + if ((devattr->device_cap_flags & 630 589 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 631 590 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { 632 591 dprintk("RPC: %s: FRMR registration " ··· 631 600 /* Mind the ia limit on FRMR page list depth */ 632 601 ia->ri_max_frmr_depth = min_t(unsigned int, 633 602 RPCRDMA_MAX_DATA_SEGS, 634 - devattr.max_fast_reg_page_list_len); 603 + devattr->max_fast_reg_page_list_len); 635 604 } 636 605 } 637 606 if (memreg == RPCRDMA_MTHCAFMR) { ··· 669 638 "phys register failed with %lX\n", 670 639 __func__, PTR_ERR(ia->ri_bind_mem)); 671 640 rc = -ENOMEM; 672 - goto out2; 641 + goto out3; 673 642 } 674 643 break; 675 644 default: 676 645 printk(KERN_ERR "RPC: Unsupported memory " 677 646 "registration mode: %d\n", memreg); 678 647 rc = -ENOMEM; 679 - goto out2; 648 + goto out3; 680 649 } 681 650 dprintk("RPC: %s: memory registration strategy is %d\n", 682 651 __func__, memreg); ··· 686 655 687 656 rwlock_init(&ia->ri_qplock); 688 657 return 0; 658 + 659 + out3: 660 + ib_dealloc_pd(ia->ri_pd); 661 + ia->ri_pd = NULL; 689 662 out2: 690 663 rdma_destroy_id(ia->ri_id); 691 664 ia->ri_id = NULL; ··· 733 698 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 734 699 struct rpcrdma_create_data_internal *cdata) 735 700 { 736 - struct ib_device_attr devattr; 701 + struct ib_device_attr *devattr = &ia->ri_devattr; 737 702 struct ib_cq *sendcq, *recvcq; 738 703 int rc, err; 739 704 740 - rc = ib_query_device(ia->ri_id->device, &devattr); 741 - if (rc) { 742 - dprintk("RPC: %s: ib_query_device failed %d\n", 743 - __func__, rc); 744 - return rc; 745 - } 746 - 747 705 /* check provider's send/recv wr limits */ 748 - if (cdata->max_requests > devattr.max_qp_wr) 749 - cdata->max_requests = devattr.max_qp_wr; 706 + if (cdata->max_requests > devattr->max_qp_wr) 707 + cdata->max_requests = devattr->max_qp_wr; 750 708 751 709 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 752 710 ep->rep_attr.qp_context = ep; ··· 774 746 775 747 } 776 748 ep->rep_attr.cap.max_send_wr *= depth; 777 - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { 778 - cdata->max_requests = devattr.max_qp_wr / depth; 749 + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { 750 + cdata->max_requests = devattr->max_qp_wr / depth; 779 751 if (!cdata->max_requests) 780 752 return -EINVAL; 781 753 ep->rep_attr.cap.max_send_wr = cdata->max_requests * ··· 794 766 ep->rep_attr.qp_type = IB_QPT_RC; 795 767 ep->rep_attr.port_num = ~0; 796 768 769 + if (cdata->padding) { 770 + ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, 771 + GFP_KERNEL); 772 + if (IS_ERR(ep->rep_padbuf)) 773 + return PTR_ERR(ep->rep_padbuf); 774 + } else 775 + ep->rep_padbuf = NULL; 776 + 797 777 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 798 778 "iovs: send %d recv %d\n", 799 779 __func__, ··· 817 781 else if (ep->rep_cqinit <= 2) 818 782 ep->rep_cqinit = 0; 819 783 INIT_CQCOUNT(ep); 820 - ep->rep_ia = ia; 821 784 init_waitqueue_head(&ep->rep_connect_wait); 822 785 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 823 786 ··· 866 831 867 832 /* Client offers RDMA Read but does not initiate */ 868 833 ep->rep_remote_cma.initiator_depth = 0; 869 - if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 834 + if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 870 835 ep->rep_remote_cma.responder_resources = 32; 871 836 else 872 - ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 837 + ep->rep_remote_cma.responder_resources = 838 + devattr->max_qp_rd_atom; 873 839 874 840 ep->rep_remote_cma.retry_count = 7; 875 841 ep->rep_remote_cma.flow_control = 0; ··· 884 848 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 885 849 __func__, err); 886 850 out1: 851 + rpcrdma_free_regbuf(ia, ep->rep_padbuf); 887 852 return rc; 888 853 } 889 854 ··· 911 874 ia->ri_id->qp = NULL; 912 875 } 913 876 914 - /* padding - could be done in rpcrdma_buffer_destroy... */ 915 - if (ep->rep_pad_mr) { 916 - rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 917 - ep->rep_pad_mr = NULL; 918 - } 877 + rpcrdma_free_regbuf(ia, ep->rep_padbuf); 919 878 920 879 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 921 880 rc = ib_destroy_cq(ep->rep_attr.recv_cq); ··· 1081 1048 } 1082 1049 } 1083 1050 1051 + static struct rpcrdma_req * 1052 + rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) 1053 + { 1054 + struct rpcrdma_req *req; 1055 + 1056 + req = kzalloc(sizeof(*req), GFP_KERNEL); 1057 + if (req == NULL) 1058 + return ERR_PTR(-ENOMEM); 1059 + 1060 + req->rl_buffer = &r_xprt->rx_buf; 1061 + return req; 1062 + } 1063 + 1064 + static struct rpcrdma_rep * 1065 + rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) 1066 + { 1067 + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1068 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1069 + struct rpcrdma_rep *rep; 1070 + int rc; 1071 + 1072 + rc = -ENOMEM; 1073 + rep = kzalloc(sizeof(*rep), GFP_KERNEL); 1074 + if (rep == NULL) 1075 + goto out; 1076 + 1077 + rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, 1078 + GFP_KERNEL); 1079 + if (IS_ERR(rep->rr_rdmabuf)) { 1080 + rc = PTR_ERR(rep->rr_rdmabuf); 1081 + goto out_free; 1082 + } 1083 + 1084 + rep->rr_buffer = &r_xprt->rx_buf; 1085 + return rep; 1086 + 1087 + out_free: 1088 + kfree(rep); 1089 + out: 1090 + return ERR_PTR(rc); 1091 + } 1092 + 1084 1093 static int 1085 1094 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) 1086 1095 { ··· 1209 1134 } 1210 1135 1211 1136 int 1212 - rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 1213 - struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 1137 + rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 1214 1138 { 1139 + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1140 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1141 + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1215 1142 char *p; 1216 - size_t len, rlen, wlen; 1143 + size_t len; 1217 1144 int i, rc; 1218 1145 1219 1146 buf->rb_max_requests = cdata->max_requests; 1220 1147 spin_lock_init(&buf->rb_lock); 1221 - atomic_set(&buf->rb_credits, 1); 1222 1148 1223 1149 /* Need to allocate: 1224 1150 * 1. arrays for send and recv pointers 1225 1151 * 2. arrays of struct rpcrdma_req to fill in pointers 1226 1152 * 3. array of struct rpcrdma_rep for replies 1227 - * 4. padding, if any 1228 1153 * Send/recv buffers in req/rep need to be registered 1229 1154 */ 1230 1155 len = buf->rb_max_requests * 1231 1156 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 1232 - len += cdata->padding; 1233 1157 1234 1158 p = kzalloc(len, GFP_KERNEL); 1235 1159 if (p == NULL) { ··· 1243 1169 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 1244 1170 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1245 1171 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1246 - 1247 - /* 1248 - * Register the zeroed pad buffer, if any. 1249 - */ 1250 - if (cdata->padding) { 1251 - rc = rpcrdma_register_internal(ia, p, cdata->padding, 1252 - &ep->rep_pad_mr, &ep->rep_pad); 1253 - if (rc) 1254 - goto out; 1255 - } 1256 - p += cdata->padding; 1257 1172 1258 1173 INIT_LIST_HEAD(&buf->rb_mws); 1259 1174 INIT_LIST_HEAD(&buf->rb_all); ··· 1261 1198 break; 1262 1199 } 1263 1200 1264 - /* 1265 - * Allocate/init the request/reply buffers. Doing this 1266 - * using kmalloc for now -- one for each buf. 1267 - */ 1268 - wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); 1269 - rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); 1270 - dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", 1271 - __func__, wlen, rlen); 1272 - 1273 1201 for (i = 0; i < buf->rb_max_requests; i++) { 1274 1202 struct rpcrdma_req *req; 1275 1203 struct rpcrdma_rep *rep; 1276 1204 1277 - req = kmalloc(wlen, GFP_KERNEL); 1278 - if (req == NULL) { 1205 + req = rpcrdma_create_req(r_xprt); 1206 + if (IS_ERR(req)) { 1279 1207 dprintk("RPC: %s: request buffer %d alloc" 1280 1208 " failed\n", __func__, i); 1281 - rc = -ENOMEM; 1209 + rc = PTR_ERR(req); 1282 1210 goto out; 1283 1211 } 1284 - memset(req, 0, sizeof(struct rpcrdma_req)); 1285 1212 buf->rb_send_bufs[i] = req; 1286 - buf->rb_send_bufs[i]->rl_buffer = buf; 1287 1213 1288 - rc = rpcrdma_register_internal(ia, req->rl_base, 1289 - wlen - offsetof(struct rpcrdma_req, rl_base), 1290 - &buf->rb_send_bufs[i]->rl_handle, 1291 - &buf->rb_send_bufs[i]->rl_iov); 1292 - if (rc) 1293 - goto out; 1294 - 1295 - buf->rb_send_bufs[i]->rl_size = wlen - 1296 - sizeof(struct rpcrdma_req); 1297 - 1298 - rep = kmalloc(rlen, GFP_KERNEL); 1299 - if (rep == NULL) { 1214 + rep = rpcrdma_create_rep(r_xprt); 1215 + if (IS_ERR(rep)) { 1300 1216 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1301 1217 __func__, i); 1302 - rc = -ENOMEM; 1218 + rc = PTR_ERR(rep); 1303 1219 goto out; 1304 1220 } 1305 - memset(rep, 0, sizeof(struct rpcrdma_rep)); 1306 1221 buf->rb_recv_bufs[i] = rep; 1307 - buf->rb_recv_bufs[i]->rr_buffer = buf; 1308 - 1309 - rc = rpcrdma_register_internal(ia, rep->rr_base, 1310 - rlen - offsetof(struct rpcrdma_rep, rr_base), 1311 - &buf->rb_recv_bufs[i]->rr_handle, 1312 - &buf->rb_recv_bufs[i]->rr_iov); 1313 - if (rc) 1314 - goto out; 1315 - 1316 1222 } 1317 - dprintk("RPC: %s: max_requests %d\n", 1318 - __func__, buf->rb_max_requests); 1319 - /* done */ 1223 + 1320 1224 return 0; 1321 1225 out: 1322 1226 rpcrdma_buffer_destroy(buf); 1323 1227 return rc; 1228 + } 1229 + 1230 + static void 1231 + rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) 1232 + { 1233 + if (!rep) 1234 + return; 1235 + 1236 + rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); 1237 + kfree(rep); 1238 + } 1239 + 1240 + static void 1241 + rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 1242 + { 1243 + if (!req) 1244 + return; 1245 + 1246 + rpcrdma_free_regbuf(ia, req->rl_sendbuf); 1247 + rpcrdma_free_regbuf(ia, req->rl_rdmabuf); 1248 + kfree(req); 1324 1249 } 1325 1250 1326 1251 static void ··· 1366 1315 dprintk("RPC: %s: entering\n", __func__); 1367 1316 1368 1317 for (i = 0; i < buf->rb_max_requests; i++) { 1369 - if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1370 - rpcrdma_deregister_internal(ia, 1371 - buf->rb_recv_bufs[i]->rr_handle, 1372 - &buf->rb_recv_bufs[i]->rr_iov); 1373 - kfree(buf->rb_recv_bufs[i]); 1374 - } 1375 - if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1376 - rpcrdma_deregister_internal(ia, 1377 - buf->rb_send_bufs[i]->rl_handle, 1378 - &buf->rb_send_bufs[i]->rl_iov); 1379 - kfree(buf->rb_send_bufs[i]); 1380 - } 1318 + if (buf->rb_recv_bufs) 1319 + rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]); 1320 + if (buf->rb_send_bufs) 1321 + rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); 1381 1322 } 1382 1323 1383 1324 switch (ia->ri_memreg_strategy) { ··· 1493 1450 int i; 1494 1451 1495 1452 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) 1496 - rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); 1497 - rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); 1453 + rpcrdma_buffer_put_mr(&seg->rl_mw, buf); 1454 + rpcrdma_buffer_put_mr(&seg1->rl_mw, buf); 1498 1455 } 1499 1456 1500 1457 static void ··· 1580 1537 list_add(&r->mw_list, stale); 1581 1538 continue; 1582 1539 } 1583 - req->rl_segments[i].mr_chunk.rl_mw = r; 1540 + req->rl_segments[i].rl_mw = r; 1584 1541 if (unlikely(i-- == 0)) 1585 1542 return req; /* Success */ 1586 1543 } ··· 1602 1559 r = list_entry(buf->rb_mws.next, 1603 1560 struct rpcrdma_mw, mw_list); 1604 1561 list_del(&r->mw_list); 1605 - req->rl_segments[i].mr_chunk.rl_mw = r; 1562 + req->rl_segments[i].rl_mw = r; 1606 1563 if (unlikely(i-- == 0)) 1607 1564 return req; /* Success */ 1608 1565 } ··· 1701 1658 struct rpcrdma_buffer *buffers = req->rl_buffer; 1702 1659 unsigned long flags; 1703 1660 1704 - if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ 1705 - buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; 1706 1661 spin_lock_irqsave(&buffers->rb_lock, flags); 1707 1662 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1708 1663 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; ··· 1729 1688 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1730 1689 */ 1731 1690 1732 - int 1691 + static int 1733 1692 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1734 1693 struct ib_mr **mrp, struct ib_sge *iov) 1735 1694 { ··· 1780 1739 return rc; 1781 1740 } 1782 1741 1783 - int 1742 + static int 1784 1743 rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1785 1744 struct ib_mr *mr, struct ib_sge *iov) 1786 1745 { ··· 1796 1755 if (rc) 1797 1756 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1798 1757 return rc; 1758 + } 1759 + 1760 + /** 1761 + * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers 1762 + * @ia: controlling rpcrdma_ia 1763 + * @size: size of buffer to be allocated, in bytes 1764 + * @flags: GFP flags 1765 + * 1766 + * Returns pointer to private header of an area of internally 1767 + * registered memory, or an ERR_PTR. The registered buffer follows 1768 + * the end of the private header. 1769 + * 1770 + * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for 1771 + * receiving the payload of RDMA RECV operations. regbufs are not 1772 + * used for RDMA READ/WRITE operations, thus are registered only for 1773 + * LOCAL access. 1774 + */ 1775 + struct rpcrdma_regbuf * 1776 + rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) 1777 + { 1778 + struct rpcrdma_regbuf *rb; 1779 + int rc; 1780 + 1781 + rc = -ENOMEM; 1782 + rb = kmalloc(sizeof(*rb) + size, flags); 1783 + if (rb == NULL) 1784 + goto out; 1785 + 1786 + rb->rg_size = size; 1787 + rb->rg_owner = NULL; 1788 + rc = rpcrdma_register_internal(ia, rb->rg_base, size, 1789 + &rb->rg_mr, &rb->rg_iov); 1790 + if (rc) 1791 + goto out_free; 1792 + 1793 + return rb; 1794 + 1795 + out_free: 1796 + kfree(rb); 1797 + out: 1798 + return ERR_PTR(rc); 1799 + } 1800 + 1801 + /** 1802 + * rpcrdma_free_regbuf - deregister and free registered buffer 1803 + * @ia: controlling rpcrdma_ia 1804 + * @rb: regbuf to be deregistered and freed 1805 + */ 1806 + void 1807 + rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) 1808 + { 1809 + if (rb) { 1810 + rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); 1811 + kfree(rb); 1812 + } 1799 1813 } 1800 1814 1801 1815 /* ··· 1895 1799 struct rpcrdma_xprt *r_xprt) 1896 1800 { 1897 1801 struct rpcrdma_mr_seg *seg1 = seg; 1898 - struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; 1802 + struct rpcrdma_mw *mw = seg1->rl_mw; 1899 1803 struct rpcrdma_frmr *frmr = &mw->r.frmr; 1900 1804 struct ib_mr *mr = frmr->fr_mr; 1901 1805 struct ib_send_wr fastreg_wr, *bad_wr; ··· 1984 1888 struct ib_send_wr invalidate_wr, *bad_wr; 1985 1889 int rc; 1986 1890 1987 - seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; 1891 + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; 1988 1892 1989 1893 memset(&invalidate_wr, 0, sizeof invalidate_wr); 1990 - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; 1894 + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; 1991 1895 invalidate_wr.opcode = IB_WR_LOCAL_INV; 1992 - invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; 1896 + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; 1993 1897 DECR_CQCOUNT(&r_xprt->rx_ep); 1994 1898 1995 1899 read_lock(&ia->ri_qplock); ··· 1999 1903 read_unlock(&ia->ri_qplock); 2000 1904 if (rc) { 2001 1905 /* Force rpcrdma_buffer_get() to retry */ 2002 - seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; 1906 + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; 2003 1907 dprintk("RPC: %s: failed ib_post_send for invalidate," 2004 1908 " status %i\n", __func__, rc); 2005 1909 } ··· 2031 1935 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 2032 1936 break; 2033 1937 } 2034 - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 2035 - physaddrs, i, seg1->mr_dma); 1938 + rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); 2036 1939 if (rc) { 2037 1940 dprintk("RPC: %s: failed ib_map_phys_fmr " 2038 1941 "%u@0x%llx+%i (%d)... status %i\n", __func__, ··· 2040 1945 while (i--) 2041 1946 rpcrdma_unmap_one(ia, --seg); 2042 1947 } else { 2043 - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 1948 + seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; 2044 1949 seg1->mr_base = seg1->mr_dma + pageoff; 2045 1950 seg1->mr_nsegs = i; 2046 1951 seg1->mr_len = len; ··· 2057 1962 LIST_HEAD(l); 2058 1963 int rc; 2059 1964 2060 - list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); 1965 + list_add(&seg1->rl_mw->r.fmr->list, &l); 2061 1966 rc = ib_unmap_fmr(&l); 2062 1967 read_lock(&ia->ri_qplock); 2063 1968 while (seg1->mr_nsegs--) ··· 2199 2104 2200 2105 recv_wr.next = NULL; 2201 2106 recv_wr.wr_id = (u64) (unsigned long) rep; 2202 - recv_wr.sg_list = &rep->rr_iov; 2107 + recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 2203 2108 recv_wr.num_sge = 1; 2204 2109 2205 2110 ib_dma_sync_single_for_cpu(ia->ri_id->device, 2206 - rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 2111 + rdmab_addr(rep->rr_rdmabuf), 2112 + rdmab_length(rep->rr_rdmabuf), 2113 + DMA_BIDIRECTIONAL); 2207 2114 2208 2115 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 2209 2116

+70 -41

net/sunrpc/xprtrdma/xprt_rdma.h

··· 70 70 int ri_async_rc; 71 71 enum rpcrdma_memreg ri_memreg_strategy; 72 72 unsigned int ri_max_frmr_depth; 73 + struct ib_device_attr ri_devattr; 74 + struct ib_qp_attr ri_qp_attr; 75 + struct ib_qp_init_attr ri_qp_init_attr; 73 76 }; 74 77 75 78 /* ··· 86 83 atomic_t rep_cqcount; 87 84 int rep_cqinit; 88 85 int rep_connected; 89 - struct rpcrdma_ia *rep_ia; 90 86 struct ib_qp_init_attr rep_attr; 91 87 wait_queue_head_t rep_connect_wait; 92 - struct ib_sge rep_pad; /* holds zeroed pad */ 93 - struct ib_mr *rep_pad_mr; /* holds zeroed pad */ 94 - void (*rep_func)(struct rpcrdma_ep *); 95 - struct rpc_xprt *rep_xprt; /* for rep_func */ 88 + struct rpcrdma_regbuf *rep_padbuf; 96 89 struct rdma_conn_param rep_remote_cma; 97 90 struct sockaddr_storage rep_remote_addr; 98 91 struct delayed_work rep_connect_worker; ··· 104 105 105 106 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 106 107 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 108 + 109 + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV 110 + * 111 + * The below structure appears at the front of a large region of kmalloc'd 112 + * memory, which always starts on a good alignment boundary. 113 + */ 114 + 115 + struct rpcrdma_regbuf { 116 + size_t rg_size; 117 + struct rpcrdma_req *rg_owner; 118 + struct ib_mr *rg_mr; 119 + struct ib_sge rg_iov; 120 + __be32 rg_base[0] __attribute__ ((aligned(256))); 121 + }; 122 + 123 + static inline u64 124 + rdmab_addr(struct rpcrdma_regbuf *rb) 125 + { 126 + return rb->rg_iov.addr; 127 + } 128 + 129 + static inline u32 130 + rdmab_length(struct rpcrdma_regbuf *rb) 131 + { 132 + return rb->rg_iov.length; 133 + } 134 + 135 + static inline u32 136 + rdmab_lkey(struct rpcrdma_regbuf *rb) 137 + { 138 + return rb->rg_iov.lkey; 139 + } 140 + 141 + static inline struct rpcrdma_msg * 142 + rdmab_to_msg(struct rpcrdma_regbuf *rb) 143 + { 144 + return (struct rpcrdma_msg *)rb->rg_base; 145 + } 107 146 108 147 enum rpcrdma_chunktype { 109 148 rpcrdma_noch = 0, ··· 171 134 /* temporary static scatter/gather max */ 172 135 #define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ 173 136 #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ 174 - #define MAX_RPCRDMAHDR (\ 175 - /* max supported RPC/RDMA header */ \ 176 - sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ 177 - (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) 178 137 179 138 struct rpcrdma_buffer; 180 139 181 140 struct rpcrdma_rep { 182 - unsigned int rr_len; /* actual received reply length */ 183 - struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ 184 - struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ 185 - void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ 186 - struct list_head rr_list; /* tasklet list */ 187 - struct ib_sge rr_iov; /* for posting */ 188 - struct ib_mr *rr_handle; /* handle for mem in rr_iov */ 189 - char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ 141 + unsigned int rr_len; 142 + struct rpcrdma_buffer *rr_buffer; 143 + struct rpc_xprt *rr_xprt; 144 + void (*rr_func)(struct rpcrdma_rep *); 145 + struct list_head rr_list; 146 + struct rpcrdma_regbuf *rr_rdmabuf; 190 147 }; 191 148 192 149 /* ··· 242 211 */ 243 212 244 213 struct rpcrdma_mr_seg { /* chunk descriptors */ 245 - union { /* chunk memory handles */ 246 - struct ib_mr *rl_mr; /* if registered directly */ 247 - struct rpcrdma_mw *rl_mw; /* if registered from region */ 248 - } mr_chunk; 214 + struct rpcrdma_mw *rl_mw; /* registered MR */ 249 215 u64 mr_base; /* registration result */ 250 216 u32 mr_rkey; /* registration result */ 251 217 u32 mr_len; /* length of chunk or segment */ ··· 255 227 }; 256 228 257 229 struct rpcrdma_req { 258 - size_t rl_size; /* actual length of buffer */ 259 230 unsigned int rl_niovs; /* 0, 2 or 4 */ 260 231 unsigned int rl_nchunks; /* non-zero if chunks */ 261 232 unsigned int rl_connect_cookie; /* retry detection */ 262 233 enum rpcrdma_chunktype rl_rtype, rl_wtype; 263 234 struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 264 235 struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 265 - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ 266 236 struct ib_sge rl_send_iov[4]; /* for active requests */ 267 - struct ib_sge rl_iov; /* for posting */ 268 - struct ib_mr *rl_handle; /* handle for mem in rl_iov */ 269 - char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ 270 - __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ 237 + struct rpcrdma_regbuf *rl_rdmabuf; 238 + struct rpcrdma_regbuf *rl_sendbuf; 239 + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 271 240 }; 272 - #define rpcr_to_rdmar(r) \ 273 - container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) 241 + 242 + static inline struct rpcrdma_req * 243 + rpcr_to_rdmar(struct rpc_rqst *rqst) 244 + { 245 + struct rpcrdma_regbuf *rb = container_of(rqst->rq_buffer, 246 + struct rpcrdma_regbuf, 247 + rg_base[0]); 248 + return rb->rg_owner; 249 + } 274 250 275 251 /* 276 252 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for ··· 284 252 */ 285 253 struct rpcrdma_buffer { 286 254 spinlock_t rb_lock; /* protects indexes */ 287 - atomic_t rb_credits; /* most recent server credits */ 288 255 int rb_max_requests;/* client max requests */ 289 256 struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ 290 257 struct list_head rb_all; ··· 349 318 * during unmount. 350 319 */ 351 320 struct rpcrdma_xprt { 352 - struct rpc_xprt xprt; 321 + struct rpc_xprt rx_xprt; 353 322 struct rpcrdma_ia rx_ia; 354 323 struct rpcrdma_ep rx_ep; 355 324 struct rpcrdma_buffer rx_buf; 356 325 struct rpcrdma_create_data_internal rx_data; 357 - struct delayed_work rdma_connect; 326 + struct delayed_work rx_connect_worker; 358 327 struct rpcrdma_stats rx_stats; 359 328 }; 360 329 361 - #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) 330 + #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) 362 331 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) 363 332 364 333 /* Setting this to 0 ensures interoperability with early servers. ··· 389 358 /* 390 359 * Buffer calls - xprtrdma/verbs.c 391 360 */ 392 - int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, 393 - struct rpcrdma_ia *, 394 - struct rpcrdma_create_data_internal *); 361 + int rpcrdma_buffer_create(struct rpcrdma_xprt *); 395 362 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 396 363 397 364 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); ··· 397 368 void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 398 369 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 399 370 400 - int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, 401 - struct ib_mr **, struct ib_sge *); 402 - int rpcrdma_deregister_internal(struct rpcrdma_ia *, 403 - struct ib_mr *, struct ib_sge *); 404 - 405 371 int rpcrdma_register_external(struct rpcrdma_mr_seg *, 406 372 int, int, struct rpcrdma_xprt *); 407 373 int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, 408 374 struct rpcrdma_xprt *); 375 + 376 + struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 377 + size_t, gfp_t); 378 + void rpcrdma_free_regbuf(struct rpcrdma_ia *, 379 + struct rpcrdma_regbuf *); 409 380 410 381 /* 411 382 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c