Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'from-tomtucker' into for-2.6.28

+710 -123
+26 -1
include/linux/sunrpc/svc_rdma.h
··· 72 72 */ 73 73 struct svc_rdma_op_ctxt { 74 74 struct svc_rdma_op_ctxt *read_hdr; 75 + struct svc_rdma_fastreg_mr *frmr; 75 76 int hdr_count; 76 77 struct xdr_buf arg; 77 78 struct list_head dto_q; ··· 104 103 int start; /* sge no for this chunk */ 105 104 int count; /* sge count for this chunk */ 106 105 }; 106 + struct svc_rdma_fastreg_mr { 107 + struct ib_mr *mr; 108 + void *kva; 109 + struct ib_fast_reg_page_list *page_list; 110 + int page_list_len; 111 + unsigned long access_flags; 112 + unsigned long map_len; 113 + enum dma_data_direction direction; 114 + struct list_head frmr_list; 115 + }; 107 116 struct svc_rdma_req_map { 117 + struct svc_rdma_fastreg_mr *frmr; 108 118 unsigned long count; 109 119 union { 110 120 struct kvec sge[RPCSVC_MAXPAGES]; 111 121 struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; 112 122 }; 113 123 }; 114 - 124 + #define RDMACTXT_F_FAST_UNREG 1 115 125 #define RDMACTXT_F_LAST_CTXT 2 126 + 127 + #define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */ 128 + #define SVCRDMA_DEVCAP_READ_W_INV 2 /* read w/ invalidate */ 116 129 117 130 struct svcxprt_rdma { 118 131 struct svc_xprt sc_xprt; /* SVC transport structure */ ··· 151 136 struct ib_cq *sc_rq_cq; 152 137 struct ib_cq *sc_sq_cq; 153 138 struct ib_mr *sc_phys_mr; /* MR for server memory */ 139 + u32 sc_dev_caps; /* distilled device caps */ 140 + u32 sc_dma_lkey; /* local dma key */ 141 + unsigned int sc_frmr_pg_list_len; 142 + struct list_head sc_frmr_q; 143 + spinlock_t sc_frmr_q_lock; 154 144 155 145 spinlock_t sc_lock; /* transport lock */ 156 146 ··· 212 192 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); 213 193 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); 214 194 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); 195 + extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); 215 196 extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); 216 197 extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); 198 + extern int svc_rdma_fastreg(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); 199 + extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); 200 + extern void svc_rdma_put_frmr(struct svcxprt_rdma *, 201 + struct svc_rdma_fastreg_mr *); 217 202 extern void svc_sq_reap(struct svcxprt_rdma *); 218 203 extern void svc_rq_reap(struct svcxprt_rdma *); 219 204 extern struct svc_xprt_class svc_rdma_class;
+166 -21
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
··· 116 116 * 117 117 * Assumptions: 118 118 * - chunk[0]->position points to pages[0] at an offset of 0 119 - * - pages[] is not physically or virtually contigous and consists of 119 + * - pages[] is not physically or virtually contiguous and consists of 120 120 * PAGE_SIZE elements. 121 121 * 122 122 * Output: ··· 125 125 * chunk in the read list 126 126 * 127 127 */ 128 - static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, 128 + static int map_read_chunks(struct svcxprt_rdma *xprt, 129 129 struct svc_rqst *rqstp, 130 130 struct svc_rdma_op_ctxt *head, 131 131 struct rpcrdma_msg *rmsgp, ··· 211 211 return sge_no; 212 212 } 213 213 214 - static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, 215 - struct svc_rdma_op_ctxt *ctxt, 216 - struct kvec *vec, 217 - u64 *sgl_offset, 218 - int count) 214 + /* Map a read-chunk-list to an XDR and fast register the page-list. 215 + * 216 + * Assumptions: 217 + * - chunk[0] position points to pages[0] at an offset of 0 218 + * - pages[] will be made physically contiguous by creating a one-off memory 219 + * region using the fastreg verb. 220 + * - byte_count is # of bytes in read-chunk-list 221 + * - ch_count is # of chunks in read-chunk-list 222 + * 223 + * Output: 224 + * - sge array pointing into pages[] array. 225 + * - chunk_sge array specifying sge index and count for each 226 + * chunk in the read list 227 + */ 228 + static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, 229 + struct svc_rqst *rqstp, 230 + struct svc_rdma_op_ctxt *head, 231 + struct rpcrdma_msg *rmsgp, 232 + struct svc_rdma_req_map *rpl_map, 233 + struct svc_rdma_req_map *chl_map, 234 + int ch_count, 235 + int byte_count) 236 + { 237 + int page_no; 238 + int ch_no; 239 + u32 offset; 240 + struct rpcrdma_read_chunk *ch; 241 + struct svc_rdma_fastreg_mr *frmr; 242 + int ret = 0; 243 + 244 + frmr = svc_rdma_get_frmr(xprt); 245 + if (IS_ERR(frmr)) 246 + return -ENOMEM; 247 + 248 + head->frmr = frmr; 249 + head->arg.head[0] = rqstp->rq_arg.head[0]; 250 + head->arg.tail[0] = rqstp->rq_arg.tail[0]; 251 + head->arg.pages = &head->pages[head->count]; 252 + head->hdr_count = head->count; /* save count of hdr pages */ 253 + head->arg.page_base = 0; 254 + head->arg.page_len = byte_count; 255 + head->arg.len = rqstp->rq_arg.len + byte_count; 256 + head->arg.buflen = rqstp->rq_arg.buflen + byte_count; 257 + 258 + /* Fast register the page list */ 259 + frmr->kva = page_address(rqstp->rq_arg.pages[0]); 260 + frmr->direction = DMA_FROM_DEVICE; 261 + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); 262 + frmr->map_len = byte_count; 263 + frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; 264 + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { 265 + frmr->page_list->page_list[page_no] = 266 + ib_dma_map_single(xprt->sc_cm_id->device, 267 + page_address(rqstp->rq_arg.pages[page_no]), 268 + PAGE_SIZE, DMA_TO_DEVICE); 269 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 270 + frmr->page_list->page_list[page_no])) 271 + goto fatal_err; 272 + atomic_inc(&xprt->sc_dma_used); 273 + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; 274 + } 275 + head->count += page_no; 276 + 277 + /* rq_respages points one past arg pages */ 278 + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; 279 + 280 + /* Create the reply and chunk maps */ 281 + offset = 0; 282 + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 283 + for (ch_no = 0; ch_no < ch_count; ch_no++) { 284 + rpl_map->sge[ch_no].iov_base = frmr->kva + offset; 285 + rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length; 286 + chl_map->ch[ch_no].count = 1; 287 + chl_map->ch[ch_no].start = ch_no; 288 + offset += ch->rc_target.rs_length; 289 + ch++; 290 + } 291 + 292 + ret = svc_rdma_fastreg(xprt, frmr); 293 + if (ret) 294 + goto fatal_err; 295 + 296 + return ch_no; 297 + 298 + fatal_err: 299 + printk("svcrdma: error fast registering xdr for xprt %p", xprt); 300 + svc_rdma_put_frmr(xprt, frmr); 301 + return -EIO; 302 + } 303 + 304 + static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, 305 + struct svc_rdma_op_ctxt *ctxt, 306 + struct svc_rdma_fastreg_mr *frmr, 307 + struct kvec *vec, 308 + u64 *sgl_offset, 309 + int count) 219 310 { 220 311 int i; 221 312 222 313 ctxt->count = count; 223 314 ctxt->direction = DMA_FROM_DEVICE; 224 315 for (i = 0; i < count; i++) { 225 - atomic_inc(&xprt->sc_dma_used); 226 - ctxt->sge[i].addr = 227 - ib_dma_map_single(xprt->sc_cm_id->device, 228 - vec[i].iov_base, vec[i].iov_len, 229 - DMA_FROM_DEVICE); 316 + ctxt->sge[i].length = 0; /* in case map fails */ 317 + if (!frmr) { 318 + ctxt->sge[i].addr = 319 + ib_dma_map_single(xprt->sc_cm_id->device, 320 + vec[i].iov_base, 321 + vec[i].iov_len, 322 + DMA_FROM_DEVICE); 323 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 324 + ctxt->sge[i].addr)) 325 + return -EINVAL; 326 + ctxt->sge[i].lkey = xprt->sc_dma_lkey; 327 + atomic_inc(&xprt->sc_dma_used); 328 + } else { 329 + ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; 330 + ctxt->sge[i].lkey = frmr->mr->lkey; 331 + } 230 332 ctxt->sge[i].length = vec[i].iov_len; 231 - ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey; 232 333 *sgl_offset = *sgl_offset + vec[i].iov_len; 233 334 } 335 + return 0; 234 336 } 235 337 236 338 static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) ··· 380 278 struct svc_rdma_op_ctxt *hdr_ctxt) 381 279 { 382 280 struct ib_send_wr read_wr; 281 + struct ib_send_wr inv_wr; 383 282 int err = 0; 384 283 int ch_no; 385 284 int ch_count; ··· 404 301 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 405 302 if (ch_count > RPCSVC_MAXPAGES) 406 303 return -EINVAL; 407 - sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, 408 - rpl_map, chl_map, 409 - ch_count, byte_count); 304 + 305 + if (!xprt->sc_frmr_pg_list_len) 306 + sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, 307 + rpl_map, chl_map, ch_count, 308 + byte_count); 309 + else 310 + sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, 311 + rpl_map, chl_map, ch_count, 312 + byte_count); 313 + if (sge_count < 0) { 314 + err = -EIO; 315 + goto out; 316 + } 317 + 410 318 sgl_offset = 0; 411 319 ch_no = 0; 412 320 ··· 426 312 next_sge: 427 313 ctxt = svc_rdma_get_context(xprt); 428 314 ctxt->direction = DMA_FROM_DEVICE; 315 + ctxt->frmr = hdr_ctxt->frmr; 316 + ctxt->read_hdr = NULL; 429 317 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 318 + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); 430 319 431 320 /* Prepare READ WR */ 432 321 memset(&read_wr, 0, sizeof read_wr); 433 - ctxt->wr_op = IB_WR_RDMA_READ; 434 322 read_wr.wr_id = (unsigned long)ctxt; 435 323 read_wr.opcode = IB_WR_RDMA_READ; 324 + ctxt->wr_op = read_wr.opcode; 436 325 read_wr.send_flags = IB_SEND_SIGNALED; 437 326 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; 438 327 read_wr.wr.rdma.remote_addr = ··· 444 327 read_wr.sg_list = ctxt->sge; 445 328 read_wr.num_sge = 446 329 rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); 447 - rdma_set_ctxt_sge(xprt, ctxt, 448 - &rpl_map->sge[chl_map->ch[ch_no].start], 449 - &sgl_offset, 450 - read_wr.num_sge); 330 + err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, 331 + &rpl_map->sge[chl_map->ch[ch_no].start], 332 + &sgl_offset, 333 + read_wr.num_sge); 334 + if (err) { 335 + svc_rdma_unmap_dma(ctxt); 336 + svc_rdma_put_context(ctxt, 0); 337 + goto out; 338 + } 451 339 if (((ch+1)->rc_discrim == 0) && 452 340 (read_wr.num_sge == chl_map->ch[ch_no].count)) { 453 341 /* ··· 461 339 * the client and the RPC needs to be enqueued. 462 340 */ 463 341 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 342 + if (hdr_ctxt->frmr) { 343 + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); 344 + /* 345 + * Invalidate the local MR used to map the data 346 + * sink. 347 + */ 348 + if (xprt->sc_dev_caps & 349 + SVCRDMA_DEVCAP_READ_W_INV) { 350 + read_wr.opcode = 351 + IB_WR_RDMA_READ_WITH_INV; 352 + ctxt->wr_op = read_wr.opcode; 353 + read_wr.ex.invalidate_rkey = 354 + ctxt->frmr->mr->lkey; 355 + } else { 356 + /* Prepare INVALIDATE WR */ 357 + memset(&inv_wr, 0, sizeof inv_wr); 358 + inv_wr.opcode = IB_WR_LOCAL_INV; 359 + inv_wr.send_flags = IB_SEND_SIGNALED; 360 + inv_wr.ex.invalidate_rkey = 361 + hdr_ctxt->frmr->mr->lkey; 362 + read_wr.next = &inv_wr; 363 + } 364 + } 464 365 ctxt->read_hdr = hdr_ctxt; 465 366 } 466 367 /* Post the read */
+215 -40
net/sunrpc/xprtrdma/svc_rdma_sendto.c
··· 69 69 * array is only concerned with the reply we are assured that we have 70 70 * on extra page for the RPCRMDA header. 71 71 */ 72 - static void xdr_to_sge(struct svcxprt_rdma *xprt, 73 - struct xdr_buf *xdr, 74 - struct svc_rdma_req_map *vec) 72 + int fast_reg_xdr(struct svcxprt_rdma *xprt, 73 + struct xdr_buf *xdr, 74 + struct svc_rdma_req_map *vec) 75 + { 76 + int sge_no; 77 + u32 sge_bytes; 78 + u32 page_bytes; 79 + u32 page_off; 80 + int page_no = 0; 81 + u8 *frva; 82 + struct svc_rdma_fastreg_mr *frmr; 83 + 84 + frmr = svc_rdma_get_frmr(xprt); 85 + if (IS_ERR(frmr)) 86 + return -ENOMEM; 87 + vec->frmr = frmr; 88 + 89 + /* Skip the RPCRDMA header */ 90 + sge_no = 1; 91 + 92 + /* Map the head. */ 93 + frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); 94 + vec->sge[sge_no].iov_base = xdr->head[0].iov_base; 95 + vec->sge[sge_no].iov_len = xdr->head[0].iov_len; 96 + vec->count = 2; 97 + sge_no++; 98 + 99 + /* Build the FRMR */ 100 + frmr->kva = frva; 101 + frmr->direction = DMA_TO_DEVICE; 102 + frmr->access_flags = 0; 103 + frmr->map_len = PAGE_SIZE; 104 + frmr->page_list_len = 1; 105 + frmr->page_list->page_list[page_no] = 106 + ib_dma_map_single(xprt->sc_cm_id->device, 107 + (void *)xdr->head[0].iov_base, 108 + PAGE_SIZE, DMA_TO_DEVICE); 109 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 110 + frmr->page_list->page_list[page_no])) 111 + goto fatal_err; 112 + atomic_inc(&xprt->sc_dma_used); 113 + 114 + page_off = xdr->page_base; 115 + page_bytes = xdr->page_len + page_off; 116 + if (!page_bytes) 117 + goto encode_tail; 118 + 119 + /* Map the pages */ 120 + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; 121 + vec->sge[sge_no].iov_len = page_bytes; 122 + sge_no++; 123 + while (page_bytes) { 124 + struct page *page; 125 + 126 + page = xdr->pages[page_no++]; 127 + sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); 128 + page_bytes -= sge_bytes; 129 + 130 + frmr->page_list->page_list[page_no] = 131 + ib_dma_map_page(xprt->sc_cm_id->device, page, 0, 132 + PAGE_SIZE, DMA_TO_DEVICE); 133 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 134 + frmr->page_list->page_list[page_no])) 135 + goto fatal_err; 136 + 137 + atomic_inc(&xprt->sc_dma_used); 138 + page_off = 0; /* reset for next time through loop */ 139 + frmr->map_len += PAGE_SIZE; 140 + frmr->page_list_len++; 141 + } 142 + vec->count++; 143 + 144 + encode_tail: 145 + /* Map tail */ 146 + if (0 == xdr->tail[0].iov_len) 147 + goto done; 148 + 149 + vec->count++; 150 + vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; 151 + 152 + if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == 153 + ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { 154 + /* 155 + * If head and tail use the same page, we don't need 156 + * to map it again. 157 + */ 158 + vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; 159 + } else { 160 + void *va; 161 + 162 + /* Map another page for the tail */ 163 + page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; 164 + va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); 165 + vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; 166 + 167 + frmr->page_list->page_list[page_no] = 168 + ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE, 169 + DMA_TO_DEVICE); 170 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 171 + frmr->page_list->page_list[page_no])) 172 + goto fatal_err; 173 + atomic_inc(&xprt->sc_dma_used); 174 + frmr->map_len += PAGE_SIZE; 175 + frmr->page_list_len++; 176 + } 177 + 178 + done: 179 + if (svc_rdma_fastreg(xprt, frmr)) 180 + goto fatal_err; 181 + 182 + return 0; 183 + 184 + fatal_err: 185 + printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); 186 + svc_rdma_put_frmr(xprt, frmr); 187 + return -EIO; 188 + } 189 + 190 + static int map_xdr(struct svcxprt_rdma *xprt, 191 + struct xdr_buf *xdr, 192 + struct svc_rdma_req_map *vec) 75 193 { 76 194 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; 77 195 int sge_no; ··· 200 82 201 83 BUG_ON(xdr->len != 202 84 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); 85 + 86 + if (xprt->sc_frmr_pg_list_len) 87 + return fast_reg_xdr(xprt, xdr, vec); 203 88 204 89 /* Skip the first sge, this is for the RPCRDMA header */ 205 90 sge_no = 1; ··· 237 116 238 117 BUG_ON(sge_no > sge_max); 239 118 vec->count = sge_no; 119 + return 0; 240 120 } 241 121 242 122 /* Assumptions: 123 + * - We are using FRMR 124 + * - or - 243 125 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 244 126 */ 245 127 static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, ··· 282 158 sge_no = 0; 283 159 284 160 /* Copy the remaining SGE */ 285 - while (bc != 0 && xdr_sge_no < vec->count) { 286 - sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 287 - sge_bytes = min((size_t)bc, 288 - (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off)); 161 + while (bc != 0) { 162 + sge_bytes = min_t(size_t, 163 + bc, vec->sge[xdr_sge_no].iov_len-sge_off); 289 164 sge[sge_no].length = sge_bytes; 290 - atomic_inc(&xprt->sc_dma_used); 291 - sge[sge_no].addr = 292 - ib_dma_map_single(xprt->sc_cm_id->device, 293 - (void *) 294 - vec->sge[xdr_sge_no].iov_base + sge_off, 295 - sge_bytes, DMA_TO_DEVICE); 296 - if (dma_mapping_error(xprt->sc_cm_id->device->dma_device, 297 - sge[sge_no].addr)) 298 - goto err; 165 + if (!vec->frmr) { 166 + sge[sge_no].addr = 167 + ib_dma_map_single(xprt->sc_cm_id->device, 168 + (void *) 169 + vec->sge[xdr_sge_no].iov_base + sge_off, 170 + sge_bytes, DMA_TO_DEVICE); 171 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, 172 + sge[sge_no].addr)) 173 + goto err; 174 + atomic_inc(&xprt->sc_dma_used); 175 + sge[sge_no].lkey = xprt->sc_dma_lkey; 176 + } else { 177 + sge[sge_no].addr = (unsigned long) 178 + vec->sge[xdr_sge_no].iov_base + sge_off; 179 + sge[sge_no].lkey = vec->frmr->mr->lkey; 180 + } 181 + ctxt->count++; 182 + ctxt->frmr = vec->frmr; 299 183 sge_off = 0; 300 184 sge_no++; 301 - ctxt->count++; 302 185 xdr_sge_no++; 186 + BUG_ON(xdr_sge_no > vec->count); 303 187 bc -= sge_bytes; 304 188 } 305 - 306 - BUG_ON(bc != 0); 307 - BUG_ON(xdr_sge_no > vec->count); 308 189 309 190 /* Prepare WRITE WR */ 310 191 memset(&write_wr, 0, sizeof write_wr); ··· 355 226 res_ary = (struct rpcrdma_write_array *) 356 227 &rdma_resp->rm_body.rm_chunks[1]; 357 228 358 - max_write = xprt->sc_max_sge * PAGE_SIZE; 229 + if (vec->frmr) 230 + max_write = vec->frmr->map_len; 231 + else 232 + max_write = xprt->sc_max_sge * PAGE_SIZE; 359 233 360 234 /* Write chunks start at the pagelist */ 361 235 for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; ··· 429 297 res_ary = (struct rpcrdma_write_array *) 430 298 &rdma_resp->rm_body.rm_chunks[2]; 431 299 432 - max_write = xprt->sc_max_sge * PAGE_SIZE; 300 + if (vec->frmr) 301 + max_write = vec->frmr->map_len; 302 + else 303 + max_write = xprt->sc_max_sge * PAGE_SIZE; 433 304 434 305 /* xdr offset starts at RPC message */ 435 306 for (xdr_off = 0, chunk_no = 0; ··· 441 306 u64 rs_offset; 442 307 ch = &arg_ary->wc_array[chunk_no].wc_target; 443 308 write_len = min(xfer_len, ch->rs_length); 444 - 445 309 446 310 /* Prepare the reply chunk given the length actually 447 311 * written */ ··· 500 366 int byte_count) 501 367 { 502 368 struct ib_send_wr send_wr; 369 + struct ib_send_wr inv_wr; 503 370 int sge_no; 504 371 int sge_bytes; 505 372 int page_no; ··· 520 385 /* Prepare the context */ 521 386 ctxt->pages[0] = page; 522 387 ctxt->count = 1; 388 + ctxt->frmr = vec->frmr; 389 + if (vec->frmr) 390 + set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); 391 + else 392 + clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); 523 393 524 394 /* Prepare the SGE for the RPCRDMA Header */ 525 - atomic_inc(&rdma->sc_dma_used); 526 395 ctxt->sge[0].addr = 527 396 ib_dma_map_page(rdma->sc_cm_id->device, 528 397 page, 0, PAGE_SIZE, DMA_TO_DEVICE); 398 + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) 399 + goto err; 400 + atomic_inc(&rdma->sc_dma_used); 401 + 529 402 ctxt->direction = DMA_TO_DEVICE; 403 + 530 404 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 531 - ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; 405 + ctxt->sge[0].lkey = rdma->sc_dma_lkey; 532 406 533 407 /* Determine how many of our SGE are to be transmitted */ 534 408 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { 535 409 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); 536 410 byte_count -= sge_bytes; 537 - atomic_inc(&rdma->sc_dma_used); 538 - ctxt->sge[sge_no].addr = 539 - ib_dma_map_single(rdma->sc_cm_id->device, 540 - vec->sge[sge_no].iov_base, 541 - sge_bytes, DMA_TO_DEVICE); 411 + if (!vec->frmr) { 412 + ctxt->sge[sge_no].addr = 413 + ib_dma_map_single(rdma->sc_cm_id->device, 414 + vec->sge[sge_no].iov_base, 415 + sge_bytes, DMA_TO_DEVICE); 416 + if (ib_dma_mapping_error(rdma->sc_cm_id->device, 417 + ctxt->sge[sge_no].addr)) 418 + goto err; 419 + atomic_inc(&rdma->sc_dma_used); 420 + ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; 421 + } else { 422 + ctxt->sge[sge_no].addr = (unsigned long) 423 + vec->sge[sge_no].iov_base; 424 + ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; 425 + } 542 426 ctxt->sge[sge_no].length = sge_bytes; 543 - ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey; 544 427 } 545 428 BUG_ON(byte_count != 0); 546 429 ··· 570 417 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 571 418 ctxt->count++; 572 419 rqstp->rq_respages[page_no] = NULL; 573 - /* If there are more pages than SGE, terminate SGE list */ 420 + /* 421 + * If there are more pages than SGE, terminate SGE 422 + * list so that svc_rdma_unmap_dma doesn't attempt to 423 + * unmap garbage. 424 + */ 574 425 if (page_no+1 >= sge_no) 575 426 ctxt->sge[page_no+1].length = 0; 576 427 } 577 428 BUG_ON(sge_no > rdma->sc_max_sge); 429 + BUG_ON(sge_no > ctxt->count); 578 430 memset(&send_wr, 0, sizeof send_wr); 579 431 ctxt->wr_op = IB_WR_SEND; 580 432 send_wr.wr_id = (unsigned long)ctxt; ··· 587 429 send_wr.num_sge = sge_no; 588 430 send_wr.opcode = IB_WR_SEND; 589 431 send_wr.send_flags = IB_SEND_SIGNALED; 432 + if (vec->frmr) { 433 + /* Prepare INVALIDATE WR */ 434 + memset(&inv_wr, 0, sizeof inv_wr); 435 + inv_wr.opcode = IB_WR_LOCAL_INV; 436 + inv_wr.send_flags = IB_SEND_SIGNALED; 437 + inv_wr.ex.invalidate_rkey = 438 + vec->frmr->mr->lkey; 439 + send_wr.next = &inv_wr; 440 + } 590 441 591 442 ret = svc_rdma_send(rdma, &send_wr); 592 443 if (ret) 593 - svc_rdma_put_context(ctxt, 1); 444 + goto err; 594 445 595 - return ret; 446 + return 0; 447 + 448 + err: 449 + svc_rdma_put_frmr(rdma, vec->frmr); 450 + svc_rdma_put_context(ctxt, 1); 451 + return -EIO; 596 452 } 597 453 598 454 void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) ··· 649 477 ctxt = svc_rdma_get_context(rdma); 650 478 ctxt->direction = DMA_TO_DEVICE; 651 479 vec = svc_rdma_get_req_map(); 652 - xdr_to_sge(rdma, &rqstp->rq_res, vec); 653 - 480 + ret = map_xdr(rdma, &rqstp->rq_res, vec); 481 + if (ret) 482 + goto err0; 654 483 inline_bytes = rqstp->rq_res.len; 655 484 656 485 /* Create the RDMA response header */ ··· 671 498 if (ret < 0) { 672 499 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", 673 500 ret); 674 - goto error; 501 + goto err1; 675 502 } 676 503 inline_bytes -= ret; 677 504 ··· 681 508 if (ret < 0) { 682 509 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", 683 510 ret); 684 - goto error; 511 + goto err1; 685 512 } 686 513 inline_bytes -= ret; 687 514 ··· 690 517 svc_rdma_put_req_map(vec); 691 518 dprintk("svcrdma: send_reply returns %d\n", ret); 692 519 return ret; 693 - error: 520 + 521 + err1: 522 + put_page(res_page); 523 + err0: 694 524 svc_rdma_put_req_map(vec); 695 525 svc_rdma_put_context(ctxt, 0); 696 - put_page(res_page); 697 526 return ret; 698 527 }
+303 -61
net/sunrpc/xprtrdma/svc_rdma_transport.c
··· 100 100 ctxt->xprt = xprt; 101 101 INIT_LIST_HEAD(&ctxt->dto_q); 102 102 ctxt->count = 0; 103 + ctxt->frmr = NULL; 103 104 atomic_inc(&xprt->sc_ctxt_used); 104 105 return ctxt; 105 106 } 106 107 107 - static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) 108 + void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) 108 109 { 109 110 struct svcxprt_rdma *xprt = ctxt->xprt; 110 111 int i; 111 112 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { 112 - atomic_dec(&xprt->sc_dma_used); 113 - ib_dma_unmap_single(xprt->sc_cm_id->device, 114 - ctxt->sge[i].addr, 115 - ctxt->sge[i].length, 116 - ctxt->direction); 113 + /* 114 + * Unmap the DMA addr in the SGE if the lkey matches 115 + * the sc_dma_lkey, otherwise, ignore it since it is 116 + * an FRMR lkey and will be unmapped later when the 117 + * last WR that uses it completes. 118 + */ 119 + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { 120 + atomic_dec(&xprt->sc_dma_used); 121 + ib_dma_unmap_single(xprt->sc_cm_id->device, 122 + ctxt->sge[i].addr, 123 + ctxt->sge[i].length, 124 + ctxt->direction); 125 + } 117 126 } 118 127 } 119 128 ··· 159 150 schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 160 151 } 161 152 map->count = 0; 153 + map->frmr = NULL; 162 154 return map; 163 155 } 164 156 ··· 326 316 } 327 317 328 318 /* 319 + * Processs a completion context 320 + */ 321 + static void process_context(struct svcxprt_rdma *xprt, 322 + struct svc_rdma_op_ctxt *ctxt) 323 + { 324 + svc_rdma_unmap_dma(ctxt); 325 + 326 + switch (ctxt->wr_op) { 327 + case IB_WR_SEND: 328 + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) 329 + svc_rdma_put_frmr(xprt, ctxt->frmr); 330 + svc_rdma_put_context(ctxt, 1); 331 + break; 332 + 333 + case IB_WR_RDMA_WRITE: 334 + svc_rdma_put_context(ctxt, 0); 335 + break; 336 + 337 + case IB_WR_RDMA_READ: 338 + case IB_WR_RDMA_READ_WITH_INV: 339 + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 340 + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 341 + BUG_ON(!read_hdr); 342 + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) 343 + svc_rdma_put_frmr(xprt, ctxt->frmr); 344 + spin_lock_bh(&xprt->sc_rq_dto_lock); 345 + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 346 + list_add_tail(&read_hdr->dto_q, 347 + &xprt->sc_read_complete_q); 348 + spin_unlock_bh(&xprt->sc_rq_dto_lock); 349 + svc_xprt_enqueue(&xprt->sc_xprt); 350 + } 351 + svc_rdma_put_context(ctxt, 0); 352 + break; 353 + 354 + default: 355 + printk(KERN_ERR "svcrdma: unexpected completion type, " 356 + "opcode=%d\n", 357 + ctxt->wr_op); 358 + break; 359 + } 360 + } 361 + 362 + /* 329 363 * Send Queue Completion Handler - potentially called on interrupt context. 330 364 * 331 365 * Note that caller must hold a transport reference. ··· 381 327 struct ib_cq *cq = xprt->sc_sq_cq; 382 328 int ret; 383 329 384 - 385 330 if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) 386 331 return; 387 332 388 333 ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); 389 334 atomic_inc(&rdma_stat_sq_poll); 390 335 while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { 391 - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 392 - xprt = ctxt->xprt; 393 - 394 - svc_rdma_unmap_dma(ctxt); 395 336 if (wc.status != IB_WC_SUCCESS) 396 337 /* Close the transport */ 397 338 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); ··· 395 346 atomic_dec(&xprt->sc_sq_count); 396 347 wake_up(&xprt->sc_send_wait); 397 348 398 - switch (ctxt->wr_op) { 399 - case IB_WR_SEND: 400 - svc_rdma_put_context(ctxt, 1); 401 - break; 349 + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 350 + if (ctxt) 351 + process_context(xprt, ctxt); 402 352 403 - case IB_WR_RDMA_WRITE: 404 - svc_rdma_put_context(ctxt, 0); 405 - break; 406 - 407 - case IB_WR_RDMA_READ: 408 - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 409 - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 410 - BUG_ON(!read_hdr); 411 - spin_lock_bh(&xprt->sc_rq_dto_lock); 412 - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 413 - list_add_tail(&read_hdr->dto_q, 414 - &xprt->sc_read_complete_q); 415 - spin_unlock_bh(&xprt->sc_rq_dto_lock); 416 - svc_xprt_enqueue(&xprt->sc_xprt); 417 - } 418 - svc_rdma_put_context(ctxt, 0); 419 - break; 420 - 421 - default: 422 - printk(KERN_ERR "svcrdma: unexpected completion type, " 423 - "opcode=%d, status=%d\n", 424 - wc.opcode, wc.status); 425 - break; 426 - } 427 353 svc_xprt_put(&xprt->sc_xprt); 428 354 } 429 355 ··· 449 425 INIT_LIST_HEAD(&cma_xprt->sc_dto_q); 450 426 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 451 427 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 428 + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); 452 429 init_waitqueue_head(&cma_xprt->sc_send_wait); 453 430 454 431 spin_lock_init(&cma_xprt->sc_lock); 455 432 spin_lock_init(&cma_xprt->sc_rq_dto_lock); 433 + spin_lock_init(&cma_xprt->sc_frmr_q_lock); 456 434 457 435 cma_xprt->sc_ord = svcrdma_ord; 458 436 ··· 488 462 struct ib_recv_wr recv_wr, *bad_recv_wr; 489 463 struct svc_rdma_op_ctxt *ctxt; 490 464 struct page *page; 491 - unsigned long pa; 465 + dma_addr_t pa; 492 466 int sge_no; 493 467 int buflen; 494 468 int ret; ··· 500 474 BUG_ON(sge_no >= xprt->sc_max_sge); 501 475 page = svc_rdma_get_page(); 502 476 ctxt->pages[sge_no] = page; 503 - atomic_inc(&xprt->sc_dma_used); 504 477 pa = ib_dma_map_page(xprt->sc_cm_id->device, 505 478 page, 0, PAGE_SIZE, 506 479 DMA_FROM_DEVICE); 480 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) 481 + goto err_put_ctxt; 482 + atomic_inc(&xprt->sc_dma_used); 507 483 ctxt->sge[sge_no].addr = pa; 508 484 ctxt->sge[sge_no].length = PAGE_SIZE; 509 - ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 485 + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; 510 486 buflen += PAGE_SIZE; 511 487 } 512 488 ctxt->count = sge_no; ··· 524 496 svc_rdma_put_context(ctxt, 1); 525 497 } 526 498 return ret; 499 + 500 + err_put_ctxt: 501 + svc_rdma_put_context(ctxt, 1); 502 + return -ENOMEM; 527 503 } 528 504 529 505 /* ··· 598 566 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " 599 567 "event=%d\n", cma_id, cma_id->context, event->event); 600 568 handle_connect_req(cma_id, 601 - event->param.conn.responder_resources); 569 + event->param.conn.initiator_depth); 602 570 break; 603 571 604 572 case RDMA_CM_EVENT_ESTABLISHED: ··· 718 686 return ERR_PTR(ret); 719 687 } 720 688 689 + static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) 690 + { 691 + struct ib_mr *mr; 692 + struct ib_fast_reg_page_list *pl; 693 + struct svc_rdma_fastreg_mr *frmr; 694 + 695 + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); 696 + if (!frmr) 697 + goto err; 698 + 699 + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); 700 + if (!mr) 701 + goto err_free_frmr; 702 + 703 + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, 704 + RPCSVC_MAXPAGES); 705 + if (!pl) 706 + goto err_free_mr; 707 + 708 + frmr->mr = mr; 709 + frmr->page_list = pl; 710 + INIT_LIST_HEAD(&frmr->frmr_list); 711 + return frmr; 712 + 713 + err_free_mr: 714 + ib_dereg_mr(mr); 715 + err_free_frmr: 716 + kfree(frmr); 717 + err: 718 + return ERR_PTR(-ENOMEM); 719 + } 720 + 721 + static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) 722 + { 723 + struct svc_rdma_fastreg_mr *frmr; 724 + 725 + while (!list_empty(&xprt->sc_frmr_q)) { 726 + frmr = list_entry(xprt->sc_frmr_q.next, 727 + struct svc_rdma_fastreg_mr, frmr_list); 728 + list_del_init(&frmr->frmr_list); 729 + ib_dereg_mr(frmr->mr); 730 + ib_free_fast_reg_page_list(frmr->page_list); 731 + kfree(frmr); 732 + } 733 + } 734 + 735 + struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) 736 + { 737 + struct svc_rdma_fastreg_mr *frmr = NULL; 738 + 739 + spin_lock_bh(&rdma->sc_frmr_q_lock); 740 + if (!list_empty(&rdma->sc_frmr_q)) { 741 + frmr = list_entry(rdma->sc_frmr_q.next, 742 + struct svc_rdma_fastreg_mr, frmr_list); 743 + list_del_init(&frmr->frmr_list); 744 + frmr->map_len = 0; 745 + frmr->page_list_len = 0; 746 + } 747 + spin_unlock_bh(&rdma->sc_frmr_q_lock); 748 + if (frmr) 749 + return frmr; 750 + 751 + return rdma_alloc_frmr(rdma); 752 + } 753 + 754 + static void frmr_unmap_dma(struct svcxprt_rdma *xprt, 755 + struct svc_rdma_fastreg_mr *frmr) 756 + { 757 + int page_no; 758 + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { 759 + dma_addr_t addr = frmr->page_list->page_list[page_no]; 760 + if (ib_dma_mapping_error(frmr->mr->device, addr)) 761 + continue; 762 + atomic_dec(&xprt->sc_dma_used); 763 + ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, 764 + frmr->direction); 765 + } 766 + } 767 + 768 + void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, 769 + struct svc_rdma_fastreg_mr *frmr) 770 + { 771 + if (frmr) { 772 + frmr_unmap_dma(rdma, frmr); 773 + spin_lock_bh(&rdma->sc_frmr_q_lock); 774 + BUG_ON(!list_empty(&frmr->frmr_list)); 775 + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); 776 + spin_unlock_bh(&rdma->sc_frmr_q_lock); 777 + } 778 + } 779 + 721 780 /* 722 781 * This is the xpo_recvfrom function for listening endpoints. Its 723 782 * purpose is to accept incoming connections. The CMA callback handler ··· 827 704 struct rdma_conn_param conn_param; 828 705 struct ib_qp_init_attr qp_attr; 829 706 struct ib_device_attr devattr; 707 + int dma_mr_acc; 708 + int need_dma_mr; 830 709 int ret; 831 710 int i; 832 711 ··· 944 819 } 945 820 newxprt->sc_qp = newxprt->sc_cm_id->qp; 946 821 947 - /* Register all of physical memory */ 948 - newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, 949 - IB_ACCESS_LOCAL_WRITE | 950 - IB_ACCESS_REMOTE_WRITE); 951 - if (IS_ERR(newxprt->sc_phys_mr)) { 952 - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); 822 + /* 823 + * Use the most secure set of MR resources based on the 824 + * transport type and available memory management features in 825 + * the device. Here's the table implemented below: 826 + * 827 + * Fast Global DMA Remote WR 828 + * Reg LKEY MR Access 829 + * Sup'd Sup'd Needed Needed 830 + * 831 + * IWARP N N Y Y 832 + * N Y Y Y 833 + * Y N Y N 834 + * Y Y N - 835 + * 836 + * IB N N Y N 837 + * N Y N - 838 + * Y N Y N 839 + * Y Y N - 840 + * 841 + * NB: iWARP requires remote write access for the data sink 842 + * of an RDMA_READ. IB does not. 843 + */ 844 + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 845 + newxprt->sc_frmr_pg_list_len = 846 + devattr.max_fast_reg_page_list_len; 847 + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 848 + } 849 + 850 + /* 851 + * Determine if a DMA MR is required and if so, what privs are required 852 + */ 853 + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { 854 + case RDMA_TRANSPORT_IWARP: 855 + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; 856 + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { 857 + need_dma_mr = 1; 858 + dma_mr_acc = 859 + (IB_ACCESS_LOCAL_WRITE | 860 + IB_ACCESS_REMOTE_WRITE); 861 + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { 862 + need_dma_mr = 1; 863 + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; 864 + } else 865 + need_dma_mr = 0; 866 + break; 867 + case RDMA_TRANSPORT_IB: 868 + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { 869 + need_dma_mr = 1; 870 + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; 871 + } else 872 + need_dma_mr = 0; 873 + break; 874 + default: 953 875 goto errout; 954 876 } 877 + 878 + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ 879 + if (need_dma_mr) { 880 + /* Register all of physical memory */ 881 + newxprt->sc_phys_mr = 882 + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); 883 + if (IS_ERR(newxprt->sc_phys_mr)) { 884 + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", 885 + ret); 886 + goto errout; 887 + } 888 + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; 889 + } else 890 + newxprt->sc_dma_lkey = 891 + newxprt->sc_cm_id->device->local_dma_lkey; 955 892 956 893 /* Post receive buffers */ 957 894 for (i = 0; i < newxprt->sc_max_requests; i++) { ··· 1148 961 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); 1149 962 WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); 1150 963 964 + /* De-allocate fastreg mr */ 965 + rdma_dealloc_frmr_q(rdma); 966 + 1151 967 /* Destroy the QP if present (not a listener) */ 1152 968 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) 1153 969 ib_destroy_qp(rdma->sc_qp); ··· 1204 1014 return 1; 1205 1015 } 1206 1016 1017 + /* 1018 + * Attempt to register the kvec representing the RPC memory with the 1019 + * device. 1020 + * 1021 + * Returns: 1022 + * NULL : The device does not support fastreg or there were no more 1023 + * fastreg mr. 1024 + * frmr : The kvec register request was successfully posted. 1025 + * <0 : An error was encountered attempting to register the kvec. 1026 + */ 1027 + int svc_rdma_fastreg(struct svcxprt_rdma *xprt, 1028 + struct svc_rdma_fastreg_mr *frmr) 1029 + { 1030 + struct ib_send_wr fastreg_wr; 1031 + u8 key; 1032 + 1033 + /* Bump the key */ 1034 + key = (u8)(frmr->mr->lkey & 0x000000FF); 1035 + ib_update_fast_reg_key(frmr->mr, ++key); 1036 + 1037 + /* Prepare FASTREG WR */ 1038 + memset(&fastreg_wr, 0, sizeof fastreg_wr); 1039 + fastreg_wr.opcode = IB_WR_FAST_REG_MR; 1040 + fastreg_wr.send_flags = IB_SEND_SIGNALED; 1041 + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; 1042 + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; 1043 + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; 1044 + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; 1045 + fastreg_wr.wr.fast_reg.length = frmr->map_len; 1046 + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; 1047 + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; 1048 + return svc_rdma_send(xprt, &fastreg_wr); 1049 + } 1050 + 1207 1051 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) 1208 1052 { 1209 - struct ib_send_wr *bad_wr; 1053 + struct ib_send_wr *bad_wr, *n_wr; 1054 + int wr_count; 1055 + int i; 1210 1056 int ret; 1211 1057 1212 1058 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1213 1059 return -ENOTCONN; 1214 1060 1215 1061 BUG_ON(wr->send_flags != IB_SEND_SIGNALED); 1216 - BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != 1217 - wr->opcode); 1062 + wr_count = 1; 1063 + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) 1064 + wr_count++; 1065 + 1218 1066 /* If the SQ is full, wait until an SQ entry is available */ 1219 1067 while (1) { 1220 1068 spin_lock_bh(&xprt->sc_lock); 1221 - if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { 1069 + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { 1222 1070 spin_unlock_bh(&xprt->sc_lock); 1223 1071 atomic_inc(&rdma_stat_sq_starve); 1224 1072 ··· 1271 1043 return 0; 1272 1044 continue; 1273 1045 } 1274 - /* Bumped used SQ WR count and post */ 1275 - svc_xprt_get(&xprt->sc_xprt); 1046 + /* Take a transport ref for each WR posted */ 1047 + for (i = 0; i < wr_count; i++) 1048 + svc_xprt_get(&xprt->sc_xprt); 1049 + 1050 + /* Bump used SQ WR count and post */ 1051 + atomic_add(wr_count, &xprt->sc_sq_count); 1276 1052 ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); 1277 - if (!ret) 1278 - atomic_inc(&xprt->sc_sq_count); 1279 - else { 1280 - svc_xprt_put(&xprt->sc_xprt); 1053 + if (ret) { 1054 + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 1055 + atomic_sub(wr_count, &xprt->sc_sq_count); 1056 + for (i = 0; i < wr_count; i ++) 1057 + svc_xprt_put(&xprt->sc_xprt); 1281 1058 dprintk("svcrdma: failed to post SQ WR rc=%d, " 1282 1059 "sc_sq_count=%d, sc_sq_depth=%d\n", 1283 1060 ret, atomic_read(&xprt->sc_sq_count), 1284 1061 xprt->sc_sq_depth); 1285 1062 } 1286 1063 spin_unlock_bh(&xprt->sc_lock); 1064 + if (ret) 1065 + wake_up(&xprt->sc_send_wait); 1287 1066 break; 1288 1067 } 1289 1068 return ret; ··· 1314 1079 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); 1315 1080 1316 1081 /* Prepare SGE for local address */ 1317 - atomic_inc(&xprt->sc_dma_used); 1318 1082 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, 1319 1083 p, 0, PAGE_SIZE, DMA_FROM_DEVICE); 1320 - sge.lkey = xprt->sc_phys_mr->lkey; 1084 + if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { 1085 + put_page(p); 1086 + return; 1087 + } 1088 + atomic_inc(&xprt->sc_dma_used); 1089 + sge.lkey = xprt->sc_dma_lkey; 1321 1090 sge.length = length; 1322 1091 1323 1092 ctxt = svc_rdma_get_context(xprt); ··· 1342 1103 if (ret) { 1343 1104 dprintk("svcrdma: Error %d posting send for protocol error\n", 1344 1105 ret); 1106 + ib_dma_unmap_page(xprt->sc_cm_id->device, 1107 + sge.addr, PAGE_SIZE, 1108 + DMA_FROM_DEVICE); 1345 1109 svc_rdma_put_context(ctxt, 1); 1346 1110 } 1347 1111 }