Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/iser: Introduce fast memory registration model (FRWR)

Newer HCAs and Virtual functions may not support FMRs but rather a fast
registration model, which we call FRWR - "Fast Registration Work Requests".

This model was introduced in 00f7ec36c ("RDMA/core: Add memory management
extensions support") and works when the IB device supports the
IB_DEVICE_MEM_MGT_EXTENSIONS capability.

Upon creating the iser device iser will test whether the HCA supports
FMRs. If no support for FMRs, check if IB_DEVICE_MEM_MGT_EXTENSIONS
is supported and assign function pointers that handle fast
registration and allocation of appropriate resources (fast_reg
descriptors).

Registration is done using posting IB_WR_FAST_REG_MR to the QP and
invalidations using posting IB_WR_LOCAL_INV.

Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>

authored by

Sagi Grimberg and committed by
Roland Dreier
5587856c e657571b

+287 -12
+20 -1
drivers/infiniband/ulp/iser/iscsi_iser.h
··· 211 211 u64 va; 212 212 u64 len; 213 213 void *mem_h; 214 - int is_fmr; 214 + int is_mr; 215 215 }; 216 216 217 217 struct iser_regd_buf { ··· 277 277 enum iser_data_dir cmd_dir); 278 278 }; 279 279 280 + struct fast_reg_descriptor { 281 + struct list_head list; 282 + /* For fast registration - FRWR */ 283 + struct ib_mr *data_mr; 284 + struct ib_fast_reg_page_list *data_frpl; 285 + /* Valid for fast registration flag */ 286 + bool valid; 287 + }; 288 + 280 289 struct iser_conn { 281 290 struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */ 282 291 struct iscsi_endpoint *ep; ··· 316 307 struct iser_page_vec *page_vec; /* represents SG to fmr maps* 317 308 * maps serialized as tx is*/ 318 309 } fmr; 310 + struct { 311 + struct list_head pool; 312 + int pool_size; 313 + } frwr; 319 314 } fastreg; 320 315 }; 321 316 ··· 406 393 407 394 int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, 408 395 enum iser_data_dir cmd_dir); 396 + int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *task, 397 + enum iser_data_dir cmd_dir); 409 398 410 399 int iser_connect(struct iser_conn *ib_conn, 411 400 struct sockaddr_in *src_addr, ··· 420 405 421 406 void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, 422 407 enum iser_data_dir cmd_dir); 408 + void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task, 409 + enum iser_data_dir cmd_dir); 423 410 424 411 int iser_post_recvl(struct iser_conn *ib_conn); 425 412 int iser_post_recvm(struct iser_conn *ib_conn, int count); ··· 438 421 int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session); 439 422 int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max); 440 423 void iser_free_fmr_pool(struct iser_conn *ib_conn); 424 + int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max); 425 + void iser_free_frwr_pool(struct iser_conn *ib_conn); 441 426 #endif
+137 -3
drivers/infiniband/ulp/iser/iser_memory.c
··· 395 395 regd_buf = &iser_task->rdma_regd[cmd_dir]; 396 396 397 397 aligned_len = iser_data_buf_aligned_len(mem, ibdev); 398 - if (aligned_len != mem->dma_nents || 399 - (!ib_conn->fastreg.fmr.pool && mem->dma_nents > 1)) { 398 + if (aligned_len != mem->dma_nents) { 400 399 err = fall_to_bounce_buf(iser_task, ibdev, 401 400 cmd_dir, aligned_len); 402 401 if (err) { ··· 413 414 regd_buf->reg.rkey = device->mr->rkey; 414 415 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 415 416 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 416 - regd_buf->reg.is_fmr = 0; 417 + regd_buf->reg.is_mr = 0; 417 418 418 419 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 419 420 "va: 0x%08lX sz: %ld]\n", ··· 442 443 return err; 443 444 } 444 445 return 0; 446 + } 447 + 448 + static int iser_fast_reg_mr(struct fast_reg_descriptor *desc, 449 + struct iser_conn *ib_conn, 450 + struct iser_regd_buf *regd_buf, 451 + u32 offset, unsigned int data_size, 452 + unsigned int page_list_len) 453 + { 454 + struct ib_send_wr fastreg_wr, inv_wr; 455 + struct ib_send_wr *bad_wr, *wr = NULL; 456 + u8 key; 457 + int ret; 458 + 459 + if (!desc->valid) { 460 + memset(&inv_wr, 0, sizeof(inv_wr)); 461 + inv_wr.opcode = IB_WR_LOCAL_INV; 462 + inv_wr.send_flags = IB_SEND_SIGNALED; 463 + inv_wr.ex.invalidate_rkey = desc->data_mr->rkey; 464 + wr = &inv_wr; 465 + /* Bump the key */ 466 + key = (u8)(desc->data_mr->rkey & 0x000000FF); 467 + ib_update_fast_reg_key(desc->data_mr, ++key); 468 + } 469 + 470 + /* Prepare FASTREG WR */ 471 + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); 472 + fastreg_wr.opcode = IB_WR_FAST_REG_MR; 473 + fastreg_wr.send_flags = IB_SEND_SIGNALED; 474 + fastreg_wr.wr.fast_reg.iova_start = desc->data_frpl->page_list[0] + offset; 475 + fastreg_wr.wr.fast_reg.page_list = desc->data_frpl; 476 + fastreg_wr.wr.fast_reg.page_list_len = page_list_len; 477 + fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; 478 + fastreg_wr.wr.fast_reg.length = data_size; 479 + fastreg_wr.wr.fast_reg.rkey = desc->data_mr->rkey; 480 + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | 481 + IB_ACCESS_REMOTE_WRITE | 482 + IB_ACCESS_REMOTE_READ); 483 + 484 + if (!wr) { 485 + wr = &fastreg_wr; 486 + atomic_inc(&ib_conn->post_send_buf_count); 487 + } else { 488 + wr->next = &fastreg_wr; 489 + atomic_add(2, &ib_conn->post_send_buf_count); 490 + } 491 + 492 + ret = ib_post_send(ib_conn->qp, wr, &bad_wr); 493 + if (ret) { 494 + if (bad_wr->next) 495 + atomic_sub(2, &ib_conn->post_send_buf_count); 496 + else 497 + atomic_dec(&ib_conn->post_send_buf_count); 498 + iser_err("fast registration failed, ret:%d\n", ret); 499 + return ret; 500 + } 501 + desc->valid = false; 502 + 503 + regd_buf->reg.mem_h = desc; 504 + regd_buf->reg.lkey = desc->data_mr->lkey; 505 + regd_buf->reg.rkey = desc->data_mr->rkey; 506 + regd_buf->reg.va = desc->data_frpl->page_list[0] + offset; 507 + regd_buf->reg.len = data_size; 508 + regd_buf->reg.is_mr = 1; 509 + 510 + return ret; 511 + } 512 + 513 + /** 514 + * iser_reg_rdma_mem_frwr - Registers memory intended for RDMA, 515 + * using Fast Registration WR (if possible) obtaining rkey and va 516 + * 517 + * returns 0 on success, errno code on failure 518 + */ 519 + int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task, 520 + enum iser_data_dir cmd_dir) 521 + { 522 + struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 523 + struct iser_device *device = ib_conn->device; 524 + struct ib_device *ibdev = device->ib_device; 525 + struct iser_data_buf *mem = &iser_task->data[cmd_dir]; 526 + struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; 527 + struct fast_reg_descriptor *desc; 528 + unsigned int data_size, page_list_len; 529 + int err, aligned_len; 530 + unsigned long flags; 531 + u32 offset; 532 + 533 + aligned_len = iser_data_buf_aligned_len(mem, ibdev); 534 + if (aligned_len != mem->dma_nents) { 535 + err = fall_to_bounce_buf(iser_task, ibdev, 536 + cmd_dir, aligned_len); 537 + if (err) { 538 + iser_err("failed to allocate bounce buffer\n"); 539 + return err; 540 + } 541 + mem = &iser_task->data_copy[cmd_dir]; 542 + } 543 + 544 + /* if there a single dma entry, dma mr suffices */ 545 + if (mem->dma_nents == 1) { 546 + struct scatterlist *sg = (struct scatterlist *)mem->buf; 547 + 548 + regd_buf->reg.lkey = device->mr->lkey; 549 + regd_buf->reg.rkey = device->mr->rkey; 550 + regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 551 + regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 552 + regd_buf->reg.is_mr = 0; 553 + } else { 554 + spin_lock_irqsave(&ib_conn->lock, flags); 555 + desc = list_first_entry(&ib_conn->fastreg.frwr.pool, 556 + struct fast_reg_descriptor, list); 557 + list_del(&desc->list); 558 + spin_unlock_irqrestore(&ib_conn->lock, flags); 559 + page_list_len = iser_sg_to_page_vec(mem, device->ib_device, 560 + desc->data_frpl->page_list, 561 + &offset, &data_size); 562 + 563 + if (page_list_len * SIZE_4K < data_size) { 564 + iser_err("fast reg page_list too short to hold this SG\n"); 565 + err = -EINVAL; 566 + goto err_reg; 567 + } 568 + 569 + err = iser_fast_reg_mr(desc, ib_conn, regd_buf, 570 + offset, data_size, page_list_len); 571 + if (err) 572 + goto err_reg; 573 + } 574 + 575 + return 0; 576 + err_reg: 577 + spin_lock_irqsave(&ib_conn->lock, flags); 578 + list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); 579 + spin_unlock_irqrestore(&ib_conn->lock, flags); 580 + return err; 445 581 }
+130 -8
drivers/infiniband/ulp/iser/iser_verbs.c
··· 73 73 { 74 74 int i, j; 75 75 struct iser_cq_desc *cq_desc; 76 + struct ib_device_attr *dev_attr; 76 77 77 - /* Assign function handles */ 78 - device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; 79 - device->iser_free_rdma_reg_res = iser_free_fmr_pool; 80 - device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; 81 - device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; 78 + dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL); 79 + if (!dev_attr) 80 + return -ENOMEM; 81 + 82 + if (ib_query_device(device->ib_device, dev_attr)) { 83 + pr_warn("Query device failed for %s\n", device->ib_device->name); 84 + goto dev_attr_err; 85 + } 86 + 87 + /* Assign function handles - based on FMR support */ 88 + if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && 89 + device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { 90 + iser_info("FMR supported, using FMR for registration\n"); 91 + device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; 92 + device->iser_free_rdma_reg_res = iser_free_fmr_pool; 93 + device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; 94 + device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; 95 + } else 96 + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 97 + iser_info("FRWR supported, using FRWR for registration\n"); 98 + device->iser_alloc_rdma_reg_res = iser_create_frwr_pool; 99 + device->iser_free_rdma_reg_res = iser_free_frwr_pool; 100 + device->iser_reg_rdma_mem = iser_reg_rdma_mem_frwr; 101 + device->iser_unreg_rdma_mem = iser_unreg_mem_frwr; 102 + } else { 103 + iser_err("IB device does not support FMRs nor FRWRs, can't register memory\n"); 104 + goto dev_attr_err; 105 + } 82 106 83 107 device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); 84 108 iser_info("using %d CQs, device %s supports %d vectors\n", ··· 158 134 if (ib_register_event_handler(&device->event_handler)) 159 135 goto handler_err; 160 136 137 + kfree(dev_attr); 161 138 return 0; 162 139 163 140 handler_err: ··· 178 153 kfree(device->cq_desc); 179 154 cq_desc_err: 180 155 iser_err("failed to allocate an IB resource\n"); 156 + dev_attr_err: 157 + kfree(dev_attr); 181 158 return -1; 182 159 } 183 160 ··· 277 250 278 251 kfree(ib_conn->fastreg.fmr.page_vec); 279 252 ib_conn->fastreg.fmr.page_vec = NULL; 253 + } 254 + 255 + /** 256 + * iser_create_frwr_pool - Creates pool of fast_reg descriptors 257 + * for fast registration work requests. 258 + * returns 0 on success, or errno code on failure 259 + */ 260 + int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max) 261 + { 262 + struct iser_device *device = ib_conn->device; 263 + struct fast_reg_descriptor *desc; 264 + int i, ret; 265 + 266 + INIT_LIST_HEAD(&ib_conn->fastreg.frwr.pool); 267 + ib_conn->fastreg.frwr.pool_size = 0; 268 + for (i = 0; i < cmds_max; i++) { 269 + desc = kmalloc(sizeof(*desc), GFP_KERNEL); 270 + if (!desc) { 271 + iser_err("Failed to allocate a new fast_reg descriptor\n"); 272 + ret = -ENOMEM; 273 + goto err; 274 + } 275 + 276 + desc->data_frpl = ib_alloc_fast_reg_page_list(device->ib_device, 277 + ISCSI_ISER_SG_TABLESIZE + 1); 278 + if (IS_ERR(desc->data_frpl)) { 279 + ret = PTR_ERR(desc->data_frpl); 280 + iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", ret); 281 + goto err; 282 + } 283 + 284 + desc->data_mr = ib_alloc_fast_reg_mr(device->pd, 285 + ISCSI_ISER_SG_TABLESIZE + 1); 286 + if (IS_ERR(desc->data_mr)) { 287 + ret = PTR_ERR(desc->data_mr); 288 + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); 289 + ib_free_fast_reg_page_list(desc->data_frpl); 290 + goto err; 291 + } 292 + desc->valid = true; 293 + list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); 294 + ib_conn->fastreg.frwr.pool_size++; 295 + } 296 + 297 + return 0; 298 + err: 299 + iser_free_frwr_pool(ib_conn); 300 + return ret; 301 + } 302 + 303 + /** 304 + * iser_free_frwr_pool - releases the pool of fast_reg descriptors 305 + */ 306 + void iser_free_frwr_pool(struct iser_conn *ib_conn) 307 + { 308 + struct fast_reg_descriptor *desc, *tmp; 309 + int i = 0; 310 + 311 + if (list_empty(&ib_conn->fastreg.frwr.pool)) 312 + return; 313 + 314 + iser_info("freeing conn %p frwr pool\n", ib_conn); 315 + 316 + list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.frwr.pool, list) { 317 + list_del(&desc->list); 318 + ib_free_fast_reg_page_list(desc->data_frpl); 319 + ib_dereg_mr(desc->data_mr); 320 + kfree(desc); 321 + ++i; 322 + } 323 + 324 + if (i < ib_conn->fastreg.frwr.pool_size) 325 + iser_warn("pool still has %d regions registered\n", 326 + ib_conn->fastreg.frwr.pool_size - i); 280 327 } 281 328 282 329 /** ··· 808 707 mem_reg->rkey = mem->fmr->rkey; 809 708 mem_reg->len = page_vec->length * SIZE_4K; 810 709 mem_reg->va = io_addr; 811 - mem_reg->is_fmr = 1; 710 + mem_reg->is_mr = 1; 812 711 mem_reg->mem_h = (void *)mem; 813 712 814 713 mem_reg->va += page_vec->offset; ··· 835 734 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 836 735 int ret; 837 736 838 - if (!reg->is_fmr) 737 + if (!reg->is_mr) 839 738 return; 840 739 841 740 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); ··· 845 744 iser_err("ib_fmr_pool_unmap failed %d\n", ret); 846 745 847 746 reg->mem_h = NULL; 747 + } 748 + 749 + void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task, 750 + enum iser_data_dir cmd_dir) 751 + { 752 + struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 753 + struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn; 754 + struct fast_reg_descriptor *desc = reg->mem_h; 755 + 756 + if (!reg->is_mr) 757 + return; 758 + 759 + reg->mem_h = NULL; 760 + reg->is_mr = 0; 761 + spin_lock_bh(&ib_conn->lock); 762 + list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool); 763 + spin_unlock_bh(&ib_conn->lock); 848 764 } 849 765 850 766 int iser_post_recvl(struct iser_conn *ib_conn) ··· 985 867 if (wc.status == IB_WC_SUCCESS) { 986 868 if (wc.opcode == IB_WC_SEND) 987 869 iser_snd_completion(tx_desc, ib_conn); 988 - else 870 + else if (wc.opcode == IB_WC_LOCAL_INV || 871 + wc.opcode == IB_WC_FAST_REG_MR) { 872 + atomic_dec(&ib_conn->post_send_buf_count); 873 + continue; 874 + } else 989 875 iser_err("expected opcode %d got %d\n", 990 876 IB_WC_SEND, wc.opcode); 991 877 } else {