Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/iser: Use IB_WR_REG_MR_INTEGRITY for PI handover

Using this new API reduces iSER code complexity.
It also reduces the maximum number of work requests per task and the need
of dealing with multiple MRs (and their registrations and invalidations)
per task. It is done by using a single WR and a special MR type
(IB_MR_TYPE_INTEGRITY) for PI operation.

The setup of the tested benchmark:
- 2 servers with 24 cores (1 initiator and 1 target)
- 24 target sessions with 1 LUN each
- ramdisk backstore
- PI active

Performance results running fio (24 jobs, 128 iodepth) using
write_generate=0 and read_verify=0 (w/w.o patch):

bs IOPS(read) IOPS(write)
---- ---------- ----------
512 1236.6K/1164.3K 1357.2K/1332.8K
1k 1196.5K/1163.8K 1348.4K/1262.7K
2k 1016.7K/921950 1003.7K/931230
4k 662728/600545 595423/501513
8k 385954/384345 333775/277090
16k 222864/222820 170317/170671
32k 116869/114896 82331/82244
64k 55205/54931 40264/40021

Using write_generate=1 and read_verify=1 (w/w.o patch):

bs IOPS(read) IOPS(write)
---- ---------- ----------
512 1090.1K/1030.9K 1303.9K/1101.4K
1k 1057.7K/904583 1318.4K/988085
2k 965226/638799 1008.6K/692514
4k 555479/410151 542414/414517
8k 298675/224964 264729/237508
16k 133485/122481 164625/138647
32k 74329/67615 80143/78743
64k 35716/35519 39294/37334

We get performance improvement at all block sizes.
The most significant improvement is when writing 4k bs (almost 30% more
iops).

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Israel Rukshin and committed by
Jason Gunthorpe
b76a4399 38ca87c6

+96 -194
+8 -30
drivers/infiniband/ulp/iser/iscsi_iser.h
··· 225 225 ISCSI_TX_DATAOUT 226 226 }; 227 227 228 - /* Maximum number of work requests per task: 229 - * Data memory region local invalidate + fast registration 230 - * Protection memory region local invalidate + fast registration 231 - * Signature memory region local invalidate + fast registration 232 - * PDU send 228 + /* 229 + * Maximum number of work requests per task 230 + * (invalidate, registration, send) 233 231 */ 234 - #define ISER_MAX_WRS 7 232 + #define ISER_MAX_WRS 3 235 233 236 234 /** 237 235 * struct iser_tx_desc - iSER TX descriptor ··· 245 247 * @mapped: Is the task header mapped 246 248 * @wr_idx: Current WR index 247 249 * @wrs: Array of WRs per task 248 - * @data_reg: Data buffer registration details 249 - * @prot_reg: Protection buffer registration details 250 - * @sig_attrs: Signature attributes 251 250 */ 252 251 struct iser_tx_desc { 253 252 struct iser_ctrl iser_header; ··· 259 264 union iser_wr { 260 265 struct ib_send_wr send; 261 266 struct ib_reg_wr fast_reg; 262 - struct ib_sig_handover_wr sig; 263 267 } wrs[ISER_MAX_WRS]; 264 - struct iser_mem_reg data_reg; 265 - struct iser_mem_reg prot_reg; 266 - struct ib_sig_attrs sig_attrs; 267 268 }; 268 269 269 270 #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ ··· 379 388 * 380 389 * @mr: memory region 381 390 * @fmr_pool: pool of fmrs 391 + * @sig_mr: signature memory region 382 392 * @page_vec: fast reg page list used by fmr pool 383 393 * @mr_valid: is mr valid indicator 384 394 */ ··· 388 396 struct ib_mr *mr; 389 397 struct ib_fmr_pool *fmr_pool; 390 398 }; 399 + struct ib_mr *sig_mr; 391 400 struct iser_page_vec *page_vec; 392 401 u8 mr_valid:1; 393 - }; 394 - 395 - /** 396 - * struct iser_pi_context - Protection information context 397 - * 398 - * @rsc: protection buffer registration resources 399 - * @sig_mr: signature enable memory region 400 - * @sig_mr_valid: is sig_mr valid indicator 401 - * @sig_protected: is region protected indicator 402 - */ 403 - struct iser_pi_context { 404 - struct iser_reg_resources rsc; 405 - struct ib_mr *sig_mr; 406 - u8 sig_mr_valid:1; 407 - u8 sig_protected:1; 408 402 }; 409 403 410 404 /** ··· 398 420 * 399 421 * @list: entry in connection fastreg pool 400 422 * @rsc: data buffer registration resources 401 - * @pi_ctx: protection information context 423 + * @sig_protected: is region protected indicator 402 424 */ 403 425 struct iser_fr_desc { 404 426 struct list_head list; 405 427 struct iser_reg_resources rsc; 406 - struct iser_pi_context *pi_ctx; 428 + bool sig_protected; 407 429 struct list_head all_list; 408 430 }; 409 431
+7 -5
drivers/infiniband/ulp/iser/iser_initiator.c
··· 592 592 static inline int 593 593 iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) 594 594 { 595 - if (likely(rkey == desc->rsc.mr->rkey)) { 596 - desc->rsc.mr_valid = 0; 597 - } else if (likely(desc->pi_ctx && rkey == desc->pi_ctx->sig_mr->rkey)) { 598 - desc->pi_ctx->sig_mr_valid = 0; 599 - } else { 595 + if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) || 596 + (desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) { 600 597 iser_err("Bogus remote invalidation for rkey %#x\n", rkey); 601 598 return -EINVAL; 602 599 } 600 + 601 + desc->rsc.mr_valid = 0; 603 602 604 603 return 0; 605 604 } ··· 748 749 749 750 iser_task->prot[ISER_DIR_IN].data_len = 0; 750 751 iser_task->prot[ISER_DIR_OUT].data_len = 0; 752 + 753 + iser_task->prot[ISER_DIR_IN].dma_nents = 0; 754 + iser_task->prot[ISER_DIR_OUT].dma_nents = 0; 751 755 752 756 memset(&iser_task->rdma_reg[ISER_DIR_IN], 0, 753 757 sizeof(struct iser_mem_reg));
+35 -63
drivers/infiniband/ulp/iser/iser_memory.c
··· 376 376 377 377 static int 378 378 iser_reg_sig_mr(struct iscsi_iser_task *iser_task, 379 - struct iser_pi_context *pi_ctx, 380 - struct iser_mem_reg *data_reg, 381 - struct iser_mem_reg *prot_reg, 379 + struct iser_data_buf *mem, 380 + struct iser_data_buf *sig_mem, 381 + struct iser_reg_resources *rsc, 382 382 struct iser_mem_reg *sig_reg) 383 383 { 384 384 struct iser_tx_desc *tx_desc = &iser_task->desc; 385 - struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs; 386 385 struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; 387 - struct ib_sig_handover_wr *wr; 388 - struct ib_mr *mr = pi_ctx->sig_mr; 386 + struct ib_mr *mr = rsc->sig_mr; 387 + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; 388 + struct ib_reg_wr *wr; 389 389 int ret; 390 390 391 391 memset(sig_attrs, 0, sizeof(*sig_attrs)); ··· 395 395 396 396 iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); 397 397 398 - if (pi_ctx->sig_mr_valid) 398 + if (rsc->mr_valid) 399 399 iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); 400 400 401 401 ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); 402 402 403 - wr = container_of(iser_tx_next_wr(tx_desc), struct ib_sig_handover_wr, 404 - wr); 405 - wr->wr.opcode = IB_WR_REG_SIG_MR; 403 + ret = ib_map_mr_sg_pi(mr, mem->sg, mem->dma_nents, NULL, 404 + sig_mem->sg, sig_mem->dma_nents, NULL, SZ_4K); 405 + if (unlikely(ret)) { 406 + iser_err("failed to map PI sg (%d)\n", 407 + mem->dma_nents + sig_mem->dma_nents); 408 + goto err; 409 + } 410 + 411 + wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr); 412 + memset(wr, 0, sizeof(*wr)); 413 + wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; 406 414 wr->wr.wr_cqe = cqe; 407 - wr->wr.sg_list = &data_reg->sge; 408 - wr->wr.num_sge = 1; 415 + wr->wr.num_sge = 0; 409 416 wr->wr.send_flags = 0; 410 - wr->sig_attrs = sig_attrs; 411 - wr->sig_mr = mr; 412 - if (scsi_prot_sg_count(iser_task->sc)) 413 - wr->prot = &prot_reg->sge; 414 - else 415 - wr->prot = NULL; 416 - wr->access_flags = IB_ACCESS_LOCAL_WRITE | 417 - IB_ACCESS_REMOTE_READ | 418 - IB_ACCESS_REMOTE_WRITE; 419 - pi_ctx->sig_mr_valid = 1; 417 + wr->mr = mr; 418 + wr->key = mr->rkey; 419 + wr->access = IB_ACCESS_LOCAL_WRITE | 420 + IB_ACCESS_REMOTE_READ | 421 + IB_ACCESS_REMOTE_WRITE; 422 + rsc->mr_valid = 1; 420 423 421 424 sig_reg->sge.lkey = mr->lkey; 422 425 sig_reg->rkey = mr->rkey; 423 - sig_reg->sge.addr = 0; 424 - sig_reg->sge.length = scsi_transfer_length(iser_task->sc); 426 + sig_reg->sge.addr = mr->iova; 427 + sig_reg->sge.length = mr->length; 425 428 426 429 iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n", 427 430 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, ··· 481 478 } 482 479 483 480 static int 484 - iser_reg_prot_sg(struct iscsi_iser_task *task, 485 - struct iser_data_buf *mem, 486 - struct iser_fr_desc *desc, 487 - bool use_dma_key, 488 - struct iser_mem_reg *reg) 489 - { 490 - struct iser_device *device = task->iser_conn->ib_conn.device; 491 - 492 - if (use_dma_key) 493 - return iser_reg_dma(device, mem, reg); 494 - 495 - return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg); 496 - } 497 - 498 - static int 499 481 iser_reg_data_sg(struct iscsi_iser_task *task, 500 482 struct iser_data_buf *mem, 501 483 struct iser_fr_desc *desc, ··· 503 515 struct iser_device *device = ib_conn->device; 504 516 struct iser_data_buf *mem = &task->data[dir]; 505 517 struct iser_mem_reg *reg = &task->rdma_reg[dir]; 506 - struct iser_mem_reg *data_reg; 507 518 struct iser_fr_desc *desc = NULL; 508 519 bool use_dma_key; 509 520 int err; ··· 515 528 reg->mem_h = desc; 516 529 } 517 530 518 - if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) 519 - data_reg = reg; 520 - else 521 - data_reg = &task->desc.data_reg; 522 - 523 - err = iser_reg_data_sg(task, mem, desc, use_dma_key, data_reg); 524 - if (unlikely(err)) 525 - goto err_reg; 526 - 527 - if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) { 528 - struct iser_mem_reg *prot_reg = &task->desc.prot_reg; 529 - 530 - if (scsi_prot_sg_count(task->sc)) { 531 - mem = &task->prot[dir]; 532 - err = iser_reg_prot_sg(task, mem, desc, 533 - use_dma_key, prot_reg); 534 - if (unlikely(err)) 535 - goto err_reg; 536 - } 537 - 538 - err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg, 539 - prot_reg, reg); 531 + if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) { 532 + err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg); 533 + if (unlikely(err)) 534 + goto err_reg; 535 + } else { 536 + err = iser_reg_sig_mr(task, mem, &task->prot[dir], 537 + &desc->rsc, reg); 540 538 if (unlikely(err)) 541 539 goto err_reg; 542 540 543 - desc->pi_ctx->sig_protected = 1; 541 + desc->sig_protected = 1; 544 542 } 545 543 546 544 return 0;
+46 -96
drivers/infiniband/ulp/iser/iser_verbs.c
··· 233 233 kfree(desc); 234 234 } 235 235 236 - static int 237 - iser_alloc_reg_res(struct iser_device *device, 238 - struct ib_pd *pd, 239 - struct iser_reg_resources *res, 240 - unsigned int size) 241 - { 242 - struct ib_device *ib_dev = device->ib_device; 243 - enum ib_mr_type mr_type; 244 - int ret; 245 - 246 - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) 247 - mr_type = IB_MR_TYPE_SG_GAPS; 248 - else 249 - mr_type = IB_MR_TYPE_MEM_REG; 250 - 251 - res->mr = ib_alloc_mr(pd, mr_type, size); 252 - if (IS_ERR(res->mr)) { 253 - ret = PTR_ERR(res->mr); 254 - iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); 255 - return ret; 256 - } 257 - res->mr_valid = 0; 258 - 259 - return 0; 260 - } 261 - 262 - static void 263 - iser_free_reg_res(struct iser_reg_resources *rsc) 264 - { 265 - ib_dereg_mr(rsc->mr); 266 - } 267 - 268 - static int 269 - iser_alloc_pi_ctx(struct iser_device *device, 270 - struct ib_pd *pd, 271 - struct iser_fr_desc *desc, 272 - unsigned int size) 273 - { 274 - struct iser_pi_context *pi_ctx = NULL; 275 - int ret; 276 - 277 - desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); 278 - if (!desc->pi_ctx) 279 - return -ENOMEM; 280 - 281 - pi_ctx = desc->pi_ctx; 282 - 283 - ret = iser_alloc_reg_res(device, pd, &pi_ctx->rsc, size); 284 - if (ret) { 285 - iser_err("failed to allocate reg_resources\n"); 286 - goto alloc_reg_res_err; 287 - } 288 - 289 - pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); 290 - if (IS_ERR(pi_ctx->sig_mr)) { 291 - ret = PTR_ERR(pi_ctx->sig_mr); 292 - goto sig_mr_failure; 293 - } 294 - pi_ctx->sig_mr_valid = 0; 295 - desc->pi_ctx->sig_protected = 0; 296 - 297 - return 0; 298 - 299 - sig_mr_failure: 300 - iser_free_reg_res(&pi_ctx->rsc); 301 - alloc_reg_res_err: 302 - kfree(desc->pi_ctx); 303 - 304 - return ret; 305 - } 306 - 307 - static void 308 - iser_free_pi_ctx(struct iser_pi_context *pi_ctx) 309 - { 310 - iser_free_reg_res(&pi_ctx->rsc); 311 - ib_dereg_mr(pi_ctx->sig_mr); 312 - kfree(pi_ctx); 313 - } 314 - 315 236 static struct iser_fr_desc * 316 237 iser_create_fastreg_desc(struct iser_device *device, 317 238 struct ib_pd *pd, ··· 240 319 unsigned int size) 241 320 { 242 321 struct iser_fr_desc *desc; 322 + struct ib_device *ib_dev = device->ib_device; 323 + enum ib_mr_type mr_type; 243 324 int ret; 244 325 245 326 desc = kzalloc(sizeof(*desc), GFP_KERNEL); 246 327 if (!desc) 247 328 return ERR_PTR(-ENOMEM); 248 329 249 - ret = iser_alloc_reg_res(device, pd, &desc->rsc, size); 250 - if (ret) 251 - goto reg_res_alloc_failure; 330 + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) 331 + mr_type = IB_MR_TYPE_SG_GAPS; 332 + else 333 + mr_type = IB_MR_TYPE_MEM_REG; 334 + 335 + desc->rsc.mr = ib_alloc_mr(pd, mr_type, size); 336 + if (IS_ERR(desc->rsc.mr)) { 337 + ret = PTR_ERR(desc->rsc.mr); 338 + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); 339 + goto err_alloc_mr; 340 + } 252 341 253 342 if (pi_enable) { 254 - ret = iser_alloc_pi_ctx(device, pd, desc, size); 255 - if (ret) 256 - goto pi_ctx_alloc_failure; 343 + desc->rsc.sig_mr = ib_alloc_mr_integrity(pd, size, size); 344 + if (IS_ERR(desc->rsc.sig_mr)) { 345 + ret = PTR_ERR(desc->rsc.sig_mr); 346 + iser_err("Failed to allocate sig_mr err=%d\n", ret); 347 + goto err_alloc_mr_integrity; 348 + } 257 349 } 350 + desc->rsc.mr_valid = 0; 258 351 259 352 return desc; 260 353 261 - pi_ctx_alloc_failure: 262 - iser_free_reg_res(&desc->rsc); 263 - reg_res_alloc_failure: 354 + err_alloc_mr_integrity: 355 + ib_dereg_mr(desc->rsc.mr); 356 + err_alloc_mr: 264 357 kfree(desc); 265 358 266 359 return ERR_PTR(ret); 360 + } 361 + 362 + static void iser_destroy_fastreg_desc(struct iser_fr_desc *desc) 363 + { 364 + struct iser_reg_resources *res = &desc->rsc; 365 + 366 + ib_dereg_mr(res->mr); 367 + if (res->sig_mr) { 368 + ib_dereg_mr(res->sig_mr); 369 + res->sig_mr = NULL; 370 + } 371 + kfree(desc); 267 372 } 268 373 269 374 /** ··· 346 399 347 400 list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) { 348 401 list_del(&desc->all_list); 349 - iser_free_reg_res(&desc->rsc); 350 - if (desc->pi_ctx) 351 - iser_free_pi_ctx(desc->pi_ctx); 352 - kfree(desc); 402 + iser_destroy_fastreg_desc(desc); 353 403 ++i; 354 404 } 355 405 ··· 651 707 struct ib_device_attr *attr = &device->ib_device->attrs; 652 708 unsigned short sg_tablesize, sup_sg_tablesize; 653 709 unsigned short reserved_mr_pages; 710 + u32 max_num_sg; 654 711 655 712 /* 656 713 * FRs without SG_GAPS or FMRs can only map up to a (device) page per ··· 665 720 else 666 721 reserved_mr_pages = 1; 667 722 723 + if (iser_conn->ib_conn.pi_support) 724 + max_num_sg = attr->max_pi_fast_reg_page_list_len; 725 + else 726 + max_num_sg = attr->max_fast_reg_page_list_len; 727 + 668 728 sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); 669 729 if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) 670 730 sup_sg_tablesize = 671 731 min_t( 672 732 uint, ISCSI_ISER_MAX_SG_TABLESIZE, 673 - attr->max_fast_reg_page_list_len - reserved_mr_pages); 733 + max_num_sg - reserved_mr_pages); 674 734 else 675 735 sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE; 676 736 ··· 1068 1118 struct ib_mr_status mr_status; 1069 1119 int ret; 1070 1120 1071 - if (desc && desc->pi_ctx->sig_protected) { 1072 - desc->pi_ctx->sig_protected = 0; 1073 - ret = ib_check_mr_status(desc->pi_ctx->sig_mr, 1121 + if (desc && desc->sig_protected) { 1122 + desc->sig_protected = 0; 1123 + ret = ib_check_mr_status(desc->rsc.sig_mr, 1074 1124 IB_MR_CHECK_SIG_STATUS, &mr_status); 1075 1125 if (ret) { 1076 1126 pr_err("ib_check_mr_status failed, ret %d\n", ret);