Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/rdmavt: Handle dereg of inuse MRs properly

A destroy of an MR prior to destroying the QP can cause the following
diagnostic if the QP is referencing the MR being de-registered:

hfi1 0000:05:00.0: hfi1_0: rvt_dereg_mr timeout mr ffff8808562108
00 pd ffff880859b20b00

The solution is to when the a non-zero refcount is encountered when
the MR is destroyed the QPs needs to be iterated looking for QPs in
the same PD as the MR. If rvt_qp_mr_clean() detects any such QP
references the rkey/lkey, the QP needs to be put into an error state
via a call to rvt_qp_error() which will trigger the clean up of any
stuck references.

This solution is as specified in IBTA 1.3 Volume 1 11.2.10.5.

[This is reproduced with the 0.4.9 version of qperf and the rc_bw test]

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

authored by

Mike Marciniszyn and committed by
Doug Ledford
0208da90 557fafe1

+216 -21
+104 -17
drivers/infiniband/sw/rdmavt/mr.c
··· 441 441 } 442 442 443 443 /** 444 + * rvt_dereg_clean_qp_cb - callback from iterator 445 + * @qp - the qp 446 + * @v - the mregion (as u64) 447 + * 448 + * This routine fields the callback for all QPs and 449 + * for QPs in the same PD as the MR will call the 450 + * rvt_qp_mr_clean() to potentially cleanup references. 451 + */ 452 + static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v) 453 + { 454 + struct rvt_mregion *mr = (struct rvt_mregion *)v; 455 + 456 + /* skip PDs that are not ours */ 457 + if (mr->pd != qp->ibqp.pd) 458 + return; 459 + rvt_qp_mr_clean(qp, mr->lkey); 460 + } 461 + 462 + /** 463 + * rvt_dereg_clean_qps - find QPs for reference cleanup 464 + * @mr - the MR that is being deregistered 465 + * 466 + * This routine iterates RC QPs looking for references 467 + * to the lkey noted in mr. 468 + */ 469 + static void rvt_dereg_clean_qps(struct rvt_mregion *mr) 470 + { 471 + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); 472 + 473 + rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb); 474 + } 475 + 476 + /** 477 + * rvt_check_refs - check references 478 + * @mr - the megion 479 + * @t - the caller identification 480 + * 481 + * This routine checks MRs holding a reference during 482 + * when being de-registered. 483 + * 484 + * If the count is non-zero, the code calls a clean routine then 485 + * waits for the timeout for the count to zero. 486 + */ 487 + static int rvt_check_refs(struct rvt_mregion *mr, const char *t) 488 + { 489 + unsigned long timeout; 490 + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); 491 + 492 + if (percpu_ref_is_zero(&mr->refcount)) 493 + return 0; 494 + /* avoid dma mr */ 495 + if (mr->lkey) 496 + rvt_dereg_clean_qps(mr); 497 + timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ); 498 + if (!timeout) { 499 + rvt_pr_err(rdi, 500 + "%s timeout mr %p pd %p lkey %x refcount %ld\n", 501 + t, mr, mr->pd, mr->lkey, 502 + atomic_long_read(&mr->refcount.count)); 503 + rvt_get_mr(mr); 504 + return -EBUSY; 505 + } 506 + return 0; 507 + } 508 + 509 + /** 510 + * rvt_mr_has_lkey - is MR 511 + * @mr - the mregion 512 + * @lkey - the lkey 513 + */ 514 + bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey) 515 + { 516 + return mr && lkey == mr->lkey; 517 + } 518 + 519 + /** 520 + * rvt_ss_has_lkey - is mr in sge tests 521 + * @ss - the sge state 522 + * @lkey 523 + * 524 + * This code tests for an MR in the indicated 525 + * sge state. 526 + */ 527 + bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey) 528 + { 529 + int i; 530 + bool rval = false; 531 + 532 + if (!ss->num_sge) 533 + return rval; 534 + /* first one */ 535 + rval = rvt_mr_has_lkey(ss->sge.mr, lkey); 536 + /* any others */ 537 + for (i = 0; !rval && i < ss->num_sge - 1; i++) 538 + rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey); 539 + return rval; 540 + } 541 + 542 + /** 444 543 * rvt_dereg_mr - unregister and free a memory region 445 544 * @ibmr: the memory region to free 446 545 * ··· 552 453 int rvt_dereg_mr(struct ib_mr *ibmr) 553 454 { 554 455 struct rvt_mr *mr = to_imr(ibmr); 555 - struct rvt_dev_info *rdi = ib_to_rvt(ibmr->pd->device); 556 - int ret = 0; 557 - unsigned long timeout; 456 + int ret; 558 457 559 458 rvt_free_lkey(&mr->mr); 560 459 561 460 rvt_put_mr(&mr->mr); /* will set completion if last */ 562 - timeout = wait_for_completion_timeout(&mr->mr.comp, 5 * HZ); 563 - if (!timeout) { 564 - rvt_pr_err(rdi, 565 - "rvt_dereg_mr timeout mr %p pd %p\n", 566 - mr, mr->mr.pd); 567 - rvt_get_mr(&mr->mr); 568 - ret = -EBUSY; 461 + ret = rvt_check_refs(&mr->mr, __func__); 462 + if (ret) 569 463 goto out; 570 - } 571 464 rvt_deinit_mregion(&mr->mr); 572 465 if (mr->umem) 573 466 ib_umem_release(mr->umem); ··· 852 761 { 853 762 struct rvt_fmr *fmr = to_ifmr(ibfmr); 854 763 int ret = 0; 855 - unsigned long timeout; 856 764 857 765 rvt_free_lkey(&fmr->mr); 858 766 rvt_put_mr(&fmr->mr); /* will set completion if last */ 859 - timeout = wait_for_completion_timeout(&fmr->mr.comp, 5 * HZ); 860 - if (!timeout) { 861 - rvt_get_mr(&fmr->mr); 862 - ret = -EBUSY; 767 + ret = rvt_check_refs(&fmr->mr, __func__); 768 + if (ret) 863 769 goto out; 864 - } 865 770 rvt_deinit_mregion(&fmr->mr); 866 771 kfree(fmr); 867 772 out:
+108 -4
drivers/infiniband/sw/rdmavt/qp.c
··· 458 458 } 459 459 } 460 460 461 - if (qp->ibqp.qp_type != IB_QPT_RC) 462 - return; 463 - 464 - for (n = 0; n < rvt_max_atomic(rdi); n++) { 461 + for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) { 465 462 struct rvt_ack_entry *e = &qp->s_ack_queue[n]; 466 463 467 464 if (e->rdma_sge.mr) { 468 465 rvt_put_mr(e->rdma_sge.mr); 469 466 e->rdma_sge.mr = NULL; 470 467 } 468 + } 469 + } 470 + 471 + /** 472 + * rvt_swqe_has_lkey - return true if lkey is used by swqe 473 + * @wqe - the send wqe 474 + * @lkey - the lkey 475 + * 476 + * Test the swqe for using lkey 477 + */ 478 + static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey) 479 + { 480 + int i; 481 + 482 + for (i = 0; i < wqe->wr.num_sge; i++) { 483 + struct rvt_sge *sge = &wqe->sg_list[i]; 484 + 485 + if (rvt_mr_has_lkey(sge->mr, lkey)) 486 + return true; 487 + } 488 + return false; 489 + } 490 + 491 + /** 492 + * rvt_qp_sends_has_lkey - return true is qp sends use lkey 493 + * @qp - the rvt_qp 494 + * @lkey - the lkey 495 + */ 496 + static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey) 497 + { 498 + u32 s_last = qp->s_last; 499 + 500 + while (s_last != qp->s_head) { 501 + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last); 502 + 503 + if (rvt_swqe_has_lkey(wqe, lkey)) 504 + return true; 505 + 506 + if (++s_last >= qp->s_size) 507 + s_last = 0; 508 + } 509 + if (qp->s_rdma_mr) 510 + if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey)) 511 + return true; 512 + return false; 513 + } 514 + 515 + /** 516 + * rvt_qp_acks_has_lkey - return true if acks have lkey 517 + * @qp - the qp 518 + * @lkey - the lkey 519 + */ 520 + static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey) 521 + { 522 + int i; 523 + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); 524 + 525 + for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) { 526 + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; 527 + 528 + if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey)) 529 + return true; 530 + } 531 + return false; 532 + } 533 + 534 + /* 535 + * rvt_qp_mr_clean - clean up remote ops for lkey 536 + * @qp - the qp 537 + * @lkey - the lkey that is being de-registered 538 + * 539 + * This routine checks if the lkey is being used by 540 + * the qp. 541 + * 542 + * If so, the qp is put into an error state to elminate 543 + * any references from the qp. 544 + */ 545 + void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey) 546 + { 547 + bool lastwqe = false; 548 + 549 + if (qp->ibqp.qp_type == IB_QPT_SMI || 550 + qp->ibqp.qp_type == IB_QPT_GSI) 551 + /* avoid special QPs */ 552 + return; 553 + spin_lock_irq(&qp->r_lock); 554 + spin_lock(&qp->s_hlock); 555 + spin_lock(&qp->s_lock); 556 + 557 + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) 558 + goto check_lwqe; 559 + 560 + if (rvt_ss_has_lkey(&qp->r_sge, lkey) || 561 + rvt_qp_sends_has_lkey(qp, lkey) || 562 + rvt_qp_acks_has_lkey(qp, lkey)) 563 + lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR); 564 + check_lwqe: 565 + spin_unlock(&qp->s_lock); 566 + spin_unlock(&qp->s_hlock); 567 + spin_unlock_irq(&qp->r_lock); 568 + if (lastwqe) { 569 + struct ib_event ev; 570 + 571 + ev.device = qp->ibqp.device; 572 + ev.element.qp = &qp->ibqp; 573 + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; 574 + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); 471 575 } 472 576 } 473 577
+3
include/rdma/rdmavt_mr.h
··· 191 191 } 192 192 } 193 193 194 + bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey); 195 + bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey); 196 + 194 197 #endif /* DEF_RDMAVT_INCMRH */
+1
include/rdma/rdmavt_qp.h
··· 702 702 void rvt_qp_iter(struct rvt_dev_info *rdi, 703 703 u64 v, 704 704 void (*cb)(struct rvt_qp *qp, u64 v)); 705 + void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey); 705 706 #endif /* DEF_RDMAVT_INCQP_H */