Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/bnxt_re: synchronize the qp-handle table array

There is a race between the CREQ tasklet and destroy qp when accessing the
qp-handle table. There is a chance of reading a valid qp-handle in the
CREQ tasklet handler while the QP is already moving ahead with the
destruction.

Fixing this race by implementing a table-lock to synchronize the access.

Fixes: f218d67ef004 ("RDMA/bnxt_re: Allow posting when QPs are in error")
Fixes: 84cf229f4001 ("RDMA/bnxt_re: Fix the qp table indexing")
Link: https://patch.msgid.link/r/1728912975-19346-3-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

authored by

Selvin Xavier and committed by
Jason Gunthorpe
76d3ddff d71f4acd

+15 -4
+4
drivers/infiniband/hw/bnxt_re/qplib_fp.c
··· 1532 1532 u32 tbl_indx; 1533 1533 int rc; 1534 1534 1535 + spin_lock_bh(&rcfw->tbl_lock); 1535 1536 tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw); 1536 1537 rcfw->qp_tbl[tbl_indx].qp_id = BNXT_QPLIB_QP_ID_INVALID; 1537 1538 rcfw->qp_tbl[tbl_indx].qp_handle = NULL; 1539 + spin_unlock_bh(&rcfw->tbl_lock); 1538 1540 1539 1541 bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, 1540 1542 CMDQ_BASE_OPCODE_DESTROY_QP, ··· 1547 1545 sizeof(resp), 0); 1548 1546 rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); 1549 1547 if (rc) { 1548 + spin_lock_bh(&rcfw->tbl_lock); 1550 1549 rcfw->qp_tbl[tbl_indx].qp_id = qp->id; 1551 1550 rcfw->qp_tbl[tbl_indx].qp_handle = qp; 1551 + spin_unlock_bh(&rcfw->tbl_lock); 1552 1552 return rc; 1553 1553 } 1554 1554
+9 -4
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
··· 634 634 case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION: 635 635 err_event = (struct creq_qp_error_notification *)qp_event; 636 636 qp_id = le32_to_cpu(err_event->xid); 637 + spin_lock(&rcfw->tbl_lock); 637 638 tbl_indx = map_qp_id_to_tbl_indx(qp_id, rcfw); 638 639 qp = rcfw->qp_tbl[tbl_indx].qp_handle; 640 + if (!qp) { 641 + spin_unlock(&rcfw->tbl_lock); 642 + break; 643 + } 644 + bnxt_qplib_mark_qp_error(qp); 645 + rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); 646 + spin_unlock(&rcfw->tbl_lock); 639 647 dev_dbg(&pdev->dev, "Received QP error notification\n"); 640 648 dev_dbg(&pdev->dev, 641 649 "qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", 642 650 qp_id, err_event->req_err_state_reason, 643 651 err_event->res_err_state_reason); 644 - if (!qp) 645 - break; 646 - bnxt_qplib_mark_qp_error(qp); 647 - rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp); 648 652 break; 649 653 default: 650 654 /* ··· 977 973 GFP_KERNEL); 978 974 if (!rcfw->qp_tbl) 979 975 goto fail; 976 + spin_lock_init(&rcfw->tbl_lock); 980 977 981 978 rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; 982 979
+2
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
··· 224 224 struct bnxt_qplib_crsqe *crsqe_tbl; 225 225 int qp_tbl_size; 226 226 struct bnxt_qplib_qp_node *qp_tbl; 227 + /* To synchronize the qp-handle hash table */ 228 + spinlock_t tbl_lock; 227 229 u64 oos_prev; 228 230 u32 init_oos_stats; 229 231 u32 cmdq_depth;