Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/smc: fix kernel panic caused by race of smc_sock

A crash occurs when smc_cdc_tx_handler() tries to access smc_sock
but smc_release() has already freed it.

[ 4570.695099] BUG: unable to handle page fault for address: 000000002eae9e88
[ 4570.696048] #PF: supervisor write access in kernel mode
[ 4570.696728] #PF: error_code(0x0002) - not-present page
[ 4570.697401] PGD 0 P4D 0
[ 4570.697716] Oops: 0002 [#1] PREEMPT SMP NOPTI
[ 4570.698228] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.16.0-rc4+ #111
[ 4570.699013] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/0
[ 4570.699933] RIP: 0010:_raw_spin_lock+0x1a/0x30
<...>
[ 4570.711446] Call Trace:
[ 4570.711746] <IRQ>
[ 4570.711992] smc_cdc_tx_handler+0x41/0xc0
[ 4570.712470] smc_wr_tx_tasklet_fn+0x213/0x560
[ 4570.712981] ? smc_cdc_tx_dismisser+0x10/0x10
[ 4570.713489] tasklet_action_common.isra.17+0x66/0x140
[ 4570.714083] __do_softirq+0x123/0x2f4
[ 4570.714521] irq_exit_rcu+0xc4/0xf0
[ 4570.714934] common_interrupt+0xba/0xe0

Though smc_cdc_tx_handler() checked the existence of smc connection,
smc_release() may have already dismissed and released the smc socket
before smc_cdc_tx_handler() further visits it.

smc_cdc_tx_handler() |smc_release()
if (!conn) |
|
|smc_cdc_tx_dismiss_slots()
| smc_cdc_tx_dismisser()
|
|sock_put(&smc->sk) <- last sock_put,
| smc_sock freed
bh_lock_sock(&smc->sk) (panic) |

To make sure we won't receive any CDC messages after we free the
smc_sock, add a refcount on the smc_connection for inflight CDC
message(posted to the QP but haven't received related CQE), and
don't release the smc_connection until all the inflight CDC messages
haven been done, for both success or failed ones.

Using refcount on CDC messages brings another problem: when the link
is going to be destroyed, smcr_link_clear() will reset the QP, which
then remove all the pending CQEs related to the QP in the CQ. To make
sure all the CQEs will always come back so the refcount on the
smc_connection can always reach 0, smc_ib_modify_qp_reset() was replaced
by smc_ib_modify_qp_error().
And remove the timeout in smc_wr_tx_wait_no_pending_sends() since we
need to wait for all pending WQEs done, or we may encounter use-after-
free when handling CQEs.

For IB device removal routine, we need to wait for all the QPs on that
device been destroyed before we can destroy CQs on the device, or
the refcount on smc_connection won't reach 0 and smc_sock cannot be
released.

Fixes: 5f08318f617b ("smc: connection data control (CDC)")
Reported-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Dust Li and committed by
David S. Miller
349d4312 90cee52f

+57 -76
+5
net/smc/smc.h
··· 180 180 u16 tx_cdc_seq; /* sequence # for CDC send */ 181 181 u16 tx_cdc_seq_fin; /* sequence # - tx completed */ 182 182 spinlock_t send_lock; /* protect wr_sends */ 183 + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe 184 + * - inc when post wqe, 185 + * - dec on polled tx cqe 186 + */ 187 + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ 183 188 struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ 184 189 u32 tx_off; /* base offset in peer rmb */ 185 190
+24 -28
net/smc/smc_cdc.c
··· 31 31 struct smc_sock *smc; 32 32 int diff; 33 33 34 - if (!conn) 35 - /* already dismissed */ 36 - return; 37 - 38 34 smc = container_of(conn, struct smc_sock, conn); 39 35 bh_lock_sock(&smc->sk); 40 36 if (!wc_status) { ··· 47 51 conn); 48 52 conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; 49 53 } 54 + 55 + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && 56 + unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) 57 + wake_up(&conn->cdc_pend_tx_wq); 58 + WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); 59 + 50 60 smc_tx_sndbuf_nonfull(smc); 51 61 bh_unlock_sock(&smc->sk); 52 62 } ··· 109 107 conn->tx_cdc_seq++; 110 108 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; 111 109 smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); 110 + 111 + atomic_inc(&conn->cdc_pend_tx_wr); 112 + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ 113 + 112 114 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); 113 115 if (!rc) { 114 116 smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); ··· 120 114 } else { 121 115 conn->tx_cdc_seq--; 122 116 conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; 117 + atomic_dec(&conn->cdc_pend_tx_wr); 123 118 } 124 119 125 120 return rc; ··· 143 136 peer->token = htonl(local->token); 144 137 peer->prod_flags.failover_validation = 1; 145 138 139 + /* We need to set pend->conn here to make sure smc_cdc_tx_handler() 140 + * can handle properly 141 + */ 142 + smc_cdc_add_pending_send(conn, pend); 143 + 144 + atomic_inc(&conn->cdc_pend_tx_wr); 145 + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ 146 + 146 147 rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); 148 + if (unlikely(rc)) 149 + atomic_dec(&conn->cdc_pend_tx_wr); 150 + 147 151 return rc; 148 152 } 149 153 ··· 211 193 return rc; 212 194 } 213 195 214 - static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, 215 - unsigned long data) 196 + void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) 216 197 { 217 - struct smc_connection *conn = (struct smc_connection *)data; 218 - struct smc_cdc_tx_pend *cdc_pend = 219 - (struct smc_cdc_tx_pend *)tx_pend; 220 - 221 - return cdc_pend->conn == conn; 222 - } 223 - 224 - static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) 225 - { 226 - struct smc_cdc_tx_pend *cdc_pend = 227 - (struct smc_cdc_tx_pend *)tx_pend; 228 - 229 - cdc_pend->conn = NULL; 230 - } 231 - 232 - void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) 233 - { 234 - struct smc_link *link = conn->lnk; 235 - 236 - smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, 237 - smc_cdc_tx_filter, smc_cdc_tx_dismisser, 238 - (unsigned long)conn); 198 + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); 239 199 } 240 200 241 201 /* Send a SMC-D CDC header.
+1 -1
net/smc/smc_cdc.h
··· 291 291 struct smc_wr_buf **wr_buf, 292 292 struct smc_rdma_wr **wr_rdma_buf, 293 293 struct smc_cdc_tx_pend **pend); 294 - void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); 294 + void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn); 295 295 int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, 296 296 struct smc_cdc_tx_pend *pend); 297 297 int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+20 -5
net/smc/smc_core.c
··· 1127 1127 smc_ism_unset_conn(conn); 1128 1128 tasklet_kill(&conn->rx_tsklet); 1129 1129 } else { 1130 - smc_cdc_tx_dismiss_slots(conn); 1130 + smc_cdc_wait_pend_tx_wr(conn); 1131 1131 if (current_work() != &conn->abort_work) 1132 1132 cancel_work_sync(&conn->abort_work); 1133 1133 } ··· 1204 1204 smc_llc_link_clear(lnk, log); 1205 1205 smcr_buf_unmap_lgr(lnk); 1206 1206 smcr_rtoken_clear_link(lnk); 1207 - smc_ib_modify_qp_reset(lnk); 1207 + smc_ib_modify_qp_error(lnk); 1208 1208 smc_wr_free_link(lnk); 1209 1209 smc_ib_destroy_queue_pair(lnk); 1210 1210 smc_ib_dealloc_protection_domain(lnk); ··· 1336 1336 else 1337 1337 tasklet_unlock_wait(&conn->rx_tsklet); 1338 1338 } else { 1339 - smc_cdc_tx_dismiss_slots(conn); 1339 + smc_cdc_wait_pend_tx_wr(conn); 1340 1340 } 1341 1341 smc_lgr_unregister_conn(conn); 1342 1342 smc_close_active_abort(smc); ··· 1459 1459 /* Called when an SMCR device is removed or the smc module is unloaded. 1460 1460 * If smcibdev is given, all SMCR link groups using this device are terminated. 1461 1461 * If smcibdev is NULL, all SMCR link groups are terminated. 1462 + * 1463 + * We must wait here for QPs been destroyed before we destroy the CQs, 1464 + * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus 1465 + * smc_sock cannot be released. 1462 1466 */ 1463 1467 void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) 1464 1468 { 1465 1469 struct smc_link_group *lgr, *lg; 1466 1470 LIST_HEAD(lgr_free_list); 1471 + LIST_HEAD(lgr_linkdown_list); 1467 1472 int i; 1468 1473 1469 1474 spin_lock_bh(&smc_lgr_list.lock); ··· 1480 1475 list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { 1481 1476 for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 1482 1477 if (lgr->lnk[i].smcibdev == smcibdev) 1483 - smcr_link_down_cond_sched(&lgr->lnk[i]); 1478 + list_move_tail(&lgr->list, &lgr_linkdown_list); 1484 1479 } 1485 1480 } 1486 1481 } ··· 1490 1485 list_del_init(&lgr->list); 1491 1486 smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); 1492 1487 __smc_lgr_terminate(lgr, false); 1488 + } 1489 + 1490 + list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { 1491 + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 1492 + if (lgr->lnk[i].smcibdev == smcibdev) { 1493 + mutex_lock(&lgr->llc_conf_mutex); 1494 + smcr_link_down_cond(&lgr->lnk[i]); 1495 + mutex_unlock(&lgr->llc_conf_mutex); 1496 + } 1497 + } 1493 1498 } 1494 1499 1495 1500 if (smcibdev) { ··· 1601 1586 if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) 1602 1587 return; 1603 1588 1604 - smc_ib_modify_qp_reset(lnk); 1605 1589 to_lnk = smc_switch_conns(lgr, lnk, true); 1606 1590 if (!to_lnk) { /* no backup link available */ 1607 1591 smcr_link_clear(lnk, true); ··· 1838 1824 conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; 1839 1825 conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; 1840 1826 conn->urg_state = SMC_URG_READ; 1827 + init_waitqueue_head(&conn->cdc_pend_tx_wq); 1841 1828 INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); 1842 1829 if (ini->is_smcd) { 1843 1830 conn->rx_off = sizeof(struct smcd_cdc_msg);
+2 -2
net/smc/smc_ib.c
··· 109 109 IB_QP_MAX_QP_RD_ATOMIC); 110 110 } 111 111 112 - int smc_ib_modify_qp_reset(struct smc_link *lnk) 112 + int smc_ib_modify_qp_error(struct smc_link *lnk) 113 113 { 114 114 struct ib_qp_attr qp_attr; 115 115 116 116 memset(&qp_attr, 0, sizeof(qp_attr)); 117 - qp_attr.qp_state = IB_QPS_RESET; 117 + qp_attr.qp_state = IB_QPS_ERR; 118 118 return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); 119 119 } 120 120
+1
net/smc/smc_ib.h
··· 90 90 int smc_ib_ready_link(struct smc_link *lnk); 91 91 int smc_ib_modify_qp_rts(struct smc_link *lnk); 92 92 int smc_ib_modify_qp_reset(struct smc_link *lnk); 93 + int smc_ib_modify_qp_error(struct smc_link *lnk); 93 94 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); 94 95 int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, 95 96 struct smc_buf_desc *buf_slot, u8 link_idx);
+3 -38
net/smc/smc_wr.c
··· 62 62 } 63 63 64 64 /* wait till all pending tx work requests on the given link are completed */ 65 - int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) 65 + void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) 66 66 { 67 - if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), 68 - SMC_WR_TX_WAIT_PENDING_TIME)) 69 - return 0; 70 - else /* timeout */ 71 - return -EPIPE; 67 + wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); 72 68 } 73 69 74 70 static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) ··· 83 87 struct smc_wr_tx_pend pnd_snd; 84 88 struct smc_link *link; 85 89 u32 pnd_snd_idx; 86 - int i; 87 90 88 91 link = wc->qp->qp_context; 89 92 ··· 123 128 } 124 129 125 130 if (wc->status) { 126 - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 127 - /* clear full struct smc_wr_tx_pend including .priv */ 128 - memset(&link->wr_tx_pends[i], 0, 129 - sizeof(link->wr_tx_pends[i])); 130 - memset(&link->wr_tx_bufs[i], 0, 131 - sizeof(link->wr_tx_bufs[i])); 132 - clear_bit(i, link->wr_tx_mask); 133 - } 134 131 if (link->lgr->smc_version == SMC_V2) { 135 132 memset(link->wr_tx_v2_pend, 0, 136 133 sizeof(*link->wr_tx_v2_pend)); ··· 408 421 return rc; 409 422 } 410 423 411 - void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, 412 - smc_wr_tx_filter filter, 413 - smc_wr_tx_dismisser dismisser, 414 - unsigned long data) 415 - { 416 - struct smc_wr_tx_pend_priv *tx_pend; 417 - struct smc_wr_rx_hdr *wr_tx; 418 - int i; 419 - 420 - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { 421 - wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; 422 - if (wr_tx->type != wr_tx_hdr_type) 423 - continue; 424 - tx_pend = &link->wr_tx_pends[i].priv; 425 - if (filter(tx_pend, data)) 426 - dismisser(tx_pend); 427 - } 428 - } 429 - 430 424 /****************************** receive queue ********************************/ 431 425 432 426 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) ··· 643 675 smc_wr_wakeup_reg_wait(lnk); 644 676 smc_wr_wakeup_tx_wait(lnk); 645 677 646 - if (smc_wr_tx_wait_no_pending_sends(lnk)) 647 - memset(lnk->wr_tx_mask, 0, 648 - BITS_TO_LONGS(SMC_WR_BUF_CNT) * 649 - sizeof(*lnk->wr_tx_mask)); 678 + smc_wr_tx_wait_no_pending_sends(lnk); 650 679 wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); 651 680 wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); 652 681
+1 -2
net/smc/smc_wr.h
··· 22 22 #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ 23 23 24 24 #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) 25 - #define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) 26 25 27 26 #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ 28 27 ··· 129 130 smc_wr_tx_filter filter, 130 131 smc_wr_tx_dismisser dismisser, 131 132 unsigned long data); 132 - int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); 133 + void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); 133 134 134 135 int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); 135 136 int smc_wr_rx_post_init(struct smc_link *link);