Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/smc: switch connections to alternate link

Add smc_switch_conns() to switch all connections from a link that is
going down. Find an other link to switch the connections to, and
switch each connection to the new link. smc_switch_cursor() updates the
cursors of a connection to the state of the last successfully sent CDC
message. When there is no link to switch to, terminate the link group.
Call smc_switch_conns() when a link is going down.
And with the possibility that links of connections can switch adapt CDC
and TX functions to detect and handle link switches.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Reviewed-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Karsten Graul and committed by
David S. Miller
c6f02ebe f0ec4f1d

+162 -9
+16 -2
net/smc/smc_cdc.c
··· 56 56 } 57 57 58 58 int smc_cdc_get_free_slot(struct smc_connection *conn, 59 + struct smc_link *link, 59 60 struct smc_wr_buf **wr_buf, 60 61 struct smc_rdma_wr **wr_rdma_buf, 61 62 struct smc_cdc_tx_pend **pend) 62 63 { 63 - struct smc_link *link = conn->lnk; 64 64 int rc; 65 65 66 66 rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, ··· 119 119 { 120 120 struct smc_cdc_tx_pend *pend; 121 121 struct smc_wr_buf *wr_buf; 122 + struct smc_link *link; 123 + bool again = false; 122 124 int rc; 123 125 124 - rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); 126 + again: 127 + link = conn->lnk; 128 + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); 125 129 if (rc) 126 130 return rc; 127 131 128 132 spin_lock_bh(&conn->send_lock); 133 + if (link != conn->lnk) { 134 + /* link of connection changed, try again one time*/ 135 + spin_unlock_bh(&conn->send_lock); 136 + smc_wr_tx_put_slot(link, 137 + (struct smc_wr_tx_pend_priv *)pend); 138 + if (again) 139 + return -ENOLINK; 140 + again = true; 141 + goto again; 142 + } 129 143 rc = smc_cdc_msg_send(conn, wr_buf, pend); 130 144 spin_unlock_bh(&conn->send_lock); 131 145 return rc;
+1
net/smc/smc_cdc.h
··· 304 304 }; 305 305 306 306 int smc_cdc_get_free_slot(struct smc_connection *conn, 307 + struct smc_link *link, 307 308 struct smc_wr_buf **wr_buf, 308 309 struct smc_rdma_wr **wr_rdma_buf, 309 310 struct smc_cdc_tx_pend **pend);
+130 -2
net/smc/smc_core.c
··· 432 432 return rc; 433 433 } 434 434 435 + static int smc_write_space(struct smc_connection *conn) 436 + { 437 + int buffer_len = conn->peer_rmbe_size; 438 + union smc_host_cursor prod; 439 + union smc_host_cursor cons; 440 + int space; 441 + 442 + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); 443 + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); 444 + /* determine rx_buf space */ 445 + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); 446 + return space; 447 + } 448 + 449 + static int smc_switch_cursor(struct smc_sock *smc) 450 + { 451 + struct smc_connection *conn = &smc->conn; 452 + union smc_host_cursor cons, fin; 453 + int rc = 0; 454 + int diff; 455 + 456 + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); 457 + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); 458 + /* set prod cursor to old state, enforce tx_rdma_writes() */ 459 + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); 460 + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); 461 + 462 + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { 463 + /* cons cursor advanced more than fin, and prod was set 464 + * fin above, so now prod is smaller than cons. Fix that. 465 + */ 466 + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); 467 + smc_curs_add(conn->sndbuf_desc->len, 468 + &conn->tx_curs_sent, diff); 469 + smc_curs_add(conn->sndbuf_desc->len, 470 + &conn->tx_curs_fin, diff); 471 + 472 + smp_mb__before_atomic(); 473 + atomic_add(diff, &conn->sndbuf_space); 474 + smp_mb__after_atomic(); 475 + 476 + smc_curs_add(conn->peer_rmbe_size, 477 + &conn->local_tx_ctrl.prod, diff); 478 + smc_curs_add(conn->peer_rmbe_size, 479 + &conn->local_tx_ctrl_fin, diff); 480 + } 481 + /* recalculate, value is used by tx_rdma_writes() */ 482 + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); 483 + 484 + if (smc->sk.sk_state != SMC_INIT && 485 + smc->sk.sk_state != SMC_CLOSED) { 486 + /* tbd: call rc = smc_cdc_get_slot_and_msg_send(conn); */ 487 + if (!rc) { 488 + schedule_delayed_work(&conn->tx_work, 0); 489 + smc->sk.sk_data_ready(&smc->sk); 490 + } 491 + } 492 + return rc; 493 + } 494 + 495 + struct smc_link *smc_switch_conns(struct smc_link_group *lgr, 496 + struct smc_link *from_lnk, bool is_dev_err) 497 + { 498 + struct smc_link *to_lnk = NULL; 499 + struct smc_connection *conn; 500 + struct smc_sock *smc; 501 + struct rb_node *node; 502 + int i, rc = 0; 503 + 504 + /* link is inactive, wake up tx waiters */ 505 + smc_wr_wakeup_tx_wait(from_lnk); 506 + 507 + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { 508 + if (lgr->lnk[i].state != SMC_LNK_ACTIVE || 509 + i == from_lnk->link_idx) 510 + continue; 511 + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && 512 + from_lnk->ibport == lgr->lnk[i].ibport) { 513 + continue; 514 + } 515 + to_lnk = &lgr->lnk[i]; 516 + break; 517 + } 518 + if (!to_lnk) { 519 + smc_lgr_terminate_sched(lgr); 520 + return NULL; 521 + } 522 + again: 523 + read_lock_bh(&lgr->conns_lock); 524 + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { 525 + conn = rb_entry(node, struct smc_connection, alert_node); 526 + if (conn->lnk != from_lnk) 527 + continue; 528 + smc = container_of(conn, struct smc_sock, conn); 529 + /* conn->lnk not yet set in SMC_INIT state */ 530 + if (smc->sk.sk_state == SMC_INIT) 531 + continue; 532 + if (smc->sk.sk_state == SMC_CLOSED || 533 + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || 534 + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || 535 + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || 536 + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || 537 + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || 538 + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || 539 + smc->sk.sk_state == SMC_PEERABORTWAIT || 540 + smc->sk.sk_state == SMC_PROCESSABORT) { 541 + spin_lock_bh(&conn->send_lock); 542 + conn->lnk = to_lnk; 543 + spin_unlock_bh(&conn->send_lock); 544 + continue; 545 + } 546 + sock_hold(&smc->sk); 547 + read_unlock_bh(&lgr->conns_lock); 548 + /* avoid race with smcr_tx_sndbuf_nonempty() */ 549 + spin_lock_bh(&conn->send_lock); 550 + conn->lnk = to_lnk; 551 + rc = smc_switch_cursor(smc); 552 + spin_unlock_bh(&conn->send_lock); 553 + sock_put(&smc->sk); 554 + if (rc) { 555 + smcr_link_down_cond_sched(to_lnk); 556 + return NULL; 557 + } 558 + goto again; 559 + } 560 + read_unlock_bh(&lgr->conns_lock); 561 + return to_lnk; 562 + } 563 + 435 564 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, 436 565 struct smc_link_group *lgr) 437 566 { ··· 1072 943 return; 1073 944 1074 945 smc_ib_modify_qp_reset(lnk); 1075 - to_lnk = NULL; 1076 - /* tbd: call to_lnk = smc_switch_conns(lgr, lnk, true); */ 946 + to_lnk = smc_switch_conns(lgr, lnk, true); 1077 947 if (!to_lnk) { /* no backup link available */ 1078 948 smcr_link_clear(lnk); 1079 949 return;
+2
net/smc/smc_core.h
··· 380 380 int smcr_buf_map_lgr(struct smc_link *lnk); 381 381 int smcr_buf_reg_lgr(struct smc_link *lnk); 382 382 int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); 383 + struct smc_link *smc_switch_conns(struct smc_link_group *lgr, 384 + struct smc_link *from_lnk, bool is_dev_err); 383 385 void smcr_link_down_cond(struct smc_link *lnk); 384 386 void smcr_link_down_cond_sched(struct smc_link *lnk); 385 387
+3 -3
net/smc/smc_llc.c
··· 933 933 return; /* no asymmetric link */ 934 934 if (!smc_link_downing(&lnk_asym->state)) 935 935 return; 936 - /* tbd: lnk_new = smc_switch_conns(lgr, lnk_asym, false); */ 936 + lnk_new = smc_switch_conns(lgr, lnk_asym, false); 937 937 smc_wr_tx_wait_no_pending_sends(lnk_asym); 938 938 if (!lnk_new) 939 939 goto out_free; ··· 1195 1195 smc_llc_send_message(lnk, &qentry->msg); /* response */ 1196 1196 1197 1197 if (smc_link_downing(&lnk_del->state)) { 1198 - /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ 1198 + smc_switch_conns(lgr, lnk_del, false); 1199 1199 smc_wr_tx_wait_no_pending_sends(lnk_del); 1200 1200 } 1201 1201 smcr_link_clear(lnk_del); ··· 1245 1245 goto out; /* asymmetric link already deleted */ 1246 1246 1247 1247 if (smc_link_downing(&lnk_del->state)) { 1248 - /* tbd: call smc_switch_conns(lgr, lnk_del, false); */ 1248 + smc_switch_conns(lgr, lnk_del, false); 1249 1249 smc_wr_tx_wait_no_pending_sends(lnk_del); 1250 1250 } 1251 1251 if (!list_empty(&lgr->list)) {
+10 -2
net/smc/smc_tx.c
··· 482 482 static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) 483 483 { 484 484 struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; 485 + struct smc_link *link = conn->lnk; 485 486 struct smc_rdma_wr *wr_rdma_buf; 486 487 struct smc_cdc_tx_pend *pend; 487 488 struct smc_wr_buf *wr_buf; 488 489 int rc; 489 490 490 - rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); 491 + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); 491 492 if (rc < 0) { 492 493 if (rc == -EBUSY) { 493 494 struct smc_sock *smc = ··· 506 505 } 507 506 508 507 spin_lock_bh(&conn->send_lock); 508 + if (link != conn->lnk) { 509 + /* link of connection changed, tx_work will restart */ 510 + smc_wr_tx_put_slot(link, 511 + (struct smc_wr_tx_pend_priv *)pend); 512 + rc = -ENOLINK; 513 + goto out_unlock; 514 + } 509 515 if (!pflags->urg_data_present) { 510 516 rc = smc_tx_rdma_writes(conn, wr_rdma_buf); 511 517 if (rc) { 512 - smc_wr_tx_put_slot(conn->lnk, 518 + smc_wr_tx_put_slot(link, 513 519 (struct smc_wr_tx_pend_priv *)pend); 514 520 goto out_unlock; 515 521 }