IB/srp: Retry stale connections

When a host just goes away (crash, power loss, etc.) without tearing
down its IB connections, it can get stale connection errors when it
tries to reconnect to targets upon rebooting. Retrying the connection
a few times will prevent sysadmins from playing the "which disk(s)
went missing?" game.

This would have made things slightly quicker when tracking down some
of the recent bugs, but it also helps quite a bit when you've got a
large number of targets hanging off a wedged server.

Signed-off-by: David Dillow <dillowda@ornl.gov>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by David Dillow and committed by Roland Dreier 9fe4bcf4 893da759

+42 -12
+41 -12
drivers/infiniband/ulp/srp/ib_srp.c
··· 204 204 return ret; 205 205 } 206 206 207 + static int srp_new_cm_id(struct srp_target_port *target) 208 + { 209 + struct ib_cm_id *new_cm_id; 210 + 211 + new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, 212 + srp_cm_handler, target); 213 + if (IS_ERR(new_cm_id)) 214 + return PTR_ERR(new_cm_id); 215 + 216 + if (target->cm_id) 217 + ib_destroy_cm_id(target->cm_id); 218 + target->cm_id = new_cm_id; 219 + 220 + return 0; 221 + } 222 + 207 223 static int srp_create_target_ib(struct srp_target_port *target) 208 224 { 209 225 struct ib_qp_init_attr *init_attr; ··· 452 436 453 437 static int srp_connect_target(struct srp_target_port *target) 454 438 { 439 + int retries = 3; 455 440 int ret; 456 441 457 442 ret = srp_lookup_path(target); ··· 483 466 break; 484 467 485 468 case SRP_DLID_REDIRECT: 469 + break; 470 + 471 + case SRP_STALE_CONN: 472 + /* Our current CM id was stale, and is now in timewait. 473 + * Try to reconnect with a new one. 474 + */ 475 + if (!retries-- || srp_new_cm_id(target)) { 476 + shost_printk(KERN_ERR, target->scsi_host, PFX 477 + "giving up on stale connection\n"); 478 + target->status = -ECONNRESET; 479 + return target->status; 480 + } 481 + 482 + shost_printk(KERN_ERR, target->scsi_host, PFX 483 + "retrying stale connection\n"); 486 484 break; 487 485 488 486 default: ··· 539 507 540 508 static int srp_reconnect_target(struct srp_target_port *target) 541 509 { 542 - struct ib_cm_id *new_cm_id; 543 510 struct ib_qp_attr qp_attr; 544 511 struct srp_request *req, *tmp; 545 512 struct ib_wc wc; ··· 557 526 * Now get a new local CM ID so that we avoid confusing the 558 527 * target in case things are really fouled up. 559 528 */ 560 - new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, 561 - srp_cm_handler, target); 562 - if (IS_ERR(new_cm_id)) { 563 - ret = PTR_ERR(new_cm_id); 529 + ret = srp_new_cm_id(target); 530 + if (ret) 564 531 goto err; 565 - } 566 - ib_destroy_cm_id(target->cm_id); 567 - target->cm_id = new_cm_id; 568 532 569 533 qp_attr.qp_state = IB_QPS_RESET; 570 534 ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); ··· 1195 1169 " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," 1196 1170 " opcode 0x%02x\n", opcode); 1197 1171 target->status = -ECONNRESET; 1172 + break; 1173 + 1174 + case IB_CM_REJ_STALE_CONN: 1175 + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); 1176 + target->status = SRP_STALE_CONN; 1198 1177 break; 1199 1178 1200 1179 default: ··· 1893 1862 if (ret) 1894 1863 goto err; 1895 1864 1896 - target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); 1897 - if (IS_ERR(target->cm_id)) { 1898 - ret = PTR_ERR(target->cm_id); 1865 + ret = srp_new_cm_id(target); 1866 + if (ret) 1899 1867 goto err_free; 1900 - } 1901 1868 1902 1869 target->qp_in_error = 0; 1903 1870 ret = srp_connect_target(target);
+1
drivers/infiniband/ulp/srp/ib_srp.h
··· 54 54 55 55 SRP_PORT_REDIRECT = 1, 56 56 SRP_DLID_REDIRECT = 2, 57 + SRP_STALE_CONN = 3, 57 58 58 59 SRP_MAX_LUN = 512, 59 60 SRP_DEF_SG_TABLESIZE = 12,