IB/srp: Retry stale connections

When a host just goes away (crash, power loss, etc.) without tearing
down its IB connections, it can get stale connection errors when it
tries to reconnect to targets upon rebooting. Retrying the connection
a few times will prevent sysadmins from playing the "which disk(s)
went missing?" game.

This would have made things slightly quicker when tracking down some
of the recent bugs, but it also helps quite a bit when you've got a
large number of targets hanging off a wedged server.

Signed-off-by: David Dillow <dillowda@ornl.gov>
Signed-off-by: Roland Dreier <rolandd@cisco.com>

authored by David Dillow and committed by Roland Dreier 9fe4bcf4 893da759

+42 -12
+41 -12
drivers/infiniband/ulp/srp/ib_srp.c
··· 204 return ret; 205 } 206 207 static int srp_create_target_ib(struct srp_target_port *target) 208 { 209 struct ib_qp_init_attr *init_attr; ··· 452 453 static int srp_connect_target(struct srp_target_port *target) 454 { 455 int ret; 456 457 ret = srp_lookup_path(target); ··· 483 break; 484 485 case SRP_DLID_REDIRECT: 486 break; 487 488 default: ··· 539 540 static int srp_reconnect_target(struct srp_target_port *target) 541 { 542 - struct ib_cm_id *new_cm_id; 543 struct ib_qp_attr qp_attr; 544 struct srp_request *req, *tmp; 545 struct ib_wc wc; ··· 557 * Now get a new local CM ID so that we avoid confusing the 558 * target in case things are really fouled up. 559 */ 560 - new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, 561 - srp_cm_handler, target); 562 - if (IS_ERR(new_cm_id)) { 563 - ret = PTR_ERR(new_cm_id); 564 goto err; 565 - } 566 - ib_destroy_cm_id(target->cm_id); 567 - target->cm_id = new_cm_id; 568 569 qp_attr.qp_state = IB_QPS_RESET; 570 ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); ··· 1195 " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," 1196 " opcode 0x%02x\n", opcode); 1197 target->status = -ECONNRESET; 1198 break; 1199 1200 default: ··· 1893 if (ret) 1894 goto err; 1895 1896 - target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); 1897 - if (IS_ERR(target->cm_id)) { 1898 - ret = PTR_ERR(target->cm_id); 1899 goto err_free; 1900 - } 1901 1902 target->qp_in_error = 0; 1903 ret = srp_connect_target(target);
··· 204 return ret; 205 } 206 207 + static int srp_new_cm_id(struct srp_target_port *target) 208 + { 209 + struct ib_cm_id *new_cm_id; 210 + 211 + new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, 212 + srp_cm_handler, target); 213 + if (IS_ERR(new_cm_id)) 214 + return PTR_ERR(new_cm_id); 215 + 216 + if (target->cm_id) 217 + ib_destroy_cm_id(target->cm_id); 218 + target->cm_id = new_cm_id; 219 + 220 + return 0; 221 + } 222 + 223 static int srp_create_target_ib(struct srp_target_port *target) 224 { 225 struct ib_qp_init_attr *init_attr; ··· 436 437 static int srp_connect_target(struct srp_target_port *target) 438 { 439 + int retries = 3; 440 int ret; 441 442 ret = srp_lookup_path(target); ··· 466 break; 467 468 case SRP_DLID_REDIRECT: 469 + break; 470 + 471 + case SRP_STALE_CONN: 472 + /* Our current CM id was stale, and is now in timewait. 473 + * Try to reconnect with a new one. 474 + */ 475 + if (!retries-- || srp_new_cm_id(target)) { 476 + shost_printk(KERN_ERR, target->scsi_host, PFX 477 + "giving up on stale connection\n"); 478 + target->status = -ECONNRESET; 479 + return target->status; 480 + } 481 + 482 + shost_printk(KERN_ERR, target->scsi_host, PFX 483 + "retrying stale connection\n"); 484 break; 485 486 default: ··· 507 508 static int srp_reconnect_target(struct srp_target_port *target) 509 { 510 struct ib_qp_attr qp_attr; 511 struct srp_request *req, *tmp; 512 struct ib_wc wc; ··· 526 * Now get a new local CM ID so that we avoid confusing the 527 * target in case things are really fouled up. 528 */ 529 + ret = srp_new_cm_id(target); 530 + if (ret) 531 goto err; 532 533 qp_attr.qp_state = IB_QPS_RESET; 534 ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); ··· 1169 " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," 1170 " opcode 0x%02x\n", opcode); 1171 target->status = -ECONNRESET; 1172 + break; 1173 + 1174 + case IB_CM_REJ_STALE_CONN: 1175 + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); 1176 + target->status = SRP_STALE_CONN; 1177 break; 1178 1179 default: ··· 1862 if (ret) 1863 goto err; 1864 1865 + ret = srp_new_cm_id(target); 1866 + if (ret) 1867 goto err_free; 1868 1869 target->qp_in_error = 0; 1870 ret = srp_connect_target(target);
+1
drivers/infiniband/ulp/srp/ib_srp.h
··· 54 55 SRP_PORT_REDIRECT = 1, 56 SRP_DLID_REDIRECT = 2, 57 58 SRP_MAX_LUN = 512, 59 SRP_DEF_SG_TABLESIZE = 12,
··· 54 55 SRP_PORT_REDIRECT = 1, 56 SRP_DLID_REDIRECT = 2, 57 + SRP_STALE_CONN = 3, 58 59 SRP_MAX_LUN = 512, 60 SRP_DEF_SG_TABLESIZE = 12,