[SCSI] fnic: Hitting BUG_ON(io_req->abts_done) in fnic_rport_exch_reset

Hitting BUG_ON(io_req->abts_done) in fnic_rport_exch_reset in case of
timing issue and also to some extent locking issue where abts and terminate
is happening around same timing.

The code changes are intended to update CMD_STATE(sc) and
io_req->abts_done together.

Signed-off-by: Sesidhar Beddel <sebaddel@cisco.com>
Signed-off-by: Hiral Patel <hiralpat@cisco.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>

authored by Sesidhar Beddel and committed by James Bottomley 1259c5dc 318c7c43

+45 -31
+45 -31
drivers/scsi/fnic/fnic_scsi.c
··· 111 111 return &fnic->io_req_lock[hash]; 112 112 } 113 113 114 + static inline spinlock_t *fnic_io_lock_tag(struct fnic *fnic, 115 + int tag) 116 + { 117 + return &fnic->io_req_lock[tag & (FNIC_IO_LOCKS - 1)]; 118 + } 119 + 114 120 /* 115 121 * Unmap the data buffer and sense buffer for an io_req, 116 122 * also unmap and free the device-private scatter/gather list. ··· 962 956 spin_unlock_irqrestore(io_lock, flags); 963 957 return; 964 958 } 965 - CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; 966 959 CMD_ABTS_STATUS(sc) = hdr_status; 967 - 968 960 CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE; 969 961 FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, 970 962 "abts cmpl recd. id %d status %s\n", ··· 1120 1116 1121 1117 static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) 1122 1118 { 1123 - unsigned int i; 1119 + int i; 1124 1120 struct fnic_io_req *io_req; 1125 1121 unsigned long flags = 0; 1126 1122 struct scsi_cmnd *sc; ··· 1131 1127 if (i == exclude_id) 1132 1128 continue; 1133 1129 1134 - sc = scsi_host_find_tag(fnic->lport->host, i); 1135 - if (!sc) 1136 - continue; 1137 - 1138 - io_lock = fnic_io_lock_hash(fnic, sc); 1130 + io_lock = fnic_io_lock_tag(fnic, i); 1139 1131 spin_lock_irqsave(io_lock, flags); 1132 + sc = scsi_host_find_tag(fnic->lport->host, i); 1133 + if (!sc) { 1134 + spin_unlock_irqrestore(io_lock, flags); 1135 + continue; 1136 + } 1137 + 1140 1138 io_req = (struct fnic_io_req *)CMD_SP(sc); 1141 1139 if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && 1142 1140 !(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) { ··· 1316 1310 1317 1311 for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { 1318 1312 abt_tag = tag; 1319 - sc = scsi_host_find_tag(fnic->lport->host, tag); 1320 - if (!sc) 1321 - continue; 1322 - 1323 - io_lock = fnic_io_lock_hash(fnic, sc); 1313 + io_lock = fnic_io_lock_tag(fnic, tag); 1324 1314 spin_lock_irqsave(io_lock, flags); 1315 + sc = scsi_host_find_tag(fnic->lport->host, tag); 1316 + if (!sc) { 1317 + spin_unlock_irqrestore(io_lock, flags); 1318 + continue; 1319 + } 1325 1320 1326 1321 io_req = (struct fnic_io_req *)CMD_SP(sc); 1327 1322 ··· 1433 1426 1434 1427 for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { 1435 1428 abt_tag = tag; 1429 + io_lock = fnic_io_lock_tag(fnic, tag); 1430 + spin_lock_irqsave(io_lock, flags); 1436 1431 sc = scsi_host_find_tag(fnic->lport->host, tag); 1437 - if (!sc) 1432 + if (!sc) { 1433 + spin_unlock_irqrestore(io_lock, flags); 1438 1434 continue; 1435 + } 1439 1436 1440 1437 cmd_rport = starget_to_rport(scsi_target(sc->device)); 1441 - if (rport != cmd_rport) 1438 + if (rport != cmd_rport) { 1439 + spin_unlock_irqrestore(io_lock, flags); 1442 1440 continue; 1443 - 1444 - io_lock = fnic_io_lock_hash(fnic, sc); 1445 - spin_lock_irqsave(io_lock, flags); 1441 + } 1446 1442 1447 1443 io_req = (struct fnic_io_req *)CMD_SP(sc); 1448 1444 ··· 1658 1648 io_req->abts_done = NULL; 1659 1649 1660 1650 /* fw did not complete abort, timed out */ 1661 - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { 1651 + if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) { 1662 1652 spin_unlock_irqrestore(io_lock, flags); 1663 1653 CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_TIMED_OUT; 1664 1654 ret = FAILED; 1665 1655 goto fnic_abort_cmd_end; 1666 1656 } 1657 + 1658 + CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; 1667 1659 1668 1660 /* 1669 1661 * firmware completed the abort, check the status, ··· 1765 1753 enum fnic_ioreq_state old_ioreq_state; 1766 1754 1767 1755 for (tag = 0; tag < FNIC_MAX_IO_REQ; tag++) { 1756 + io_lock = fnic_io_lock_tag(fnic, tag); 1757 + spin_lock_irqsave(io_lock, flags); 1768 1758 sc = scsi_host_find_tag(fnic->lport->host, tag); 1769 1759 /* 1770 1760 * ignore this lun reset cmd or cmds that do not belong to 1771 1761 * this lun 1772 1762 */ 1773 - if (!sc || sc == lr_sc || sc->device != lun_dev) 1763 + if (!sc || sc == lr_sc || sc->device != lun_dev) { 1764 + spin_unlock_irqrestore(io_lock, flags); 1774 1765 continue; 1775 - 1776 - io_lock = fnic_io_lock_hash(fnic, sc); 1777 - spin_lock_irqsave(io_lock, flags); 1766 + } 1778 1767 1779 1768 io_req = (struct fnic_io_req *)CMD_SP(sc); 1780 1769 ··· 1804 1791 spin_unlock_irqrestore(io_lock, flags); 1805 1792 continue; 1806 1793 } 1794 + 1795 + if (io_req->abts_done) 1796 + shost_printk(KERN_ERR, fnic->lport->host, 1797 + "%s: io_req->abts_done is set state is %s\n", 1798 + __func__, fnic_ioreq_state_to_str(CMD_STATE(sc))); 1807 1799 old_ioreq_state = CMD_STATE(sc); 1808 1800 /* 1809 1801 * Any pending IO issued prior to reset is expected to be ··· 1818 1800 * handled in this function. 1819 1801 */ 1820 1802 CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; 1821 - 1822 - if (io_req->abts_done) 1823 - shost_printk(KERN_ERR, fnic->lport->host, 1824 - "%s: io_req->abts_done is set state is %s\n", 1825 - __func__, fnic_ioreq_state_to_str(CMD_STATE(sc))); 1826 1803 1827 1804 BUG_ON(io_req->abts_done); 1828 1805 ··· 1871 1858 io_req->abts_done = NULL; 1872 1859 1873 1860 /* if abort is still pending with fw, fail */ 1874 - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { 1861 + if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) { 1875 1862 spin_unlock_irqrestore(io_lock, flags); 1876 1863 CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE; 1877 1864 ret = 1; 1878 1865 goto clean_pending_aborts_end; 1879 1866 } 1867 + CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; 1880 1868 CMD_SP(sc) = NULL; 1881 1869 spin_unlock_irqrestore(io_lock, flags); 1882 1870 ··· 2075 2061 spin_unlock_irqrestore(io_lock, flags); 2076 2062 int_to_scsilun(sc->device->lun, &fc_lun); 2077 2063 /* 2078 - * Issue abort and terminate on the device reset request. 2079 - * If q'ing of the abort fails, retry issue it after a delay. 2064 + * Issue abort and terminate on device reset request. 2065 + * If q'ing of terminate fails, retry it after a delay. 2080 2066 */ 2081 2067 while (1) { 2082 2068 spin_lock_irqsave(io_lock, flags);