Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

scsi: lpfc: Coordinate adapter error handling with offline handling

The driver periodically checks for adapter error in a background thread. If
the thread detects an error, the adapter will be reset including the
deletion and reallocation of workqueues on the adapter. Simultaneously,
there may be a user-space request to offline the adapter which may try to
do many of the same steps, in parallel, on a different thread. As memory
was deallocated while unexpected, the parallel offline request hit a bad
pointer.

Add coordination between the two threads. The error recovery thread has
precedence. So, when an error is detected, a flag is set on the adapter to
indicate the error thread is terminating the adapter. But, before doing
that work, it will look for a flag that is set by the offline flow, and if
set, will wait for it to complete before then processing the error handling
path. Similarly, in the offline thread, it first checks for whether the
error thread is resetting the adapter, and if so, will then wait for the
error thread to finish. Only after it has finished, will it set its flag
and offline the adapter.

Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

authored by

James Smart and committed by
Martin K. Petersen
4645f7b5 32a93100

+45 -3
+19
drivers/scsi/lpfc/lpfc_attr.c
··· 1204 1204 1205 1205 psli = &phba->sli; 1206 1206 1207 + /* 1208 + * If freeing the queues have already started, don't access them. 1209 + * Otherwise set FREE_WAIT to indicate that queues are being used 1210 + * to hold the freeing process until we finish. 1211 + */ 1212 + spin_lock_irq(&phba->hbalock); 1213 + if (!(psli->sli_flag & LPFC_QUEUE_FREE_INIT)) { 1214 + psli->sli_flag |= LPFC_QUEUE_FREE_WAIT; 1215 + } else { 1216 + spin_unlock_irq(&phba->hbalock); 1217 + goto skip_wait; 1218 + } 1219 + spin_unlock_irq(&phba->hbalock); 1220 + 1207 1221 /* Wait a little for things to settle down, but not 1208 1222 * long enough for dev loss timeout to expire. 1209 1223 */ ··· 1239 1225 } 1240 1226 } 1241 1227 out: 1228 + spin_lock_irq(&phba->hbalock); 1229 + psli->sli_flag &= ~LPFC_QUEUE_FREE_WAIT; 1230 + spin_unlock_irq(&phba->hbalock); 1231 + 1232 + skip_wait: 1242 1233 init_completion(&online_compl); 1243 1234 rc = lpfc_workq_post_event(phba, &status, &online_compl, type); 1244 1235 if (rc == 0)
+19
drivers/scsi/lpfc/lpfc_init.c
··· 9135 9135 void 9136 9136 lpfc_sli4_queue_destroy(struct lpfc_hba *phba) 9137 9137 { 9138 + /* 9139 + * Set FREE_INIT before beginning to free the queues. 9140 + * Wait until the users of queues to acknowledge to 9141 + * release queues by clearing FREE_WAIT. 9142 + */ 9143 + spin_lock_irq(&phba->hbalock); 9144 + phba->sli.sli_flag |= LPFC_QUEUE_FREE_INIT; 9145 + while (phba->sli.sli_flag & LPFC_QUEUE_FREE_WAIT) { 9146 + spin_unlock_irq(&phba->hbalock); 9147 + msleep(20); 9148 + spin_lock_irq(&phba->hbalock); 9149 + } 9150 + spin_unlock_irq(&phba->hbalock); 9151 + 9138 9152 /* Release HBA eqs */ 9139 9153 if (phba->sli4_hba.hdwq) 9140 9154 lpfc_sli4_release_hdwq(phba); ··· 9187 9173 9188 9174 /* Everything on this list has been freed */ 9189 9175 INIT_LIST_HEAD(&phba->sli4_hba.lpfc_wq_list); 9176 + 9177 + /* Done with freeing the queues */ 9178 + spin_lock_irq(&phba->hbalock); 9179 + phba->sli.sli_flag &= ~LPFC_QUEUE_FREE_INIT; 9180 + spin_unlock_irq(&phba->hbalock); 9190 9181 } 9191 9182 9192 9183 int
+3 -3
drivers/scsi/lpfc/lpfc_sli.c
··· 14417 14417 if (!queue) 14418 14418 return; 14419 14419 14420 + if (!list_empty(&queue->wq_list)) 14421 + list_del(&queue->wq_list); 14422 + 14420 14423 while (!list_empty(&queue->page_list)) { 14421 14424 list_remove_head(&queue->page_list, dmabuf, struct lpfc_dmabuf, 14422 14425 list); ··· 14434 14431 14435 14432 if (!list_empty(&queue->cpu_list)) 14436 14433 list_del(&queue->cpu_list); 14437 - 14438 - if (!list_empty(&queue->wq_list)) 14439 - list_del(&queue->wq_list); 14440 14434 14441 14435 kfree(queue); 14442 14436 return;
+4
drivers/scsi/lpfc/lpfc_sli.h
··· 327 327 #define LPFC_SLI_ASYNC_MBX_BLK 0x2000 /* Async mailbox is blocked */ 328 328 #define LPFC_SLI_SUPPRESS_RSP 0x4000 /* Suppress RSP feature is supported */ 329 329 #define LPFC_SLI_USE_EQDR 0x8000 /* EQ Delay Register is supported */ 330 + #define LPFC_QUEUE_FREE_INIT 0x10000 /* Queue freeing is in progress */ 331 + #define LPFC_QUEUE_FREE_WAIT 0x20000 /* Hold Queue free as it is being 332 + * used outside worker thread 333 + */ 330 334 331 335 struct lpfc_sli_ring *sli3_ring; 332 336