Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

scsi: sd: Revert "Rework asynchronous resume support"

Although commit 88f1669019bd ("scsi: sd: Rework asynchronous resume support")
eliminates a delay for some ATA disks after resume, it causes resume of ATA
disks to fail on other setups. See also:

* "Resume process hangs for 5-6 seconds starting sometime in 5.16"
(https://bugzilla.kernel.org/show_bug.cgi?id=215880).

* Geert's regression report
(https://lore.kernel.org/linux-scsi/alpine.DEB.2.22.394.2207191125130.1006766@ramsan.of.borg/).

This is what I understand about this issue:

* During resume, ata_port_pm_resume() starts the SCSI error handler. This
changes the SCSI host state into SHOST_RECOVERY and causes
scsi_queue_rq() to return BLK_STS_RESOURCE.

* sd_resume() calls sd_start_stop_device() for ATA devices. That function
in turn calls sd_submit_start() which tries to submit a START STOP UNIT
command. That command can only be submitted after the SCSI error handler
has changed the SCSI host state back to SHOST_RUNNING.

* The SCSI error handler runs on its own thread and calls
schedule_work(&(ap->scsi_rescan_task)). That causes
ata_scsi_dev_rescan() to be called from the context of a kernel
workqueue. That call hangs in blk_mq_get_tag(). I'm not sure why - maybe
because all available tags have been allocated by sd_submit_start()
calls (this is a guess).

Link: https://lore.kernel.org/r/20220816172638.538734-1-bvanassche@acm.org
Fixes: 88f1669019bd ("scsi: sd: Rework asynchronous resume support")
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: gzhqyz@gmail.com
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reported-by: gzhqyz@gmail.com
Reported-and-tested-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: John Garry <john.garry@huawei.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

authored by

Bart Van Assche and committed by
Martin K. Petersen
785538bf fac8e558

+18 -71
+18 -66
drivers/scsi/sd.c
··· 103 103 static void sd_config_write_same(struct scsi_disk *); 104 104 static int sd_revalidate_disk(struct gendisk *); 105 105 static void sd_unlock_native_capacity(struct gendisk *disk); 106 - static void sd_start_done_work(struct work_struct *work); 107 106 static int sd_probe(struct device *); 108 107 static int sd_remove(struct device *); 109 108 static void sd_shutdown(struct device *); ··· 3470 3471 sdkp->max_retries = SD_MAX_RETRIES; 3471 3472 atomic_set(&sdkp->openers, 0); 3472 3473 atomic_set(&sdkp->device->ioerr_cnt, 0); 3473 - INIT_WORK(&sdkp->start_done_work, sd_start_done_work); 3474 3474 3475 3475 if (!sdp->request_queue->rq_timeout) { 3476 3476 if (sdp->type != TYPE_MOD) ··· 3592 3594 kfree(sdkp); 3593 3595 } 3594 3596 3595 - /* Process sense data after a START command finished. */ 3596 - static void sd_start_done_work(struct work_struct *work) 3597 - { 3598 - struct scsi_disk *sdkp = container_of(work, typeof(*sdkp), 3599 - start_done_work); 3600 - struct scsi_sense_hdr sshdr; 3601 - int res = sdkp->start_result; 3602 - 3603 - if (res == 0) 3604 - return; 3605 - 3606 - sd_print_result(sdkp, "Start/Stop Unit failed", res); 3607 - 3608 - if (res < 0) 3609 - return; 3610 - 3611 - if (scsi_normalize_sense(sdkp->start_sense_buffer, 3612 - sdkp->start_sense_len, &sshdr)) 3613 - sd_print_sense_hdr(sdkp, &sshdr); 3614 - } 3615 - 3616 - /* A START command finished. May be called from interrupt context. */ 3617 - static void sd_start_done(struct request *req, blk_status_t status) 3618 - { 3619 - const struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); 3620 - struct scsi_disk *sdkp = scsi_disk(req->q->disk); 3621 - 3622 - sdkp->start_result = scmd->result; 3623 - WARN_ON_ONCE(scmd->sense_len > SCSI_SENSE_BUFFERSIZE); 3624 - sdkp->start_sense_len = scmd->sense_len; 3625 - memcpy(sdkp->start_sense_buffer, scmd->sense_buffer, 3626 - ARRAY_SIZE(sdkp->start_sense_buffer)); 3627 - WARN_ON_ONCE(!schedule_work(&sdkp->start_done_work)); 3628 - } 3629 - 3630 - /* Submit a START command asynchronously. */ 3631 - static int sd_submit_start(struct scsi_disk *sdkp, u8 cmd[], u8 cmd_len) 3632 - { 3633 - struct scsi_device *sdev = sdkp->device; 3634 - struct request_queue *q = sdev->request_queue; 3635 - struct request *req; 3636 - struct scsi_cmnd *scmd; 3637 - 3638 - req = scsi_alloc_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_PM); 3639 - if (IS_ERR(req)) 3640 - return PTR_ERR(req); 3641 - 3642 - scmd = blk_mq_rq_to_pdu(req); 3643 - scmd->cmd_len = cmd_len; 3644 - memcpy(scmd->cmnd, cmd, cmd_len); 3645 - scmd->allowed = sdkp->max_retries; 3646 - req->timeout = SD_TIMEOUT; 3647 - req->rq_flags |= RQF_PM | RQF_QUIET; 3648 - req->end_io = sd_start_done; 3649 - blk_execute_rq_nowait(req, /*at_head=*/true); 3650 - 3651 - return 0; 3652 - } 3653 - 3654 3597 static int sd_start_stop_device(struct scsi_disk *sdkp, int start) 3655 3598 { 3656 3599 unsigned char cmd[6] = { START_STOP }; /* START_VALID */ 3600 + struct scsi_sense_hdr sshdr; 3657 3601 struct scsi_device *sdp = sdkp->device; 3602 + int res; 3658 3603 3659 3604 if (start) 3660 3605 cmd[4] |= 1; /* START */ ··· 3608 3667 if (!scsi_device_online(sdp)) 3609 3668 return -ENODEV; 3610 3669 3611 - /* Wait until processing of sense data has finished. */ 3612 - flush_work(&sdkp->start_done_work); 3670 + res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, 3671 + SD_TIMEOUT, sdkp->max_retries, 0, RQF_PM, NULL); 3672 + if (res) { 3673 + sd_print_result(sdkp, "Start/Stop Unit failed", res); 3674 + if (res > 0 && scsi_sense_valid(&sshdr)) { 3675 + sd_print_sense_hdr(sdkp, &sshdr); 3676 + /* 0x3a is medium not present */ 3677 + if (sshdr.asc == 0x3a) 3678 + res = 0; 3679 + } 3680 + } 3613 3681 3614 - return sd_submit_start(sdkp, cmd, sizeof(cmd)); 3682 + /* SCSI error codes must not go to the generic layer */ 3683 + if (res) 3684 + return -EIO; 3685 + 3686 + return 0; 3615 3687 } 3616 3688 3617 3689 /* ··· 3651 3697 sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); 3652 3698 sd_start_stop_device(sdkp, 0); 3653 3699 } 3654 - 3655 - flush_work(&sdkp->start_done_work); 3656 3700 } 3657 3701 3658 3702 static int sd_suspend_common(struct device *dev, bool ignore_stop_errors)
-5
drivers/scsi/sd.h
··· 150 150 unsigned urswrz : 1; 151 151 unsigned security : 1; 152 152 unsigned ignore_medium_access_errors : 1; 153 - 154 - int start_result; 155 - u32 start_sense_len; 156 - u8 start_sense_buffer[SCSI_SENSE_BUFFERSIZE]; 157 - struct work_struct start_done_work; 158 153 }; 159 154 #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) 160 155