[PATCH] USB: Fix USB suspend/resume crasher (#2)

This patch closes the IRQ race and makes various other OHCI & EHCI code
path safer vs. suspend/resume.
I've been able to (finally !) successfully suspend and resume various
Mac models, with or without USB mouse plugged, or plugging while asleep,
or unplugging while asleep etc... all without a crash.

Alan, please verify the UHCI bit I did, I only verified that it builds.
It's very simple so I wouldn't expect any issue there. If you aren't
confident, then just drop the hunks that change uhci-hcd.c

I also made the patch a little bit more "safer" by making sure the store
to the interrupt register that disables interrupts is not posted before
I set the flag and drop the spinlock.

Without this patch, you cannot reliably sleep/wakeup any recent Mac, and
I suspect PCs have some more sneaky issues too (they don't frankly crash
with machine checks because x86 tend to silently swallow PCI errors but
that won't last afaik, at least PCI Express will blow up in those
situations, but the USB code may still misbehave).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

authored by Benjamin Herrenschmidt and committed by Greg Kroah-Hartman 8de98402 d3420ba4

+132 -25
+2 -1
drivers/usb/core/hcd-pci.c
··· 219 219 goto done; 220 220 } 221 221 } 222 + synchronize_irq(dev->irq); 222 223 223 224 /* FIXME until the generic PM interfaces change a lot more, this 224 225 * can't use PCI D1 and D2 states. For example, the confusion ··· 393 392 394 393 dev->dev.power.power_state = PMSG_ON; 395 394 396 - hcd->saw_irq = 0; 395 + clear_bit(HCD_FLAG_SAW_IRQ, &hcd->flags); 397 396 398 397 if (hcd->driver->resume) { 399 398 retval = hcd->driver->resume(hcd);
+10 -5
drivers/usb/core/hcd.c
··· 1315 1315 * finish unlinking the initial failed usb_set_address() 1316 1316 * or device descriptor fetch. 1317 1317 */ 1318 - if (!hcd->saw_irq && hcd->self.root_hub != urb->dev) { 1318 + if (!test_bit(HCD_FLAG_SAW_IRQ, &hcd->flags) 1319 + && hcd->self.root_hub != urb->dev) { 1319 1320 dev_warn (hcd->self.controller, "Unlink after no-IRQ? " 1320 1321 "Controller is probably using the wrong IRQ." 1321 1322 "\n"); 1322 - hcd->saw_irq = 1; 1323 + set_bit(HCD_FLAG_SAW_IRQ, &hcd->flags); 1323 1324 } 1324 1325 1325 1326 urb->status = status; ··· 1650 1649 struct usb_hcd *hcd = __hcd; 1651 1650 int start = hcd->state; 1652 1651 1653 - if (start == HC_STATE_HALT) 1652 + if (unlikely(start == HC_STATE_HALT || 1653 + !test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) 1654 1654 return IRQ_NONE; 1655 1655 if (hcd->driver->irq (hcd, r) == IRQ_NONE) 1656 1656 return IRQ_NONE; 1657 1657 1658 - hcd->saw_irq = 1; 1659 - if (hcd->state == HC_STATE_HALT) 1658 + set_bit(HCD_FLAG_SAW_IRQ, &hcd->flags); 1659 + 1660 + if (unlikely(hcd->state == HC_STATE_HALT)) 1660 1661 usb_hc_died (hcd); 1661 1662 return IRQ_HANDLED; 1662 1663 } ··· 1770 1767 struct usb_device *rhdev; 1771 1768 1772 1769 dev_info(hcd->self.controller, "%s\n", hcd->product_desc); 1770 + 1771 + set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 1773 1772 1774 1773 /* till now HC has been in an indeterminate state ... */ 1775 1774 if (hcd->driver->reset && (retval = hcd->driver->reset(hcd)) < 0) {
+6 -1
drivers/usb/core/hcd.h
··· 72 72 * hardware info/state 73 73 */ 74 74 const struct hc_driver *driver; /* hw-specific hooks */ 75 - unsigned saw_irq : 1; 75 + 76 + /* Flags that need to be manipulated atomically */ 77 + unsigned long flags; 78 + #define HCD_FLAG_HW_ACCESSIBLE 0x00000001 79 + #define HCD_FLAG_SAW_IRQ 0x00000002 80 + 76 81 unsigned can_wakeup:1; /* hw supports wakeup? */ 77 82 unsigned remote_wakeup:1;/* sw should use wakeup? */ 78 83 unsigned rh_registered:1;/* is root hub registered? */
+26 -1
drivers/usb/host/ehci-pci.c
··· 228 228 static int ehci_pci_suspend(struct usb_hcd *hcd, pm_message_t message) 229 229 { 230 230 struct ehci_hcd *ehci = hcd_to_ehci(hcd); 231 + unsigned long flags; 232 + int rc = 0; 231 233 232 234 if (time_before(jiffies, ehci->next_statechange)) 233 235 msleep(10); 234 236 237 + /* Root hub was already suspended. Disable irq emission and 238 + * mark HW unaccessible, bail out if RH has been resumed. Use 239 + * the spinlock to properly synchronize with possible pending 240 + * RH suspend or resume activity. 241 + * 242 + * This is still racy as hcd->state is manipulated outside of 243 + * any locks =P But that will be a different fix. 244 + */ 245 + spin_lock_irqsave (&ehci->lock, flags); 246 + if (hcd->state != HC_STATE_SUSPENDED) { 247 + rc = -EINVAL; 248 + goto bail; 249 + } 250 + writel (0, &ehci->regs->intr_enable); 251 + (void)readl(&ehci->regs->intr_enable); 252 + 253 + clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 254 + bail: 255 + spin_unlock_irqrestore (&ehci->lock, flags); 256 + 235 257 // could save FLADJ in case of Vaux power loss 236 258 // ... we'd only use it to handle clock skew 237 259 238 - return 0; 260 + return rc; 239 261 } 240 262 241 263 static int ehci_pci_resume(struct usb_hcd *hcd) ··· 272 250 273 251 if (time_before(jiffies, ehci->next_statechange)) 274 252 msleep(100); 253 + 254 + /* Mark hardware accessible again as we are out of D3 state by now */ 255 + set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 275 256 276 257 /* If CF is clear, we lost PCI Vaux power and need to restart. */ 277 258 if (readl(&ehci->regs->configured_flag) != FLAG_CF)
+16 -8
drivers/usb/host/ehci-q.c
··· 912 912 int epnum; 913 913 unsigned long flags; 914 914 struct ehci_qh *qh = NULL; 915 + int rc = 0; 915 916 916 917 qtd = list_entry (qtd_list->next, struct ehci_qtd, qtd_list); 917 918 epnum = ep->desc.bEndpointAddress; ··· 927 926 #endif 928 927 929 928 spin_lock_irqsave (&ehci->lock, flags); 929 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, 930 + &ehci_to_hcd(ehci)->flags))) { 931 + rc = -ESHUTDOWN; 932 + goto done; 933 + } 934 + 930 935 qh = qh_append_tds (ehci, urb, qtd_list, epnum, &ep->hcpriv); 936 + if (unlikely(qh == NULL)) { 937 + rc = -ENOMEM; 938 + goto done; 939 + } 931 940 932 941 /* Control/bulk operations through TTs don't need scheduling, 933 942 * the HC and TT handle it when the TT has a buffer ready. 934 943 */ 935 - if (likely (qh != NULL)) { 936 - if (likely (qh->qh_state == QH_STATE_IDLE)) 937 - qh_link_async (ehci, qh_get (qh)); 938 - } 944 + if (likely (qh->qh_state == QH_STATE_IDLE)) 945 + qh_link_async (ehci, qh_get (qh)); 946 + done: 939 947 spin_unlock_irqrestore (&ehci->lock, flags); 940 - if (unlikely (qh == NULL)) { 948 + if (unlikely (qh == NULL)) 941 949 qtd_list_free (ehci, urb, qtd_list); 942 - return -ENOMEM; 943 - } 944 - return 0; 950 + return rc; 945 951 } 946 952 947 953 /*-------------------------------------------------------------------------*/
+16 -2
drivers/usb/host/ehci-sched.c
··· 602 602 603 603 spin_lock_irqsave (&ehci->lock, flags); 604 604 605 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, 606 + &ehci_to_hcd(ehci)->flags))) { 607 + status = -ESHUTDOWN; 608 + goto done; 609 + } 610 + 605 611 /* get qh and force any scheduling errors */ 606 612 INIT_LIST_HEAD (&empty); 607 613 qh = qh_append_tds (ehci, urb, &empty, epnum, &ep->hcpriv); ··· 1462 1456 1463 1457 /* schedule ... need to lock */ 1464 1458 spin_lock_irqsave (&ehci->lock, flags); 1465 - status = iso_stream_schedule (ehci, urb, stream); 1459 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, 1460 + &ehci_to_hcd(ehci)->flags))) 1461 + status = -ESHUTDOWN; 1462 + else 1463 + status = iso_stream_schedule (ehci, urb, stream); 1466 1464 if (likely (status == 0)) 1467 1465 itd_link_urb (ehci, urb, ehci->periodic_size << 3, stream); 1468 1466 spin_unlock_irqrestore (&ehci->lock, flags); ··· 1825 1815 1826 1816 /* schedule ... need to lock */ 1827 1817 spin_lock_irqsave (&ehci->lock, flags); 1828 - status = iso_stream_schedule (ehci, urb, stream); 1818 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, 1819 + &ehci_to_hcd(ehci)->flags))) 1820 + status = -ESHUTDOWN; 1821 + else 1822 + status = iso_stream_schedule (ehci, urb, stream); 1829 1823 if (status == 0) 1830 1824 sitd_link_urb (ehci, urb, ehci->periodic_size << 3, stream); 1831 1825 spin_unlock_irqrestore (&ehci->lock, flags);
+5 -1
drivers/usb/host/ohci-hcd.c
··· 115 115 116 116 /*-------------------------------------------------------------------------*/ 117 117 118 - // #define OHCI_VERBOSE_DEBUG /* not always helpful */ 118 + #undef OHCI_VERBOSE_DEBUG /* not always helpful */ 119 119 120 120 /* For initializing controller (mask in an HCFS mode too) */ 121 121 #define OHCI_CONTROL_INIT OHCI_CTRL_CBSR ··· 253 253 spin_lock_irqsave (&ohci->lock, flags); 254 254 255 255 /* don't submit to a dead HC */ 256 + if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags)) { 257 + retval = -ENODEV; 258 + goto fail; 259 + } 256 260 if (!HC_IS_RUNNING(hcd->state)) { 257 261 retval = -ENODEV; 258 262 goto fail;
+20 -4
drivers/usb/host/ohci-hub.c
··· 53 53 54 54 spin_lock_irqsave (&ohci->lock, flags); 55 55 56 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) { 57 + spin_unlock_irqrestore (&ohci->lock, flags); 58 + return -ESHUTDOWN; 59 + } 60 + 56 61 ohci->hc_control = ohci_readl (ohci, &ohci->regs->control); 57 62 switch (ohci->hc_control & OHCI_CTRL_HCFS) { 58 63 case OHCI_USB_RESUME: ··· 145 140 struct ohci_hcd *ohci = hcd_to_ohci (hcd); 146 141 u32 temp, enables; 147 142 int status = -EINPROGRESS; 143 + unsigned long flags; 148 144 149 145 if (time_before (jiffies, ohci->next_statechange)) 150 146 msleep(5); 151 147 152 - spin_lock_irq (&ohci->lock); 148 + spin_lock_irqsave (&ohci->lock, flags); 149 + 150 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) { 151 + spin_unlock_irqrestore (&ohci->lock, flags); 152 + return -ESHUTDOWN; 153 + } 154 + 155 + 153 156 ohci->hc_control = ohci_readl (ohci, &ohci->regs->control); 154 157 155 158 if (ohci->hc_control & (OHCI_CTRL_IR | OHCI_SCHED_ENABLES)) { ··· 192 179 ohci_dbg (ohci, "lost power\n"); 193 180 status = -EBUSY; 194 181 } 195 - spin_unlock_irq (&ohci->lock); 182 + spin_unlock_irqrestore (&ohci->lock, flags); 196 183 if (status == -EBUSY) { 197 184 (void) ohci_init (ohci); 198 185 return ohci_restart (ohci); ··· 310 297 /* handle autosuspended root: finish resuming before 311 298 * letting khubd or root hub timer see state changes. 312 299 */ 313 - if ((ohci->hc_control & OHCI_CTRL_HCFS) != OHCI_USB_OPER 314 - || !HC_IS_RUNNING(hcd->state)) { 300 + if (unlikely((ohci->hc_control & OHCI_CTRL_HCFS) != OHCI_USB_OPER 301 + || !HC_IS_RUNNING(hcd->state))) { 315 302 can_suspend = 0; 316 303 goto done; 317 304 } ··· 520 507 int ports = hcd_to_bus (hcd)->root_hub->maxchild; 521 508 u32 temp; 522 509 int retval = 0; 510 + 511 + if (unlikely(!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))) 512 + return -ESHUTDOWN; 523 513 524 514 switch (typeReq) { 525 515 case ClearHubFeature:
+25 -2
drivers/usb/host/ohci-pci.c
··· 105 105 106 106 static int ohci_pci_suspend (struct usb_hcd *hcd, pm_message_t message) 107 107 { 108 - /* root hub was already suspended */ 109 - return 0; 108 + struct ohci_hcd *ohci = hcd_to_ohci (hcd); 109 + unsigned long flags; 110 + int rc = 0; 111 + 112 + /* Root hub was already suspended. Disable irq emission and 113 + * mark HW unaccessible, bail out if RH has been resumed. Use 114 + * the spinlock to properly synchronize with possible pending 115 + * RH suspend or resume activity. 116 + * 117 + * This is still racy as hcd->state is manipulated outside of 118 + * any locks =P But that will be a different fix. 119 + */ 120 + spin_lock_irqsave (&ohci->lock, flags); 121 + if (hcd->state != HC_STATE_SUSPENDED) { 122 + rc = -EINVAL; 123 + goto bail; 124 + } 125 + ohci_writel(ohci, OHCI_INTR_MIE, &ohci->regs->intrdisable); 126 + (void)ohci_readl(ohci, &ohci->regs->intrdisable); 127 + clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 128 + bail: 129 + spin_unlock_irqrestore (&ohci->lock, flags); 130 + 131 + return rc; 110 132 } 111 133 112 134 113 135 static int ohci_pci_resume (struct usb_hcd *hcd) 114 136 { 137 + set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 115 138 usb_hcd_resume_root_hub(hcd); 116 139 return 0; 117 140 }
+6
drivers/usb/host/uhci-hcd.c
··· 717 717 * at the source, so we must turn off PIRQ. 718 718 */ 719 719 pci_write_config_word(to_pci_dev(uhci_dev(uhci)), USBLEGSUP, 0); 720 + clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 720 721 uhci->hc_inaccessible = 1; 721 722 hcd->poll_rh = 0; 722 723 ··· 733 732 struct uhci_hcd *uhci = hcd_to_uhci(hcd); 734 733 735 734 dev_dbg(uhci_dev(uhci), "%s\n", __FUNCTION__); 735 + 736 + /* We aren't in D3 state anymore, we do that even if dead as I 737 + * really don't want to keep a stale HCD_FLAG_HW_ACCESSIBLE=0 738 + */ 739 + set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); 736 740 737 741 if (uhci->rh_state == UHCI_RH_RESET) /* Dead */ 738 742 return 0;