Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI: PCIe AER: add aer_recover_queue

In addition to native PCIe AER, now APEI (ACPI Platform Error
Interface) GHES (Generic Hardware Error Source) can be used to report
PCIe AER errors too. To add support to APEI GHES PCIe AER recovery,
aer_recover_queue is added to export the recovery function in native
PCIe AER driver.

Recoverable PCIe AER errors are reported via NMI in APEI GHES. Then
APEI GHES uses irq_work to delay the error processing into an IRQ
handler. But PCIe AER recovery can be very time-consuming, so
aer_recover_queue, which can be used in IRQ handler, delays the real
recovery action into the process context, that is, work queue.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

authored by

Huang Ying and committed by
Jesse Barnes
0918472c 0aba496f

+74 -8
+69 -7
drivers/pci/pcie/aer/aerdrv_core.c
··· 24 24 #include <linux/suspend.h> 25 25 #include <linux/delay.h> 26 26 #include <linux/slab.h> 27 + #include <linux/kfifo.h> 27 28 #include "aerdrv.h" 28 29 29 30 static int forceload; ··· 446 445 return drv; 447 446 } 448 447 449 - static pci_ers_result_t reset_link(struct pcie_device *aerdev, 450 - struct pci_dev *dev) 448 + static pci_ers_result_t reset_link(struct pci_dev *dev) 451 449 { 452 450 struct pci_dev *udev; 453 451 pci_ers_result_t status; ··· 486 486 487 487 /** 488 488 * do_recovery - handle nonfatal/fatal error recovery process 489 - * @aerdev: pointer to a pcie_device data structure of root port 490 489 * @dev: pointer to a pci_dev data structure of agent detecting an error 491 490 * @severity: error severity type 492 491 * ··· 493 494 * error detected message to all downstream drivers within a hierarchy in 494 495 * question and return the returned code. 495 496 */ 496 - static void do_recovery(struct pcie_device *aerdev, struct pci_dev *dev, 497 - int severity) 497 + static void do_recovery(struct pci_dev *dev, int severity) 498 498 { 499 499 pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED; 500 500 enum pci_channel_state state; ··· 509 511 report_error_detected); 510 512 511 513 if (severity == AER_FATAL) { 512 - result = reset_link(aerdev, dev); 514 + result = reset_link(dev); 513 515 if (result != PCI_ERS_RESULT_RECOVERED) 514 516 goto failed; 515 517 } ··· 574 576 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, 575 577 info->status); 576 578 } else 577 - do_recovery(aerdev, dev, info->severity); 579 + do_recovery(dev, info->severity); 578 580 } 581 + 582 + #ifdef CONFIG_ACPI_APEI_PCIEAER 583 + static void aer_recover_work_func(struct work_struct *work); 584 + 585 + #define AER_RECOVER_RING_ORDER 4 586 + #define AER_RECOVER_RING_SIZE (1 << AER_RECOVER_RING_ORDER) 587 + 588 + struct aer_recover_entry 589 + { 590 + u8 bus; 591 + u8 devfn; 592 + u16 domain; 593 + int severity; 594 + }; 595 + 596 + static DEFINE_KFIFO(aer_recover_ring, struct aer_recover_entry, 597 + AER_RECOVER_RING_SIZE); 598 + /* 599 + * Mutual exclusion for writers of aer_recover_ring, reader side don't 600 + * need lock, because there is only one reader and lock is not needed 601 + * between reader and writer. 602 + */ 603 + static DEFINE_SPINLOCK(aer_recover_ring_lock); 604 + static DECLARE_WORK(aer_recover_work, aer_recover_work_func); 605 + 606 + void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, 607 + int severity) 608 + { 609 + unsigned long flags; 610 + struct aer_recover_entry entry = { 611 + .bus = bus, 612 + .devfn = devfn, 613 + .domain = domain, 614 + .severity = severity, 615 + }; 616 + 617 + spin_lock_irqsave(&aer_recover_ring_lock, flags); 618 + if (kfifo_put(&aer_recover_ring, &entry)) 619 + schedule_work(&aer_recover_work); 620 + else 621 + pr_err("AER recover: Buffer overflow when recovering AER for %04x:%02x:%02x:%x\n", 622 + domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); 623 + spin_unlock_irqrestore(&aer_recover_ring_lock, flags); 624 + } 625 + EXPORT_SYMBOL_GPL(aer_recover_queue); 626 + 627 + static void aer_recover_work_func(struct work_struct *work) 628 + { 629 + struct aer_recover_entry entry; 630 + struct pci_dev *pdev; 631 + 632 + while (kfifo_get(&aer_recover_ring, &entry)) { 633 + pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus, 634 + entry.devfn); 635 + if (!pdev) { 636 + pr_err("AER recover: Can not find pci_dev for %04x:%02x:%02x:%x\n", 637 + entry.domain, entry.bus, 638 + PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); 639 + continue; 640 + } 641 + do_recovery(pdev, entry.severity); 642 + } 643 + } 644 + #endif 579 645 580 646 /** 581 647 * get_device_error_info - read error status from dev and store it to info
+2 -1
drivers/pci/pcie/aer/aerdrv_errprint.c
··· 204 204 } 205 205 206 206 #ifdef CONFIG_ACPI_APEI_PCIEAER 207 - static int cper_severity_to_aer(int cper_severity) 207 + int cper_severity_to_aer(int cper_severity) 208 208 { 209 209 switch (cper_severity) { 210 210 case CPER_SEV_RECOVERABLE: ··· 215 215 return AER_CORRECTABLE; 216 216 } 217 217 } 218 + EXPORT_SYMBOL_GPL(cper_severity_to_aer); 218 219 219 220 void cper_print_aer(const char *prefix, int cper_severity, 220 221 struct aer_capability_regs *aer)
+3
include/linux/aer.h
··· 51 51 52 52 extern void cper_print_aer(const char *prefix, int cper_severity, 53 53 struct aer_capability_regs *aer); 54 + extern int cper_severity_to_aer(int cper_severity); 55 + extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, 56 + int severity); 54 57 #endif //_AER_H_ 55 58