Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI AER: support invalid error source IDs

When the bus id part of error source id is equal to 0 or nosourceid=1,
make the kernel probe the AER status registers of all devices under the
root port to find the initial error reporter.

Reviewed-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

authored by

Zhang, Yanmin and committed by
Jesse Barnes
28eb27cf 70298c6e

+122 -60
+4
Documentation/PCI/pcieaer-howto.txt
··· 61 61 walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line 62 62 when booting kernel. Note that forceload=n by default. 63 63 64 + nosourceid, another parameter of type bool, can be used when broken 65 + hardware (mostly chipsets) has root ports that cannot obtain the reporting 66 + source ID. nosourceid=n by default. 67 + 64 68 2.3 AER error output 65 69 When a PCI-E AER error is captured, an error message will be outputed to 66 70 console. If it's a correctable error, it is outputed as a warning.
+2
drivers/pci/pcie/aer/aerdrv.h
··· 58 58 }; 59 59 60 60 struct aer_err_info { 61 + struct pci_dev *dev; 62 + u16 id; 61 63 int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */ 62 64 int flags; 63 65 unsigned int status; /* COR/UNCOR Error Status */
+116 -60
drivers/pci/pcie/aer/aerdrv_core.c
··· 26 26 #include "aerdrv.h" 27 27 28 28 static int forceload; 29 + static int nosourceid; 29 30 module_param(forceload, bool, 0); 31 + module_param(nosourceid, bool, 0); 30 32 31 33 int pci_enable_pcie_error_reporting(struct pci_dev *dev) 32 34 { ··· 145 143 pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable); 146 144 } 147 145 148 - static int find_device_iter(struct device *device, void *data) 146 + static inline int compare_device_id(struct pci_dev *dev, 147 + struct aer_err_info *e_info) 149 148 { 150 - struct pci_dev *dev; 151 - u16 id = *(unsigned long *)data; 152 - u8 secondary, subordinate, d_bus = id >> 8; 149 + if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) { 150 + /* 151 + * Device ID match 152 + */ 153 + return 1; 154 + } 153 155 154 - if (device->bus == &pci_bus_type) { 155 - dev = to_pci_dev(device); 156 - if (id == ((dev->bus->number << 8) | dev->devfn)) { 157 - /* 158 - * Device ID match 159 - */ 160 - *(unsigned long*)data = (unsigned long)device; 156 + return 0; 157 + } 158 + 159 + #define PCI_BUS(x) (((x) >> 8) & 0xff) 160 + 161 + static int find_device_iter(struct pci_dev *dev, void *data) 162 + { 163 + int pos; 164 + u32 status; 165 + u32 mask; 166 + u16 reg16; 167 + int result; 168 + struct aer_err_info *e_info = (struct aer_err_info *)data; 169 + 170 + /* 171 + * When bus id is equal to 0, it might be a bad id 172 + * reported by root port. 173 + */ 174 + if (!nosourceid && (PCI_BUS(e_info->id) != 0)) { 175 + result = compare_device_id(dev, e_info); 176 + if (result) 177 + e_info->dev = dev; 178 + return result; 179 + } 180 + 181 + /* 182 + * Next is to check when bus id is equal to 0 or 183 + * nosourceid==y. Some ports might lose the bus 184 + * id of error source id. We check AER status 185 + * registers to find the initial reporter. 186 + */ 187 + if (atomic_read(&dev->enable_cnt) == 0) 188 + return 0; 189 + pos = pci_find_capability(dev, PCI_CAP_ID_EXP); 190 + if (!pos) 191 + return 0; 192 + /* Check if AER is enabled */ 193 + pci_read_config_word(dev, pos+PCI_EXP_DEVCTL, &reg16); 194 + if (!(reg16 & ( 195 + PCI_EXP_DEVCTL_CERE | 196 + PCI_EXP_DEVCTL_NFERE | 197 + PCI_EXP_DEVCTL_FERE | 198 + PCI_EXP_DEVCTL_URRE))) 199 + return 0; 200 + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); 201 + if (!pos) 202 + return 0; 203 + 204 + status = 0; 205 + mask = 0; 206 + if (e_info->severity == AER_CORRECTABLE) { 207 + pci_read_config_dword(dev, 208 + pos + PCI_ERR_COR_STATUS, 209 + &status); 210 + pci_read_config_dword(dev, 211 + pos + PCI_ERR_COR_MASK, 212 + &mask); 213 + if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) { 214 + e_info->dev = dev; 161 215 return 1; 162 216 } 163 - 164 - /* 165 - * If device is P2P, check if it is an upstream? 166 - */ 167 - if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE) { 168 - pci_read_config_byte(dev, PCI_SECONDARY_BUS, 169 - &secondary); 170 - pci_read_config_byte(dev, PCI_SUBORDINATE_BUS, 171 - &subordinate); 172 - if (d_bus >= secondary && d_bus <= subordinate) { 173 - *(unsigned long*)data = (unsigned long)device; 174 - return 1; 175 - } 217 + } else { 218 + pci_read_config_dword(dev, 219 + pos + PCI_ERR_UNCOR_STATUS, 220 + &status); 221 + pci_read_config_dword(dev, 222 + pos + PCI_ERR_UNCOR_MASK, 223 + &mask); 224 + if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) { 225 + e_info->dev = dev; 226 + return 1; 176 227 } 177 228 } 178 229 ··· 235 180 /** 236 181 * find_source_device - search through device hierarchy for source device 237 182 * @parent: pointer to Root Port pci_dev data structure 238 - * @id: device ID of agent who sends an error message to this Root Port 183 + * @err_info: including detailed error information such like id 239 184 * 240 185 * Invoked when error is detected at the Root Port. 241 186 */ 242 - static struct device* find_source_device(struct pci_dev *parent, u16 id) 187 + static void find_source_device(struct pci_dev *parent, 188 + struct aer_err_info *e_info) 243 189 { 244 190 struct pci_dev *dev = parent; 245 - struct device *device; 246 - unsigned long device_addr; 247 - int status; 191 + int result; 248 192 249 193 /* Is Root Port an agent that sends error message? */ 250 - if (id == ((dev->bus->number << 8) | dev->devfn)) 251 - return &dev->dev; 194 + result = find_device_iter(dev, e_info); 195 + if (result) 196 + return; 252 197 253 - do { 254 - device_addr = id; 255 - if ((status = device_for_each_child(&dev->dev, 256 - &device_addr, find_device_iter))) { 257 - device = (struct device*)device_addr; 258 - dev = to_pci_dev(device); 259 - if (id == ((dev->bus->number << 8) | dev->devfn)) 260 - return device; 261 - } 262 - }while (status); 263 - 264 - return NULL; 198 + pci_walk_bus(parent->subordinate, find_device_iter, e_info); 265 199 } 266 200 267 201 static int report_error_detected(struct pci_dev *dev, void *data) ··· 545 501 */ 546 502 static void handle_error_source(struct pcie_device * aerdev, 547 503 struct pci_dev *dev, 548 - struct aer_err_info info) 504 + struct aer_err_info *info) 549 505 { 550 506 pci_ers_result_t status = 0; 551 507 int pos; 552 508 553 - if (info.severity == AER_CORRECTABLE) { 509 + if (info->severity == AER_CORRECTABLE) { 554 510 /* 555 511 * Correctable error does not need software intevention. 556 512 * No need to go through error recovery process. ··· 558 514 pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); 559 515 if (pos) 560 516 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, 561 - info.status); 517 + info->status); 562 518 } else { 563 - status = do_recovery(aerdev, dev, info.severity); 519 + status = do_recovery(aerdev, dev, info->severity); 564 520 if (status == PCI_ERS_RESULT_RECOVERED) { 565 521 dev_printk(KERN_DEBUG, &dev->dev, "AER driver " 566 522 "successfully recovered\n"); ··· 717 673 static void aer_isr_one_error(struct pcie_device *p_device, 718 674 struct aer_err_source *e_src) 719 675 { 720 - struct device *s_device; 721 - struct aer_err_info e_info = {0, 0, 0,}; 676 + struct aer_err_info *e_info; 722 677 int i; 723 - u16 id; 678 + 679 + /* struct aer_err_info might be big, so we allocate it with slab */ 680 + e_info = kmalloc(sizeof(struct aer_err_info), GFP_KERNEL); 681 + if (e_info == NULL) { 682 + dev_printk(KERN_DEBUG, &p_device->port->dev, 683 + "Can't allocate mem when processing AER errors\n"); 684 + return; 685 + } 724 686 725 687 /* 726 688 * There is a possibility that both correctable error and ··· 738 688 if (!(e_src->status & i)) 739 689 continue; 740 690 691 + memset(e_info, 0, sizeof(struct aer_err_info)); 692 + 741 693 /* Init comprehensive error information */ 742 694 if (i & PCI_ERR_ROOT_COR_RCV) { 743 - id = ERR_COR_ID(e_src->id); 744 - e_info.severity = AER_CORRECTABLE; 695 + e_info->id = ERR_COR_ID(e_src->id); 696 + e_info->severity = AER_CORRECTABLE; 745 697 } else { 746 - id = ERR_UNCOR_ID(e_src->id); 747 - e_info.severity = ((e_src->status >> 6) & 1); 698 + e_info->id = ERR_UNCOR_ID(e_src->id); 699 + e_info->severity = ((e_src->status >> 6) & 1); 748 700 } 749 701 if (e_src->status & 750 702 (PCI_ERR_ROOT_MULTI_COR_RCV | 751 703 PCI_ERR_ROOT_MULTI_UNCOR_RCV)) 752 - e_info.flags |= AER_MULTI_ERROR_VALID_FLAG; 753 - if (!(s_device = find_source_device(p_device->port, id))) { 704 + e_info->flags |= AER_MULTI_ERROR_VALID_FLAG; 705 + 706 + find_source_device(p_device->port, e_info); 707 + if (e_info->dev == NULL) { 754 708 printk(KERN_DEBUG "%s->can't find device of ID%04x\n", 755 - __func__, id); 709 + __func__, e_info->id); 756 710 continue; 757 711 } 758 - if (get_device_error_info(to_pci_dev(s_device), &e_info) == 712 + if (get_device_error_info(e_info->dev, e_info) == 759 713 AER_SUCCESS) { 760 - aer_print_error(to_pci_dev(s_device), &e_info); 714 + aer_print_error(e_info->dev, e_info); 761 715 handle_error_source(p_device, 762 - to_pci_dev(s_device), 716 + e_info->dev, 763 717 e_info); 764 718 } 765 719 } 720 + 721 + kfree(e_info); 766 722 } 767 723 768 724 /**