Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller

This mostly reverts the commit b4c7d2076b4e ("PCI/LINK: Remove bandwidth
notification"). An upcoming commit extends this driver building PCIe
bandwidth controller on top of it.

PCIe bandwidth notifications were first added in the commit e8303bb7a75c
("PCI/LINK: Report degraded links via link bandwidth notification") but
later had to be removed. The significant changes compared with the old
bandwidth notification driver include:

1) Don't print the notifications into kernel log, just keep the Link
Speed cached in struct pci_bus updated. While somewhat unfortunate,
the log spam was the source of complaints that eventually lead to
the removal of the bandwidth notifications driver (see the links
below for further information).

2) Besides the Link Bandwidth Management Interrupt, also enable Link
Autonomous Bandwidth Interrupt to cover the other source of bandwidth
changes.

3) Handle Link Speed updates robustly. Refresh the cached Link Speed
when enabling Bandwidth Notification Interrupts, and solve the race
between Link Speed read and LBMS/LABS update in
pcie_bwnotif_irq_thread().

4) Use concurrency safe LNKCTL RMW operations.

5) The driver is now called PCIe bwctrl (bandwidth controller) instead
of just bandwidth notifications because of increased scope and
functionality within the driver.

6) Coexist with the Target Link Speed quirk in pcie_failed_link_retrain().
Provide LBMS counting API for it.

7) Tweaks to variable/functions names for consistency and length reasons.

Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to
keep track PCIe Link Speed changes.

[bhelgaas: This is based on previous work by Alexandru Gagniuc
<mr.nuke.me@gmail.com>; see e8303bb7a75c ("PCI/LINK: Report degraded links
via link bandwidth notification")]

Link: https://lore.kernel.org/r/20241018144755.7875-7-ilpo.jarvinen@linux.intel.com
Link: https://lore.kernel.org/all/20190429185611.121751-1-helgaas@kernel.org/
Link: https://lore.kernel.org/linux-pci/20190501142942.26972-1-keith.busch@intel.com/
Link: https://lore.kernel.org/linux-pci/20200115221008.GA191037@google.com/
Suggested-by: Lukas Wunner <lukas@wunner.de> # Building bwctrl on top of bwnotif
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash fix to drop IRQF_ONESHOT and convert to hardirq handler:
https://lore.kernel.org/r/20241115165717.15233-1-ilpo.jarvinen@linux.intel.com]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Stefan Wahren <wahrenst@gmx.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

authored by

Ilpo Järvinen and committed by
Bjorn Helgaas
665745f2 3491f509

+229 -9
+6
MAINTAINERS
··· 17933 17933 F: include/linux/pci* 17934 17934 F: include/uapi/linux/pci* 17935 17935 17936 + PCIE BANDWIDTH CONTROLLER 17937 + M: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com> 17938 + L: linux-pci@vger.kernel.org 17939 + S: Supported 17940 + F: drivers/pci/pcie/bwctrl.c 17941 + 17936 17942 PCIE DRIVER FOR AMAZON ANNAPURNA LABS 17937 17943 M: Jonathan Chocron <jonnyc@amazon.com> 17938 17944 L: linux-pci@vger.kernel.org
+5
drivers/pci/hotplug/pciehp_ctrl.c
··· 19 19 #include <linux/types.h> 20 20 #include <linux/pm_runtime.h> 21 21 #include <linux/pci.h> 22 + 23 + #include "../pci.h" 22 24 #include "pciehp.h" 23 25 24 26 /* The following routines constitute the bulk of the ··· 129 127 130 128 pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, 131 129 INDICATOR_NOOP); 130 + 131 + /* Don't carry LBMS indications across */ 132 + pcie_reset_lbms_count(ctrl->pcie->port); 132 133 } 133 134 134 135 static int pciehp_enable_slot(struct controller *ctrl);
+1 -1
drivers/pci/pci.c
··· 4740 4740 * to track link speed or width changes made by hardware itself 4741 4741 * in attempt to correct unreliable link operation. 4742 4742 */ 4743 - pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS); 4743 + pcie_reset_lbms_count(pdev); 4744 4744 return rc; 4745 4745 } 4746 4746
+11
drivers/pci/pci.h
··· 698 698 static inline void pcie_ecrc_get_policy(char *str) { } 699 699 #endif 700 700 701 + #ifdef CONFIG_PCIEPORTBUS 702 + void pcie_reset_lbms_count(struct pci_dev *port); 703 + int pcie_lbms_count(struct pci_dev *port, unsigned long *val); 704 + #else 705 + static inline void pcie_reset_lbms_count(struct pci_dev *port) {} 706 + static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val) 707 + { 708 + return -EOPNOTSUPP; 709 + } 710 + #endif 711 + 701 712 struct pci_dev_reset_methods { 702 713 u16 vendor; 703 714 u16 device;
+1 -1
drivers/pci/pcie/Makefile
··· 4 4 5 5 pcieportdrv-y := portdrv.o rcec.o 6 6 7 - obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o 7 + obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o 8 8 9 9 obj-y += aspm.o 10 10 obj-$(CONFIG_PCIEAER) += aer.o err.o
+186
drivers/pci/pcie/bwctrl.c
··· 1 + // SPDX-License-Identifier: GPL-2.0+ 2 + /* 3 + * PCIe bandwidth controller 4 + * 5 + * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com> 6 + * 7 + * Copyright (C) 2019 Dell Inc 8 + * Copyright (C) 2023-2024 Intel Corporation 9 + * 10 + * This service port driver hooks into the Bandwidth Notification interrupt 11 + * watching for changes or links becoming degraded in operation. It updates 12 + * the cached Current Link Speed that is exposed to user space through sysfs. 13 + */ 14 + 15 + #define dev_fmt(fmt) "bwctrl: " fmt 16 + 17 + #include <linux/atomic.h> 18 + #include <linux/cleanup.h> 19 + #include <linux/errno.h> 20 + #include <linux/interrupt.h> 21 + #include <linux/pci.h> 22 + #include <linux/rwsem.h> 23 + #include <linux/slab.h> 24 + #include <linux/types.h> 25 + 26 + #include "../pci.h" 27 + #include "portdrv.h" 28 + 29 + /** 30 + * struct pcie_bwctrl_data - PCIe bandwidth controller 31 + * @lbms_count: Count for LBMS (since last reset) 32 + */ 33 + struct pcie_bwctrl_data { 34 + atomic_t lbms_count; 35 + }; 36 + 37 + /* Prevents port removal during LBMS count accessors */ 38 + static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem); 39 + 40 + static void pcie_bwnotif_enable(struct pcie_device *srv) 41 + { 42 + struct pcie_bwctrl_data *data = srv->port->link_bwctrl; 43 + struct pci_dev *port = srv->port; 44 + u16 link_status; 45 + int ret; 46 + 47 + /* Count LBMS seen so far as one */ 48 + ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status); 49 + if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS) 50 + atomic_inc(&data->lbms_count); 51 + 52 + pcie_capability_set_word(port, PCI_EXP_LNKCTL, 53 + PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE); 54 + pcie_capability_write_word(port, PCI_EXP_LNKSTA, 55 + PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS); 56 + 57 + /* 58 + * Update after enabling notifications & clearing status bits ensures 59 + * link speed is up to date. 60 + */ 61 + pcie_update_link_speed(port->subordinate); 62 + } 63 + 64 + static void pcie_bwnotif_disable(struct pci_dev *port) 65 + { 66 + pcie_capability_clear_word(port, PCI_EXP_LNKCTL, 67 + PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE); 68 + } 69 + 70 + static irqreturn_t pcie_bwnotif_irq(int irq, void *context) 71 + { 72 + struct pcie_device *srv = context; 73 + struct pcie_bwctrl_data *data = srv->port->link_bwctrl; 74 + struct pci_dev *port = srv->port; 75 + u16 link_status, events; 76 + int ret; 77 + 78 + ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status); 79 + if (ret != PCIBIOS_SUCCESSFUL) 80 + return IRQ_NONE; 81 + 82 + events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS); 83 + if (!events) 84 + return IRQ_NONE; 85 + 86 + if (events & PCI_EXP_LNKSTA_LBMS) 87 + atomic_inc(&data->lbms_count); 88 + 89 + pcie_capability_write_word(port, PCI_EXP_LNKSTA, events); 90 + 91 + /* 92 + * Interrupts will not be triggered from any further Link Speed 93 + * change until LBMS is cleared by the write. Therefore, re-read the 94 + * speed (inside pcie_update_link_speed()) after LBMS has been 95 + * cleared to avoid missing link speed changes. 96 + */ 97 + pcie_update_link_speed(port->subordinate); 98 + 99 + return IRQ_HANDLED; 100 + } 101 + 102 + void pcie_reset_lbms_count(struct pci_dev *port) 103 + { 104 + struct pcie_bwctrl_data *data; 105 + 106 + guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem); 107 + data = port->link_bwctrl; 108 + if (data) 109 + atomic_set(&data->lbms_count, 0); 110 + else 111 + pcie_capability_write_word(port, PCI_EXP_LNKSTA, 112 + PCI_EXP_LNKSTA_LBMS); 113 + } 114 + 115 + int pcie_lbms_count(struct pci_dev *port, unsigned long *val) 116 + { 117 + struct pcie_bwctrl_data *data; 118 + 119 + guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem); 120 + data = port->link_bwctrl; 121 + if (!data) 122 + return -ENOTTY; 123 + 124 + *val = atomic_read(&data->lbms_count); 125 + 126 + return 0; 127 + } 128 + 129 + static int pcie_bwnotif_probe(struct pcie_device *srv) 130 + { 131 + struct pci_dev *port = srv->port; 132 + int ret; 133 + 134 + struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device, 135 + sizeof(*data), GFP_KERNEL); 136 + if (!data) 137 + return -ENOMEM; 138 + 139 + ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq, 140 + IRQF_SHARED, "PCIe bwctrl", srv); 141 + if (ret) 142 + return ret; 143 + 144 + scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) { 145 + port->link_bwctrl = no_free_ptr(data); 146 + pcie_bwnotif_enable(srv); 147 + } 148 + 149 + pci_dbg(port, "enabled with IRQ %d\n", srv->irq); 150 + 151 + return 0; 152 + } 153 + 154 + static void pcie_bwnotif_remove(struct pcie_device *srv) 155 + { 156 + pcie_bwnotif_disable(srv->port); 157 + scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) 158 + srv->port->link_bwctrl = NULL; 159 + } 160 + 161 + static int pcie_bwnotif_suspend(struct pcie_device *srv) 162 + { 163 + pcie_bwnotif_disable(srv->port); 164 + return 0; 165 + } 166 + 167 + static int pcie_bwnotif_resume(struct pcie_device *srv) 168 + { 169 + pcie_bwnotif_enable(srv); 170 + return 0; 171 + } 172 + 173 + static struct pcie_port_service_driver pcie_bwctrl_driver = { 174 + .name = "pcie_bwctrl", 175 + .port_type = PCIE_ANY_PORT, 176 + .service = PCIE_PORT_SERVICE_BWCTRL, 177 + .probe = pcie_bwnotif_probe, 178 + .suspend = pcie_bwnotif_suspend, 179 + .resume = pcie_bwnotif_resume, 180 + .remove = pcie_bwnotif_remove, 181 + }; 182 + 183 + int __init pcie_bwctrl_init(void) 184 + { 185 + return pcie_port_service_register(&pcie_bwctrl_driver); 186 + }
+5 -4
drivers/pci/pcie/portdrv.c
··· 68 68 */ 69 69 70 70 if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP | 71 - PCIE_PORT_SERVICE_BWNOTIF)) { 71 + PCIE_PORT_SERVICE_BWCTRL)) { 72 72 pcie_capability_read_word(dev, PCI_EXP_FLAGS, &reg16); 73 73 *pme = FIELD_GET(PCI_EXP_FLAGS_IRQ, reg16); 74 74 nvec = *pme + 1; ··· 150 150 151 151 /* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */ 152 152 if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP | 153 - PCIE_PORT_SERVICE_BWNOTIF)) { 153 + PCIE_PORT_SERVICE_BWCTRL)) { 154 154 pcie_irq = pci_irq_vector(dev, pme); 155 155 irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq; 156 156 irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq; 157 - irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq; 157 + irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq; 158 158 } 159 159 160 160 if (mask & PCIE_PORT_SERVICE_AER) ··· 271 271 272 272 pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap); 273 273 if (linkcap & PCI_EXP_LNKCAP_LBNC) 274 - services |= PCIE_PORT_SERVICE_BWNOTIF; 274 + services |= PCIE_PORT_SERVICE_BWCTRL; 275 275 } 276 276 277 277 return services; ··· 828 828 pcie_aer_init(); 829 829 pcie_pme_init(); 830 830 pcie_dpc_init(); 831 + pcie_bwctrl_init(); 831 832 pcie_hp_init(); 832 833 } 833 834
+4 -2
drivers/pci/pcie/portdrv.h
··· 20 20 #define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT) 21 21 #define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */ 22 22 #define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT) 23 - #define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */ 24 - #define PCIE_PORT_SERVICE_BWNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT) 23 + #define PCIE_PORT_SERVICE_BWCTRL_SHIFT 4 /* Bandwidth Controller (notifications) */ 24 + #define PCIE_PORT_SERVICE_BWCTRL (1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT) 25 25 26 26 #define PCIE_PORT_DEVICE_MAXSERVICES 5 27 27 ··· 50 50 #else 51 51 static inline int pcie_dpc_init(void) { return 0; } 52 52 #endif 53 + 54 + int pcie_bwctrl_init(void); 53 55 54 56 /* Port Type */ 55 57 #define PCIE_ANY_PORT (~0)
+8 -1
drivers/pci/quirks.c
··· 35 35 36 36 static bool pcie_lbms_seen(struct pci_dev *dev, u16 lnksta) 37 37 { 38 - return lnksta & PCI_EXP_LNKSTA_LBMS; 38 + unsigned long count; 39 + int ret; 40 + 41 + ret = pcie_lbms_count(dev, &count); 42 + if (ret < 0) 43 + return lnksta & PCI_EXP_LNKSTA_LBMS; 44 + 45 + return count > 0; 39 46 } 40 47 41 48 /*
+2
include/linux/pci.h
··· 313 313 }; 314 314 315 315 struct irq_affinity; 316 + struct pcie_bwctrl_data; 316 317 struct pcie_link_state; 317 318 struct pci_sriov; 318 319 struct pci_p2pdma; ··· 503 502 unsigned int dpc_rp_extensions:1; 504 503 u8 dpc_rp_log_size; 505 504 #endif 505 + struct pcie_bwctrl_data *link_bwctrl; 506 506 #ifdef CONFIG_PCI_ATS 507 507 union { 508 508 struct pci_sriov *sriov; /* PF: SR-IOV info */