Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PCI/bwctrl: Add pcie_set_target_speed() to set PCIe Link Speed

Currently, PCIe Link Speeds are adjusted by custom code rather than in a
common function provided in PCI core. The PCIe bandwidth controller
(bwctrl) introduces an in-kernel API, pcie_set_target_speed(), to set PCIe
Link Speed.

Convert Target Speed quirk to use the new API. The Target Speed quirk runs
very early when bwctrl is not yet probed for a Port and can also run later
when bwctrl is already setup for the Port, which requires the per port
mutex (set_speed_mutex) to be only taken if the bwctrl setup is already
complete.

The new API is also intended to be used in an upcoming commit that adds a
thermal cooling device to throttle PCIe bandwidth when thermal thresholds
are reached.

The PCIe bandwidth control procedure is as follows. The highest speed
supported by the Port and the PCIe device which is not higher than the
requested speed is selected and written into the Target Link Speed in the
Link Control 2 Register. Then bandwidth controller retrains the PCIe Link.

Bandwidth Notifications enable the cur_bus_speed in the struct pci_bus to
keep track PCIe Link Speed changes. While Bandwidth Notifications should
also be generated when bandwidth controller alters the PCIe Link Speed, a
few platforms do not deliver LMBS interrupt after Link Training as
expected. Thus, after changing the Link Speed, bandwidth controller makes
additional read for the Link Status Register to ensure cur_bus_speed is
consistent with the new PCIe Link Speed.

Link: https://lore.kernel.org/r/20241018144755.7875-8-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
[bhelgaas: squash devm_mutex_init() error checking from
https://lore.kernel.org/r/20241030163139.2111689-1-andriy.shevchenko@linux.intel.com,
drop export of pcie_set_target_speed()]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

authored by

Ilpo Järvinen and committed by
Bjorn Helgaas
de9a6c8d 665745f2

+208 -19
+20
drivers/pci/pci.h
··· 331 331 struct pci_bus *pci_bus_get(struct pci_bus *bus); 332 332 void pci_bus_put(struct pci_bus *bus); 333 333 334 + #define PCIE_LNKCAP_SLS2SPEED(lnkcap) \ 335 + ({ \ 336 + ((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ 337 + (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ 338 + (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ 339 + (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ 340 + (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ 341 + (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ 342 + PCI_SPEED_UNKNOWN); \ 343 + }) 344 + 334 345 /* PCIe link information from Link Capabilities 2 */ 335 346 #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \ 336 347 ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ ··· 350 339 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ 351 340 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ 352 341 (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ 342 + PCI_SPEED_UNKNOWN) 343 + 344 + #define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \ 345 + ((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \ 346 + (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \ 347 + (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \ 348 + (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \ 349 + (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \ 350 + (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \ 353 351 PCI_SPEED_UNKNOWN) 354 352 355 353 /* PCIe speed to Mb/s reduced by encoding overhead */
+174 -6
drivers/pci/pcie/bwctrl.c
··· 7 7 * Copyright (C) 2019 Dell Inc 8 8 * Copyright (C) 2023-2024 Intel Corporation 9 9 * 10 + * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds 11 + * and notify the operating system when the Link Width or Speed changes. The 12 + * notification capability is required for all Root Ports and Downstream 13 + * Ports supporting Link Width wider than x1 and/or multiple Link Speeds. 14 + * 10 15 * This service port driver hooks into the Bandwidth Notification interrupt 11 16 * watching for changes or links becoming degraded in operation. It updates 12 17 * the cached Current Link Speed that is exposed to user space through sysfs. ··· 20 15 #define dev_fmt(fmt) "bwctrl: " fmt 21 16 22 17 #include <linux/atomic.h> 18 + #include <linux/bitops.h> 19 + #include <linux/bits.h> 23 20 #include <linux/cleanup.h> 24 21 #include <linux/errno.h> 25 22 #include <linux/interrupt.h> 23 + #include <linux/mutex.h> 26 24 #include <linux/pci.h> 27 25 #include <linux/rwsem.h> 28 26 #include <linux/slab.h> ··· 36 28 37 29 /** 38 30 * struct pcie_bwctrl_data - PCIe bandwidth controller 31 + * @set_speed_mutex: Serializes link speed changes 39 32 * @lbms_count: Count for LBMS (since last reset) 40 33 */ 41 34 struct pcie_bwctrl_data { 35 + struct mutex set_speed_mutex; 42 36 atomic_t lbms_count; 43 37 }; 44 38 45 - /* Prevents port removal during LBMS count accessors */ 39 + /* 40 + * Prevent port removal during LBMS count accessors and Link Speed changes. 41 + * 42 + * These have to be differentiated because pcie_bwctrl_change_speed() calls 43 + * pcie_retrain_link() which uses LBMS count reset accessor on success 44 + * (using just one rwsem triggers "possible recursive locking detected" 45 + * warning). 46 + */ 46 47 static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem); 48 + static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem); 49 + 50 + static bool pcie_valid_speed(enum pci_bus_speed speed) 51 + { 52 + return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT); 53 + } 54 + 55 + static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed) 56 + { 57 + static const u8 speed_conv[] = { 58 + [PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT, 59 + [PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT, 60 + [PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT, 61 + [PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT, 62 + [PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT, 63 + [PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT, 64 + }; 65 + 66 + if (WARN_ON_ONCE(!pcie_valid_speed(speed))) 67 + return 0; 68 + 69 + return speed_conv[speed]; 70 + } 71 + 72 + static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds) 73 + { 74 + return __fls(supported_speeds); 75 + } 76 + 77 + /** 78 + * pcie_bwctrl_select_speed - Select Target Link Speed 79 + * @port: PCIe Port 80 + * @speed_req: Requested PCIe Link Speed 81 + * 82 + * Select Target Link Speed by take into account Supported Link Speeds of 83 + * both the Root Port and the Endpoint. 84 + * 85 + * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.) 86 + */ 87 + static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req) 88 + { 89 + struct pci_bus *bus = port->subordinate; 90 + u8 desired_speeds, supported_speeds; 91 + struct pci_dev *dev; 92 + 93 + desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req), 94 + __fls(PCI_EXP_LNKCAP2_SLS_2_5GB)); 95 + 96 + supported_speeds = port->supported_speeds; 97 + if (bus) { 98 + down_read(&pci_bus_sem); 99 + dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list); 100 + if (dev) 101 + supported_speeds &= dev->supported_speeds; 102 + up_read(&pci_bus_sem); 103 + } 104 + if (!supported_speeds) 105 + return PCI_EXP_LNKCAP2_SLS_2_5GB; 106 + 107 + return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds); 108 + } 109 + 110 + static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt) 111 + { 112 + int ret; 113 + 114 + ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2, 115 + PCI_EXP_LNKCTL2_TLS, target_speed); 116 + if (ret != PCIBIOS_SUCCESSFUL) 117 + return pcibios_err_to_errno(ret); 118 + 119 + ret = pcie_retrain_link(port, use_lt); 120 + if (ret < 0) 121 + return ret; 122 + 123 + /* 124 + * Ensure link speed updates also with platforms that have problems 125 + * with notifications. 126 + */ 127 + if (port->subordinate) 128 + pcie_update_link_speed(port->subordinate); 129 + 130 + return 0; 131 + } 132 + 133 + /** 134 + * pcie_set_target_speed - Set downstream Link Speed for PCIe Port 135 + * @port: PCIe Port 136 + * @speed_req: Requested PCIe Link Speed 137 + * @use_lt: Wait for the LT or DLLLA bit to detect the end of link training 138 + * 139 + * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be 140 + * adjusted downwards to the best speed supported by both the Port and PCIe 141 + * Device underneath it. 142 + * 143 + * Return: 144 + * * 0 - on success 145 + * * -EINVAL - @speed_req is not a PCIe Link Speed 146 + * * -ENODEV - @port is not controllable 147 + * * -ETIMEDOUT - changing Link Speed took too long 148 + * * -EAGAIN - Link Speed was changed but @speed_req was not achieved 149 + */ 150 + int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req, 151 + bool use_lt) 152 + { 153 + struct pci_bus *bus = port->subordinate; 154 + u16 target_speed; 155 + int ret; 156 + 157 + if (WARN_ON_ONCE(!pcie_valid_speed(speed_req))) 158 + return -EINVAL; 159 + 160 + if (bus && bus->cur_bus_speed == speed_req) 161 + return 0; 162 + 163 + target_speed = pcie_bwctrl_select_speed(port, speed_req); 164 + 165 + scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) { 166 + struct pcie_bwctrl_data *data = port->link_bwctrl; 167 + 168 + /* 169 + * port->link_bwctrl is NULL during initial scan when called 170 + * e.g. from the Target Speed quirk. 171 + */ 172 + if (data) 173 + mutex_lock(&data->set_speed_mutex); 174 + 175 + ret = pcie_bwctrl_change_speed(port, target_speed, use_lt); 176 + 177 + if (data) 178 + mutex_unlock(&data->set_speed_mutex); 179 + } 180 + 181 + /* 182 + * Despite setting higher speed into the Target Link Speed, empty 183 + * bus won't train to 5GT+ speeds. 184 + */ 185 + if (!ret && bus && bus->cur_bus_speed != speed_req && 186 + !list_empty(&bus->devices)) 187 + ret = -EAGAIN; 188 + 189 + return ret; 190 + } 47 191 48 192 static void pcie_bwnotif_enable(struct pcie_device *srv) 49 193 { ··· 296 136 if (!data) 297 137 return -ENOMEM; 298 138 139 + ret = devm_mutex_init(&srv->device, &data->set_speed_mutex); 140 + if (ret) 141 + return ret; 142 + 299 143 ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq, 300 144 IRQF_SHARED, "PCIe bwctrl", srv); 301 145 if (ret) 302 146 return ret; 303 147 304 - scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) { 305 - port->link_bwctrl = no_free_ptr(data); 306 - pcie_bwnotif_enable(srv); 148 + scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) { 149 + scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) { 150 + port->link_bwctrl = no_free_ptr(data); 151 + pcie_bwnotif_enable(srv); 152 + } 307 153 } 308 154 309 155 pci_dbg(port, "enabled with IRQ %d\n", srv->irq); ··· 320 154 static void pcie_bwnotif_remove(struct pcie_device *srv) 321 155 { 322 156 pcie_bwnotif_disable(srv->port); 323 - scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) 324 - srv->port->link_bwctrl = NULL; 157 + 158 + scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) 159 + scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) 160 + srv->port->link_bwctrl = NULL; 325 161 } 326 162 327 163 static int pcie_bwnotif_suspend(struct pcie_device *srv)
+4 -13
drivers/pci/quirks.c
··· 113 113 114 114 pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n"); 115 115 116 - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; 117 - lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT; 118 - pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2); 119 - 120 - ret = pcie_retrain_link(dev, false); 116 + ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false); 121 117 if (ret) { 122 118 pci_info(dev, "retraining failed\n"); 123 - pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, 124 - oldlnkctl2); 125 - pcie_retrain_link(dev, true); 119 + pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2), 120 + true); 126 121 return ret; 127 122 } 128 123 ··· 131 136 132 137 pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n"); 133 138 pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); 134 - lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS; 135 - lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS; 136 - pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2); 137 - 138 - ret = pcie_retrain_link(dev, false); 139 + ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false); 139 140 if (ret) { 140 141 pci_info(dev, "retraining failed\n"); 141 142 return ret;
+10
include/linux/pci.h
··· 1797 1797 #ifdef CONFIG_PCIEPORTBUS 1798 1798 extern bool pcie_ports_disabled; 1799 1799 extern bool pcie_ports_native; 1800 + 1801 + int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req, 1802 + bool use_lt); 1800 1803 #else 1801 1804 #define pcie_ports_disabled true 1802 1805 #define pcie_ports_native false 1806 + 1807 + static inline int pcie_set_target_speed(struct pci_dev *port, 1808 + enum pci_bus_speed speed_req, 1809 + bool use_lt) 1810 + { 1811 + return -EOPNOTSUPP; 1812 + } 1803 1813 #endif 1804 1814 1805 1815 #define PCIE_LINK_STATE_L0S (BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */