Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-mlx5e-add-support-for-pcie-congestion-events'

Tariq Toukan says:

====================
net/mlx5e: Add support for PCIe congestion events

Dragos says:

PCIe congestion events are events generated by the firmware when the
device side has sustained PCIe inbound or outbound traffic above
certain thresholds. The high and low threshold are hysteresis thresholds
to prevent flapping: once the high threshold has been reached, a low
threshold event will be triggered only after the bandwidth usage went
below the low threshold.

This series adds support for receiving and exposing such events as
ethtool counters.

2 new pairs of counters are exposed: pci_bw_in/outbound_high/low. These
should help the user understand if the device PCI is under pressure.

Planned followup patches:
- Allow configuration of thresholds through devlink.
- Add ethtool counter for wakeups which did not result in any state
change.
====================

Link: https://patch.msgid.link/1752589821-145787-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+381 -1
+32
Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
··· 1341 1341 - The number of times the device owned queue had not enough buffers 1342 1342 allocated. 1343 1343 - Error 1344 + 1345 + * - `pci_bw_inbound_high` 1346 + - The number of times the device crossed the high inbound pcie bandwidth 1347 + threshold. To be compared to pci_bw_inbound_low to check if the device 1348 + is in a congested state. 1349 + If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested. 1350 + If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested. 1351 + - Tnformative 1352 + 1353 + * - `pci_bw_inbound_low` 1354 + - The number of times the device crossed the low inbound PCIe bandwidth 1355 + threshold. To be compared to pci_bw_inbound_high to check if the device 1356 + is in a congested state. 1357 + If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested. 1358 + If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested. 1359 + - Informative 1360 + 1361 + * - `pci_bw_outbound_high` 1362 + - The number of times the device crossed the high outbound pcie bandwidth 1363 + threshold. To be compared to pci_bw_outbound_low to check if the device 1364 + is in a congested state. 1365 + If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested. 1366 + If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested. 1367 + - Informative 1368 + 1369 + * - `pci_bw_outbound_low` 1370 + - The number of times the device crossed the low outbound PCIe bandwidth 1371 + threshold. To be compared to pci_bw_outbound_high to check if the device 1372 + is in a congested state. 1373 + If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested. 1374 + If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested. 1375 + - Informative
+1 -1
drivers/net/ethernet/mellanox/mlx5/core/Makefile
··· 29 29 en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \ 30 30 en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \ 31 31 en/qos.o en/htb.o en/trap.o en/fs_tt_redirect.o en/selq.o \ 32 - lib/crypto.o lib/sd.o 32 + lib/crypto.o lib/sd.o en/pcie_cong_event.o 33 33 34 34 # 35 35 # Netdev extra
+2
drivers/net/ethernet/mellanox/mlx5/core/en.h
··· 920 920 struct notifier_block events_nb; 921 921 struct notifier_block blocking_events_nb; 922 922 923 + struct mlx5e_pcie_cong_event *cong_event; 924 + 923 925 struct udp_tunnel_nic_info nic_info; 924 926 #ifdef CONFIG_MLX5_CORE_EN_DCB 925 927 struct mlx5e_dcbx dcbx;
+315
drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 + // Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. 3 + 4 + #include "en.h" 5 + #include "pcie_cong_event.h" 6 + 7 + #define MLX5E_CONG_HIGH_STATE 0x7 8 + 9 + enum { 10 + MLX5E_INBOUND_CONG = BIT(0), 11 + MLX5E_OUTBOUND_CONG = BIT(1), 12 + }; 13 + 14 + struct mlx5e_pcie_cong_thresh { 15 + u16 inbound_high; 16 + u16 inbound_low; 17 + u16 outbound_high; 18 + u16 outbound_low; 19 + }; 20 + 21 + struct mlx5e_pcie_cong_stats { 22 + u32 pci_bw_inbound_high; 23 + u32 pci_bw_inbound_low; 24 + u32 pci_bw_outbound_high; 25 + u32 pci_bw_outbound_low; 26 + }; 27 + 28 + struct mlx5e_pcie_cong_event { 29 + u64 obj_id; 30 + 31 + struct mlx5e_priv *priv; 32 + 33 + /* For event notifier and workqueue. */ 34 + struct work_struct work; 35 + struct mlx5_nb nb; 36 + 37 + /* Stores last read state. */ 38 + u8 state; 39 + 40 + /* For ethtool stats group. */ 41 + struct mlx5e_pcie_cong_stats stats; 42 + }; 43 + 44 + /* In units of 0.01 % */ 45 + static const struct mlx5e_pcie_cong_thresh default_thresh_config = { 46 + .inbound_high = 9000, 47 + .inbound_low = 7500, 48 + .outbound_high = 9000, 49 + .outbound_low = 7500, 50 + }; 51 + 52 + static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = { 53 + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, 54 + pci_bw_inbound_high) }, 55 + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, 56 + pci_bw_inbound_low) }, 57 + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, 58 + pci_bw_outbound_high) }, 59 + { MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats, 60 + pci_bw_outbound_low) }, 61 + }; 62 + 63 + #define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc) 64 + 65 + static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie_cong) 66 + { 67 + return priv->cong_event ? NUM_PCIE_CONG_COUNTERS : 0; 68 + } 69 + 70 + static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie_cong) {} 71 + 72 + static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie_cong) 73 + { 74 + if (!priv->cong_event) 75 + return; 76 + 77 + for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) 78 + ethtool_puts(data, mlx5e_pcie_cong_stats_desc[i].format); 79 + } 80 + 81 + static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie_cong) 82 + { 83 + if (!priv->cong_event) 84 + return; 85 + 86 + for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) { 87 + u32 ctr = MLX5E_READ_CTR32_CPU(&priv->cong_event->stats, 88 + mlx5e_pcie_cong_stats_desc, 89 + i); 90 + 91 + mlx5e_ethtool_put_stat(data, ctr); 92 + } 93 + } 94 + 95 + MLX5E_DEFINE_STATS_GRP(pcie_cong, 0); 96 + 97 + static int 98 + mlx5_cmd_pcie_cong_event_set(struct mlx5_core_dev *dev, 99 + const struct mlx5e_pcie_cong_thresh *config, 100 + u64 *obj_id) 101 + { 102 + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; 103 + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; 104 + void *cong_obj; 105 + void *hdr; 106 + int err; 107 + 108 + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); 109 + cong_obj = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, cong_obj); 110 + 111 + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, 112 + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 113 + 114 + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, 115 + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); 116 + 117 + MLX5_SET(pcie_cong_event_obj, cong_obj, inbound_event_en, 1); 118 + MLX5_SET(pcie_cong_event_obj, cong_obj, outbound_event_en, 1); 119 + 120 + MLX5_SET(pcie_cong_event_obj, cong_obj, 121 + inbound_cong_high_threshold, config->inbound_high); 122 + MLX5_SET(pcie_cong_event_obj, cong_obj, 123 + inbound_cong_low_threshold, config->inbound_low); 124 + 125 + MLX5_SET(pcie_cong_event_obj, cong_obj, 126 + outbound_cong_high_threshold, config->outbound_high); 127 + MLX5_SET(pcie_cong_event_obj, cong_obj, 128 + outbound_cong_low_threshold, config->outbound_low); 129 + 130 + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); 131 + if (err) 132 + return err; 133 + 134 + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 135 + 136 + mlx5_core_dbg(dev, "PCIe congestion event (obj_id=%llu) created. Config: in: [%u, %u], out: [%u, %u]\n", 137 + *obj_id, 138 + config->inbound_high, config->inbound_low, 139 + config->outbound_high, config->outbound_low); 140 + 141 + return 0; 142 + } 143 + 144 + static int mlx5_cmd_pcie_cong_event_destroy(struct mlx5_core_dev *dev, 145 + u64 obj_id) 146 + { 147 + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; 148 + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; 149 + void *hdr; 150 + 151 + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); 152 + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, 153 + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 154 + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, 155 + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); 156 + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id); 157 + 158 + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); 159 + } 160 + 161 + static int mlx5_cmd_pcie_cong_event_query(struct mlx5_core_dev *dev, 162 + u64 obj_id, 163 + u32 *state) 164 + { 165 + u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {}; 166 + u32 out[MLX5_ST_SZ_DW(pcie_cong_event_cmd_out)]; 167 + void *obj; 168 + void *hdr; 169 + u8 cong; 170 + int err; 171 + 172 + hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr); 173 + 174 + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, 175 + MLX5_CMD_OP_QUERY_GENERAL_OBJECT); 176 + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, 177 + MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT); 178 + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id); 179 + 180 + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); 181 + if (err) 182 + return err; 183 + 184 + obj = MLX5_ADDR_OF(pcie_cong_event_cmd_out, out, cong_obj); 185 + 186 + if (state) { 187 + cong = MLX5_GET(pcie_cong_event_obj, obj, inbound_cong_state); 188 + if (cong == MLX5E_CONG_HIGH_STATE) 189 + *state |= MLX5E_INBOUND_CONG; 190 + 191 + cong = MLX5_GET(pcie_cong_event_obj, obj, outbound_cong_state); 192 + if (cong == MLX5E_CONG_HIGH_STATE) 193 + *state |= MLX5E_OUTBOUND_CONG; 194 + } 195 + 196 + return 0; 197 + } 198 + 199 + static void mlx5e_pcie_cong_event_work(struct work_struct *work) 200 + { 201 + struct mlx5e_pcie_cong_event *cong_event; 202 + struct mlx5_core_dev *dev; 203 + struct mlx5e_priv *priv; 204 + u32 new_cong_state = 0; 205 + u32 changes; 206 + int err; 207 + 208 + cong_event = container_of(work, struct mlx5e_pcie_cong_event, work); 209 + priv = cong_event->priv; 210 + dev = priv->mdev; 211 + 212 + err = mlx5_cmd_pcie_cong_event_query(dev, cong_event->obj_id, 213 + &new_cong_state); 214 + if (err) { 215 + mlx5_core_warn(dev, "Error %d when querying PCIe cong event object (obj_id=%llu).\n", 216 + err, cong_event->obj_id); 217 + return; 218 + } 219 + 220 + changes = cong_event->state ^ new_cong_state; 221 + if (!changes) 222 + return; 223 + 224 + cong_event->state = new_cong_state; 225 + 226 + if (changes & MLX5E_INBOUND_CONG) { 227 + if (new_cong_state & MLX5E_INBOUND_CONG) 228 + cong_event->stats.pci_bw_inbound_high++; 229 + else 230 + cong_event->stats.pci_bw_inbound_low++; 231 + } 232 + 233 + if (changes & MLX5E_OUTBOUND_CONG) { 234 + if (new_cong_state & MLX5E_OUTBOUND_CONG) 235 + cong_event->stats.pci_bw_outbound_high++; 236 + else 237 + cong_event->stats.pci_bw_outbound_low++; 238 + } 239 + } 240 + 241 + static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb, 242 + unsigned long event, void *eqe) 243 + { 244 + struct mlx5e_pcie_cong_event *cong_event; 245 + 246 + cong_event = mlx5_nb_cof(nb, struct mlx5e_pcie_cong_event, nb); 247 + queue_work(cong_event->priv->wq, &cong_event->work); 248 + 249 + return NOTIFY_OK; 250 + } 251 + 252 + int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv) 253 + { 254 + struct mlx5e_pcie_cong_event *cong_event; 255 + struct mlx5_core_dev *mdev = priv->mdev; 256 + int err; 257 + 258 + if (!mlx5_pcie_cong_event_supported(mdev)) 259 + return 0; 260 + 261 + cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL, 262 + mdev->priv.numa_node); 263 + if (!cong_event) 264 + return -ENOMEM; 265 + 266 + INIT_WORK(&cong_event->work, mlx5e_pcie_cong_event_work); 267 + MLX5_NB_INIT(&cong_event->nb, mlx5e_pcie_cong_event_handler, 268 + OBJECT_CHANGE); 269 + 270 + cong_event->priv = priv; 271 + 272 + err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config, 273 + &cong_event->obj_id); 274 + if (err) { 275 + mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n"); 276 + goto err_free; 277 + } 278 + 279 + err = mlx5_eq_notifier_register(mdev, &cong_event->nb); 280 + if (err) { 281 + mlx5_core_warn(mdev, "Error registering notifier for the PCIe congestion event\n"); 282 + goto err_obj_destroy; 283 + } 284 + 285 + priv->cong_event = cong_event; 286 + 287 + return 0; 288 + 289 + err_obj_destroy: 290 + mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id); 291 + err_free: 292 + kvfree(cong_event); 293 + 294 + return err; 295 + } 296 + 297 + void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv) 298 + { 299 + struct mlx5e_pcie_cong_event *cong_event = priv->cong_event; 300 + struct mlx5_core_dev *mdev = priv->mdev; 301 + 302 + if (!cong_event) 303 + return; 304 + 305 + priv->cong_event = NULL; 306 + 307 + mlx5_eq_notifier_unregister(mdev, &cong_event->nb); 308 + cancel_work_sync(&cong_event->work); 309 + 310 + if (mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id)) 311 + mlx5_core_warn(mdev, "Error destroying PCIe congestion event (obj_id=%llu)\n", 312 + cong_event->obj_id); 313 + 314 + kvfree(cong_event); 315 + }
+10
drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ 2 + /* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. */ 3 + 4 + #ifndef __MLX5_PCIE_CONG_EVENT_H__ 5 + #define __MLX5_PCIE_CONG_EVENT_H__ 6 + 7 + int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv); 8 + void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv); 9 + 10 + #endif /* __MLX5_PCIE_CONG_EVENT_H__ */
+3
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
··· 76 76 #include "en/trap.h" 77 77 #include "lib/devcom.h" 78 78 #include "lib/sd.h" 79 + #include "en/pcie_cong_event.h" 79 80 80 81 static bool mlx5e_hw_gro_supported(struct mlx5_core_dev *mdev) 81 82 { ··· 5990 5989 if (mlx5e_monitor_counter_supported(priv)) 5991 5990 mlx5e_monitor_counter_init(priv); 5992 5991 5992 + mlx5e_pcie_cong_event_init(priv); 5993 5993 mlx5e_hv_vhca_stats_create(priv); 5994 5994 if (netdev->reg_state != NETREG_REGISTERED) 5995 5995 return; ··· 6030 6028 6031 6029 mlx5e_nic_set_rx_mode(priv); 6032 6030 6031 + mlx5e_pcie_cong_event_cleanup(priv); 6033 6032 mlx5e_hv_vhca_stats_destroy(priv); 6034 6033 if (mlx5e_monitor_counter_supported(priv)) 6035 6034 mlx5e_monitor_counter_cleanup(priv);
+1
drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
··· 2612 2612 #ifdef CONFIG_MLX5_MACSEC 2613 2613 &MLX5E_STATS_GRP(macsec_hw), 2614 2614 #endif 2615 + &MLX5E_STATS_GRP(pcie_cong), 2615 2616 }; 2616 2617 2617 2618 unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv)
+1
drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
··· 535 535 extern MLX5E_DECLARE_STATS_GRP(ipsec_sw); 536 536 extern MLX5E_DECLARE_STATS_GRP(ptp); 537 537 extern MLX5E_DECLARE_STATS_GRP(macsec_hw); 538 + extern MLX5E_DECLARE_STATS_GRP(pcie_cong); 538 539 539 540 #endif /* __MLX5_EN_STATS_H__ */
+3
drivers/net/ethernet/mellanox/mlx5/core/eq.c
··· 585 585 async_event_mask |= 586 586 (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE); 587 587 588 + if (mlx5_pcie_cong_event_supported(dev)) 589 + async_event_mask |= (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE); 590 + 588 591 mask[0] = async_event_mask; 589 592 590 593 if (MLX5_CAP_GEN(dev, event_cap))
+13
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
··· 495 495 496 496 return 1 << MLX5_CAP_GEN(dev, log_max_eq); 497 497 } 498 + 499 + static inline bool mlx5_pcie_cong_event_supported(struct mlx5_core_dev *dev) 500 + { 501 + u64 features = MLX5_CAP_GEN_2_64(dev, general_obj_types_127_64); 502 + 503 + if (!(features & MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT)) 504 + return false; 505 + 506 + if (dev->sd) 507 + return false; 508 + 509 + return true; 510 + } 498 511 #endif /* __MLX5_CORE_H__ */