Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/mlx5: Use IB set_netdev and get_netdev functions

The IB layer provides a common interface to store and get net
devices associated to an IB device port (ib_device_set_netdev()
and ib_device_get_netdev()).
Previously, mlx5_ib stored and managed the associated net devices
internally.

Replace internal net device management in mlx5_ib with
ib_device_set_netdev() when attaching/detaching a net device and
ib_device_get_netdev() when retrieving the net device.

Export ib_device_get_netdev().

For mlx5 representors/PFs/VFs and lag creation we replace the netdev
assignments with the IB set/get netdev functions.

In active-backup mode lag the active slave net device is stored in the
lag itself. To assure the net device stored in a lag bond IB device is
the active slave we implement the following:
- mlx5_core: when modifying the slave of a bond we send the internal driver event
MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE.
- mlx5_ib: when catching the event call ib_device_set_netdev()

This patch also ensures the correct IB events are sent in switchdev lag.

While at it, when in multiport eswitch mode, only a single IB device is
created for all ports. The said IB device will receive all netdev events
of its VFs once loaded, thus to avoid overwriting the mapping of PF IB
device to PF netdev, ignore NETDEV_REGISTER events if the ib device has
already been mapped to a netdev.

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Chiara Meiohas and committed by
Leon Romanovsky
8d159eb2 5f8ca04f

+191 -103
+4
drivers/infiniband/core/device.c
··· 2236 2236 if (!rdma_is_port_valid(ib_dev, port)) 2237 2237 return NULL; 2238 2238 2239 + if (!ib_dev->port_data) 2240 + return NULL; 2241 + 2239 2242 pdata = &ib_dev->port_data[port]; 2240 2243 2241 2244 /* ··· 2257 2254 2258 2255 return res; 2259 2256 } 2257 + EXPORT_SYMBOL(ib_device_get_netdev); 2260 2258 2261 2259 /** 2262 2260 * ib_device_get_by_netdev - Find an IB device associated with a netdev
+12 -11
drivers/infiniband/hw/mlx5/ib_rep.c
··· 13 13 int vport_index) 14 14 { 15 15 struct mlx5_ib_dev *ibdev; 16 + struct net_device *ndev; 16 17 17 18 ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); 18 19 if (!ibdev) ··· 21 20 22 21 ibdev->port[vport_index].rep = rep; 23 22 rep->rep_data[REP_IB].priv = ibdev; 24 - write_lock(&ibdev->port[vport_index].roce.netdev_lock); 25 - ibdev->port[vport_index].roce.netdev = 26 - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); 27 - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); 23 + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); 28 24 29 - return 0; 25 + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); 30 26 } 31 27 32 28 static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); ··· 102 104 ibdev->is_rep = true; 103 105 vport_index = rep->vport_index; 104 106 ibdev->port[vport_index].rep = rep; 105 - ibdev->ib_dev.phys_port_cnt = num_ports; 106 - ibdev->port[vport_index].roce.netdev = 107 - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); 108 107 ibdev->mdev = lag_master; 109 108 ibdev->num_ports = num_ports; 109 + ibdev->ib_dev.phys_port_cnt = num_ports; 110 + ret = ib_device_set_netdev(&ibdev->ib_dev, 111 + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, 112 + rep->vport), 113 + vport_index + 1); 114 + if (ret) 115 + goto fail_add; 110 116 111 117 ret = __mlx5_ib_add(ibdev, profile); 112 118 if (ret) ··· 163 161 } 164 162 165 163 port = &dev->port[vport_index]; 166 - write_lock(&port->roce.netdev_lock); 167 - port->roce.netdev = NULL; 168 - write_unlock(&port->roce.netdev_lock); 164 + 165 + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); 169 166 rep->rep_data[REP_IB].priv = NULL; 170 167 port->rep = NULL; 171 168
+129 -54
drivers/infiniband/hw/mlx5/main.c
··· 147 147 148 148 if (upper && port->rep->vport == MLX5_VPORT_UPLINK) 149 149 continue; 150 - 151 - read_lock(&port->roce.netdev_lock); 152 - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, 153 - port->rep->vport); 154 - if (rep_ndev == ndev) { 155 - read_unlock(&port->roce.netdev_lock); 150 + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); 151 + if (rep_ndev && rep_ndev == ndev) { 152 + dev_put(rep_ndev); 156 153 *port_num = i + 1; 157 154 return &port->roce; 158 155 } 159 - read_unlock(&port->roce.netdev_lock); 156 + 157 + dev_put(rep_ndev); 158 + } 159 + 160 + return NULL; 161 + } 162 + 163 + static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, 164 + struct net_device *ndev, 165 + struct net_device *upper, 166 + struct net_device *ib_ndev) 167 + { 168 + if (!dev->ib_active) 169 + return false; 170 + 171 + /* Event is about our upper device */ 172 + if (upper == ndev) 173 + return true; 174 + 175 + /* RDMA device is not in lag and not in switchdev */ 176 + if (!dev->is_rep && !upper && ndev == ib_ndev) 177 + return true; 178 + 179 + /* RDMA devie is in switchdev */ 180 + if (dev->is_rep && ndev == ib_ndev) 181 + return true; 182 + 183 + return false; 184 + } 185 + 186 + static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) 187 + { 188 + struct mlx5_ib_port *port; 189 + int i; 190 + 191 + for (i = 0; i < ibdev->num_ports; i++) { 192 + port = &ibdev->port[i]; 193 + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { 194 + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); 195 + } 160 196 } 161 197 162 198 return NULL; ··· 204 168 struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); 205 169 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 206 170 u32 port_num = roce->native_port_num; 171 + struct net_device *ib_ndev = NULL; 207 172 struct mlx5_core_dev *mdev; 208 173 struct mlx5_ib_dev *ibdev; 209 174 ··· 218 181 /* Should already be registered during the load */ 219 182 if (ibdev->is_rep) 220 183 break; 221 - write_lock(&roce->netdev_lock); 184 + 185 + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); 186 + /* Exit if already registered */ 187 + if (ib_ndev) 188 + goto put_ndev; 189 + 222 190 if (ndev->dev.parent == mdev->device) 223 - roce->netdev = ndev; 224 - write_unlock(&roce->netdev_lock); 191 + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); 225 192 break; 226 193 227 194 case NETDEV_UNREGISTER: 228 195 /* In case of reps, ib device goes away before the netdevs */ 229 - write_lock(&roce->netdev_lock); 230 - if (roce->netdev == ndev) 231 - roce->netdev = NULL; 232 - write_unlock(&roce->netdev_lock); 233 - break; 196 + if (ibdev->is_rep) 197 + break; 198 + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); 199 + if (ib_ndev == ndev) 200 + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); 201 + goto put_ndev; 234 202 235 203 case NETDEV_CHANGE: 236 204 case NETDEV_UP: 237 205 case NETDEV_DOWN: { 238 206 struct net_device *upper = NULL; 239 207 240 - if (mlx5_lag_is_roce(mdev)) { 208 + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { 241 209 struct net_device *lag_ndev; 242 210 243 - lag_ndev = mlx5_lag_get_roce_netdev(mdev); 211 + if(mlx5_lag_is_roce(mdev)) 212 + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); 213 + else /* sriov lag */ 214 + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); 215 + 244 216 if (lag_ndev) { 245 217 upper = netdev_master_upper_dev_get(lag_ndev); 246 218 dev_put(lag_ndev); ··· 262 216 roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); 263 217 if (!roce) 264 218 return NOTIFY_DONE; 265 - if ((upper == ndev || 266 - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && 267 - ibdev->ib_active) { 219 + 220 + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); 221 + 222 + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { 268 223 struct ib_event ibev = { }; 269 224 enum ib_port_state port_state; 270 225 271 226 if (get_port_state(&ibdev->ib_dev, port_num, 272 227 &port_state)) 273 - goto done; 228 + goto put_ndev; 274 229 275 230 if (roce->last_port_state == port_state) 276 - goto done; 231 + goto put_ndev; 277 232 278 233 roce->last_port_state = port_state; 279 234 ibev.device = &ibdev->ib_dev; ··· 283 236 else if (port_state == IB_PORT_ACTIVE) 284 237 ibev.event = IB_EVENT_PORT_ACTIVE; 285 238 else 286 - goto done; 239 + goto put_ndev; 287 240 288 241 ibev.element.port_num = port_num; 289 242 ib_dispatch_event(&ibev); ··· 294 247 default: 295 248 break; 296 249 } 250 + put_ndev: 251 + dev_put(ib_ndev); 297 252 done: 298 253 mlx5_ib_put_native_port_mdev(ibdev, port_num); 299 254 return NOTIFY_DONE; 300 - } 301 - 302 - static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, 303 - u32 port_num) 304 - { 305 - struct mlx5_ib_dev *ibdev = to_mdev(device); 306 - struct net_device *ndev; 307 - struct mlx5_core_dev *mdev; 308 - 309 - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); 310 - if (!mdev) 311 - return NULL; 312 - 313 - if (mlx5_lag_is_roce(mdev)) { 314 - ndev = mlx5_lag_get_roce_netdev(mdev); 315 - goto out; 316 - } 317 - 318 - /* Ensure ndev does not disappear before we invoke dev_hold() 319 - */ 320 - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); 321 - ndev = ibdev->port[port_num - 1].roce.netdev; 322 - dev_hold(ndev); 323 - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); 324 - 325 - out: 326 - mlx5_ib_put_native_port_mdev(ibdev, port_num); 327 - return ndev; 328 255 } 329 256 330 257 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, ··· 575 554 if (!put_mdev) 576 555 goto out; 577 556 578 - ndev = mlx5_ib_get_netdev(device, port_num); 557 + ndev = ib_device_get_netdev(device, port_num); 579 558 if (!ndev) 580 559 goto out; 581 560 ··· 3206 3185 fw_rev_sub(dev->mdev)); 3207 3186 } 3208 3187 3188 + static int lag_event(struct notifier_block *nb, unsigned long event, void *data) 3189 + { 3190 + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, 3191 + lag_events); 3192 + struct mlx5_core_dev *mdev = dev->mdev; 3193 + struct mlx5_ib_port *port; 3194 + struct net_device *ndev; 3195 + int i, err; 3196 + int portnum; 3197 + 3198 + portnum = 0; 3199 + switch (event) { 3200 + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: 3201 + ndev = data; 3202 + if (ndev) { 3203 + if (!mlx5_lag_is_roce(mdev)) { 3204 + // sriov lag 3205 + for (i = 0; i < dev->num_ports; i++) { 3206 + port = &dev->port[i]; 3207 + if (port->rep && port->rep->vport == 3208 + MLX5_VPORT_UPLINK) { 3209 + portnum = i; 3210 + break; 3211 + } 3212 + } 3213 + } 3214 + err = ib_device_set_netdev(&dev->ib_dev, ndev, 3215 + portnum + 1); 3216 + dev_put(ndev); 3217 + if (err) 3218 + return err; 3219 + /* Rescan gids after new netdev assignment */ 3220 + rdma_roce_rescan_device(&dev->ib_dev); 3221 + } 3222 + break; 3223 + default: 3224 + return NOTIFY_DONE; 3225 + } 3226 + return NOTIFY_OK; 3227 + } 3228 + 3229 + static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) 3230 + { 3231 + dev->lag_events.notifier_call = lag_event; 3232 + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, 3233 + &dev->lag_events); 3234 + } 3235 + 3236 + static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) 3237 + { 3238 + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, 3239 + &dev->lag_events); 3240 + } 3241 + 3209 3242 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) 3210 3243 { 3211 3244 struct mlx5_core_dev *mdev = dev->mdev; ··· 3281 3206 goto err_destroy_vport_lag; 3282 3207 } 3283 3208 3209 + mlx5e_lag_event_register(dev); 3284 3210 dev->flow_db->lag_demux_ft = ft; 3285 3211 dev->lag_ports = mlx5_lag_get_num_ports(mdev); 3286 3212 dev->lag_active = true; ··· 3299 3223 if (dev->lag_active) { 3300 3224 dev->lag_active = false; 3301 3225 3226 + mlx5e_lag_event_unregister(dev); 3302 3227 mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); 3303 3228 dev->flow_db->lag_demux_ft = NULL; 3304 3229 ··· 4016 3939 4017 3940 for (i = 0; i < dev->num_ports; i++) { 4018 3941 spin_lock_init(&dev->port[i].mp.mpi_lock); 4019 - rwlock_init(&dev->port[i].roce.netdev_lock); 4020 3942 dev->port[i].roce.dev = dev; 4021 3943 dev->port[i].roce.native_port_num = i + 1; 4022 3944 dev->port[i].roce.last_port_state = IB_PORT_DOWN; ··· 4280 4204 .create_wq = mlx5_ib_create_wq, 4281 4205 .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, 4282 4206 .destroy_wq = mlx5_ib_destroy_wq, 4283 - .get_netdev = mlx5_ib_get_netdev, 4284 4207 .modify_wq = mlx5_ib_modify_wq, 4285 4208 4286 4209 INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
+1 -2
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 888 888 /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL 889 889 * netdev pointer 890 890 */ 891 - rwlock_t netdev_lock; 892 - struct net_device *netdev; 893 891 struct notifier_block nb; 894 892 struct netdev_net_notifier nn; 895 893 struct notifier_block mdev_nb; ··· 1136 1138 /* protect accessing data_direct_dev */ 1137 1139 struct mutex data_direct_lock; 1138 1140 struct notifier_block mdev_events; 1141 + struct notifier_block lag_events; 1139 1142 int num_ports; 1140 1143 /* serialize update of capability mask 1141 1144 */
+41 -35
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
··· 445 445 return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); 446 446 } 447 447 448 + static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev) 449 + { 450 + struct net_device *ndev = NULL; 451 + struct mlx5_lag *ldev; 452 + unsigned long flags; 453 + int i; 454 + 455 + spin_lock_irqsave(&lag_lock, flags); 456 + ldev = mlx5_lag_dev(dev); 457 + 458 + if (!ldev) 459 + goto unlock; 460 + 461 + for (i = 0; i < ldev->ports; i++) 462 + if (ldev->tracker.netdev_state[i].tx_enabled) 463 + ndev = ldev->pf[i].netdev; 464 + if (!ndev) 465 + ndev = ldev->pf[ldev->ports - 1].netdev; 466 + 467 + if (ndev) 468 + dev_hold(ndev); 469 + 470 + unlock: 471 + spin_unlock_irqrestore(&lag_lock, flags); 472 + 473 + return ndev; 474 + } 475 + 448 476 void mlx5_modify_lag(struct mlx5_lag *ldev, 449 477 struct lag_tracker *tracker) 450 478 { ··· 505 477 } 506 478 } 507 479 508 - if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && 509 - !(ldev->mode == MLX5_LAG_MODE_ROCE)) 510 - mlx5_lag_drop_rule_setup(ldev, tracker); 480 + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { 481 + struct net_device *ndev = mlx5_lag_active_backup_get_netdev(dev0); 482 + 483 + if(!(ldev->mode == MLX5_LAG_MODE_ROCE)) 484 + mlx5_lag_drop_rule_setup(ldev, tracker); 485 + /** Only sriov and roce lag should have tracker->tx_type set so 486 + * no need to check the mode 487 + */ 488 + blocking_notifier_call_chain(&dev0->priv.lag_nh, 489 + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, 490 + ndev); 491 + } 511 492 } 512 493 513 494 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev, ··· 650 613 mlx5_core_err(dev0, 651 614 "Failed to deactivate RoCE LAG; driver restart required\n"); 652 615 } 616 + BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh); 653 617 654 618 return err; 655 619 } ··· 1529 1491 mutex_unlock(&ldev->lock); 1530 1492 mlx5_queue_bond_work(ldev, 0); 1531 1493 } 1532 - 1533 - struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) 1534 - { 1535 - struct net_device *ndev = NULL; 1536 - struct mlx5_lag *ldev; 1537 - unsigned long flags; 1538 - int i; 1539 - 1540 - spin_lock_irqsave(&lag_lock, flags); 1541 - ldev = mlx5_lag_dev(dev); 1542 - 1543 - if (!(ldev && __mlx5_lag_is_roce(ldev))) 1544 - goto unlock; 1545 - 1546 - if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { 1547 - for (i = 0; i < ldev->ports; i++) 1548 - if (ldev->tracker.netdev_state[i].tx_enabled) 1549 - ndev = ldev->pf[i].netdev; 1550 - if (!ndev) 1551 - ndev = ldev->pf[ldev->ports - 1].netdev; 1552 - } else { 1553 - ndev = ldev->pf[MLX5_LAG_P1].netdev; 1554 - } 1555 - if (ndev) 1556 - dev_hold(ndev); 1557 - 1558 - unlock: 1559 - spin_unlock_irqrestore(&lag_lock, flags); 1560 - 1561 - return ndev; 1562 - } 1563 - EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); 1564 1494 1565 1495 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, 1566 1496 struct net_device *slave)
+1
include/linux/mlx5/device.h
··· 371 371 MLX5_DRIVER_EVENT_SF_PEER_DEVLINK, 372 372 MLX5_DRIVER_EVENT_AFFILIATION_DONE, 373 373 MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, 374 + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, 374 375 }; 375 376 376 377 enum {
+1 -1
include/linux/mlx5/driver.h
··· 643 643 struct mlx5_sf_hw_table *sf_hw_table; 644 644 struct mlx5_sf_table *sf_table; 645 645 #endif 646 + struct blocking_notifier_head lag_nh; 646 647 }; 647 648 648 649 enum mlx5_device_state { ··· 1182 1181 bool mlx5_lag_is_master(struct mlx5_core_dev *dev); 1183 1182 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); 1184 1183 bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); 1185 - struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); 1186 1184 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, 1187 1185 struct net_device *slave); 1188 1186 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+2
include/rdma/ib_verbs.h
··· 4453 4453 const struct sockaddr *addr); 4454 4454 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 4455 4455 unsigned int port); 4456 + struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 4457 + u32 port); 4456 4458 struct ib_wq *ib_create_wq(struct ib_pd *pd, 4457 4459 struct ib_wq_init_attr *init_attr); 4458 4460 int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);