Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/mlx5: Lag, Control MultiPort E-Switch single FDB mode

MultiPort E-Switch builds on newer hardware's capabilities and introduces
a mode where a single E-Switch is used and all the vports and physical
ports on the NIC are connected to it.

The new mode will allow in the future a decrease in the memory used by the
driver and advanced features that aren't possible today.

This represents a big change in the current E-Switch implantation in mlx5.
Currently, by default, each E-Switch manager manages its E-Switch.
Steering rules in each E-Switch can only forward traffic to the native
physical port associated with that E-Switch. While there are ways to target
non-native physical ports, for example using a bond or via special TC
rules. None of the ways allows a user to configure the driver
to operate by default in such a mode nor can the driver decide
to move to this mode by default as it's user configuration-driven right now.

While MultiPort E-Switch single FDB mode is the preferred mode, older
generations of ConnectX hardware couldn't support this mode so it was never
implemented. Now that there is capable hardware present, start the
transition to having this mode by default.

Introduce a devlink parameter to control MultiPort E-Switch single FDB mode.
This will allow users to select this mode on their system right now
and in the future will allow the driver to move to this mode by default.

Example:
$ devlink dev param set pci/0000:00:0b.0 name esw_multiport value 1 \
cmode runtime

Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>

authored by

Roi Dayan and committed by
Saeed Mahameed
a32327a3 2edd9257

+101 -76
+18
Documentation/networking/devlink/mlx5.rst
··· 54 54 - Control the number of large groups (size > 1) in the FDB table. 55 55 56 56 * The default value is 15, and the range is between 1 and 1024. 57 + * - ``esw_multiport`` 58 + - Boolean 59 + - runtime 60 + - Control MultiPort E-Switch shared fdb mode. 61 + 62 + An experimental mode where a single E-Switch is used and all the vports 63 + and physical ports on the NIC are connected to it. 64 + 65 + An example is to send traffic from a VF that is created on PF0 to an 66 + uplink that is natively associated with the uplink of PF1 67 + 68 + Note: Future devices, ConnectX-8 and onward, will eventually have this 69 + as the default to allow forwarding between all NIC ports in a single 70 + E-switch environment and the dual E-switch mode will likely get 71 + deprecated. 72 + 73 + Default: disabled 74 + 57 75 58 76 The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` 59 77
+54
drivers/net/ethernet/mellanox/mlx5/core/devlink.c
··· 7 7 #include "fw_reset.h" 8 8 #include "fs_core.h" 9 9 #include "eswitch.h" 10 + #include "lag/lag.h" 10 11 #include "esw/qos.h" 11 12 #include "sf/dev/dev.h" 12 13 #include "sf/sf.h" ··· 438 437 return 0; 439 438 } 440 439 440 + static int mlx5_devlink_esw_multiport_set(struct devlink *devlink, u32 id, 441 + struct devlink_param_gset_ctx *ctx) 442 + { 443 + struct mlx5_core_dev *dev = devlink_priv(devlink); 444 + 445 + if (!MLX5_ESWITCH_MANAGER(dev)) 446 + return -EOPNOTSUPP; 447 + 448 + if (ctx->val.vbool) 449 + return mlx5_lag_mpesw_enable(dev); 450 + 451 + mlx5_lag_mpesw_disable(dev); 452 + return 0; 453 + } 454 + 455 + static int mlx5_devlink_esw_multiport_get(struct devlink *devlink, u32 id, 456 + struct devlink_param_gset_ctx *ctx) 457 + { 458 + struct mlx5_core_dev *dev = devlink_priv(devlink); 459 + 460 + if (!MLX5_ESWITCH_MANAGER(dev)) 461 + return -EOPNOTSUPP; 462 + 463 + ctx->val.vbool = mlx5_lag_mpesw_is_activated(dev); 464 + return 0; 465 + } 466 + 467 + static int mlx5_devlink_esw_multiport_validate(struct devlink *devlink, u32 id, 468 + union devlink_param_value val, 469 + struct netlink_ext_ack *extack) 470 + { 471 + struct mlx5_core_dev *dev = devlink_priv(devlink); 472 + 473 + if (!MLX5_ESWITCH_MANAGER(dev)) { 474 + NL_SET_ERR_MSG_MOD(extack, "E-Switch is unsupported"); 475 + return -EOPNOTSUPP; 476 + } 477 + 478 + if (mlx5_eswitch_mode(dev) != MLX5_ESWITCH_OFFLOADS) { 479 + NL_SET_ERR_MSG_MOD(extack, 480 + "E-Switch must be in switchdev mode"); 481 + return -EBUSY; 482 + } 483 + 484 + return 0; 485 + } 486 + 441 487 #endif 442 488 443 489 static int mlx5_devlink_eq_depth_validate(struct devlink *devlink, u32 id, ··· 503 455 BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), 504 456 NULL, NULL, 505 457 mlx5_devlink_large_group_num_validate), 458 + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT, 459 + "esw_multiport", DEVLINK_PARAM_TYPE_BOOL, 460 + BIT(DEVLINK_PARAM_CMODE_RUNTIME), 461 + mlx5_devlink_esw_multiport_get, 462 + mlx5_devlink_esw_multiport_set, 463 + mlx5_devlink_esw_multiport_validate), 506 464 #endif 507 465 DEVLINK_PARAM_GENERIC(IO_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), 508 466 NULL, NULL, mlx5_devlink_eq_depth_validate),
+1
drivers/net/ethernet/mellanox/mlx5/core/devlink.h
··· 11 11 MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE, 12 12 MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, 13 13 MLX5_DEVLINK_PARAM_ID_ESW_PORT_METADATA, 14 + MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT, 14 15 }; 15 16 16 17 struct mlx5_trap_ctx {
-9
drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c
··· 216 216 struct net_device *uplink_dev; 217 217 struct mlx5e_priv *out_priv; 218 218 struct mlx5_eswitch *esw; 219 - bool is_uplink_rep; 220 219 int *ifindexes; 221 220 int if_count; 222 221 int err; ··· 230 231 231 232 parse_state->ifindexes[if_count] = out_dev->ifindex; 232 233 parse_state->if_count++; 233 - is_uplink_rep = mlx5e_eswitch_uplink_rep(out_dev); 234 234 235 235 if (mlx5_lag_mpesw_do_mirred(priv->mdev, out_dev, extack)) 236 236 return -EOPNOTSUPP; ··· 272 274 rpriv = out_priv->ppriv; 273 275 esw_attr->dests[esw_attr->out_count].rep = rpriv->rep; 274 276 esw_attr->dests[esw_attr->out_count].mdev = out_priv->mdev; 275 - 276 - /* If output device is bond master then rules are not explicit 277 - * so we don't attempt to count them. 278 - */ 279 - if (is_uplink_rep && MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) && 280 - MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up)) 281 - attr->lag.count = true; 282 277 283 278 esw_attr->out_count++; 284 279
+2 -20
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
··· 2152 2152 free_branch_attr(flow, attr->branch_true); 2153 2153 free_branch_attr(flow, attr->branch_false); 2154 2154 2155 - if (flow->attr->lag.count) 2156 - mlx5_lag_del_mpesw_rule(esw->dev); 2157 - 2158 2155 kvfree(attr->esw_attr->rx_tun_attr); 2159 2156 kvfree(attr->parse_attr); 2160 2157 kfree(flow->attr); ··· 4311 4314 4312 4315 static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev) 4313 4316 { 4314 - if (same_hw_reps(priv, out_dev) && 4315 - MLX5_CAP_PORT_SELECTION(priv->mdev, port_select_flow_table) && 4316 - MLX5_CAP_GEN(priv->mdev, create_lag_when_not_master_up)) 4317 - return true; 4318 - 4319 - return false; 4317 + return same_hw_reps(priv, out_dev) && mlx5_lag_mpesw_is_activated(priv->mdev); 4320 4318 } 4321 4319 4322 4320 bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, ··· 4613 4621 struct mlx5_core_dev *in_mdev) 4614 4622 { 4615 4623 struct flow_rule *rule = flow_cls_offload_flow_rule(f); 4616 - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; 4617 4624 struct netlink_ext_ack *extack = f->common.extack; 4618 4625 struct mlx5e_tc_flow_parse_attr *parse_attr; 4619 4626 struct mlx5e_tc_flow *flow; ··· 4645 4654 if (err) 4646 4655 goto err_free; 4647 4656 4648 - if (flow->attr->lag.count) { 4649 - err = mlx5_lag_add_mpesw_rule(esw->dev); 4650 - if (err) 4651 - goto err_free; 4652 - } 4653 - 4654 4657 err = mlx5e_tc_add_fdb_flow(priv, flow, extack); 4655 4658 complete_all(&flow->init_done); 4656 4659 if (err) { 4657 4660 if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev))) 4658 - goto err_lag; 4661 + goto err_free; 4659 4662 4660 4663 add_unready_flow(flow); 4661 4664 } 4662 4665 4663 4666 return flow; 4664 4667 4665 - err_lag: 4666 - if (flow->attr->lag.count) 4667 - mlx5_lag_del_mpesw_rule(esw->dev); 4668 4668 err_free: 4669 4669 mlx5e_flow_put(priv, flow); 4670 4670 out:
-6
drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
··· 92 92 u32 exe_aso_type; 93 93 struct list_head list; 94 94 struct mlx5e_post_act_handle *post_act_handle; 95 - struct { 96 - /* Indicate whether the parsed flow should be counted for lag mode decision 97 - * making 98 - */ 99 - bool count; 100 - } lag; 101 95 struct mlx5_flow_attr *branch_true; 102 96 struct mlx5_flow_attr *branch_false; 103 97 struct mlx5_flow_attr *jumping_attr;
+1 -3
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
··· 230 230 mlx5_lag_mp_cleanup(ldev); 231 231 cancel_delayed_work_sync(&ldev->bond_work); 232 232 destroy_workqueue(ldev->wq); 233 - mlx5_lag_mpesw_cleanup(ldev); 234 233 mutex_destroy(&ldev->lock); 235 234 kfree(ldev); 236 235 } ··· 275 276 mlx5_core_err(dev, "Failed to init multipath lag err=%d\n", 276 277 err); 277 278 278 - mlx5_lag_mpesw_init(ldev); 279 279 ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports); 280 280 ldev->buckets = 1; 281 281 ··· 686 688 } 687 689 688 690 #define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2 689 - static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) 691 + bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) 690 692 { 691 693 #ifdef CONFIG_MLX5_ESWITCH 692 694 struct mlx5_core_dev *dev;
+1
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
··· 102 102 return test_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags); 103 103 } 104 104 105 + bool mlx5_lag_check_prereq(struct mlx5_lag *ldev); 105 106 void mlx5_modify_lag(struct mlx5_lag *ldev, 106 107 struct lag_tracker *tracker); 107 108 int mlx5_activate_lag(struct mlx5_lag *ldev,
+22 -28
drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
··· 7 7 #include "eswitch.h" 8 8 #include "lib/mlx5.h" 9 9 10 - static int add_mpesw_rule(struct mlx5_lag *ldev) 10 + static int enable_mpesw(struct mlx5_lag *ldev) 11 11 { 12 12 struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; 13 13 int err; 14 14 15 - if (atomic_add_return(1, &ldev->lag_mpesw.mpesw_rule_count) != 1) 16 - return 0; 15 + if (ldev->mode != MLX5_LAG_MODE_NONE) 16 + return -EINVAL; 17 17 18 - if (ldev->mode != MLX5_LAG_MODE_NONE) { 19 - err = -EINVAL; 20 - goto out_err; 21 - } 18 + if (mlx5_eswitch_mode(dev) != MLX5_ESWITCH_OFFLOADS || 19 + !MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table) || 20 + !MLX5_CAP_GEN(dev, create_lag_when_not_master_up) || 21 + !mlx5_lag_check_prereq(ldev)) 22 + return -EOPNOTSUPP; 22 23 23 24 err = mlx5_activate_lag(ldev, NULL, MLX5_LAG_MODE_MPESW, false); 24 25 if (err) { ··· 30 29 return 0; 31 30 32 31 out_err: 33 - atomic_dec(&ldev->lag_mpesw.mpesw_rule_count); 34 32 return err; 35 33 } 36 34 37 - static void del_mpesw_rule(struct mlx5_lag *ldev) 35 + static void disable_mpesw(struct mlx5_lag *ldev) 38 36 { 39 - if (!atomic_dec_return(&ldev->lag_mpesw.mpesw_rule_count) && 40 - ldev->mode == MLX5_LAG_MODE_MPESW) 37 + if (ldev->mode == MLX5_LAG_MODE_MPESW) 41 38 mlx5_disable_lag(ldev); 42 39 } 43 40 ··· 45 46 struct mlx5_lag *ldev = mpesww->lag; 46 47 47 48 mutex_lock(&ldev->lock); 48 - if (mpesww->op == MLX5_MPESW_OP_ENABLE) 49 - mpesww->result = add_mpesw_rule(ldev); 50 - else if (mpesww->op == MLX5_MPESW_OP_DISABLE) 51 - del_mpesw_rule(ldev); 52 - mutex_unlock(&ldev->lock); 49 + if (ldev->mode_changes_in_progress) { 50 + mpesww->result = -EAGAIN; 51 + goto unlock; 52 + } 53 53 54 + if (mpesww->op == MLX5_MPESW_OP_ENABLE) 55 + mpesww->result = enable_mpesw(ldev); 56 + else if (mpesww->op == MLX5_MPESW_OP_DISABLE) 57 + disable_mpesw(ldev); 58 + unlock: 59 + mutex_unlock(&ldev->lock); 54 60 complete(&mpesww->comp); 55 61 } 56 62 ··· 90 86 return err; 91 87 } 92 88 93 - void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev) 89 + void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev) 94 90 { 95 91 mlx5_lag_mpesw_queue_work(dev, MLX5_MPESW_OP_DISABLE); 96 92 } 97 93 98 - int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev) 94 + int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev) 99 95 { 100 96 return mlx5_lag_mpesw_queue_work(dev, MLX5_MPESW_OP_ENABLE); 101 97 } ··· 121 117 struct mlx5_lag *ldev = mlx5_lag_dev(dev); 122 118 123 119 return ldev && ldev->mode == MLX5_LAG_MODE_MPESW; 124 - } 125 - 126 - void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) 127 - { 128 - atomic_set(&ldev->lag_mpesw.mpesw_rule_count, 0); 129 - } 130 - 131 - void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) 132 - { 133 - WARN_ON(atomic_read(&ldev->lag_mpesw.mpesw_rule_count)); 134 120 }
+2 -10
drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
··· 9 9 10 10 struct lag_mpesw { 11 11 struct work_struct mpesw_work; 12 - atomic_t mpesw_rule_count; 13 12 }; 14 13 15 14 enum mpesw_op { ··· 28 29 struct net_device *out_dev, 29 30 struct netlink_ext_ack *extack); 30 31 bool mlx5_lag_mpesw_is_activated(struct mlx5_core_dev *dev); 31 - void mlx5_lag_del_mpesw_rule(struct mlx5_core_dev *dev); 32 - int mlx5_lag_add_mpesw_rule(struct mlx5_core_dev *dev); 33 - #if IS_ENABLED(CONFIG_MLX5_ESWITCH) 34 - void mlx5_lag_mpesw_init(struct mlx5_lag *ldev); 35 - void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev); 36 - #else 37 - static inline void mlx5_lag_mpesw_init(struct mlx5_lag *ldev) {} 38 - static inline void mlx5_lag_mpesw_cleanup(struct mlx5_lag *ldev) {} 39 - #endif 32 + void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev); 33 + int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev); 40 34 41 35 #endif /* __MLX5_LAG_MPESW_H__ */