Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'mlxsw-new-reset-flow'

Petr Machata says:

====================
mlxsw: Add support for new reset flow

Ido Schimmel writes:

This patchset changes mlxsw to issue a PCI reset during probe and
devlink reload so that the PCI firmware could be upgraded without a
reboot.

Unlike the old version of this patchset [1], in this version the driver
no longer tries to issue a PCI reset by triggering a PCI link toggle on
its own, but instead calls the PCI core to issue the reset.

The PCI APIs require the device lock to be held which is why patches

Patches #7 adds reset method quirk for NVIDIA Spectrum devices.

Patch #8 adds a debug level print in PCI core so that device ready delay
will be printed even if it is shorter than one second.

Patches #9-#11 are straightforward preparations in mlxsw.

Patch #12 finally implements the new reset flow in mlxsw.

Patch #13 adds PCI reset handlers in mlxsw to avoid user space from
resetting the device from underneath an unaware driver. Instead, the
driver is gracefully de-initialized before the PCI reset and then
initialized again after it.

Patch #14 adds a PCI reset selftest to make sure this code path does not
regress.

[1] https://lore.kernel.org/netdev/cover.1679502371.git.petrm@nvidia.com/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+247 -30
+2 -2
Documentation/netlink/specs/devlink.yaml
··· 1484 1484 dont-validate: [ strict ] 1485 1485 flags: [ admin-perm ] 1486 1486 do: 1487 - pre: devlink-nl-pre-doit 1488 - post: devlink-nl-post-doit 1487 + pre: devlink-nl-pre-doit-dev-lock 1488 + post: devlink-nl-post-doit-dev-lock 1489 1489 request: 1490 1490 attributes: 1491 1491 - bus-name
+83 -7
drivers/net/ethernet/mellanox/mlxsw/pci.c
··· 130 130 const struct pci_device_id *id; 131 131 enum mlxsw_pci_cqe_v max_cqe_ver; /* Maximal supported CQE version */ 132 132 u8 num_sdq_cqs; /* Number of CQs used for SDQs */ 133 + bool skip_reset; 133 134 }; 134 135 135 136 static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q) ··· 1477 1476 return -EBUSY; 1478 1477 } 1479 1478 1480 - static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci, 1481 - const struct pci_device_id *id) 1479 + static int mlxsw_pci_reset_at_pci_disable(struct mlxsw_pci *mlxsw_pci) 1482 1480 { 1483 1481 struct pci_dev *pdev = mlxsw_pci->pdev; 1484 1482 char mrsr_pl[MLXSW_REG_MRSR_LEN]; 1483 + int err; 1484 + 1485 + mlxsw_reg_mrsr_pack(mrsr_pl, 1486 + MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE); 1487 + err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); 1488 + if (err) 1489 + return err; 1490 + 1491 + device_lock_assert(&pdev->dev); 1492 + 1493 + pci_cfg_access_lock(pdev); 1494 + pci_save_state(pdev); 1495 + 1496 + err = __pci_reset_function_locked(pdev); 1497 + if (err) 1498 + pci_err(pdev, "PCI function reset failed with %d\n", err); 1499 + 1500 + pci_restore_state(pdev); 1501 + pci_cfg_access_unlock(pdev); 1502 + 1503 + return err; 1504 + } 1505 + 1506 + static int mlxsw_pci_reset_sw(struct mlxsw_pci *mlxsw_pci) 1507 + { 1508 + char mrsr_pl[MLXSW_REG_MRSR_LEN]; 1509 + 1510 + mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET); 1511 + return mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); 1512 + } 1513 + 1514 + static int 1515 + mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) 1516 + { 1517 + struct pci_dev *pdev = mlxsw_pci->pdev; 1518 + char mcam_pl[MLXSW_REG_MCAM_LEN]; 1519 + bool pci_reset_supported; 1485 1520 u32 sys_status; 1486 1521 int err; 1487 1522 ··· 1528 1491 return err; 1529 1492 } 1530 1493 1531 - mlxsw_reg_mrsr_pack(mrsr_pl); 1532 - err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); 1494 + /* PCI core already issued a PCI reset, do not issue another reset. */ 1495 + if (mlxsw_pci->skip_reset) 1496 + return 0; 1497 + 1498 + mlxsw_reg_mcam_pack(mcam_pl, 1499 + MLXSW_REG_MCAM_FEATURE_GROUP_ENHANCED_FEATURES); 1500 + err = mlxsw_reg_query(mlxsw_pci->core, MLXSW_REG(mcam), mcam_pl); 1533 1501 if (err) 1534 1502 return err; 1503 + 1504 + mlxsw_reg_mcam_unpack(mcam_pl, MLXSW_REG_MCAM_PCI_RESET, 1505 + &pci_reset_supported); 1506 + 1507 + if (pci_reset_supported) { 1508 + pci_dbg(pdev, "Starting PCI reset flow\n"); 1509 + err = mlxsw_pci_reset_at_pci_disable(mlxsw_pci); 1510 + } else { 1511 + pci_dbg(pdev, "Starting software reset flow\n"); 1512 + err = mlxsw_pci_reset_sw(mlxsw_pci); 1513 + } 1535 1514 1536 1515 err = mlxsw_pci_sys_ready_wait(mlxsw_pci, id, &sys_status); 1537 1516 if (err) { ··· 1590 1537 if (!mbox) 1591 1538 return -ENOMEM; 1592 1539 1593 - err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id); 1540 + err = mlxsw_pci_reset(mlxsw_pci, mlxsw_pci->id); 1594 1541 if (err) 1595 - goto err_sw_reset; 1542 + goto err_reset; 1596 1543 1597 1544 err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci); 1598 1545 if (err < 0) { ··· 1725 1672 err_query_fw: 1726 1673 mlxsw_pci_free_irq_vectors(mlxsw_pci); 1727 1674 err_alloc_irq: 1728 - err_sw_reset: 1675 + err_reset: 1729 1676 mbox_put: 1730 1677 mlxsw_cmd_mbox_free(mbox); 1731 1678 return err; ··· 2112 2059 kfree(mlxsw_pci); 2113 2060 } 2114 2061 2062 + static void mlxsw_pci_reset_prepare(struct pci_dev *pdev) 2063 + { 2064 + struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev); 2065 + 2066 + mlxsw_core_bus_device_unregister(mlxsw_pci->core, false); 2067 + } 2068 + 2069 + static void mlxsw_pci_reset_done(struct pci_dev *pdev) 2070 + { 2071 + struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev); 2072 + 2073 + mlxsw_pci->skip_reset = true; 2074 + mlxsw_core_bus_device_register(&mlxsw_pci->bus_info, &mlxsw_pci_bus, 2075 + mlxsw_pci, false, NULL, NULL); 2076 + mlxsw_pci->skip_reset = false; 2077 + } 2078 + 2079 + static const struct pci_error_handlers mlxsw_pci_err_handler = { 2080 + .reset_prepare = mlxsw_pci_reset_prepare, 2081 + .reset_done = mlxsw_pci_reset_done, 2082 + }; 2083 + 2115 2084 int mlxsw_pci_driver_register(struct pci_driver *pci_driver) 2116 2085 { 2117 2086 pci_driver->probe = mlxsw_pci_probe; 2118 2087 pci_driver->remove = mlxsw_pci_remove; 2119 2088 pci_driver->shutdown = mlxsw_pci_remove; 2089 + pci_driver->err_handler = &mlxsw_pci_err_handler; 2120 2090 return pci_register_driver(pci_driver); 2121 2091 } 2122 2092 EXPORT_SYMBOL(mlxsw_pci_driver_register);
+14 -2
drivers/net/ethernet/mellanox/mlxsw/reg.h
··· 10122 10122 10123 10123 MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN); 10124 10124 10125 + enum mlxsw_reg_mrsr_command { 10126 + /* Switch soft reset, does not reset PCI firmware. */ 10127 + MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET = 1, 10128 + /* Reset will be done when PCI link will be disabled. 10129 + * This command will reset PCI firmware also. 10130 + */ 10131 + MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE = 6, 10132 + }; 10133 + 10125 10134 /* reg_mrsr_command 10126 10135 * Reset/shutdown command 10127 10136 * 0 - do nothing ··· 10139 10130 */ 10140 10131 MLXSW_ITEM32(reg, mrsr, command, 0x00, 0, 4); 10141 10132 10142 - static inline void mlxsw_reg_mrsr_pack(char *payload) 10133 + static inline void mlxsw_reg_mrsr_pack(char *payload, 10134 + enum mlxsw_reg_mrsr_command command) 10143 10135 { 10144 10136 MLXSW_REG_ZERO(mrsr, payload); 10145 - mlxsw_reg_mrsr_command_set(payload, 1); 10137 + mlxsw_reg_mrsr_command_set(payload, command); 10146 10138 } 10147 10139 10148 10140 /* MLCR - Management LED Control Register ··· 10594 10584 enum mlxsw_reg_mcam_mng_feature_cap_mask_bits { 10595 10585 /* If set, MCIA supports 128 bytes payloads. Otherwise, 48 bytes. */ 10596 10586 MLXSW_REG_MCAM_MCIA_128B = 34, 10587 + /* If set, MRSR.command=6 is supported. */ 10588 + MLXSW_REG_MCAM_PCI_RESET = 48, 10597 10589 }; 10598 10590 10599 10591 #define MLXSW_REG_BYTES_PER_DWORD 0x4
+3
drivers/pci/pci.c
··· 1219 1219 if (delay > PCI_RESET_WAIT) 1220 1220 pci_info(dev, "ready %dms after %s\n", delay - 1, 1221 1221 reset_type); 1222 + else 1223 + pci_dbg(dev, "ready %dms after %s\n", delay - 1, 1224 + reset_type); 1222 1225 1223 1226 return 0; 1224 1227 }
+13
drivers/pci/quirks.c
··· 3787 3787 PCI_CLASS_DISPLAY_VGA, 8, quirk_no_pm_reset); 3788 3788 3789 3789 /* 3790 + * Spectrum-{1,2,3,4} devices report that a D3hot->D0 transition causes a reset 3791 + * (i.e., they advertise NoSoftRst-). However, this transition does not have 3792 + * any effect on the device: It continues to be operational and network ports 3793 + * remain up. Advertising this support makes it seem as if a PM reset is viable 3794 + * for these devices. Mark it as unavailable to skip it when testing reset 3795 + * methods. 3796 + */ 3797 + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcb84, quirk_no_pm_reset); 3798 + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf6c, quirk_no_pm_reset); 3799 + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf70, quirk_no_pm_reset); 3800 + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf80, quirk_no_pm_reset); 3801 + 3802 + /* 3790 3803 * Thunderbolt controllers with broken MSI hotplug signaling: 3791 3804 * Entire 1st generation (Light Ridge, Eagle Ridge, Light Peak) and part 3792 3805 * of the 2nd generation (Cactus Ridge 4C up to revision 1, Port Ridge).
+2 -2
net/devlink/core.c
··· 503 503 * all devlink instances from this namespace into init_net. 504 504 */ 505 505 devlinks_xa_for_each_registered_get(net, index, devlink) { 506 - devl_lock(devlink); 506 + devl_dev_lock(devlink, true); 507 507 err = 0; 508 508 if (devl_is_registered(devlink)) 509 509 err = devlink_reload(devlink, &init_net, 510 510 DEVLINK_RELOAD_ACTION_DRIVER_REINIT, 511 511 DEVLINK_RELOAD_LIMIT_UNSPEC, 512 512 &actions_performed, NULL); 513 - devl_unlock(devlink); 513 + devl_dev_unlock(devlink, true); 514 514 devlink_put(devlink); 515 515 if (err && err != -EOPNOTSUPP) 516 516 pr_warn("Failed to reload devlink instance into init_net\n");
+8
net/devlink/dev.c
··· 4 4 * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> 5 5 */ 6 6 7 + #include <linux/device.h> 7 8 #include <net/genetlink.h> 8 9 #include <net/sock.h> 9 10 #include "devl_internal.h" ··· 433 432 u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; 434 433 struct net *curr_net; 435 434 int err; 435 + 436 + /* Make sure the reload operations are invoked with the device lock 437 + * held to allow drivers to trigger functionality that expects it 438 + * (e.g., PCI reset) and to close possible races between these 439 + * operations and probe/remove. 440 + */ 441 + device_lock_assert(devlink->dev); 436 442 437 443 memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, 438 444 sizeof(remote_reload_stats));
+17 -4
net/devlink/devl_internal.h
··· 3 3 * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> 4 4 */ 5 5 6 + #include <linux/device.h> 6 7 #include <linux/etherdevice.h> 7 8 #include <linux/mutex.h> 8 9 #include <linux/netdevice.h> ··· 97 96 return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); 98 97 } 99 98 99 + static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock) 100 + { 101 + if (dev_lock) 102 + device_lock(devlink->dev); 103 + devl_lock(devlink); 104 + } 105 + 106 + static inline void devl_dev_unlock(struct devlink *devlink, bool dev_lock) 107 + { 108 + devl_unlock(devlink); 109 + if (dev_lock) 110 + device_unlock(devlink->dev); 111 + } 112 + 100 113 typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index); 101 114 typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index, 102 115 u32 rel_index); ··· 126 111 bool *msg_updated); 127 112 128 113 /* Netlink */ 129 - #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) 130 - #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) 131 - 132 114 enum devlink_multicast_groups { 133 115 DEVLINK_MCGRP_CONFIG, 134 116 }; ··· 152 140 int flags); 153 141 154 142 struct devlink * 155 - devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); 143 + devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, 144 + bool dev_lock); 156 145 157 146 int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, 158 147 devlink_nl_dump_one_func_t *dump_one);
+2 -1
net/devlink/health.c
··· 1151 1151 struct nlattr **attrs = info->attrs; 1152 1152 struct devlink *devlink; 1153 1153 1154 - devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); 1154 + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, 1155 + false); 1155 1156 if (IS_ERR(devlink)) 1156 1157 return NULL; 1157 1158
+36 -9
net/devlink/netlink.c
··· 9 9 10 10 #include "devl_internal.h" 11 11 12 + #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) 13 + #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) 14 + #define DEVLINK_NL_FLAG_NEED_DEV_LOCK BIT(2) 15 + 12 16 static const struct genl_multicast_group devlink_nl_mcgrps[] = { 13 17 [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, 14 18 }; ··· 65 61 } 66 62 67 63 struct devlink * 68 - devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) 64 + devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, 65 + bool dev_lock) 69 66 { 70 67 struct devlink *devlink; 71 68 unsigned long index; ··· 80 75 devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); 81 76 82 77 devlinks_xa_for_each_registered_get(net, index, devlink) { 83 - devl_lock(devlink); 78 + devl_dev_lock(devlink, dev_lock); 84 79 if (devl_is_registered(devlink) && 85 80 strcmp(devlink->dev->bus->name, busname) == 0 && 86 81 strcmp(dev_name(devlink->dev), devname) == 0) 87 82 return devlink; 88 - devl_unlock(devlink); 83 + devl_dev_unlock(devlink, dev_lock); 89 84 devlink_put(devlink); 90 85 } 91 86 ··· 95 90 static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info, 96 91 u8 flags) 97 92 { 93 + bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK; 98 94 struct devlink_port *devlink_port; 99 95 struct devlink *devlink; 100 96 int err; 101 97 102 - devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs); 98 + devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs, 99 + dev_lock); 103 100 if (IS_ERR(devlink)) 104 101 return PTR_ERR(devlink); 105 102 ··· 121 114 return 0; 122 115 123 116 unlock: 124 - devl_unlock(devlink); 117 + devl_dev_unlock(devlink, dev_lock); 125 118 devlink_put(devlink); 126 119 return err; 127 120 } ··· 138 131 return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT); 139 132 } 140 133 134 + int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops, 135 + struct sk_buff *skb, struct genl_info *info) 136 + { 137 + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); 138 + } 139 + 141 140 int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, 142 141 struct sk_buff *skb, 143 142 struct genl_info *info) ··· 151 138 return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT); 152 139 } 153 140 154 - void devlink_nl_post_doit(const struct genl_split_ops *ops, 155 - struct sk_buff *skb, struct genl_info *info) 141 + static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info, 142 + u8 flags) 156 143 { 144 + bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK; 157 145 struct devlink *devlink; 158 146 159 147 devlink = info->user_ptr[0]; 160 - devl_unlock(devlink); 148 + devl_dev_unlock(devlink, dev_lock); 161 149 devlink_put(devlink); 150 + } 151 + 152 + void devlink_nl_post_doit(const struct genl_split_ops *ops, 153 + struct sk_buff *skb, struct genl_info *info) 154 + { 155 + __devlink_nl_post_doit(skb, info, 0); 156 + } 157 + 158 + void 159 + devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops, 160 + struct sk_buff *skb, struct genl_info *info) 161 + { 162 + __devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); 162 163 } 163 164 164 165 static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, ··· 183 156 struct devlink *devlink; 184 157 int err; 185 158 186 - devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs); 159 + devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs, false); 187 160 if (IS_ERR(devlink)) 188 161 return PTR_ERR(devlink); 189 162 err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED);
+2 -1
net/devlink/region.c
··· 883 883 884 884 start_offset = state->start_offset; 885 885 886 - devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); 886 + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, 887 + false); 887 888 if (IS_ERR(devlink)) 888 889 return PTR_ERR(devlink); 889 890
+58
tools/testing/selftests/drivers/net/mlxsw/pci_reset.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + # 4 + # Test that PCI reset works correctly by verifying that only the expected reset 5 + # methods are supported and that after issuing the reset the ifindex of the 6 + # port changes. 7 + 8 + lib_dir=$(dirname $0)/../../../net/forwarding 9 + 10 + ALL_TESTS=" 11 + pci_reset_test 12 + " 13 + NUM_NETIFS=1 14 + source $lib_dir/lib.sh 15 + source $lib_dir/devlink_lib.sh 16 + 17 + pci_reset_test() 18 + { 19 + RET=0 20 + 21 + local bus=$(echo $DEVLINK_DEV | cut -d '/' -f 1) 22 + local bdf=$(echo $DEVLINK_DEV | cut -d '/' -f 2) 23 + 24 + if [ $bus != "pci" ]; then 25 + check_err 1 "devlink device is not a PCI device" 26 + log_test "pci reset" 27 + return 28 + fi 29 + 30 + if [ ! -f /sys/bus/pci/devices/$bdf/reset_method ]; then 31 + check_err 1 "reset is not supported" 32 + log_test "pci reset" 33 + return 34 + fi 35 + 36 + [[ $(cat /sys/bus/pci/devices/$bdf/reset_method) == "bus" ]] 37 + check_err $? "only \"bus\" reset method should be supported" 38 + 39 + local ifindex_pre=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]') 40 + 41 + echo 1 > /sys/bus/pci/devices/$bdf/reset 42 + check_err $? "reset failed" 43 + 44 + # Wait for udev to rename newly created netdev. 45 + udevadm settle 46 + 47 + local ifindex_post=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]') 48 + 49 + [[ $ifindex_pre != $ifindex_post ]] 50 + check_err $? "reset not performed" 51 + 52 + log_test "pci reset" 53 + } 54 + 55 + swp1=${NETIFS[p1]} 56 + tests_run 57 + 58 + exit $EXIT_STATUS