Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'mlx5-fixes-2020-09-30' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

From: Saeed Mahameed <saeedm@nvidia.com>

====================
This series introduces some fixes to mlx5 driver.

v1->v2:
- Patch #1 Don't return while mutex is held. (Dave)

v2->v3:
- Drop patch #1, will consider a better approach (Jakub)
- use cpu_relax() instead of cond_resched() (Jakub)
- while(i--) to reveres a loop (Jakub)
- Drop old mellanox email sign-off and change the committer email
(Jakub)

Please pull and let me know if there is any problem.

For -stable v4.15
('net/mlx5e: Fix VLAN cleanup flow')
('net/mlx5e: Fix VLAN create flow')

For -stable v4.16
('net/mlx5: Fix request_irqs error flow')

For -stable v5.4
('net/mlx5e: Add resiliency in Striding RQ mode for packets larger than MTU')
('net/mlx5: Avoid possible free of command entry while timeout comp handler')

For -stable v5.7
('net/mlx5e: Fix return status when setting unsupported FEC mode')

For -stable v5.8
('net/mlx5e: Fix race condition on nhe->n pointer in neigh update')
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+350 -119
+144 -54
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
··· 69 69 MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR = 0x10, 70 70 }; 71 71 72 - static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, 73 - struct mlx5_cmd_msg *in, 74 - struct mlx5_cmd_msg *out, 75 - void *uout, int uout_size, 76 - mlx5_cmd_cbk_t cbk, 77 - void *context, int page_queue) 72 + static struct mlx5_cmd_work_ent * 73 + cmd_alloc_ent(struct mlx5_cmd *cmd, struct mlx5_cmd_msg *in, 74 + struct mlx5_cmd_msg *out, void *uout, int uout_size, 75 + mlx5_cmd_cbk_t cbk, void *context, int page_queue) 78 76 { 79 77 gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL; 80 78 struct mlx5_cmd_work_ent *ent; ··· 81 83 if (!ent) 82 84 return ERR_PTR(-ENOMEM); 83 85 86 + ent->idx = -EINVAL; 84 87 ent->in = in; 85 88 ent->out = out; 86 89 ent->uout = uout; ··· 90 91 ent->context = context; 91 92 ent->cmd = cmd; 92 93 ent->page_queue = page_queue; 94 + refcount_set(&ent->refcnt, 1); 93 95 94 96 return ent; 97 + } 98 + 99 + static void cmd_free_ent(struct mlx5_cmd_work_ent *ent) 100 + { 101 + kfree(ent); 95 102 } 96 103 97 104 static u8 alloc_token(struct mlx5_cmd *cmd) ··· 114 109 return token; 115 110 } 116 111 117 - static int alloc_ent(struct mlx5_cmd *cmd) 112 + static int cmd_alloc_index(struct mlx5_cmd *cmd) 118 113 { 119 114 unsigned long flags; 120 115 int ret; ··· 128 123 return ret < cmd->max_reg_cmds ? ret : -ENOMEM; 129 124 } 130 125 131 - static void free_ent(struct mlx5_cmd *cmd, int idx) 126 + static void cmd_free_index(struct mlx5_cmd *cmd, int idx) 132 127 { 133 128 unsigned long flags; 134 129 135 130 spin_lock_irqsave(&cmd->alloc_lock, flags); 136 131 set_bit(idx, &cmd->bitmask); 137 132 spin_unlock_irqrestore(&cmd->alloc_lock, flags); 133 + } 134 + 135 + static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) 136 + { 137 + refcount_inc(&ent->refcnt); 138 + } 139 + 140 + static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) 141 + { 142 + if (!refcount_dec_and_test(&ent->refcnt)) 143 + return; 144 + 145 + if (ent->idx >= 0) 146 + cmd_free_index(ent->cmd, ent->idx); 147 + 148 + cmd_free_ent(ent); 138 149 } 139 150 140 151 static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) ··· 238 217 } while (time_before(jiffies, poll_end)); 239 218 240 219 ent->ret = -ETIMEDOUT; 241 - } 242 - 243 - static void free_cmd(struct mlx5_cmd_work_ent *ent) 244 - { 245 - kfree(ent); 246 220 } 247 221 248 222 static int verify_signature(struct mlx5_cmd_work_ent *ent) ··· 853 837 struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev, 854 838 cmd); 855 839 840 + mlx5_cmd_eq_recover(dev); 841 + 842 + /* Maybe got handled by eq recover ? */ 843 + if (!test_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state)) { 844 + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, recovered after timeout\n", ent->idx, 845 + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); 846 + goto out; /* phew, already handled */ 847 + } 848 + 856 849 ent->ret = -ETIMEDOUT; 857 - mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", 858 - mlx5_command_str(msg_to_opcode(ent->in)), 859 - msg_to_opcode(ent->in)); 850 + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, timeout. Will cause a leak of a command resource\n", 851 + ent->idx, mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); 860 852 mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); 853 + 854 + out: 855 + cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */ 861 856 } 862 857 863 858 static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); ··· 881 854 return true; 882 855 883 856 return cmd->allowed_opcode == opcode; 857 + } 858 + 859 + static int cmd_alloc_index_retry(struct mlx5_cmd *cmd) 860 + { 861 + unsigned long alloc_end = jiffies + msecs_to_jiffies(1000); 862 + int idx; 863 + 864 + retry: 865 + idx = cmd_alloc_index(cmd); 866 + if (idx < 0 && time_before(jiffies, alloc_end)) { 867 + /* Index allocation can fail on heavy load of commands. This is a temporary 868 + * situation as the current command already holds the semaphore, meaning that 869 + * another command completion is being handled and it is expected to release 870 + * the entry index soon. 871 + */ 872 + cpu_relax(); 873 + goto retry; 874 + } 875 + return idx; 876 + } 877 + 878 + bool mlx5_cmd_is_down(struct mlx5_core_dev *dev) 879 + { 880 + return pci_channel_offline(dev->pdev) || 881 + dev->cmd.state != MLX5_CMDIF_STATE_UP || 882 + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR; 884 883 } 885 884 886 885 static void cmd_work_handler(struct work_struct *work) ··· 926 873 sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; 927 874 down(sem); 928 875 if (!ent->page_queue) { 929 - alloc_ret = alloc_ent(cmd); 876 + alloc_ret = cmd_alloc_index_retry(cmd); 930 877 if (alloc_ret < 0) { 931 878 mlx5_core_err_rl(dev, "failed to allocate command entry\n"); 932 879 if (ent->callback) { 933 880 ent->callback(-EAGAIN, ent->context); 934 881 mlx5_free_cmd_msg(dev, ent->out); 935 882 free_msg(dev, ent->in); 936 - free_cmd(ent); 883 + cmd_ent_put(ent); 937 884 } else { 938 885 ent->ret = -EAGAIN; 939 886 complete(&ent->done); ··· 969 916 ent->ts1 = ktime_get_ns(); 970 917 cmd_mode = cmd->mode; 971 918 972 - if (ent->callback) 973 - schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); 919 + if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) 920 + cmd_ent_get(ent); 974 921 set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); 975 922 976 923 /* Skip sending command to fw if internal error */ 977 - if (pci_channel_offline(dev->pdev) || 978 - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || 979 - cmd->state != MLX5_CMDIF_STATE_UP || 980 - !opcode_allowed(&dev->cmd, ent->op)) { 924 + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) { 981 925 u8 status = 0; 982 926 u32 drv_synd; 983 927 ··· 983 933 MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); 984 934 985 935 mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); 986 - /* no doorbell, no need to keep the entry */ 987 - free_ent(cmd, ent->idx); 988 - if (ent->callback) 989 - free_cmd(ent); 990 936 return; 991 937 } 992 938 939 + cmd_ent_get(ent); /* for the _real_ FW event on completion */ 993 940 /* ring doorbell after the descriptor is valid */ 994 941 mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); 995 942 wmb(); ··· 1030 983 } 1031 984 } 1032 985 986 + enum { 987 + MLX5_CMD_TIMEOUT_RECOVER_MSEC = 5 * 1000, 988 + }; 989 + 990 + static void wait_func_handle_exec_timeout(struct mlx5_core_dev *dev, 991 + struct mlx5_cmd_work_ent *ent) 992 + { 993 + unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_RECOVER_MSEC); 994 + 995 + mlx5_cmd_eq_recover(dev); 996 + 997 + /* Re-wait on the ent->done after executing the recovery flow. If the 998 + * recovery flow (or any other recovery flow running simultaneously) 999 + * has recovered an EQE, it should cause the entry to be completed by 1000 + * the command interface. 1001 + */ 1002 + if (wait_for_completion_timeout(&ent->done, timeout)) { 1003 + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) recovered after timeout\n", ent->idx, 1004 + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); 1005 + return; 1006 + } 1007 + 1008 + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) No done completion\n", ent->idx, 1009 + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); 1010 + 1011 + ent->ret = -ETIMEDOUT; 1012 + mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); 1013 + } 1014 + 1033 1015 static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) 1034 1016 { 1035 1017 unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC); ··· 1070 994 ent->ret = -ECANCELED; 1071 995 goto out_err; 1072 996 } 1073 - if (cmd->mode == CMD_MODE_POLLING || ent->polling) { 997 + if (cmd->mode == CMD_MODE_POLLING || ent->polling) 1074 998 wait_for_completion(&ent->done); 1075 - } else if (!wait_for_completion_timeout(&ent->done, timeout)) { 1076 - ent->ret = -ETIMEDOUT; 1077 - mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); 1078 - } 999 + else if (!wait_for_completion_timeout(&ent->done, timeout)) 1000 + wait_func_handle_exec_timeout(dev, ent); 1079 1001 1080 1002 out_err: 1081 1003 err = ent->ret; ··· 1113 1039 if (callback && page_queue) 1114 1040 return -EINVAL; 1115 1041 1116 - ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context, 1117 - page_queue); 1042 + ent = cmd_alloc_ent(cmd, in, out, uout, uout_size, 1043 + callback, context, page_queue); 1118 1044 if (IS_ERR(ent)) 1119 1045 return PTR_ERR(ent); 1046 + 1047 + /* put for this ent is when consumed, depending on the use case 1048 + * 1) (!callback) blocking flow: by caller after wait_func completes 1049 + * 2) (callback) flow: by mlx5_cmd_comp_handler() when ent is handled 1050 + */ 1120 1051 1121 1052 ent->token = token; 1122 1053 ent->polling = force_polling; ··· 1141 1062 } 1142 1063 1143 1064 if (callback) 1144 - goto out; 1065 + goto out; /* mlx5_cmd_comp_handler() will put(ent) */ 1145 1066 1146 1067 err = wait_func(dev, ent); 1147 - if (err == -ETIMEDOUT) 1148 - goto out; 1149 - if (err == -ECANCELED) 1068 + if (err == -ETIMEDOUT || err == -ECANCELED) 1150 1069 goto out_free; 1151 1070 1152 1071 ds = ent->ts2 - ent->ts1; ··· 1162 1085 *status = ent->status; 1163 1086 1164 1087 out_free: 1165 - free_cmd(ent); 1088 + cmd_ent_put(ent); 1166 1089 out: 1167 1090 return err; 1168 1091 } ··· 1593 1516 if (!forced) { 1594 1517 mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", 1595 1518 ent->idx); 1596 - free_ent(cmd, ent->idx); 1597 - free_cmd(ent); 1519 + cmd_ent_put(ent); 1598 1520 } 1599 1521 continue; 1600 1522 } 1601 1523 1602 - if (ent->callback) 1603 - cancel_delayed_work(&ent->cb_timeout_work); 1524 + if (ent->callback && cancel_delayed_work(&ent->cb_timeout_work)) 1525 + cmd_ent_put(ent); /* timeout work was canceled */ 1526 + 1527 + if (!forced || /* Real FW completion */ 1528 + pci_channel_offline(dev->pdev) || /* FW is inaccessible */ 1529 + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 1530 + cmd_ent_put(ent); 1531 + 1604 1532 if (ent->page_queue) 1605 1533 sem = &cmd->pages_sem; 1606 1534 else ··· 1626 1544 mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", 1627 1545 ent->ret, deliv_status_to_str(ent->status), ent->status); 1628 1546 } 1629 - 1630 - /* only real completion will free the entry slot */ 1631 - if (!forced) 1632 - free_ent(cmd, ent->idx); 1633 1547 1634 1548 if (ent->callback) { 1635 1549 ds = ent->ts2 - ent->ts1; ··· 1654 1576 free_msg(dev, ent->in); 1655 1577 1656 1578 err = err ? err : ent->status; 1657 - if (!forced) 1658 - free_cmd(ent); 1579 + /* final consumer is done, release ent */ 1580 + cmd_ent_put(ent); 1659 1581 callback(err, context); 1660 1582 } else { 1583 + /* release wait_func() so mlx5_cmd_invoke() 1584 + * can make the final ent_put() 1585 + */ 1661 1586 complete(&ent->done); 1662 1587 } 1663 1588 up(sem); ··· 1670 1589 1671 1590 void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) 1672 1591 { 1592 + struct mlx5_cmd *cmd = &dev->cmd; 1593 + unsigned long bitmask; 1673 1594 unsigned long flags; 1674 1595 u64 vector; 1596 + int i; 1675 1597 1676 1598 /* wait for pending handlers to complete */ 1677 1599 mlx5_eq_synchronize_cmd_irq(dev); ··· 1683 1599 if (!vector) 1684 1600 goto no_trig; 1685 1601 1602 + bitmask = vector; 1603 + /* we must increment the allocated entries refcount before triggering the completions 1604 + * to guarantee pending commands will not get freed in the meanwhile. 1605 + * For that reason, it also has to be done inside the alloc_lock. 1606 + */ 1607 + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) 1608 + cmd_ent_get(cmd->ent_arr[i]); 1686 1609 vector |= MLX5_TRIGGERED_CMD_COMP; 1687 1610 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 1688 1611 1689 1612 mlx5_core_dbg(dev, "vector 0x%llx\n", vector); 1690 1613 mlx5_cmd_comp_handler(dev, vector, true); 1614 + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) 1615 + cmd_ent_put(cmd->ent_arr[i]); 1691 1616 return; 1692 1617 1693 1618 no_trig: ··· 1804 1711 u8 token; 1805 1712 1806 1713 opcode = MLX5_GET(mbox_in, in, opcode); 1807 - if (pci_channel_offline(dev->pdev) || 1808 - dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || 1809 - dev->cmd.state != MLX5_CMDIF_STATE_UP || 1810 - !opcode_allowed(&dev->cmd, opcode)) { 1714 + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode)) { 1811 1715 err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); 1812 1716 MLX5_SET(mbox_out, out, status, status); 1813 1717 MLX5_SET(mbox_out, out, syndrome, drv_synd);
+7 -1
drivers/net/ethernet/mellanox/mlx5/core/en.h
··· 91 91 #define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER) 92 92 93 93 #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2) 94 - #define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8)) 94 + /* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between 95 + * WQEs, This page will absorb write overflow by the hardware, when 96 + * receiving packets larger than MTU. These oversize packets are 97 + * dropped by the driver at a later stage. 98 + */ 99 + #define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8)) 95 100 #define MLX5E_LOG_ALIGNED_MPWQE_PPW (ilog2(MLX5E_REQUIRED_WQE_MTTS)) 96 101 #define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS) 97 102 #define MLX5E_MAX_RQ_NUM_MTTS \ ··· 622 617 u32 rqn; 623 618 struct mlx5_core_dev *mdev; 624 619 struct mlx5_core_mkey umr_mkey; 620 + struct mlx5e_dma_info wqe_overflow; 625 621 626 622 /* XDP read-mostly */ 627 623 struct xdp_rxq_info xdp_rxq;
+3
drivers/net/ethernet/mellanox/mlx5/core/en/port.c
··· 569 569 if (fec_policy >= (1 << MLX5E_FEC_LLRS_272_257_1) && !fec_50g_per_lane) 570 570 return -EOPNOTSUPP; 571 571 572 + if (fec_policy && !mlx5e_fec_in_caps(dev, fec_policy)) 573 + return -EOPNOTSUPP; 574 + 572 575 MLX5_SET(pplm_reg, in, local_port, 1); 573 576 err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPLM, 0, 0); 574 577 if (err)
+50 -31
drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c
··· 110 110 rtnl_unlock(); 111 111 } 112 112 113 + struct neigh_update_work { 114 + struct work_struct work; 115 + struct neighbour *n; 116 + struct mlx5e_neigh_hash_entry *nhe; 117 + }; 118 + 119 + static void mlx5e_release_neigh_update_work(struct neigh_update_work *update_work) 120 + { 121 + neigh_release(update_work->n); 122 + mlx5e_rep_neigh_entry_release(update_work->nhe); 123 + kfree(update_work); 124 + } 125 + 113 126 static void mlx5e_rep_neigh_update(struct work_struct *work) 114 127 { 115 - struct mlx5e_neigh_hash_entry *nhe = 116 - container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work); 117 - struct neighbour *n = nhe->n; 128 + struct neigh_update_work *update_work = container_of(work, struct neigh_update_work, 129 + work); 130 + struct mlx5e_neigh_hash_entry *nhe = update_work->nhe; 131 + struct neighbour *n = update_work->n; 118 132 struct mlx5e_encap_entry *e; 119 133 unsigned char ha[ETH_ALEN]; 120 134 struct mlx5e_priv *priv; ··· 160 146 mlx5e_rep_update_flows(priv, e, neigh_connected, ha); 161 147 mlx5e_encap_put(priv, e); 162 148 } 163 - mlx5e_rep_neigh_entry_release(nhe); 164 149 rtnl_unlock(); 165 - neigh_release(n); 150 + mlx5e_release_neigh_update_work(update_work); 166 151 } 167 152 168 - static void mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv, 169 - struct mlx5e_neigh_hash_entry *nhe, 170 - struct neighbour *n) 153 + static struct neigh_update_work *mlx5e_alloc_neigh_update_work(struct mlx5e_priv *priv, 154 + struct neighbour *n) 171 155 { 172 - /* Take a reference to ensure the neighbour and mlx5 encap 173 - * entry won't be destructed until we drop the reference in 174 - * delayed work. 175 - */ 176 - neigh_hold(n); 156 + struct neigh_update_work *update_work; 157 + struct mlx5e_neigh_hash_entry *nhe; 158 + struct mlx5e_neigh m_neigh = {}; 177 159 178 - /* This assignment is valid as long as the the neigh reference 179 - * is taken 180 - */ 181 - nhe->n = n; 160 + update_work = kzalloc(sizeof(*update_work), GFP_ATOMIC); 161 + if (WARN_ON(!update_work)) 162 + return NULL; 182 163 183 - if (!queue_work(priv->wq, &nhe->neigh_update_work)) { 184 - mlx5e_rep_neigh_entry_release(nhe); 185 - neigh_release(n); 164 + m_neigh.dev = n->dev; 165 + m_neigh.family = n->ops->family; 166 + memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); 167 + 168 + /* Obtain reference to nhe as last step in order not to release it in 169 + * atomic context. 170 + */ 171 + rcu_read_lock(); 172 + nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); 173 + rcu_read_unlock(); 174 + if (!nhe) { 175 + kfree(update_work); 176 + return NULL; 186 177 } 178 + 179 + INIT_WORK(&update_work->work, mlx5e_rep_neigh_update); 180 + neigh_hold(n); 181 + update_work->n = n; 182 + update_work->nhe = nhe; 183 + 184 + return update_work; 187 185 } 188 186 189 187 static int mlx5e_rep_netevent_event(struct notifier_block *nb, ··· 207 181 struct net_device *netdev = rpriv->netdev; 208 182 struct mlx5e_priv *priv = netdev_priv(netdev); 209 183 struct mlx5e_neigh_hash_entry *nhe = NULL; 210 - struct mlx5e_neigh m_neigh = {}; 184 + struct neigh_update_work *update_work; 211 185 struct neigh_parms *p; 212 186 struct neighbour *n; 213 187 bool found = false; ··· 222 196 #endif 223 197 return NOTIFY_DONE; 224 198 225 - m_neigh.dev = n->dev; 226 - m_neigh.family = n->ops->family; 227 - memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); 228 - 229 - rcu_read_lock(); 230 - nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); 231 - rcu_read_unlock(); 232 - if (!nhe) 199 + update_work = mlx5e_alloc_neigh_update_work(priv, n); 200 + if (!update_work) 233 201 return NOTIFY_DONE; 234 202 235 - mlx5e_rep_queue_neigh_update_work(priv, nhe, n); 203 + queue_work(priv->wq, &update_work->work); 236 204 break; 237 205 238 206 case NETEVENT_DELAY_PROBE_TIME_UPDATE: ··· 372 352 373 353 (*nhe)->priv = priv; 374 354 memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh)); 375 - INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update); 376 355 spin_lock_init(&(*nhe)->encap_list_lock); 377 356 INIT_LIST_HEAD(&(*nhe)->encap_list); 378 357 refcount_set(&(*nhe)->refcnt, 1);
+3 -1
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
··· 246 246 case FLOW_ACT_MANGLE_HDR_TYPE_IP6: 247 247 ip6_offset = (offset - offsetof(struct ipv6hdr, saddr)); 248 248 ip6_offset /= 4; 249 - if (ip6_offset < 8) 249 + if (ip6_offset < 4) 250 250 tuple->ip.src_v6.s6_addr32[ip6_offset] = cpu_to_be32(val); 251 + else if (ip6_offset < 8) 252 + tuple->ip.dst_v6.s6_addr32[ip6_offset - 4] = cpu_to_be32(val); 251 253 else 252 254 return -EOPNOTSUPP; 253 255 break;
+10 -4
drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
··· 217 217 break; 218 218 } 219 219 220 + if (WARN_ONCE(*rule_p, "VLAN rule already exists type %d", rule_type)) 221 + return 0; 222 + 220 223 *rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); 221 224 222 225 if (IS_ERR(*rule_p)) { ··· 400 397 for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID) 401 398 mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); 402 399 403 - if (priv->fs.vlan.cvlan_filter_disabled && 404 - !(priv->netdev->flags & IFF_PROMISC)) 400 + if (priv->fs.vlan.cvlan_filter_disabled) 405 401 mlx5e_add_any_vid_rules(priv); 406 402 } 407 403 ··· 417 415 for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID) 418 416 mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); 419 417 420 - if (priv->fs.vlan.cvlan_filter_disabled && 421 - !(priv->netdev->flags & IFF_PROMISC)) 418 + WARN_ON_ONCE(!(test_bit(MLX5E_STATE_DESTROYING, &priv->state))); 419 + 420 + /* must be called after DESTROY bit is set and 421 + * set_rx_mode is called and flushed 422 + */ 423 + if (priv->fs.vlan.cvlan_filter_disabled) 422 424 mlx5e_del_any_vid_rules(priv); 423 425 } 424 426
+85 -19
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
··· 246 246 247 247 static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, 248 248 u64 npages, u8 page_shift, 249 - struct mlx5_core_mkey *umr_mkey) 249 + struct mlx5_core_mkey *umr_mkey, 250 + dma_addr_t filler_addr) 250 251 { 251 - int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 252 + struct mlx5_mtt *mtt; 253 + int inlen; 252 254 void *mkc; 253 255 u32 *in; 254 256 int err; 257 + int i; 258 + 259 + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(*mtt) * npages; 255 260 256 261 in = kvzalloc(inlen, GFP_KERNEL); 257 262 if (!in) ··· 276 271 MLX5_SET(mkc, mkc, translations_octword_size, 277 272 MLX5_MTT_OCTW(npages)); 278 273 MLX5_SET(mkc, mkc, log_page_size, page_shift); 274 + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 275 + MLX5_MTT_OCTW(npages)); 276 + 277 + /* Initialize the mkey with all MTTs pointing to a default 278 + * page (filler_addr). When the channels are activated, UMR 279 + * WQEs will redirect the RX WQEs to the actual memory from 280 + * the RQ's pool, while the gaps (wqe_overflow) remain mapped 281 + * to the default page. 282 + */ 283 + mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 284 + for (i = 0 ; i < npages ; i++) 285 + mtt[i].ptag = cpu_to_be64(filler_addr); 279 286 280 287 err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen); 281 288 ··· 299 282 { 300 283 u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq)); 301 284 302 - return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey); 285 + return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey, 286 + rq->wqe_overflow.addr); 303 287 } 304 288 305 289 static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix) ··· 368 350 mlx5e_reporter_rq_cqe_err(rq); 369 351 } 370 352 353 + static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) 354 + { 355 + rq->wqe_overflow.page = alloc_page(GFP_KERNEL); 356 + if (!rq->wqe_overflow.page) 357 + return -ENOMEM; 358 + 359 + rq->wqe_overflow.addr = dma_map_page(rq->pdev, rq->wqe_overflow.page, 0, 360 + PAGE_SIZE, rq->buff.map_dir); 361 + if (dma_mapping_error(rq->pdev, rq->wqe_overflow.addr)) { 362 + __free_page(rq->wqe_overflow.page); 363 + return -ENOMEM; 364 + } 365 + return 0; 366 + } 367 + 368 + static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) 369 + { 370 + dma_unmap_page(rq->pdev, rq->wqe_overflow.addr, PAGE_SIZE, 371 + rq->buff.map_dir); 372 + __free_page(rq->wqe_overflow.page); 373 + } 374 + 371 375 static int mlx5e_alloc_rq(struct mlx5e_channel *c, 372 376 struct mlx5e_params *params, 373 377 struct mlx5e_xsk_param *xsk, ··· 436 396 rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK; 437 397 err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix); 438 398 if (err < 0) 439 - goto err_rq_wq_destroy; 399 + goto err_rq_xdp_prog; 440 400 441 401 rq->buff.map_dir = params->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; 442 402 rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk); ··· 446 406 case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: 447 407 err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq, 448 408 &rq->wq_ctrl); 409 + if (err) 410 + goto err_rq_xdp; 411 + 412 + err = mlx5e_alloc_mpwqe_rq_drop_page(rq); 449 413 if (err) 450 414 goto err_rq_wq_destroy; 451 415 ··· 468 424 469 425 err = mlx5e_create_rq_umr_mkey(mdev, rq); 470 426 if (err) 471 - goto err_rq_wq_destroy; 427 + goto err_rq_drop_page; 472 428 rq->mkey_be = cpu_to_be32(rq->umr_mkey.key); 473 429 474 430 err = mlx5e_rq_alloc_mpwqe_info(rq, c); 475 431 if (err) 476 - goto err_free; 432 + goto err_rq_mkey; 477 433 break; 478 434 default: /* MLX5_WQ_TYPE_CYCLIC */ 479 435 err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq, 480 436 &rq->wq_ctrl); 481 437 if (err) 482 - goto err_rq_wq_destroy; 438 + goto err_rq_xdp; 483 439 484 440 rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR]; 485 441 ··· 494 450 GFP_KERNEL, cpu_to_node(c->cpu)); 495 451 if (!rq->wqe.frags) { 496 452 err = -ENOMEM; 497 - goto err_free; 453 + goto err_rq_wq_destroy; 498 454 } 499 455 500 456 err = mlx5e_init_di_list(rq, wq_sz, c->cpu); 501 457 if (err) 502 - goto err_free; 458 + goto err_rq_frags; 503 459 504 460 rq->mkey_be = c->mkey_be; 505 461 } 506 462 507 463 err = mlx5e_rq_set_handlers(rq, params, xsk); 508 464 if (err) 509 - goto err_free; 465 + goto err_free_by_rq_type; 510 466 511 467 if (xsk) { 512 468 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, ··· 530 486 if (IS_ERR(rq->page_pool)) { 531 487 err = PTR_ERR(rq->page_pool); 532 488 rq->page_pool = NULL; 533 - goto err_free; 489 + goto err_free_by_rq_type; 534 490 } 535 491 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 536 492 MEM_TYPE_PAGE_POOL, rq->page_pool); 537 493 } 538 494 if (err) 539 - goto err_free; 495 + goto err_free_by_rq_type; 540 496 541 497 for (i = 0; i < wq_sz; i++) { 542 498 if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { ··· 586 542 587 543 return 0; 588 544 589 - err_free: 545 + err_free_by_rq_type: 590 546 switch (rq->wq_type) { 591 547 case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: 592 548 kvfree(rq->mpwqe.info); 549 + err_rq_mkey: 593 550 mlx5_core_destroy_mkey(mdev, &rq->umr_mkey); 551 + err_rq_drop_page: 552 + mlx5e_free_mpwqe_rq_drop_page(rq); 594 553 break; 595 554 default: /* MLX5_WQ_TYPE_CYCLIC */ 596 - kvfree(rq->wqe.frags); 597 555 mlx5e_free_di_list(rq); 556 + err_rq_frags: 557 + kvfree(rq->wqe.frags); 598 558 } 599 - 600 559 err_rq_wq_destroy: 560 + mlx5_wq_destroy(&rq->wq_ctrl); 561 + err_rq_xdp: 562 + xdp_rxq_info_unreg(&rq->xdp_rxq); 563 + err_rq_xdp_prog: 601 564 if (params->xdp_prog) 602 565 bpf_prog_put(params->xdp_prog); 603 - xdp_rxq_info_unreg(&rq->xdp_rxq); 604 - page_pool_destroy(rq->page_pool); 605 - mlx5_wq_destroy(&rq->wq_ctrl); 606 566 607 567 return err; 608 568 } ··· 628 580 case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: 629 581 kvfree(rq->mpwqe.info); 630 582 mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey); 583 + mlx5e_free_mpwqe_rq_drop_page(rq); 631 584 break; 632 585 default: /* MLX5_WQ_TYPE_CYCLIC */ 633 586 kvfree(rq->wqe.frags); ··· 4226 4177 } 4227 4178 #endif 4228 4179 4180 + static bool mlx5e_gre_tunnel_inner_proto_offload_supported(struct mlx5_core_dev *mdev, 4181 + struct sk_buff *skb) 4182 + { 4183 + switch (skb->inner_protocol) { 4184 + case htons(ETH_P_IP): 4185 + case htons(ETH_P_IPV6): 4186 + case htons(ETH_P_TEB): 4187 + return true; 4188 + case htons(ETH_P_MPLS_UC): 4189 + case htons(ETH_P_MPLS_MC): 4190 + return MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre); 4191 + } 4192 + return false; 4193 + } 4194 + 4229 4195 static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, 4230 4196 struct sk_buff *skb, 4231 4197 netdev_features_t features) ··· 4263 4199 4264 4200 switch (proto) { 4265 4201 case IPPROTO_GRE: 4266 - return features; 4202 + if (mlx5e_gre_tunnel_inner_proto_offload_supported(priv->mdev, skb)) 4203 + return features; 4204 + break; 4267 4205 case IPPROTO_IPIP: 4268 4206 case IPPROTO_IPV6: 4269 4207 if (mlx5e_tunnel_proto_supported(priv->mdev, IPPROTO_IPIP))
-6
drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
··· 135 135 /* encap list sharing the same neigh */ 136 136 struct list_head encap_list; 137 137 138 - /* valid only when the neigh reference is taken during 139 - * neigh_update_work workqueue callback. 140 - */ 141 - struct neighbour *n; 142 - struct work_struct neigh_update_work; 143 - 144 138 /* neigh hash entry can be deleted only when the refcount is zero. 145 139 * refcount is needed to avoid neigh hash entry removal by TC, while 146 140 * it's used by the neigh notification call.
+41 -1
drivers/net/ethernet/mellanox/mlx5/core/eq.c
··· 189 189 return count_eqe; 190 190 } 191 191 192 + static void mlx5_eq_async_int_lock(struct mlx5_eq_async *eq, unsigned long *flags) 193 + __acquires(&eq->lock) 194 + { 195 + if (in_irq()) 196 + spin_lock(&eq->lock); 197 + else 198 + spin_lock_irqsave(&eq->lock, *flags); 199 + } 200 + 201 + static void mlx5_eq_async_int_unlock(struct mlx5_eq_async *eq, unsigned long *flags) 202 + __releases(&eq->lock) 203 + { 204 + if (in_irq()) 205 + spin_unlock(&eq->lock); 206 + else 207 + spin_unlock_irqrestore(&eq->lock, *flags); 208 + } 209 + 210 + enum async_eq_nb_action { 211 + ASYNC_EQ_IRQ_HANDLER = 0, 212 + ASYNC_EQ_RECOVER = 1, 213 + }; 214 + 192 215 static int mlx5_eq_async_int(struct notifier_block *nb, 193 216 unsigned long action, void *data) 194 217 { ··· 221 198 struct mlx5_eq_table *eqt; 222 199 struct mlx5_core_dev *dev; 223 200 struct mlx5_eqe *eqe; 201 + unsigned long flags; 224 202 int num_eqes = 0; 225 203 226 204 dev = eq->dev; 227 205 eqt = dev->priv.eq_table; 206 + 207 + mlx5_eq_async_int_lock(eq_async, &flags); 228 208 229 209 eqe = next_eqe_sw(eq); 230 210 if (!eqe) ··· 249 223 250 224 out: 251 225 eq_update_ci(eq, 1); 226 + mlx5_eq_async_int_unlock(eq_async, &flags); 252 227 253 - return 0; 228 + return unlikely(action == ASYNC_EQ_RECOVER) ? num_eqes : 0; 229 + } 230 + 231 + void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev) 232 + { 233 + struct mlx5_eq_async *eq = &dev->priv.eq_table->cmd_eq; 234 + int eqes; 235 + 236 + eqes = mlx5_eq_async_int(&eq->irq_nb, ASYNC_EQ_RECOVER, NULL); 237 + if (eqes) 238 + mlx5_core_warn(dev, "Recovered %d EQEs on cmd_eq\n", eqes); 254 239 } 255 240 256 241 static void init_eq_buf(struct mlx5_eq *eq) ··· 606 569 int err; 607 570 608 571 eq->irq_nb.notifier_call = mlx5_eq_async_int; 572 + spin_lock_init(&eq->lock); 609 573 610 574 err = create_async_eq(dev, &eq->core, param); 611 575 if (err) { ··· 694 656 695 657 cleanup_async_eq(dev, &table->pages_eq, "pages"); 696 658 cleanup_async_eq(dev, &table->async_eq, "async"); 659 + mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_DESTROY_EQ); 697 660 mlx5_cmd_use_polling(dev); 698 661 cleanup_async_eq(dev, &table->cmd_eq, "cmd"); 662 + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); 699 663 mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); 700 664 } 701 665
+2
drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
··· 37 37 struct mlx5_eq_async { 38 38 struct mlx5_eq core; 39 39 struct notifier_block irq_nb; 40 + spinlock_t lock; /* To avoid irq EQ handle races with resiliency flows */ 40 41 }; 41 42 42 43 struct mlx5_eq_comp { ··· 82 81 struct cpumask *mlx5_eq_comp_cpumask(struct mlx5_core_dev *dev, int ix); 83 82 84 83 u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq); 84 + void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev); 85 85 void mlx5_eq_synchronize_async_irq(struct mlx5_core_dev *dev); 86 86 void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev); 87 87
+1 -1
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
··· 432 432 u32 npages; 433 433 u32 i = 0; 434 434 435 - if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) 435 + if (!mlx5_cmd_is_down(dev)) 436 436 return mlx5_cmd_exec(dev, in, in_size, out, out_size); 437 437 438 438 /* No hard feelings, we want our pages back! */
+1 -1
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
··· 115 115 return 0; 116 116 117 117 err_request_irq: 118 - for (; i >= 0; i--) { 118 + while (i--) { 119 119 struct mlx5_irq *irq = mlx5_irq_get(dev, i); 120 120 int irqn = pci_irq_vector(dev->pdev, i); 121 121
+3
include/linux/mlx5/driver.h
··· 767 767 u64 ts2; 768 768 u16 op; 769 769 bool polling; 770 + /* Track the max comp handlers */ 771 + refcount_t refcnt; 770 772 }; 771 773 772 774 struct mlx5_pas { ··· 935 933 int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, 936 934 void *out, int out_size); 937 935 void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); 936 + bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); 938 937 939 938 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); 940 939 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);