Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'qed-Error-recovery-process'

Michal Kalderon says:

====================
qed*: Error recovery process

Parity errors might happen in the device's memories due to momentary bit
flips which are caused by radiation.
Errors that are not correctable initiate a process kill event, which blocks
the device access towards the host and the network, and a recovery process
is started in the management FW and in the driver.

This series adds the support of this process in the qed core module and in
the qede driver (patches 2 & 3).
Patch 1 in the series revises the load sequence, to avoid PCI errors that
might be observed during a recovery process.

Changes in v2:
- Addressed issue found in https://patchwork.ozlabs.org/patch/1030545/
The change was done be removing the enum and passing a boolean to
the related functions.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+725 -204
+4 -1
drivers/net/ethernet/qlogic/qed/qed.h
··· 554 554 u8 dp_level; 555 555 char name[NAME_SIZE]; 556 556 557 - bool first_on_engine; 558 557 bool hw_init_done; 559 558 560 559 u8 num_funcs_on_engine; ··· 804 805 805 806 u32 mcp_nvm_resp; 806 807 808 + /* Recovery */ 809 + bool recov_in_prog; 810 + 807 811 /* Linux specific here */ 808 812 struct qede_dev *edev; 809 813 struct pci_dev *pdev; ··· 946 944 u32 qed_unzip_data(struct qed_hwfn *p_hwfn, 947 945 u32 input_len, u8 *input_buf, 948 946 u32 max_size, u8 *unzip_buf); 947 + void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn); 949 948 void qed_get_protocol_stats(struct qed_dev *cdev, 950 949 enum qed_mcp_protocol_type type, 951 950 union qed_mcp_protocol_stats *stats);
+96 -64
drivers/net/ethernet/qlogic/qed/qed_dev.c
··· 1959 1959 (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0); 1960 1960 STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0); 1961 1961 1962 - /* Cleanup chip from previous driver if such remains exist */ 1963 - rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id, false); 1964 - if (rc) 1965 - return rc; 1966 - 1967 1962 /* Sanity check before the PF init sequence that uses DMAE */ 1968 1963 rc = qed_dmae_sanity(p_hwfn, p_ptt, "pf_phase"); 1969 1964 if (rc) ··· 2002 2007 return rc; 2003 2008 } 2004 2009 2005 - static int qed_change_pci_hwfn(struct qed_hwfn *p_hwfn, 2006 - struct qed_ptt *p_ptt, 2007 - u8 enable) 2010 + int qed_pglueb_set_pfid_enable(struct qed_hwfn *p_hwfn, 2011 + struct qed_ptt *p_ptt, bool b_enable) 2008 2012 { 2009 - u32 delay_idx = 0, val, set_val = enable ? 1 : 0; 2013 + u32 delay_idx = 0, val, set_val = b_enable ? 1 : 0; 2010 2014 2011 - /* Change PF in PXP */ 2012 - qed_wr(p_hwfn, p_ptt, 2013 - PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, set_val); 2015 + /* Configure the PF's internal FID_enable for master transactions */ 2016 + qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, set_val); 2014 2017 2015 - /* wait until value is set - try for 1 second every 50us */ 2018 + /* Wait until value is set - try for 1 second every 50us */ 2016 2019 for (delay_idx = 0; delay_idx < 20000; delay_idx++) { 2017 2020 val = qed_rd(p_hwfn, p_ptt, 2018 2021 PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER); ··· 2064 2071 return 0; 2065 2072 } 2066 2073 2074 + static void qed_pglueb_clear_err(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) 2075 + { 2076 + qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR, 2077 + BIT(p_hwfn->abs_pf_id)); 2078 + } 2079 + 2067 2080 int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params) 2068 2081 { 2069 2082 struct qed_load_req_params load_req_params; 2070 2083 u32 load_code, resp, param, drv_mb_param; 2071 2084 bool b_default_mtu = true; 2072 2085 struct qed_hwfn *p_hwfn; 2073 - int rc = 0, mfw_rc, i; 2086 + int rc = 0, i; 2074 2087 u16 ether_type; 2075 2088 2076 2089 if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) { ··· 2091 2092 } 2092 2093 2093 2094 for_each_hwfn(cdev, i) { 2094 - struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; 2095 + p_hwfn = &cdev->hwfns[i]; 2095 2096 2096 2097 /* If management didn't provide a default, set one of our own */ 2097 2098 if (!p_hwfn->hw_info.mtu) { ··· 2103 2104 qed_vf_start(p_hwfn, p_params); 2104 2105 continue; 2105 2106 } 2106 - 2107 - /* Enable DMAE in PXP */ 2108 - rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true); 2109 2107 2110 2108 rc = qed_calc_hw_mode(p_hwfn); 2111 2109 if (rc) ··· 2140 2144 "Load request was sent. Load code: 0x%x\n", 2141 2145 load_code); 2142 2146 2147 + /* Only relevant for recovery: 2148 + * Clear the indication after LOAD_REQ is responded by the MFW. 2149 + */ 2150 + cdev->recov_in_prog = false; 2151 + 2143 2152 qed_mcp_set_capabilities(p_hwfn, p_hwfn->p_main_ptt); 2144 2153 2145 2154 qed_reset_mb_shadow(p_hwfn, p_hwfn->p_main_ptt); 2146 2155 2147 - p_hwfn->first_on_engine = (load_code == 2148 - FW_MSG_CODE_DRV_LOAD_ENGINE); 2156 + /* Clean up chip from previous driver if such remains exist. 2157 + * This is not needed when the PF is the first one on the 2158 + * engine, since afterwards we are going to init the FW. 2159 + */ 2160 + if (load_code != FW_MSG_CODE_DRV_LOAD_ENGINE) { 2161 + rc = qed_final_cleanup(p_hwfn, p_hwfn->p_main_ptt, 2162 + p_hwfn->rel_pf_id, false); 2163 + if (rc) { 2164 + DP_NOTICE(p_hwfn, "Final cleanup failed\n"); 2165 + goto load_err; 2166 + } 2167 + } 2168 + 2169 + /* Log and clear previous pglue_b errors if such exist */ 2170 + qed_pglueb_rbc_attn_handler(p_hwfn, p_hwfn->p_main_ptt); 2171 + 2172 + /* Enable the PF's internal FID_enable in the PXP */ 2173 + rc = qed_pglueb_set_pfid_enable(p_hwfn, p_hwfn->p_main_ptt, 2174 + true); 2175 + if (rc) 2176 + goto load_err; 2177 + 2178 + /* Clear the pglue_b was_error indication. 2179 + * In E4 it must be done after the BME and the internal 2180 + * FID_enable for the PF are set, since VDMs may cause the 2181 + * indication to be set again. 2182 + */ 2183 + qed_pglueb_clear_err(p_hwfn, p_hwfn->p_main_ptt); 2149 2184 2150 2185 switch (load_code) { 2151 2186 case FW_MSG_CODE_DRV_LOAD_ENGINE: ··· 2207 2180 break; 2208 2181 } 2209 2182 2210 - if (rc) 2183 + if (rc) { 2211 2184 DP_NOTICE(p_hwfn, 2212 2185 "init phase failed for loadcode 0x%x (rc %d)\n", 2213 - load_code, rc); 2214 - 2215 - /* ACK mfw regardless of success or failure of initialization */ 2216 - mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, 2217 - DRV_MSG_CODE_LOAD_DONE, 2218 - 0, &load_code, &param); 2219 - if (rc) 2220 - return rc; 2221 - if (mfw_rc) { 2222 - DP_NOTICE(p_hwfn, "Failed sending LOAD_DONE command\n"); 2223 - return mfw_rc; 2186 + load_code, rc); 2187 + goto load_err; 2224 2188 } 2225 2189 2226 - /* Check if there is a DID mismatch between nvm-cfg/efuse */ 2227 - if (param & FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR) 2228 - DP_NOTICE(p_hwfn, 2229 - "warning: device configuration is not supported on this board type. The device may not function as expected.\n"); 2190 + rc = qed_mcp_load_done(p_hwfn, p_hwfn->p_main_ptt); 2191 + if (rc) 2192 + return rc; 2230 2193 2231 2194 /* send DCBX attention request command */ 2232 2195 DP_VERBOSE(p_hwfn, 2233 2196 QED_MSG_DCB, 2234 2197 "sending phony dcbx set command to trigger DCBx attention handling\n"); 2235 - mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, 2236 - DRV_MSG_CODE_SET_DCBX, 2237 - 1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT, 2238 - &load_code, &param); 2239 - if (mfw_rc) { 2198 + rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, 2199 + DRV_MSG_CODE_SET_DCBX, 2200 + 1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT, 2201 + &resp, &param); 2202 + if (rc) { 2240 2203 DP_NOTICE(p_hwfn, 2241 2204 "Failed to send DCBX attention request\n"); 2242 - return mfw_rc; 2205 + return rc; 2243 2206 } 2244 2207 2245 2208 p_hwfn->hw_init_done = true; ··· 2278 2261 } 2279 2262 2280 2263 return 0; 2264 + 2265 + load_err: 2266 + /* The MFW load lock should be released also when initialization fails. 2267 + */ 2268 + qed_mcp_load_done(p_hwfn, p_hwfn->p_main_ptt); 2269 + return rc; 2281 2270 } 2282 2271 2283 2272 #define QED_HW_STOP_RETRY_LIMIT (10) ··· 2295 2272 /* close timers */ 2296 2273 qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_CONN, 0x0); 2297 2274 qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, 0x0); 2275 + 2276 + if (cdev->recov_in_prog) 2277 + return; 2298 2278 2299 2279 for (i = 0; i < QED_HW_STOP_RETRY_LIMIT; i++) { 2300 2280 if ((!qed_rd(p_hwfn, p_ptt, ··· 2361 2335 p_hwfn->hw_init_done = false; 2362 2336 2363 2337 /* Send unload command to MCP */ 2364 - rc = qed_mcp_unload_req(p_hwfn, p_ptt); 2365 - if (rc) { 2366 - DP_NOTICE(p_hwfn, 2367 - "Failed sending a UNLOAD_REQ command. rc = %d.\n", 2368 - rc); 2369 - rc2 = -EINVAL; 2338 + if (!cdev->recov_in_prog) { 2339 + rc = qed_mcp_unload_req(p_hwfn, p_ptt); 2340 + if (rc) { 2341 + DP_NOTICE(p_hwfn, 2342 + "Failed sending a UNLOAD_REQ command. rc = %d.\n", 2343 + rc); 2344 + rc2 = -EINVAL; 2345 + } 2370 2346 } 2371 2347 2372 2348 qed_slowpath_irq_sync(p_hwfn); ··· 2410 2382 qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DB_ENABLE, 0); 2411 2383 qed_wr(p_hwfn, p_ptt, QM_REG_PF_EN, 0); 2412 2384 2413 - qed_mcp_unload_done(p_hwfn, p_ptt); 2414 - if (rc) { 2415 - DP_NOTICE(p_hwfn, 2416 - "Failed sending a UNLOAD_DONE command. rc = %d.\n", 2417 - rc); 2418 - rc2 = -EINVAL; 2385 + if (!cdev->recov_in_prog) { 2386 + rc = qed_mcp_unload_done(p_hwfn, p_ptt); 2387 + if (rc) { 2388 + DP_NOTICE(p_hwfn, 2389 + "Failed sending a UNLOAD_DONE command. rc = %d.\n", 2390 + rc); 2391 + rc2 = -EINVAL; 2392 + } 2419 2393 } 2420 2394 } 2421 2395 2422 - if (IS_PF(cdev)) { 2396 + if (IS_PF(cdev) && !cdev->recov_in_prog) { 2423 2397 p_hwfn = QED_LEADING_HWFN(cdev); 2424 2398 p_ptt = QED_LEADING_HWFN(cdev)->p_main_ptt; 2425 2399 2426 - /* Disable DMAE in PXP - in CMT, this should only be done for 2427 - * first hw-function, and only after all transactions have 2428 - * stopped for all active hw-functions. 2400 + /* Clear the PF's internal FID_enable in the PXP. 2401 + * In CMT this should only be done for first hw-function, and 2402 + * only after all transactions have stopped for all active 2403 + * hw-functions. 2429 2404 */ 2430 - rc = qed_change_pci_hwfn(p_hwfn, p_ptt, false); 2405 + rc = qed_pglueb_set_pfid_enable(p_hwfn, p_ptt, false); 2431 2406 if (rc) { 2432 2407 DP_NOTICE(p_hwfn, 2433 - "qed_change_pci_hwfn failed. rc = %d.\n", rc); 2408 + "qed_pglueb_set_pfid_enable() failed. rc = %d.\n", 2409 + rc); 2434 2410 rc2 = -EINVAL; 2435 2411 } 2436 2412 } ··· 2534 2502 PGLUE_B_REG_PGL_ADDR_94_F0_BB, 0); 2535 2503 } 2536 2504 2537 - /* Clean Previous errors if such exist */ 2538 - qed_wr(p_hwfn, p_hwfn->p_main_ptt, 2539 - PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR, 1 << p_hwfn->abs_pf_id); 2505 + /* Clean previous pglue_b errors if such exist */ 2506 + qed_pglueb_clear_err(p_hwfn, p_hwfn->p_main_ptt); 2540 2507 2541 2508 /* enable internal target-read */ 2542 2509 qed_wr(p_hwfn, p_hwfn->p_main_ptt, ··· 3471 3440 void __iomem *p_doorbells, 3472 3441 enum qed_pci_personality personality) 3473 3442 { 3443 + struct qed_dev *cdev = p_hwfn->cdev; 3474 3444 int rc = 0; 3475 3445 3476 3446 /* Split PCI bars evenly between hwfns */ ··· 3524 3492 /* Sending a mailbox to the MFW should be done after qed_get_hw_info() 3525 3493 * is called as it sets the ports number in an engine. 3526 3494 */ 3527 - if (IS_LEAD_HWFN(p_hwfn)) { 3495 + if (IS_LEAD_HWFN(p_hwfn) && !cdev->recov_in_prog) { 3528 3496 rc = qed_mcp_initiate_pf_flr(p_hwfn, p_hwfn->p_main_ptt); 3529 3497 if (rc) 3530 3498 DP_NOTICE(p_hwfn, "Failed to initiate PF FLR\n");
+12
drivers/net/ethernet/qlogic/qed/qed_dev_api.h
··· 473 473 qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle); 474 474 475 475 /** 476 + * @brief qed_pglueb_set_pfid_enable - Enable or disable PCI BUS MASTER 477 + * 478 + * @param p_hwfn 479 + * @param p_ptt 480 + * @param b_enable - true/false 481 + * 482 + * @return int 483 + */ 484 + int qed_pglueb_set_pfid_enable(struct qed_hwfn *p_hwfn, 485 + struct qed_ptt *p_ptt, bool b_enable); 486 + 487 + /** 476 488 * @brief db_recovery_add - add doorbell information to the doorbell 477 489 * recovery mechanism. 478 490 *
+1 -1
drivers/net/ethernet/qlogic/qed/qed_hsi.h
··· 12827 12827 MFW_DRV_MSG_LLDP_DATA_UPDATED, 12828 12828 MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED, 12829 12829 MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED, 12830 - MFW_DRV_MSG_RESERVED4, 12830 + MFW_DRV_MSG_ERROR_RECOVERY, 12831 12831 MFW_DRV_MSG_BW_UPDATE, 12832 12832 MFW_DRV_MSG_S_TAG_UPDATE, 12833 12833 MFW_DRV_MSG_GET_LAN_STATS,
+11
drivers/net/ethernet/qlogic/qed/qed_hw.c
··· 703 703 int qed_status = 0; 704 704 u32 offset = 0; 705 705 706 + if (p_hwfn->cdev->recov_in_prog) { 707 + DP_VERBOSE(p_hwfn, 708 + NETIF_MSG_HW, 709 + "Recovery is in progress. Avoid DMAE transaction [{src: addr 0x%llx, type %d}, {dst: addr 0x%llx, type %d}, size %d].\n", 710 + src_addr, src_type, dst_addr, dst_type, 711 + size_in_dwords); 712 + 713 + /* Let the flow complete w/o any error handling */ 714 + return 0; 715 + } 716 + 706 717 qed_dmae_opcode(p_hwfn, 707 718 (src_type == QED_DMAE_ADDRESS_GRC), 708 719 (dst_type == QED_DMAE_ADDRESS_GRC),
+64 -62
drivers/net/ethernet/qlogic/qed/qed_int.c
··· 255 255 #define PGLUE_ATTENTION_ICPL_VALID (1 << 23) 256 256 #define PGLUE_ATTENTION_ZLR_VALID (1 << 25) 257 257 #define PGLUE_ATTENTION_ILT_VALID (1 << 23) 258 - static int qed_pglub_rbc_attn_cb(struct qed_hwfn *p_hwfn) 258 + 259 + int qed_pglueb_rbc_attn_handler(struct qed_hwfn *p_hwfn, 260 + struct qed_ptt *p_ptt) 259 261 { 260 262 u32 tmp; 261 263 262 - tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 263 - PGLUE_B_REG_TX_ERR_WR_DETAILS2); 264 + tmp = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_TX_ERR_WR_DETAILS2); 264 265 if (tmp & PGLUE_ATTENTION_VALID) { 265 266 u32 addr_lo, addr_hi, details; 266 267 267 - addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 268 + addr_lo = qed_rd(p_hwfn, p_ptt, 268 269 PGLUE_B_REG_TX_ERR_WR_ADD_31_0); 269 - addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 270 + addr_hi = qed_rd(p_hwfn, p_ptt, 270 271 PGLUE_B_REG_TX_ERR_WR_ADD_63_32); 271 - details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 272 + details = qed_rd(p_hwfn, p_ptt, 272 273 PGLUE_B_REG_TX_ERR_WR_DETAILS); 273 274 274 - DP_INFO(p_hwfn, 275 - "Illegal write by chip to [%08x:%08x] blocked.\n" 276 - "Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n" 277 - "Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n", 278 - addr_hi, addr_lo, details, 279 - (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID), 280 - (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID), 281 - GET_FIELD(details, 282 - PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0, 283 - tmp, 284 - GET_FIELD(tmp, 285 - PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1 : 0, 286 - GET_FIELD(tmp, 287 - PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0, 288 - GET_FIELD(tmp, 289 - PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1 : 0); 275 + DP_NOTICE(p_hwfn, 276 + "Illegal write by chip to [%08x:%08x] blocked.\n" 277 + "Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n" 278 + "Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n", 279 + addr_hi, addr_lo, details, 280 + (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID), 281 + (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID), 282 + GET_FIELD(details, 283 + PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0, 284 + tmp, 285 + GET_FIELD(tmp, 286 + PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1 : 0, 287 + GET_FIELD(tmp, 288 + PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0, 289 + GET_FIELD(tmp, 290 + PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1 : 0); 290 291 } 291 292 292 - tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 293 - PGLUE_B_REG_TX_ERR_RD_DETAILS2); 293 + tmp = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_TX_ERR_RD_DETAILS2); 294 294 if (tmp & PGLUE_ATTENTION_RD_VALID) { 295 295 u32 addr_lo, addr_hi, details; 296 296 297 - addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 297 + addr_lo = qed_rd(p_hwfn, p_ptt, 298 298 PGLUE_B_REG_TX_ERR_RD_ADD_31_0); 299 - addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 299 + addr_hi = qed_rd(p_hwfn, p_ptt, 300 300 PGLUE_B_REG_TX_ERR_RD_ADD_63_32); 301 - details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 301 + details = qed_rd(p_hwfn, p_ptt, 302 302 PGLUE_B_REG_TX_ERR_RD_DETAILS); 303 303 304 - DP_INFO(p_hwfn, 305 - "Illegal read by chip from [%08x:%08x] blocked.\n" 306 - " Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n" 307 - " Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n", 308 - addr_hi, addr_lo, details, 309 - (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID), 310 - (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID), 311 - GET_FIELD(details, 312 - PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0, 313 - tmp, 314 - GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1 315 - : 0, 316 - GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0, 317 - GET_FIELD(tmp, PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1 318 - : 0); 304 + DP_NOTICE(p_hwfn, 305 + "Illegal read by chip from [%08x:%08x] blocked.\n" 306 + "Details: %08x [PFID %02x, VFID %02x, VF_VALID %02x]\n" 307 + "Details2 %08x [Was_error %02x BME deassert %02x FID_enable deassert %02x]\n", 308 + addr_hi, addr_lo, details, 309 + (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_PFID), 310 + (u8)GET_FIELD(details, PGLUE_ATTENTION_DETAILS_VFID), 311 + GET_FIELD(details, 312 + PGLUE_ATTENTION_DETAILS_VF_VALID) ? 1 : 0, 313 + tmp, 314 + GET_FIELD(tmp, 315 + PGLUE_ATTENTION_DETAILS2_WAS_ERR) ? 1 : 0, 316 + GET_FIELD(tmp, 317 + PGLUE_ATTENTION_DETAILS2_BME) ? 1 : 0, 318 + GET_FIELD(tmp, 319 + PGLUE_ATTENTION_DETAILS2_FID_EN) ? 1 : 0); 319 320 } 320 321 321 - tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 322 - PGLUE_B_REG_TX_ERR_WR_DETAILS_ICPL); 322 + tmp = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_TX_ERR_WR_DETAILS_ICPL); 323 323 if (tmp & PGLUE_ATTENTION_ICPL_VALID) 324 - DP_INFO(p_hwfn, "ICPL error - %08x\n", tmp); 324 + DP_NOTICE(p_hwfn, "ICPL error - %08x\n", tmp); 325 325 326 - tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 327 - PGLUE_B_REG_MASTER_ZLR_ERR_DETAILS); 326 + tmp = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_MASTER_ZLR_ERR_DETAILS); 328 327 if (tmp & PGLUE_ATTENTION_ZLR_VALID) { 329 328 u32 addr_hi, addr_lo; 330 329 331 - addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 330 + addr_lo = qed_rd(p_hwfn, p_ptt, 332 331 PGLUE_B_REG_MASTER_ZLR_ERR_ADD_31_0); 333 - addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 332 + addr_hi = qed_rd(p_hwfn, p_ptt, 334 333 PGLUE_B_REG_MASTER_ZLR_ERR_ADD_63_32); 335 334 336 - DP_INFO(p_hwfn, "ZLR eror - %08x [Address %08x:%08x]\n", 337 - tmp, addr_hi, addr_lo); 335 + DP_NOTICE(p_hwfn, "ZLR error - %08x [Address %08x:%08x]\n", 336 + tmp, addr_hi, addr_lo); 338 337 } 339 338 340 - tmp = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 341 - PGLUE_B_REG_VF_ILT_ERR_DETAILS2); 339 + tmp = qed_rd(p_hwfn, p_ptt, PGLUE_B_REG_VF_ILT_ERR_DETAILS2); 342 340 if (tmp & PGLUE_ATTENTION_ILT_VALID) { 343 341 u32 addr_hi, addr_lo, details; 344 342 345 - addr_lo = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 343 + addr_lo = qed_rd(p_hwfn, p_ptt, 346 344 PGLUE_B_REG_VF_ILT_ERR_ADD_31_0); 347 - addr_hi = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 345 + addr_hi = qed_rd(p_hwfn, p_ptt, 348 346 PGLUE_B_REG_VF_ILT_ERR_ADD_63_32); 349 - details = qed_rd(p_hwfn, p_hwfn->p_dpc_ptt, 347 + details = qed_rd(p_hwfn, p_ptt, 350 348 PGLUE_B_REG_VF_ILT_ERR_DETAILS); 351 349 352 - DP_INFO(p_hwfn, 353 - "ILT error - Details %08x Details2 %08x [Address %08x:%08x]\n", 354 - details, tmp, addr_hi, addr_lo); 350 + DP_NOTICE(p_hwfn, 351 + "ILT error - Details %08x Details2 %08x [Address %08x:%08x]\n", 352 + details, tmp, addr_hi, addr_lo); 355 353 } 356 354 357 355 /* Clear the indications */ 358 - qed_wr(p_hwfn, p_hwfn->p_dpc_ptt, 359 - PGLUE_B_REG_LATCHED_ERRORS_CLR, (1 << 2)); 356 + qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_LATCHED_ERRORS_CLR, BIT(2)); 360 357 361 358 return 0; 359 + } 360 + 361 + static int qed_pglueb_rbc_attn_cb(struct qed_hwfn *p_hwfn) 362 + { 363 + return qed_pglueb_rbc_attn_handler(p_hwfn, p_hwfn->p_dpc_ptt); 362 364 } 363 365 364 366 #define QED_DORQ_ATTENTION_REASON_MASK (0xfffff) ··· 542 540 {"PGLUE misc_flr", ATTENTION_SINGLE, 543 541 NULL, MAX_BLOCK_ID}, 544 542 {"PGLUE B RBC", ATTENTION_PAR_INT, 545 - qed_pglub_rbc_attn_cb, BLOCK_PGLUE_B}, 543 + qed_pglueb_rbc_attn_cb, BLOCK_PGLUE_B}, 546 544 {"PGLUE misc_mctp", ATTENTION_SINGLE, 547 545 NULL, MAX_BLOCK_ID}, 548 546 {"Flash event", ATTENTION_SINGLE, NULL, MAX_BLOCK_ID},
+3
drivers/net/ethernet/qlogic/qed/qed_int.h
··· 431 431 432 432 #define QED_MAPPING_MEMORY_SIZE(dev) (NUM_OF_SBS(dev)) 433 433 434 + int qed_pglueb_rbc_attn_handler(struct qed_hwfn *p_hwfn, 435 + struct qed_ptt *p_ptt); 436 + 434 437 #endif
+30
drivers/net/ethernet/qlogic/qed/qed_main.c
··· 359 359 360 360 qed_init_dp(cdev, params->dp_module, params->dp_level); 361 361 362 + cdev->recov_in_prog = params->recov_in_prog; 363 + 362 364 rc = qed_init_pci(cdev, pdev); 363 365 if (rc) { 364 366 DP_ERR(cdev, "init pci failed\n"); ··· 2205 2203 return qed_mcp_get_nvm_image(hwfn, type, buf, len); 2206 2204 } 2207 2205 2206 + void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn) 2207 + { 2208 + struct qed_common_cb_ops *ops = p_hwfn->cdev->protocol_ops.common; 2209 + void *cookie = p_hwfn->cdev->ops_cookie; 2210 + 2211 + if (ops && ops->schedule_recovery_handler) 2212 + ops->schedule_recovery_handler(cookie); 2213 + } 2214 + 2208 2215 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, 2209 2216 void *handle) 2210 2217 { ··· 2235 2224 qed_ptt_release(hwfn, ptt); 2236 2225 2237 2226 return status; 2227 + } 2228 + 2229 + static int qed_recovery_process(struct qed_dev *cdev) 2230 + { 2231 + struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev); 2232 + struct qed_ptt *p_ptt; 2233 + int rc = 0; 2234 + 2235 + p_ptt = qed_ptt_acquire(p_hwfn); 2236 + if (!p_ptt) 2237 + return -EAGAIN; 2238 + 2239 + rc = qed_start_recovery_process(p_hwfn, p_ptt); 2240 + 2241 + qed_ptt_release(p_hwfn, p_ptt); 2242 + 2243 + return rc; 2238 2244 } 2239 2245 2240 2246 static int qed_update_wol(struct qed_dev *cdev, bool enabled) ··· 2408 2380 .nvm_get_image = &qed_nvm_get_image, 2409 2381 .set_coalesce = &qed_set_coalesce, 2410 2382 .set_led = &qed_set_led, 2383 + .recovery_process = &qed_recovery_process, 2384 + .recovery_prolog = &qed_recovery_prolog, 2411 2385 .update_drv_state = &qed_update_drv_state, 2412 2386 .update_mac = &qed_update_mac, 2413 2387 .update_mtu = &qed_update_mtu,
+115
drivers/net/ethernet/qlogic/qed/qed_mcp.c
··· 1070 1070 return 0; 1071 1071 } 1072 1072 1073 + int qed_mcp_load_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) 1074 + { 1075 + u32 resp = 0, param = 0; 1076 + int rc; 1077 + 1078 + rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_LOAD_DONE, 0, &resp, 1079 + &param); 1080 + if (rc) { 1081 + DP_NOTICE(p_hwfn, 1082 + "Failed to send a LOAD_DONE command, rc = %d\n", rc); 1083 + return rc; 1084 + } 1085 + 1086 + /* Check if there is a DID mismatch between nvm-cfg/efuse */ 1087 + if (param & FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR) 1088 + DP_NOTICE(p_hwfn, 1089 + "warning: device configuration is not supported on this board type. The device may not function as expected.\n"); 1090 + 1091 + return 0; 1092 + } 1093 + 1073 1094 int qed_mcp_unload_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) 1074 1095 { 1075 1096 struct qed_mcp_mb_params mb_params; ··· 1549 1528 return 0; 1550 1529 } 1551 1530 1531 + u32 qed_get_process_kill_counter(struct qed_hwfn *p_hwfn, 1532 + struct qed_ptt *p_ptt) 1533 + { 1534 + u32 path_offsize_addr, path_offsize, path_addr, proc_kill_cnt; 1535 + 1536 + if (IS_VF(p_hwfn->cdev)) 1537 + return -EINVAL; 1538 + 1539 + path_offsize_addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base, 1540 + PUBLIC_PATH); 1541 + path_offsize = qed_rd(p_hwfn, p_ptt, path_offsize_addr); 1542 + path_addr = SECTION_ADDR(path_offsize, QED_PATH_ID(p_hwfn)); 1543 + 1544 + proc_kill_cnt = qed_rd(p_hwfn, p_ptt, 1545 + path_addr + 1546 + offsetof(struct public_path, process_kill)) & 1547 + PROCESS_KILL_COUNTER_MASK; 1548 + 1549 + return proc_kill_cnt; 1550 + } 1551 + 1552 + static void qed_mcp_handle_process_kill(struct qed_hwfn *p_hwfn, 1553 + struct qed_ptt *p_ptt) 1554 + { 1555 + struct qed_dev *cdev = p_hwfn->cdev; 1556 + u32 proc_kill_cnt; 1557 + 1558 + /* Prevent possible attentions/interrupts during the recovery handling 1559 + * and till its load phase, during which they will be re-enabled. 1560 + */ 1561 + qed_int_igu_disable_int(p_hwfn, p_ptt); 1562 + 1563 + DP_NOTICE(p_hwfn, "Received a process kill indication\n"); 1564 + 1565 + /* The following operations should be done once, and thus in CMT mode 1566 + * are carried out by only the first HW function. 1567 + */ 1568 + if (p_hwfn != QED_LEADING_HWFN(cdev)) 1569 + return; 1570 + 1571 + if (cdev->recov_in_prog) { 1572 + DP_NOTICE(p_hwfn, 1573 + "Ignoring the indication since a recovery process is already in progress\n"); 1574 + return; 1575 + } 1576 + 1577 + cdev->recov_in_prog = true; 1578 + 1579 + proc_kill_cnt = qed_get_process_kill_counter(p_hwfn, p_ptt); 1580 + DP_NOTICE(p_hwfn, "Process kill counter: %d\n", proc_kill_cnt); 1581 + 1582 + qed_schedule_recovery_handler(p_hwfn); 1583 + } 1584 + 1552 1585 static void qed_mcp_send_protocol_stats(struct qed_hwfn *p_hwfn, 1553 1586 struct qed_ptt *p_ptt, 1554 1587 enum MFW_DRV_MSG_TYPE type) ··· 1832 1757 break; 1833 1758 case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE: 1834 1759 qed_mcp_handle_transceiver_change(p_hwfn, p_ptt); 1760 + break; 1761 + case MFW_DRV_MSG_ERROR_RECOVERY: 1762 + qed_mcp_handle_process_kill(p_hwfn, p_ptt); 1835 1763 break; 1836 1764 case MFW_DRV_MSG_GET_LAN_STATS: 1837 1765 case MFW_DRV_MSG_GET_FCOE_STATS: ··· 2379 2301 *p_flash_size = flash_size; 2380 2302 2381 2303 return 0; 2304 + } 2305 + 2306 + int qed_start_recovery_process(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) 2307 + { 2308 + struct qed_dev *cdev = p_hwfn->cdev; 2309 + 2310 + if (cdev->recov_in_prog) { 2311 + DP_NOTICE(p_hwfn, 2312 + "Avoid triggering a recovery since such a process is already in progress\n"); 2313 + return -EAGAIN; 2314 + } 2315 + 2316 + DP_NOTICE(p_hwfn, "Triggering a recovery process\n"); 2317 + qed_wr(p_hwfn, p_ptt, MISC_REG_AEU_GENERAL_ATTN_35, 0x1); 2318 + 2319 + return 0; 2320 + } 2321 + 2322 + #define QED_RECOVERY_PROLOG_SLEEP_MS 100 2323 + 2324 + int qed_recovery_prolog(struct qed_dev *cdev) 2325 + { 2326 + struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev); 2327 + struct qed_ptt *p_ptt = p_hwfn->p_main_ptt; 2328 + int rc; 2329 + 2330 + /* Allow ongoing PCIe transactions to complete */ 2331 + msleep(QED_RECOVERY_PROLOG_SLEEP_MS); 2332 + 2333 + /* Clear the PF's internal FID_enable in the PXP */ 2334 + rc = qed_pglueb_set_pfid_enable(p_hwfn, p_ptt, false); 2335 + if (rc) 2336 + DP_NOTICE(p_hwfn, 2337 + "qed_pglueb_set_pfid_enable() failed. rc = %d.\n", 2338 + rc); 2339 + 2340 + return rc; 2382 2341 } 2383 2342 2384 2343 static int
+42
drivers/net/ethernet/qlogic/qed/qed_mcp.h
··· 441 441 struct qed_mcp_drv_version *p_ver); 442 442 443 443 /** 444 + * @brief Read the MFW process kill counter 445 + * 446 + * @param p_hwfn 447 + * @param p_ptt 448 + * 449 + * @return u32 450 + */ 451 + u32 qed_get_process_kill_counter(struct qed_hwfn *p_hwfn, 452 + struct qed_ptt *p_ptt); 453 + 454 + /** 455 + * @brief Trigger a recovery process 456 + * 457 + * @param p_hwfn 458 + * @param p_ptt 459 + * 460 + * @return int 461 + */ 462 + int qed_start_recovery_process(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); 463 + 464 + /** 465 + * @brief A recovery handler must call this function as its first step. 466 + * It is assumed that the handler is not run from an interrupt context. 467 + * 468 + * @param cdev 469 + * @param p_ptt 470 + * 471 + * @return int 472 + */ 473 + int qed_recovery_prolog(struct qed_dev *cdev); 474 + 475 + /** 444 476 * @brief Notify MFW about the change in base device properties 445 477 * 446 478 * @param p_hwfn ··· 831 799 int qed_mcp_load_req(struct qed_hwfn *p_hwfn, 832 800 struct qed_ptt *p_ptt, 833 801 struct qed_load_req_params *p_params); 802 + 803 + /** 804 + * @brief Sends a LOAD_DONE message to the MFW 805 + * 806 + * @param p_hwfn 807 + * @param p_ptt 808 + * 809 + * @return int - 0 - Operation was successful. 810 + */ 811 + int qed_mcp_load_done(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); 834 812 835 813 /** 836 814 * @brief Sends a UNLOAD_REQ message to the MFW
+2
drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
··· 518 518 0x180824UL 519 519 #define MISC_REG_AEU_GENERAL_ATTN_0 \ 520 520 0x008400UL 521 + #define MISC_REG_AEU_GENERAL_ATTN_35 \ 522 + 0x00848cUL 521 523 #define CAU_REG_SB_ADDR_MEMORY \ 522 524 0x1c8000UL 523 525 #define CAU_REG_SB_VAR_MEMORY \
+22
drivers/net/ethernet/qlogic/qed/qed_spq.c
··· 790 790 SPQ_HIGH_PRI_RESERVE_DEFAULT); 791 791 } 792 792 793 + static void qed_spq_recov_set_ret_code(struct qed_spq_entry *p_ent, 794 + u8 *fw_return_code) 795 + { 796 + if (!fw_return_code) 797 + return; 798 + 799 + if (p_ent->elem.hdr.protocol_id == PROTOCOLID_ROCE || 800 + p_ent->elem.hdr.protocol_id == PROTOCOLID_IWARP) 801 + *fw_return_code = RDMA_RETURN_OK; 802 + } 803 + 793 804 /* Avoid overriding of SPQ entries when getting out-of-order completions, by 794 805 * marking the completions in a bitmap and increasing the chain consumer only 795 806 * for the first successive completed entries. ··· 834 823 if (!p_ent) { 835 824 DP_NOTICE(p_hwfn, "Got a NULL pointer\n"); 836 825 return -EINVAL; 826 + } 827 + 828 + if (p_hwfn->cdev->recov_in_prog) { 829 + DP_VERBOSE(p_hwfn, 830 + QED_MSG_SPQ, 831 + "Recovery is in progress. Skip spq post [cmd %02x protocol %02x]\n", 832 + p_ent->elem.hdr.cmd_id, p_ent->elem.hdr.protocol_id); 833 + 834 + /* Let the flow complete w/o any error handling */ 835 + qed_spq_recov_set_ret_code(p_ent, fw_return_code); 836 + return 0; 837 837 } 838 838 839 839 /* Complete the entry */
+8 -1
drivers/net/ethernet/qlogic/qed/qed_sriov.c
··· 4447 4447 if (cdev->p_iov_info && cdev->p_iov_info->num_vfs && pci_enabled) 4448 4448 pci_disable_sriov(cdev->pdev); 4449 4449 4450 + if (cdev->recov_in_prog) { 4451 + DP_VERBOSE(cdev, 4452 + QED_MSG_IOV, 4453 + "Skip SRIOV disable operations in the device since a recovery is in progress\n"); 4454 + goto out; 4455 + } 4456 + 4450 4457 for_each_hwfn(cdev, i) { 4451 4458 struct qed_hwfn *hwfn = &cdev->hwfns[i]; 4452 4459 struct qed_ptt *ptt = qed_ptt_acquire(hwfn); ··· 4493 4486 4494 4487 qed_ptt_release(hwfn, ptt); 4495 4488 } 4496 - 4489 + out: 4497 4490 qed_iov_set_vfs_to_disable(cdev, false); 4498 4491 4499 4492 return 0;
+3
drivers/net/ethernet/qlogic/qede/qede.h
··· 162 162 struct list_head entry; 163 163 struct list_head rdma_event_list; 164 164 struct workqueue_struct *rdma_wq; 165 + bool exp_recovery; 165 166 }; 166 167 167 168 struct qede_ptp; ··· 265 264 enum QEDE_STATE { 266 265 QEDE_STATE_CLOSED, 267 266 QEDE_STATE_OPEN, 267 + QEDE_STATE_RECOVERY, 268 268 }; 269 269 270 270 #define HILO_U64(hi, lo) ((((u64)(hi)) << 32) + (lo)) ··· 464 462 #define QEDE_CSUM_UNNECESSARY BIT(1) 465 463 #define QEDE_TUNN_CSUM_UNNECESSARY BIT(2) 466 464 465 + #define QEDE_SP_RECOVERY 0 467 466 #define QEDE_SP_RX_MODE 1 468 467 469 468 #ifdef CONFIG_RFS_ACCEL
+243 -51
drivers/net/ethernet/qlogic/qede/qede_main.c
··· 133 133 static void qede_remove(struct pci_dev *pdev); 134 134 static void qede_shutdown(struct pci_dev *pdev); 135 135 static void qede_link_update(void *dev, struct qed_link_output *link); 136 + static void qede_schedule_recovery_handler(void *dev); 137 + static void qede_recovery_handler(struct qede_dev *edev); 136 138 static void qede_get_eth_tlv_data(void *edev, void *data); 137 139 static void qede_get_generic_tlv_data(void *edev, 138 140 struct qed_generic_tlvs *data); 139 - 140 - /* The qede lock is used to protect driver state change and driver flows that 141 - * are not reentrant. 142 - */ 143 - void __qede_lock(struct qede_dev *edev) 144 - { 145 - mutex_lock(&edev->qede_lock); 146 - } 147 - 148 - void __qede_unlock(struct qede_dev *edev) 149 - { 150 - mutex_unlock(&edev->qede_lock); 151 - } 152 141 153 142 #ifdef CONFIG_QED_SRIOV 154 143 static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos, ··· 220 231 .arfs_filter_op = qede_arfs_filter_op, 221 232 #endif 222 233 .link_update = qede_link_update, 234 + .schedule_recovery_handler = qede_schedule_recovery_handler, 223 235 .get_generic_tlv_data = qede_get_generic_tlv_data, 224 236 .get_protocol_tlv_data = qede_get_eth_tlv_data, 225 237 }, ··· 940 950 return -ENOMEM; 941 951 } 942 952 953 + /* The qede lock is used to protect driver state change and driver flows that 954 + * are not reentrant. 955 + */ 956 + void __qede_lock(struct qede_dev *edev) 957 + { 958 + mutex_lock(&edev->qede_lock); 959 + } 960 + 961 + void __qede_unlock(struct qede_dev *edev) 962 + { 963 + mutex_unlock(&edev->qede_lock); 964 + } 965 + 966 + /* This version of the lock should be used when acquiring the RTNL lock is also 967 + * needed in addition to the internal qede lock. 968 + */ 969 + void qede_lock(struct qede_dev *edev) 970 + { 971 + rtnl_lock(); 972 + __qede_lock(edev); 973 + } 974 + 975 + void qede_unlock(struct qede_dev *edev) 976 + { 977 + __qede_unlock(edev); 978 + rtnl_unlock(); 979 + } 980 + 943 981 static void qede_sp_task(struct work_struct *work) 944 982 { 945 983 struct qede_dev *edev = container_of(work, struct qede_dev, 946 984 sp_task.work); 985 + 986 + /* The locking scheme depends on the specific flag: 987 + * In case of QEDE_SP_RECOVERY, acquiring the RTNL lock is required to 988 + * ensure that ongoing flows are ended and new ones are not started. 989 + * In other cases - only the internal qede lock should be acquired. 990 + */ 991 + 992 + if (test_and_clear_bit(QEDE_SP_RECOVERY, &edev->sp_flags)) { 993 + #ifdef CONFIG_QED_SRIOV 994 + /* SRIOV must be disabled outside the lock to avoid a deadlock. 995 + * The recovery of the active VFs is currently not supported. 996 + */ 997 + qede_sriov_configure(edev->pdev, 0); 998 + #endif 999 + qede_lock(edev); 1000 + qede_recovery_handler(edev); 1001 + qede_unlock(edev); 1002 + } 947 1003 948 1004 __qede_lock(edev); 949 1005 ··· 1067 1031 1068 1032 enum qede_probe_mode { 1069 1033 QEDE_PROBE_NORMAL, 1034 + QEDE_PROBE_RECOVERY, 1070 1035 }; 1071 1036 1072 1037 static int __qede_probe(struct pci_dev *pdev, u32 dp_module, u8 dp_level, ··· 1088 1051 probe_params.dp_module = dp_module; 1089 1052 probe_params.dp_level = dp_level; 1090 1053 probe_params.is_vf = is_vf; 1054 + probe_params.recov_in_prog = (mode == QEDE_PROBE_RECOVERY); 1091 1055 cdev = qed_ops->common->probe(pdev, &probe_params); 1092 1056 if (!cdev) { 1093 1057 rc = -ENODEV; ··· 1116 1078 if (rc) 1117 1079 goto err2; 1118 1080 1119 - edev = qede_alloc_etherdev(cdev, pdev, &dev_info, dp_module, 1120 - dp_level); 1121 - if (!edev) { 1122 - rc = -ENOMEM; 1123 - goto err2; 1081 + if (mode != QEDE_PROBE_RECOVERY) { 1082 + edev = qede_alloc_etherdev(cdev, pdev, &dev_info, dp_module, 1083 + dp_level); 1084 + if (!edev) { 1085 + rc = -ENOMEM; 1086 + goto err2; 1087 + } 1088 + } else { 1089 + struct net_device *ndev = pci_get_drvdata(pdev); 1090 + 1091 + edev = netdev_priv(ndev); 1092 + edev->cdev = cdev; 1093 + memset(&edev->stats, 0, sizeof(edev->stats)); 1094 + memcpy(&edev->dev_info, &dev_info, sizeof(dev_info)); 1124 1095 } 1125 1096 1126 1097 if (is_vf) ··· 1137 1090 1138 1091 qede_init_ndev(edev); 1139 1092 1140 - rc = qede_rdma_dev_add(edev); 1093 + rc = qede_rdma_dev_add(edev, (mode == QEDE_PROBE_RECOVERY)); 1141 1094 if (rc) 1142 1095 goto err3; 1143 1096 1144 - /* Prepare the lock prior to the registration of the netdev, 1145 - * as once it's registered we might reach flows requiring it 1146 - * [it's even possible to reach a flow needing it directly 1147 - * from there, although it's unlikely]. 1148 - */ 1149 - INIT_DELAYED_WORK(&edev->sp_task, qede_sp_task); 1150 - mutex_init(&edev->qede_lock); 1151 - rc = register_netdev(edev->ndev); 1152 - if (rc) { 1153 - DP_NOTICE(edev, "Cannot register net-device\n"); 1154 - goto err4; 1097 + if (mode != QEDE_PROBE_RECOVERY) { 1098 + /* Prepare the lock prior to the registration of the netdev, 1099 + * as once it's registered we might reach flows requiring it 1100 + * [it's even possible to reach a flow needing it directly 1101 + * from there, although it's unlikely]. 1102 + */ 1103 + INIT_DELAYED_WORK(&edev->sp_task, qede_sp_task); 1104 + mutex_init(&edev->qede_lock); 1105 + 1106 + rc = register_netdev(edev->ndev); 1107 + if (rc) { 1108 + DP_NOTICE(edev, "Cannot register net-device\n"); 1109 + goto err4; 1110 + } 1155 1111 } 1156 1112 1157 1113 edev->ops->common->set_name(cdev, edev->ndev->name); 1158 1114 1159 1115 /* PTP not supported on VFs */ 1160 1116 if (!is_vf) 1161 - qede_ptp_enable(edev, true); 1117 + qede_ptp_enable(edev, (mode == QEDE_PROBE_NORMAL)); 1162 1118 1163 1119 edev->ops->register_ops(cdev, &qede_ll_ops, edev); 1164 1120 ··· 1176 1126 return 0; 1177 1127 1178 1128 err4: 1179 - qede_rdma_dev_remove(edev); 1129 + qede_rdma_dev_remove(edev, (mode == QEDE_PROBE_RECOVERY)); 1180 1130 err3: 1181 1131 free_netdev(edev->ndev); 1182 1132 err2: ··· 1212 1162 1213 1163 enum qede_remove_mode { 1214 1164 QEDE_REMOVE_NORMAL, 1165 + QEDE_REMOVE_RECOVERY, 1215 1166 }; 1216 1167 1217 1168 static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode) ··· 1223 1172 1224 1173 DP_INFO(edev, "Starting qede_remove\n"); 1225 1174 1226 - qede_rdma_dev_remove(edev); 1227 - unregister_netdev(ndev); 1228 - cancel_delayed_work_sync(&edev->sp_task); 1175 + qede_rdma_dev_remove(edev, (mode == QEDE_REMOVE_RECOVERY)); 1176 + 1177 + if (mode != QEDE_REMOVE_RECOVERY) { 1178 + unregister_netdev(ndev); 1179 + 1180 + cancel_delayed_work_sync(&edev->sp_task); 1181 + 1182 + edev->ops->common->set_power_state(cdev, PCI_D0); 1183 + 1184 + pci_set_drvdata(pdev, NULL); 1185 + } 1229 1186 1230 1187 qede_ptp_disable(edev); 1231 - 1232 - edev->ops->common->set_power_state(cdev, PCI_D0); 1233 - 1234 - pci_set_drvdata(pdev, NULL); 1235 1188 1236 1189 /* Use global ops since we've freed edev */ 1237 1190 qed_ops->common->slowpath_stop(cdev); ··· 1249 1194 * [e.g., QED register callbacks] won't break anything when 1250 1195 * accessing the netdevice. 1251 1196 */ 1252 - free_netdev(ndev); 1197 + if (mode != QEDE_REMOVE_RECOVERY) 1198 + free_netdev(ndev); 1253 1199 1254 1200 dev_info(&pdev->dev, "Ending qede_remove successfully\n"); 1255 1201 } ··· 1593 1537 } 1594 1538 1595 1539 return 0; 1540 + } 1541 + 1542 + static void qede_empty_tx_queue(struct qede_dev *edev, 1543 + struct qede_tx_queue *txq) 1544 + { 1545 + unsigned int pkts_compl = 0, bytes_compl = 0; 1546 + struct netdev_queue *netdev_txq; 1547 + int rc, len = 0; 1548 + 1549 + netdev_txq = netdev_get_tx_queue(edev->ndev, txq->ndev_txq_id); 1550 + 1551 + while (qed_chain_get_cons_idx(&txq->tx_pbl) != 1552 + qed_chain_get_prod_idx(&txq->tx_pbl)) { 1553 + DP_VERBOSE(edev, NETIF_MSG_IFDOWN, 1554 + "Freeing a packet on tx queue[%d]: chain_cons 0x%x, chain_prod 0x%x\n", 1555 + txq->index, qed_chain_get_cons_idx(&txq->tx_pbl), 1556 + qed_chain_get_prod_idx(&txq->tx_pbl)); 1557 + 1558 + rc = qede_free_tx_pkt(edev, txq, &len); 1559 + if (rc) { 1560 + DP_NOTICE(edev, 1561 + "Failed to free a packet on tx queue[%d]: chain_cons 0x%x, chain_prod 0x%x\n", 1562 + txq->index, 1563 + qed_chain_get_cons_idx(&txq->tx_pbl), 1564 + qed_chain_get_prod_idx(&txq->tx_pbl)); 1565 + break; 1566 + } 1567 + 1568 + bytes_compl += len; 1569 + pkts_compl++; 1570 + txq->sw_tx_cons++; 1571 + } 1572 + 1573 + netdev_tx_completed_queue(netdev_txq, pkts_compl, bytes_compl); 1574 + } 1575 + 1576 + static void qede_empty_tx_queues(struct qede_dev *edev) 1577 + { 1578 + int i; 1579 + 1580 + for_each_queue(i) 1581 + if (edev->fp_array[i].type & QEDE_FASTPATH_TX) { 1582 + int cos; 1583 + 1584 + for_each_cos_in_txq(edev, cos) { 1585 + struct qede_fastpath *fp; 1586 + 1587 + fp = &edev->fp_array[i]; 1588 + qede_empty_tx_queue(edev, 1589 + &fp->txq[cos]); 1590 + } 1591 + } 1596 1592 } 1597 1593 1598 1594 /* This function inits fp content and resets the SB, RXQ and TXQ structures */ ··· 2161 2053 2162 2054 enum qede_unload_mode { 2163 2055 QEDE_UNLOAD_NORMAL, 2056 + QEDE_UNLOAD_RECOVERY, 2164 2057 }; 2165 2058 2166 2059 static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode, ··· 2177 2068 2178 2069 clear_bit(QEDE_FLAGS_LINK_REQUESTED, &edev->flags); 2179 2070 2180 - edev->state = QEDE_STATE_CLOSED; 2071 + if (mode != QEDE_UNLOAD_RECOVERY) 2072 + edev->state = QEDE_STATE_CLOSED; 2181 2073 2182 2074 qede_rdma_dev_event_close(edev); 2183 2075 ··· 2186 2076 netif_tx_disable(edev->ndev); 2187 2077 netif_carrier_off(edev->ndev); 2188 2078 2189 - /* Reset the link */ 2190 - memset(&link_params, 0, sizeof(link_params)); 2191 - link_params.link_up = false; 2192 - edev->ops->common->set_link(edev->cdev, &link_params); 2193 - rc = qede_stop_queues(edev); 2194 - if (rc) { 2195 - qede_sync_free_irqs(edev); 2196 - goto out; 2197 - } 2079 + if (mode != QEDE_UNLOAD_RECOVERY) { 2080 + /* Reset the link */ 2081 + memset(&link_params, 0, sizeof(link_params)); 2082 + link_params.link_up = false; 2083 + edev->ops->common->set_link(edev->cdev, &link_params); 2198 2084 2199 - DP_INFO(edev, "Stopped Queues\n"); 2085 + rc = qede_stop_queues(edev); 2086 + if (rc) { 2087 + qede_sync_free_irqs(edev); 2088 + goto out; 2089 + } 2090 + 2091 + DP_INFO(edev, "Stopped Queues\n"); 2092 + } 2200 2093 2201 2094 qede_vlan_mark_nonconfigured(edev); 2202 2095 edev->ops->fastpath_stop(edev->cdev); ··· 2215 2102 2216 2103 qede_napi_disable_remove(edev); 2217 2104 2105 + if (mode == QEDE_UNLOAD_RECOVERY) 2106 + qede_empty_tx_queues(edev); 2107 + 2218 2108 qede_free_mem_load(edev); 2219 2109 qede_free_fp_array(edev); 2220 2110 2221 2111 out: 2222 2112 if (!is_locked) 2223 2113 __qede_unlock(edev); 2114 + 2115 + if (mode != QEDE_UNLOAD_RECOVERY) 2116 + DP_NOTICE(edev, "Link is down\n"); 2117 + 2224 2118 DP_INFO(edev, "Ending qede unload\n"); 2225 2119 } 2226 2120 2227 2121 enum qede_load_mode { 2228 2122 QEDE_LOAD_NORMAL, 2229 2123 QEDE_LOAD_RELOAD, 2124 + QEDE_LOAD_RECOVERY, 2230 2125 }; 2231 2126 2232 2127 static int qede_load(struct qede_dev *edev, enum qede_load_mode mode, ··· 2412 2291 qede_rdma_dev_event_close(edev); 2413 2292 } 2414 2293 } 2294 + } 2295 + 2296 + static void qede_schedule_recovery_handler(void *dev) 2297 + { 2298 + struct qede_dev *edev = dev; 2299 + 2300 + if (edev->state == QEDE_STATE_RECOVERY) { 2301 + DP_NOTICE(edev, 2302 + "Avoid scheduling a recovery handling since already in recovery state\n"); 2303 + return; 2304 + } 2305 + 2306 + set_bit(QEDE_SP_RECOVERY, &edev->sp_flags); 2307 + schedule_delayed_work(&edev->sp_task, 0); 2308 + 2309 + DP_INFO(edev, "Scheduled a recovery handler\n"); 2310 + } 2311 + 2312 + static void qede_recovery_failed(struct qede_dev *edev) 2313 + { 2314 + netdev_err(edev->ndev, "Recovery handling has failed. Power cycle is needed.\n"); 2315 + 2316 + netif_device_detach(edev->ndev); 2317 + 2318 + if (edev->cdev) 2319 + edev->ops->common->set_power_state(edev->cdev, PCI_D3hot); 2320 + } 2321 + 2322 + static void qede_recovery_handler(struct qede_dev *edev) 2323 + { 2324 + u32 curr_state = edev->state; 2325 + int rc; 2326 + 2327 + DP_NOTICE(edev, "Starting a recovery process\n"); 2328 + 2329 + /* No need to acquire first the qede_lock since is done by qede_sp_task 2330 + * before calling this function. 2331 + */ 2332 + edev->state = QEDE_STATE_RECOVERY; 2333 + 2334 + edev->ops->common->recovery_prolog(edev->cdev); 2335 + 2336 + if (curr_state == QEDE_STATE_OPEN) 2337 + qede_unload(edev, QEDE_UNLOAD_RECOVERY, true); 2338 + 2339 + __qede_remove(edev->pdev, QEDE_REMOVE_RECOVERY); 2340 + 2341 + rc = __qede_probe(edev->pdev, edev->dp_module, edev->dp_level, 2342 + IS_VF(edev), QEDE_PROBE_RECOVERY); 2343 + if (rc) { 2344 + edev->cdev = NULL; 2345 + goto err; 2346 + } 2347 + 2348 + if (curr_state == QEDE_STATE_OPEN) { 2349 + rc = qede_load(edev, QEDE_LOAD_RECOVERY, true); 2350 + if (rc) 2351 + goto err; 2352 + 2353 + qede_config_rx_mode(edev->ndev); 2354 + udp_tunnel_get_rx_info(edev->ndev); 2355 + } 2356 + 2357 + edev->state = curr_state; 2358 + 2359 + DP_NOTICE(edev, "Recovery handling is done\n"); 2360 + 2361 + return; 2362 + 2363 + err: 2364 + qede_recovery_failed(edev); 2415 2365 } 2416 2366 2417 2367 static bool qede_is_txq_full(struct qede_dev *edev, struct qede_tx_queue *txq)
+43 -20
drivers/net/ethernet/qlogic/qede/qede_rdma.c
··· 50 50 if (!qedr_drv) 51 51 return; 52 52 53 + /* Leftovers from previous error recovery */ 54 + edev->rdma_info.exp_recovery = false; 53 55 edev->rdma_info.qedr_dev = qedr_drv->add(edev->cdev, edev->pdev, 54 56 edev->ndev); 55 57 } ··· 89 87 destroy_workqueue(edev->rdma_info.rdma_wq); 90 88 } 91 89 92 - int qede_rdma_dev_add(struct qede_dev *edev) 90 + int qede_rdma_dev_add(struct qede_dev *edev, bool recovery) 93 91 { 94 - int rc = 0; 92 + int rc; 95 93 96 - if (qede_rdma_supported(edev)) { 97 - rc = qede_rdma_create_wq(edev); 98 - if (rc) 99 - return rc; 94 + if (!qede_rdma_supported(edev)) 95 + return 0; 100 96 101 - INIT_LIST_HEAD(&edev->rdma_info.entry); 102 - mutex_lock(&qedr_dev_list_lock); 103 - list_add_tail(&edev->rdma_info.entry, &qedr_dev_list); 104 - _qede_rdma_dev_add(edev); 105 - mutex_unlock(&qedr_dev_list_lock); 106 - } 97 + /* Cannot start qedr while recovering since it wasn't fully stopped */ 98 + if (recovery) 99 + return 0; 100 + 101 + rc = qede_rdma_create_wq(edev); 102 + if (rc) 103 + return rc; 104 + 105 + INIT_LIST_HEAD(&edev->rdma_info.entry); 106 + mutex_lock(&qedr_dev_list_lock); 107 + list_add_tail(&edev->rdma_info.entry, &qedr_dev_list); 108 + _qede_rdma_dev_add(edev); 109 + mutex_unlock(&qedr_dev_list_lock); 107 110 108 111 return rc; 109 112 } ··· 117 110 { 118 111 if (qedr_drv && qedr_drv->remove && edev->rdma_info.qedr_dev) 119 112 qedr_drv->remove(edev->rdma_info.qedr_dev); 120 - edev->rdma_info.qedr_dev = NULL; 121 113 } 122 114 123 - void qede_rdma_dev_remove(struct qede_dev *edev) 115 + void qede_rdma_dev_remove(struct qede_dev *edev, bool recovery) 124 116 { 125 117 if (!qede_rdma_supported(edev)) 126 118 return; 127 119 128 - qede_rdma_destroy_wq(edev); 129 - mutex_lock(&qedr_dev_list_lock); 130 - _qede_rdma_dev_remove(edev); 131 - list_del(&edev->rdma_info.entry); 132 - mutex_unlock(&qedr_dev_list_lock); 120 + /* Cannot remove qedr while recovering since it wasn't fully stopped */ 121 + if (!recovery) { 122 + qede_rdma_destroy_wq(edev); 123 + mutex_lock(&qedr_dev_list_lock); 124 + if (!edev->rdma_info.exp_recovery) 125 + _qede_rdma_dev_remove(edev); 126 + edev->rdma_info.qedr_dev = NULL; 127 + list_del(&edev->rdma_info.entry); 128 + mutex_unlock(&qedr_dev_list_lock); 129 + } else { 130 + if (!edev->rdma_info.exp_recovery) { 131 + mutex_lock(&qedr_dev_list_lock); 132 + _qede_rdma_dev_remove(edev); 133 + mutex_unlock(&qedr_dev_list_lock); 134 + } 135 + edev->rdma_info.exp_recovery = true; 136 + } 133 137 } 134 138 135 139 static void _qede_rdma_dev_open(struct qede_dev *edev) ··· 222 204 223 205 mutex_lock(&qedr_dev_list_lock); 224 206 list_for_each_entry(edev, &qedr_dev_list, rdma_info.entry) { 225 - if (edev->rdma_info.qedr_dev) 207 + /* If device has experienced recovery it was already removed */ 208 + if (edev->rdma_info.qedr_dev && !edev->rdma_info.exp_recovery) 226 209 _qede_rdma_dev_remove(edev); 227 210 } 228 211 qedr_drv = NULL; ··· 302 283 enum qede_rdma_event event) 303 284 { 304 285 struct qede_rdma_event_work *event_node; 286 + 287 + /* If a recovery was experienced avoid adding the event */ 288 + if (edev->rdma_info.exp_recovery) 289 + return; 305 290 306 291 if (!edev->rdma_info.qedr_dev) 307 292 return;
+20
include/linux/qed/qed_if.h
··· 763 763 u32 dp_module; 764 764 u8 dp_level; 765 765 bool is_vf; 766 + bool recov_in_prog; 766 767 }; 767 768 768 769 #define QED_DRV_VER_STR_SIZE 12 ··· 810 809 void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); 811 810 void (*link_update)(void *dev, 812 811 struct qed_link_output *link); 812 + void (*schedule_recovery_handler)(void *dev); 813 813 void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); 814 814 void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data); 815 815 void (*get_protocol_tlv_data)(void *dev, void *data); ··· 1057 1055 */ 1058 1056 int (*db_recovery_del)(struct qed_dev *cdev, 1059 1057 void __iomem *db_addr, void *db_data); 1058 + 1059 + /** 1060 + * @brief recovery_process - Trigger a recovery process 1061 + * 1062 + * @param cdev 1063 + * 1064 + * @return 0 on success, error otherwise. 1065 + */ 1066 + int (*recovery_process)(struct qed_dev *cdev); 1067 + 1068 + /** 1069 + * @brief recovery_prolog - Execute the prolog operations of a recovery process 1070 + * 1071 + * @param cdev 1072 + * 1073 + * @return 0 on success, error otherwise. 1074 + */ 1075 + int (*recovery_prolog)(struct qed_dev *cdev); 1060 1076 1061 1077 /** 1062 1078 * @brief update_drv_state - API to inform the change in the driver state.
+6 -4
include/linux/qed/qede_rdma.h
··· 74 74 bool qede_rdma_supported(struct qede_dev *dev); 75 75 76 76 #if IS_ENABLED(CONFIG_QED_RDMA) 77 - int qede_rdma_dev_add(struct qede_dev *dev); 77 + int qede_rdma_dev_add(struct qede_dev *dev, bool recovery); 78 78 void qede_rdma_dev_event_open(struct qede_dev *dev); 79 79 void qede_rdma_dev_event_close(struct qede_dev *dev); 80 - void qede_rdma_dev_remove(struct qede_dev *dev); 80 + void qede_rdma_dev_remove(struct qede_dev *dev, bool recovery); 81 81 void qede_rdma_event_changeaddr(struct qede_dev *edr); 82 82 83 83 #else 84 - static inline int qede_rdma_dev_add(struct qede_dev *dev) 84 + static inline int qede_rdma_dev_add(struct qede_dev *dev, 85 + bool recovery) 85 86 { 86 87 return 0; 87 88 } 88 89 89 90 static inline void qede_rdma_dev_event_open(struct qede_dev *dev) {} 90 91 static inline void qede_rdma_dev_event_close(struct qede_dev *dev) {} 91 - static inline void qede_rdma_dev_remove(struct qede_dev *dev) {} 92 + static inline void qede_rdma_dev_remove(struct qede_dev *dev, 93 + bool recovery) {} 92 94 static inline void qede_rdma_event_changeaddr(struct qede_dev *edr) {} 93 95 #endif 94 96 #endif