Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sfc: Improve NIC internal error recovery

Make the error count a per-NIC variable.
Reset this the count after an hour if it has not reached the critical value.
Set the critical value back to 5.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Ben Hutchings and committed by
David S. Miller
2c3c3d02 4720bc6c

+19 -4
+19 -4
drivers/net/sfc/falcon.c
··· 39 39 * @next_buffer_table: First available buffer table id 40 40 * @pci_dev2: The secondary PCI device if present 41 41 * @i2c_data: Operations and state for I2C bit-bashing algorithm 42 + * @int_error_count: Number of internal errors seen recently 43 + * @int_error_expire: Time at which error count will be expired 42 44 */ 43 45 struct falcon_nic_data { 44 46 unsigned next_buffer_table; 45 47 struct pci_dev *pci_dev2; 46 48 struct i2c_algo_bit_data i2c_data; 49 + 50 + unsigned int_error_count; 51 + unsigned long int_error_expire; 47 52 }; 48 53 49 54 /************************************************************************** ··· 124 119 #define FALCON_EVQ_SIZE 4096 125 120 #define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1) 126 121 127 - /* Max number of internal errors. After this resets will not be performed */ 128 - #define FALCON_MAX_INT_ERRORS 4 122 + /* If FALCON_MAX_INT_ERRORS internal errors occur within 123 + * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and 124 + * disable it. 125 + */ 126 + #define FALCON_INT_ERROR_EXPIRE 3600 127 + #define FALCON_MAX_INT_ERRORS 5 129 128 130 129 /* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times 131 130 */ ··· 1383 1374 efx_oword_t *int_ker = efx->irq_status.addr; 1384 1375 efx_oword_t fatal_intr; 1385 1376 int error, mem_perr; 1386 - static int n_int_errors; 1387 1377 1388 1378 falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER); 1389 1379 error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR); ··· 1409 1401 pci_clear_master(nic_data->pci_dev2); 1410 1402 falcon_disable_interrupts(efx); 1411 1403 1412 - if (++n_int_errors < FALCON_MAX_INT_ERRORS) { 1404 + /* Count errors and reset or disable the NIC accordingly */ 1405 + if (nic_data->int_error_count == 0 || 1406 + time_after(jiffies, nic_data->int_error_expire)) { 1407 + nic_data->int_error_count = 0; 1408 + nic_data->int_error_expire = 1409 + jiffies + FALCON_INT_ERROR_EXPIRE * HZ; 1410 + } 1411 + if (++nic_data->int_error_count < FALCON_MAX_INT_ERRORS) { 1413 1412 EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n"); 1414 1413 efx_schedule_reset(efx, RESET_TYPE_INT_ERROR); 1415 1414 } else {