Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

e1000e: locking bug introduced by commit 67fd4fcb

Commit 67fd4fcb (e1000e: convert to stats64) added the ability to update
statistics more accurately and on-demand through the net_device_ops
.ndo_get_stats64 hook, but introduced a locking bug on 82577/8/9 when
linked at half-duplex (seen on kernels with CONFIG_DEBUG_ATOMIC_SLEEP=y and
CONFIG_PROVE_LOCKING=y). The commit introduced code paths that caused a
mutex to be locked in atomic contexts, e.g. an rcu_read_lock is held when
irqbalance reads the stats from /sys/class/net/ethX/statistics causing the
mutex to be locked to read the Phy half-duplex statistics registers.

The mutex was originally introduced to prevent concurrent accesses of
resources (the NVM and Phy) shared by the driver, firmware and hardware
a few years back when there was an issue with the NVM getting corrupted.
It was later split into two mutexes - one for the NVM and one for the Phy
when it was determined the NVM, unlike the Phy, should not be protected by
the software/firmware/hardware semaphore (arbitration of which is done in
part with the SWFLAG bit in the EXTCNF_CTRL register). This latter
semaphore should be sufficient to prevent resource contention of the Phy in
the driver (i.e. the mutex for Phy accesses is not needed), but to be sure
the mutex is replaced with an atomic bit flag which will warn if any
contention is possible.

Also add additional debug output to help determine when the sw/fw/hw
semaphore is owned by the firmware or hardware.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Reported-by: Francois Romieu <romieu@fr.zoreil.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>

authored by

Bruce Allan and committed by
Jeff Kirsher
a90b412c 96cd8951

+14 -8
+1
drivers/net/ethernet/intel/e1000e/e1000.h
··· 469 469 enum e1000_state_t { 470 470 __E1000_TESTING, 471 471 __E1000_RESETTING, 472 + __E1000_ACCESS_SHARED_RESOURCE, 472 473 __E1000_DOWN 473 474 }; 474 475
+13 -8
drivers/net/ethernet/intel/e1000e/ich8lan.c
··· 852 852 mutex_unlock(&nvm_mutex); 853 853 } 854 854 855 - static DEFINE_MUTEX(swflag_mutex); 856 - 857 855 /** 858 856 * e1000_acquire_swflag_ich8lan - Acquire software control flag 859 857 * @hw: pointer to the HW structure ··· 864 866 u32 extcnf_ctrl, timeout = PHY_CFG_TIMEOUT; 865 867 s32 ret_val = 0; 866 868 867 - mutex_lock(&swflag_mutex); 869 + if (test_and_set_bit(__E1000_ACCESS_SHARED_RESOURCE, 870 + &hw->adapter->state)) { 871 + WARN(1, "e1000e: %s: contention for Phy access\n", 872 + hw->adapter->netdev->name); 873 + return -E1000_ERR_PHY; 874 + } 868 875 869 876 while (timeout) { 870 877 extcnf_ctrl = er32(EXTCNF_CTRL); ··· 881 878 } 882 879 883 880 if (!timeout) { 884 - e_dbg("SW/FW/HW has locked the resource for too long.\n"); 881 + e_dbg("SW has already locked the resource.\n"); 885 882 ret_val = -E1000_ERR_CONFIG; 886 883 goto out; 887 884 } ··· 901 898 } 902 899 903 900 if (!timeout) { 904 - e_dbg("Failed to acquire the semaphore.\n"); 901 + e_dbg("Failed to acquire the semaphore, FW or HW has it: " 902 + "FWSM=0x%8.8x EXTCNF_CTRL=0x%8.8x)\n", 903 + er32(FWSM), extcnf_ctrl); 905 904 extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG; 906 905 ew32(EXTCNF_CTRL, extcnf_ctrl); 907 906 ret_val = -E1000_ERR_CONFIG; ··· 912 907 913 908 out: 914 909 if (ret_val) 915 - mutex_unlock(&swflag_mutex); 910 + clear_bit(__E1000_ACCESS_SHARED_RESOURCE, &hw->adapter->state); 916 911 917 912 return ret_val; 918 913 } ··· 937 932 e_dbg("Semaphore unexpectedly released by sw/fw/hw\n"); 938 933 } 939 934 940 - mutex_unlock(&swflag_mutex); 935 + clear_bit(__E1000_ACCESS_SHARED_RESOURCE, &hw->adapter->state); 941 936 } 942 937 943 938 /** ··· 3144 3139 msleep(20); 3145 3140 3146 3141 if (!ret_val) 3147 - mutex_unlock(&swflag_mutex); 3142 + clear_bit(__E1000_ACCESS_SHARED_RESOURCE, &hw->adapter->state); 3148 3143 3149 3144 if (ctrl & E1000_CTRL_PHY_RST) { 3150 3145 ret_val = hw->phy.ops.get_cfg_done(hw);