Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sfc: use budget for TX completions

When running workloads heavy unbalanced towards TX (high TX, low RX
traffic), sfc driver can retain the CPU during too long times. Although
in many cases this is not enough to be visible, it can affect
performance and system responsiveness.

A way to reproduce it is to use a debug kernel and run some parallel
netperf TX tests. In some systems, this will lead to this message being
logged:
kernel:watchdog: BUG: soft lockup - CPU#12 stuck for 22s!

The reason is that sfc driver doesn't account any NAPI budget for the TX
completion events work. With high-TX/low-RX traffic, this makes that the
CPU is held for long time for NAPI poll.

Documentations says "drivers can process completions for any number of Tx
packets but should only process up to budget number of Rx packets".
However, many drivers do limit the amount of TX completions that they
process in a single NAPI poll.

In the same way, this patch adds a limit for the TX work in sfc. With
the patch applied, the watchdog warning never appears.

Tested with netperf in different combinations: single process / parallel
processes, TCP / UDP and different sizes of UDP messages. Repeated the
tests before and after the patch, without any noticeable difference in
network or CPU performance.

Test hardware:
Intel(R) Xeon(R) CPU E5-1620 v4 @ 3.50GHz (4 cores, 2 threads/core)
Solarflare Communications XtremeScale X2522-25G Network Adapter

Fixes: 5227ecccea2d ("sfc: remove tx and MCDI handling from NAPI budget consideration")
Fixes: d19a53721863 ("sfc_ef100: TX path for EF100 NICs")
Reported-by: Fei Liu <feliu@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Acked-by: Martin Habets <habetsm.xilinx@gmail.com>
Link: https://lore.kernel.org/r/20230615084929.10506-1-ihuguet@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Íñigo Huguet and committed by
Jakub Kicinski
4aaf2c52 d4e06728

+31 -13
+18 -7
drivers/net/ethernet/sfc/ef10.c
··· 2950 2950 return tstamp; 2951 2951 } 2952 2952 2953 - static void 2953 + static int 2954 2954 efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event) 2955 2955 { 2956 2956 struct efx_nic *efx = channel->efx; ··· 2958 2958 unsigned int tx_ev_desc_ptr; 2959 2959 unsigned int tx_ev_q_label; 2960 2960 unsigned int tx_ev_type; 2961 + int work_done; 2961 2962 u64 ts_part; 2962 2963 2963 2964 if (unlikely(READ_ONCE(efx->reset_pending))) 2964 - return; 2965 + return 0; 2965 2966 2966 2967 if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT))) 2967 - return; 2968 + return 0; 2968 2969 2969 2970 /* Get the transmit queue */ 2970 2971 tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL); ··· 2974 2973 if (!tx_queue->timestamping) { 2975 2974 /* Transmit completion */ 2976 2975 tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, ESF_DZ_TX_DESCR_INDX); 2977 - efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); 2978 - return; 2976 + return efx_xmit_done(tx_queue, tx_ev_desc_ptr & tx_queue->ptr_mask); 2979 2977 } 2980 2978 2981 2979 /* Transmit timestamps are only available for 8XXX series. They result ··· 3000 3000 * fields in the event. 3001 3001 */ 3002 3002 tx_ev_type = EFX_QWORD_FIELD(*event, ESF_EZ_TX_SOFT1); 3003 + work_done = 0; 3003 3004 3004 3005 switch (tx_ev_type) { 3005 3006 case TX_TIMESTAMP_EVENT_TX_EV_COMPLETION: ··· 3017 3016 tx_queue->completed_timestamp_major = ts_part; 3018 3017 3019 3018 efx_xmit_done_single(tx_queue); 3019 + work_done = 1; 3020 3020 break; 3021 3021 3022 3022 default: ··· 3028 3026 EFX_QWORD_VAL(*event)); 3029 3027 break; 3030 3028 } 3029 + 3030 + return work_done; 3031 3031 } 3032 3032 3033 3033 static void ··· 3085 3081 } 3086 3082 } 3087 3083 3084 + #define EFX_NAPI_MAX_TX 512 3085 + 3088 3086 static int efx_ef10_ev_process(struct efx_channel *channel, int quota) 3089 3087 { 3090 3088 struct efx_nic *efx = channel->efx; 3091 3089 efx_qword_t event, *p_event; 3092 3090 unsigned int read_ptr; 3093 - int ev_code; 3091 + int spent_tx = 0; 3094 3092 int spent = 0; 3093 + int ev_code; 3095 3094 3096 3095 if (quota <= 0) 3097 3096 return spent; ··· 3133 3126 } 3134 3127 break; 3135 3128 case ESE_DZ_EV_CODE_TX_EV: 3136 - efx_ef10_handle_tx_event(channel, &event); 3129 + spent_tx += efx_ef10_handle_tx_event(channel, &event); 3130 + if (spent_tx >= EFX_NAPI_MAX_TX) { 3131 + spent = quota; 3132 + goto out; 3133 + } 3137 3134 break; 3138 3135 case ESE_DZ_EV_CODE_DRIVER_EV: 3139 3136 efx_ef10_handle_driver_event(channel, &event);
+6 -1
drivers/net/ethernet/sfc/ef100_nic.c
··· 253 253 efx_reg(channel->efx, ER_GZ_EVQ_INT_PRIME)); 254 254 } 255 255 256 + #define EFX_NAPI_MAX_TX 512 257 + 256 258 static int ef100_ev_process(struct efx_channel *channel, int quota) 257 259 { 258 260 struct efx_nic *efx = channel->efx; ··· 262 260 bool evq_phase, old_evq_phase; 263 261 unsigned int read_ptr; 264 262 efx_qword_t *p_event; 263 + int spent_tx = 0; 265 264 int spent = 0; 266 265 bool ev_phase; 267 266 int ev_type; ··· 298 295 efx_mcdi_process_event(channel, p_event); 299 296 break; 300 297 case ESE_GZ_EF100_EV_TX_COMPLETION: 301 - ef100_ev_tx(channel, p_event); 298 + spent_tx += ef100_ev_tx(channel, p_event); 299 + if (spent_tx >= EFX_NAPI_MAX_TX) 300 + spent = quota; 302 301 break; 303 302 case ESE_GZ_EF100_EV_DRIVER: 304 303 netif_info(efx, drv, efx->net_dev,
+2 -2
drivers/net/ethernet/sfc/ef100_tx.c
··· 346 346 ef100_tx_push_buffers(tx_queue); 347 347 } 348 348 349 - void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) 349 + int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event) 350 350 { 351 351 unsigned int tx_done = 352 352 EFX_QWORD_FIELD(*p_event, ESF_GZ_EV_TXCMPL_NUM_DESC); ··· 357 357 unsigned int tx_index = (tx_queue->read_count + tx_done - 1) & 358 358 tx_queue->ptr_mask; 359 359 360 - efx_xmit_done(tx_queue, tx_index); 360 + return efx_xmit_done(tx_queue, tx_index); 361 361 } 362 362 363 363 /* Add a socket buffer to a TX queue
+1 -1
drivers/net/ethernet/sfc/ef100_tx.h
··· 20 20 void ef100_tx_write(struct efx_tx_queue *tx_queue); 21 21 unsigned int ef100_tx_max_skb_descs(struct efx_nic *efx); 22 22 23 - void ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); 23 + int ef100_ev_tx(struct efx_channel *channel, const efx_qword_t *p_event); 24 24 25 25 netdev_tx_t ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb); 26 26 int __ef100_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+3 -1
drivers/net/ethernet/sfc/tx_common.c
··· 249 249 } 250 250 } 251 251 252 - void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) 252 + int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) 253 253 { 254 254 unsigned int fill_level, pkts_compl = 0, bytes_compl = 0; 255 255 unsigned int efv_pkts_compl = 0; ··· 279 279 } 280 280 281 281 efx_xmit_done_check_empty(tx_queue); 282 + 283 + return pkts_compl + efv_pkts_compl; 282 284 } 283 285 284 286 /* Remove buffers put into a tx_queue for the current packet.
+1 -1
drivers/net/ethernet/sfc/tx_common.h
··· 28 28 } 29 29 30 30 void efx_xmit_done_check_empty(struct efx_tx_queue *tx_queue); 31 - void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); 31 + int efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index); 32 32 33 33 void efx_enqueue_unwind(struct efx_tx_queue *tx_queue, 34 34 unsigned int insert_count);