Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-stmmac-improve-tx-timer-logic'

Christian Marangi says:

====================
net: stmmac: improve tx timer logic

This series comes with the intention of restoring original performance
of stmmac on some router/device that used the stmmac driver to handle
gigabit traffic.

More info are present in patch 3. This cover letter is to show results
and improvements of the following change.

The move to hr_timer for tx timer and commit 8fce33317023 ("net: stmmac:
Rework coalesce timer and fix multi-queue races") caused big performance
regression on these kind of device.

This was observed on ipq806x that after kernel 4.19 couldn't handle
gigabit speed anymore.

The following series is currently applied and tested in OpenWrt SNAPSHOT
and have great performance increase. (the scenario is qca8k switch +
stmmac dwmac1000) Some good comparison can be found here [1].

The difference is from a swconfig scenario (where dsa tagging is not
used so very low CPU impact in handling traffic) and DSA scenario where
tagging is used and there is a minimal impact in the CPU. As can be
notice even with DSA in place we have better perf.

It was observed by other user that also SQM scenario with cake scheduler
were improved in the order of 100mbps (this scenario is CPU limited and
any increase of perf is caused by removing load on the CPU)

Been at least 15 days that this is in use without any complain or bug
reported about queue timeout. (was the case with v1 before the
additional patch was added, only appear on real world tests and not on
iperf tests)

[1] https://forum.openwrt.org/t/netgear-r7800-exploration-ipq8065-qca9984/285/3427?u=ansuel
====================

Link: https://lore.kernel.org/r/20231018123550.27110-1-ansuelsmth@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+59 -18
-8
drivers/net/ethernet/chelsio/cxgb3/sge.c
··· 2501 2501 return work_done; 2502 2502 } 2503 2503 2504 - /* 2505 - * Returns true if the device is already scheduled for polling. 2506 - */ 2507 - static inline int napi_is_scheduled(struct napi_struct *napi) 2508 - { 2509 - return test_bit(NAPI_STATE_SCHED, &napi->state); 2510 - } 2511 - 2512 2504 /** 2513 2505 * process_pure_responses - process pure responses from a response queue 2514 2506 * @adap: the adapter
+1 -1
drivers/net/ethernet/stmicro/stmmac/common.h
··· 293 293 #define MIN_DMA_RIWT 0x10 294 294 #define DEF_DMA_RIWT 0xa0 295 295 /* Tx coalesce parameters */ 296 - #define STMMAC_COAL_TX_TIMER 1000 296 + #define STMMAC_COAL_TX_TIMER 5000 297 297 #define STMMAC_MAX_COAL_TX_TICK 100000 298 298 #define STMMAC_TX_MAX_FRAMES 256 299 299 #define STMMAC_TX_FRAMES 25
+33 -7
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
··· 2543 2543 * @priv: driver private structure 2544 2544 * @budget: napi budget limiting this functions packet handling 2545 2545 * @queue: TX queue index 2546 + * @pending_packets: signal to arm the TX coal timer 2546 2547 * Description: it reclaims the transmit resources after transmission completes. 2548 + * If some packets still needs to be handled, due to TX coalesce, set 2549 + * pending_packets to true to make NAPI arm the TX coal timer. 2547 2550 */ 2548 - static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) 2551 + static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue, 2552 + bool *pending_packets) 2549 2553 { 2550 2554 struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue]; 2551 2555 struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[queue]; ··· 2710 2706 2711 2707 /* We still have pending packets, let's call for a new scheduling */ 2712 2708 if (tx_q->dirty_tx != tx_q->cur_tx) 2713 - stmmac_tx_timer_arm(priv, queue); 2709 + *pending_packets = true; 2714 2710 2715 2711 flags = u64_stats_update_begin_irqsave(&txq_stats->syncp); 2716 2712 txq_stats->tx_packets += tx_packets; ··· 3000 2996 { 3001 2997 struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[queue]; 3002 2998 u32 tx_coal_timer = priv->tx_coal_timer[queue]; 2999 + struct stmmac_channel *ch; 3000 + struct napi_struct *napi; 3003 3001 3004 3002 if (!tx_coal_timer) 3005 3003 return; 3006 3004 3007 - hrtimer_start(&tx_q->txtimer, 3008 - STMMAC_COAL_TIMER(tx_coal_timer), 3009 - HRTIMER_MODE_REL); 3005 + ch = &priv->channel[tx_q->queue_index]; 3006 + napi = tx_q->xsk_pool ? &ch->rxtx_napi : &ch->tx_napi; 3007 + 3008 + /* Arm timer only if napi is not already scheduled. 3009 + * Try to cancel any timer if napi is scheduled, timer will be armed 3010 + * again in the next scheduled napi. 3011 + */ 3012 + if (unlikely(!napi_is_scheduled(napi))) 3013 + hrtimer_start(&tx_q->txtimer, 3014 + STMMAC_COAL_TIMER(tx_coal_timer), 3015 + HRTIMER_MODE_REL); 3016 + else 3017 + hrtimer_try_to_cancel(&tx_q->txtimer); 3010 3018 } 3011 3019 3012 3020 /** ··· 5576 5560 container_of(napi, struct stmmac_channel, tx_napi); 5577 5561 struct stmmac_priv *priv = ch->priv_data; 5578 5562 struct stmmac_txq_stats *txq_stats; 5563 + bool pending_packets = false; 5579 5564 u32 chan = ch->index; 5580 5565 unsigned long flags; 5581 5566 int work_done; ··· 5586 5569 txq_stats->napi_poll++; 5587 5570 u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); 5588 5571 5589 - work_done = stmmac_tx_clean(priv, budget, chan); 5572 + work_done = stmmac_tx_clean(priv, budget, chan, &pending_packets); 5590 5573 work_done = min(work_done, budget); 5591 5574 5592 5575 if (work_done < budget && napi_complete_done(napi, work_done)) { ··· 5597 5580 spin_unlock_irqrestore(&ch->lock, flags); 5598 5581 } 5599 5582 5583 + /* TX still have packet to handle, check if we need to arm tx timer */ 5584 + if (pending_packets) 5585 + stmmac_tx_timer_arm(priv, chan); 5586 + 5600 5587 return work_done; 5601 5588 } 5602 5589 ··· 5609 5588 struct stmmac_channel *ch = 5610 5589 container_of(napi, struct stmmac_channel, rxtx_napi); 5611 5590 struct stmmac_priv *priv = ch->priv_data; 5591 + bool tx_pending_packets = false; 5612 5592 int rx_done, tx_done, rxtx_done; 5613 5593 struct stmmac_rxq_stats *rxq_stats; 5614 5594 struct stmmac_txq_stats *txq_stats; ··· 5626 5604 txq_stats->napi_poll++; 5627 5605 u64_stats_update_end_irqrestore(&txq_stats->syncp, flags); 5628 5606 5629 - tx_done = stmmac_tx_clean(priv, budget, chan); 5607 + tx_done = stmmac_tx_clean(priv, budget, chan, &tx_pending_packets); 5630 5608 tx_done = min(tx_done, budget); 5631 5609 5632 5610 rx_done = stmmac_rx_zc(priv, budget, chan); ··· 5650 5628 stmmac_enable_dma_irq(priv, priv->ioaddr, chan, 1, 1); 5651 5629 spin_unlock_irqrestore(&ch->lock, flags); 5652 5630 } 5631 + 5632 + /* TX still have packet to handle, check if we need to arm tx timer */ 5633 + if (tx_pending_packets) 5634 + stmmac_tx_timer_arm(priv, chan); 5653 5635 5654 5636 return min(rxtx_done, budget - 1); 5655 5637 }
+1 -1
drivers/net/wireless/realtek/rtw89/core.c
··· 2005 2005 struct napi_struct *napi = &rtwdev->napi; 2006 2006 2007 2007 /* In low power mode, napi isn't scheduled. Receive it to netif. */ 2008 - if (unlikely(!test_bit(NAPI_STATE_SCHED, &napi->state))) 2008 + if (unlikely(!napi_is_scheduled(napi))) 2009 2009 napi = NULL; 2010 2010 2011 2011 rtw89_core_hw_to_sband_rate(rx_status);
+23
include/linux/netdevice.h
··· 482 482 return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); 483 483 } 484 484 485 + /** 486 + * napi_is_scheduled - test if NAPI is scheduled 487 + * @n: NAPI context 488 + * 489 + * This check is "best-effort". With no locking implemented, 490 + * a NAPI can be scheduled or terminate right after this check 491 + * and produce not precise results. 492 + * 493 + * NAPI_STATE_SCHED is an internal state, napi_is_scheduled 494 + * should not be used normally and napi_schedule should be 495 + * used instead. 496 + * 497 + * Use only if the driver really needs to check if a NAPI 498 + * is scheduled for example in the context of delayed timer 499 + * that can be skipped if a NAPI is already scheduled. 500 + * 501 + * Return True if NAPI is scheduled, False otherwise. 502 + */ 503 + static inline bool napi_is_scheduled(struct napi_struct *n) 504 + { 505 + return test_bit(NAPI_STATE_SCHED, &n->state); 506 + } 507 + 485 508 bool napi_schedule_prep(struct napi_struct *n); 486 509 487 510 /**
+1 -1
net/core/dev.c
··· 6532 6532 * accidentally calling ->poll() when NAPI is not scheduled. 6533 6533 */ 6534 6534 work = 0; 6535 - if (test_bit(NAPI_STATE_SCHED, &n->state)) { 6535 + if (napi_is_scheduled(n)) { 6536 6536 work = n->poll(n, weight); 6537 6537 trace_napi_poll(n, work, weight); 6538 6538 }