Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/hfi1: Add atomic triggered sleep/wakeup

When running iperf in a two host configuration the following trace can
occur:

[ 319.728730] NETDEV WATCHDOG: ib0 (hfi1): transmit queue 0 timed out

The issue happens because the current implementation relies on the netif
txq being stopped to control the flushing of the tx list.

There are two resources that the transmit logic can wait on and stop the
txq:
- SDMA descriptors
- Ring space to hold completions

The ring space is tested on the sending side and relieved when the ring is
consumed in the napi tx reaping.

Unfortunately, that reaping can run conncurrently with the workqueue
flushing of the txlist. If the txq is started just before the workitem
executes, the txlist will never be flushed, leading to the txq being
stuck.

Fix by:
- Adding sleep/wakeup wrappers
* Use an atomic to control the call to the netif routines inside the
wrappers

- Use another atomic to record ring space exhaustion
* Only wakeup when the a ring space exhaustion has happened and it
relieved

Add additional wrappers to clarify the ring space resource handling.

Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets")
Link: https://lore.kernel.org/r/20200623204327.108092.4024.stgit@awfm-01.aw.intel.com
Reviewed-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

authored by

Mike Marciniszyn and committed by
Jason Gunthorpe
38fd98af 82172b76

+53 -20
+6
drivers/infiniband/hw/hfi1/ipoib.h
··· 67 67 * @sde: sdma engine 68 68 * @tx_list: tx request list 69 69 * @sent_txreqs: count of txreqs posted to sdma 70 + * @stops: count of stops of queue 71 + * @ring_full: ring has been filled 72 + * @no_desc: descriptor shortage seen 70 73 * @flow: tracks when list needs to be flushed for a flow change 71 74 * @q_idx: ipoib Tx queue index 72 75 * @pkts_sent: indicator packets have been sent from this queue ··· 83 80 struct sdma_engine *sde; 84 81 struct list_head tx_list; 85 82 u64 sent_txreqs; 83 + atomic_t stops; 84 + atomic_t ring_full; 85 + atomic_t no_desc; 86 86 union hfi1_ipoib_flow flow; 87 87 u8 q_idx; 88 88 bool pkts_sent;
+47 -20
drivers/infiniband/hw/hfi1/ipoib_tx.c
··· 55 55 return sent - completed; 56 56 } 57 57 58 + static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq) 59 + { 60 + return hfi1_ipoib_txreqs(txq->sent_txreqs, 61 + atomic64_read(&txq->complete_txreqs)); 62 + } 63 + 64 + static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq) 65 + { 66 + if (atomic_inc_return(&txq->stops) == 1) 67 + netif_stop_subqueue(txq->priv->netdev, txq->q_idx); 68 + } 69 + 70 + static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq) 71 + { 72 + if (atomic_dec_and_test(&txq->stops)) 73 + netif_wake_subqueue(txq->priv->netdev, txq->q_idx); 74 + } 75 + 76 + static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq) 77 + { 78 + return min_t(uint, txq->priv->netdev->tx_queue_len, 79 + txq->tx_ring.max_items - 1); 80 + } 81 + 82 + static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq) 83 + { 84 + return min_t(uint, txq->priv->netdev->tx_queue_len, 85 + txq->tx_ring.max_items) >> 1; 86 + } 87 + 58 88 static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) 59 89 { 60 - if (unlikely(hfi1_ipoib_txreqs(++txq->sent_txreqs, 61 - atomic64_read(&txq->complete_txreqs)) >= 62 - min_t(unsigned int, txq->priv->netdev->tx_queue_len, 63 - txq->tx_ring.max_items - 1))) 64 - netif_stop_subqueue(txq->priv->netdev, txq->q_idx); 90 + ++txq->sent_txreqs; 91 + if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) && 92 + !atomic_xchg(&txq->ring_full, 1)) 93 + hfi1_ipoib_stop_txq(txq); 65 94 } 66 95 67 96 static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) 68 97 { 69 98 struct net_device *dev = txq->priv->netdev; 70 - 71 - /* If the queue is already running just return */ 72 - if (likely(!__netif_subqueue_stopped(dev, txq->q_idx))) 73 - return; 74 99 75 100 /* If shutting down just return as queue state is irrelevant */ 76 101 if (unlikely(dev->reg_state != NETREG_REGISTERED)) ··· 111 86 * Use the minimum of the current tx_queue_len or the rings max txreqs 112 87 * to protect against ring overflow. 113 88 */ 114 - if (hfi1_ipoib_txreqs(txq->sent_txreqs, 115 - atomic64_read(&txq->complete_txreqs)) 116 - < min_t(unsigned int, dev->tx_queue_len, 117 - txq->tx_ring.max_items) >> 1) 118 - netif_wake_subqueue(dev, txq->q_idx); 89 + if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) && 90 + atomic_xchg(&txq->ring_full, 0)) 91 + hfi1_ipoib_wake_txq(txq); 119 92 } 120 93 121 94 static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) ··· 631 608 return -EAGAIN; 632 609 } 633 610 634 - netif_stop_subqueue(txq->priv->netdev, txq->q_idx); 635 - 636 611 if (list_empty(&txreq->list)) 637 612 /* came from non-list submit */ 638 613 list_add_tail(&txreq->list, &txq->tx_list); 639 - if (list_empty(&txq->wait.list)) 614 + if (list_empty(&txq->wait.list)) { 615 + if (!atomic_xchg(&txq->no_desc, 1)) 616 + hfi1_ipoib_stop_txq(txq); 640 617 iowait_queue(pkts_sent, wait->iow, &sde->dmawait); 618 + } 641 619 642 620 write_sequnlock(&sde->waitlock); 643 621 return -EBUSY; ··· 673 649 struct net_device *dev = txq->priv->netdev; 674 650 675 651 if (likely(dev->reg_state == NETREG_REGISTERED) && 676 - likely(__netif_subqueue_stopped(dev, txq->q_idx)) && 677 652 likely(!hfi1_ipoib_flush_tx_list(dev, txq))) 678 - netif_wake_subqueue(dev, txq->q_idx); 653 + if (atomic_xchg(&txq->no_desc, 0)) 654 + hfi1_ipoib_wake_txq(txq); 679 655 } 680 656 681 657 int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) ··· 729 705 txq->sde = NULL; 730 706 INIT_LIST_HEAD(&txq->tx_list); 731 707 atomic64_set(&txq->complete_txreqs, 0); 708 + atomic_set(&txq->stops, 0); 709 + atomic_set(&txq->ring_full, 0); 710 + atomic_set(&txq->no_desc, 0); 732 711 txq->q_idx = i; 733 712 txq->flow.tx_queue = 0xff; 734 713 txq->flow.sc5 = 0xff; ··· 797 770 atomic64_inc(complete_txreqs); 798 771 } 799 772 800 - if (hfi1_ipoib_txreqs(txq->sent_txreqs, atomic64_read(complete_txreqs))) 773 + if (hfi1_ipoib_used(txq)) 801 774 dd_dev_warn(txq->priv->dd, 802 775 "txq %d not empty found %llu requests\n", 803 776 txq->q_idx,