Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netdevsim: implement peer queue flow control

Add flow control mechanism between paired netdevsim devices to stop the
TX queue during high traffic scenarios. When a receive queue becomes
congested (approaching NSIM_RING_SIZE limit), the corresponding transmit
queue on the peer device is stopped using netif_subqueue_try_stop().

Once the receive queue has sufficient capacity again, the peer's
transmit queue is resumed with netif_tx_wake_queue().

Key changes:
* Add nsim_stop_peer_tx_queue() to pause peer TX when RX queue is full
* Add nsim_start_peer_tx_queue() to resume peer TX when RX queue drains
* Implement queue mapping validation to ensure TX/RX queue count match
* Wake all queues during device unlinking to prevent stuck queues
* Use RCU protection when accessing peer device references
* wake the queues when changing the queue numbers
* Remove IFF_NO_QUEUE given it will enqueue packets now

The flow control only activates when devices have matching TX/RX queue
counts to ensure proper queue mapping.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250711-netdev_flow_control-v3-1-aa1d5a155762@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Breno Leitao and committed by
Jakub Kicinski
ff2ac4df 5777d187

+85 -7
+3
drivers/net/netdevsim/bus.c
··· 384 384 err = 0; 385 385 RCU_INIT_POINTER(nsim->peer, NULL); 386 386 RCU_INIT_POINTER(peer->peer, NULL); 387 + synchronize_net(); 388 + netif_tx_wake_all_queues(dev); 389 + netif_tx_wake_all_queues(peer->netdev); 387 390 388 391 out_put_netns: 389 392 put_net(ns);
+21
drivers/net/netdevsim/ethtool.c
··· 101 101 ch->combined_count = ns->ethtool.channels; 102 102 } 103 103 104 + static void 105 + nsim_wake_queues(struct net_device *dev) 106 + { 107 + struct netdevsim *ns = netdev_priv(dev); 108 + struct netdevsim *peer; 109 + 110 + synchronize_net(); 111 + netif_tx_wake_all_queues(dev); 112 + 113 + rcu_read_lock(); 114 + peer = rcu_dereference(ns->peer); 115 + if (peer) 116 + netif_tx_wake_all_queues(peer->netdev); 117 + rcu_read_unlock(); 118 + } 119 + 104 120 static int 105 121 nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) 106 122 { ··· 129 113 return err; 130 114 131 115 ns->ethtool.channels = ch->combined_count; 116 + 117 + /* Only wake up queues if devices are linked */ 118 + if (rcu_access_pointer(ns->peer)) 119 + nsim_wake_queues(dev); 120 + 132 121 return 0; 133 122 } 134 123
+61 -7
drivers/net/netdevsim/netdev.c
··· 37 37 38 38 #define NSIM_RING_SIZE 256 39 39 40 - static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb) 40 + static void nsim_start_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq) 41 + { 42 + struct netdevsim *ns = netdev_priv(dev); 43 + struct net_device *peer_dev; 44 + struct netdevsim *peer_ns; 45 + struct netdev_queue *txq; 46 + u16 idx; 47 + 48 + idx = rq->napi.index; 49 + rcu_read_lock(); 50 + peer_ns = rcu_dereference(ns->peer); 51 + if (!peer_ns) 52 + goto out; 53 + 54 + /* TX device */ 55 + peer_dev = peer_ns->netdev; 56 + if (dev->real_num_tx_queues != peer_dev->num_rx_queues) 57 + goto out; 58 + 59 + txq = netdev_get_tx_queue(peer_dev, idx); 60 + if (!netif_tx_queue_stopped(txq)) 61 + goto out; 62 + 63 + netif_tx_wake_queue(txq); 64 + out: 65 + rcu_read_unlock(); 66 + } 67 + 68 + static void nsim_stop_tx_queue(struct net_device *tx_dev, 69 + struct net_device *rx_dev, 70 + struct nsim_rq *rq, 71 + u16 idx) 72 + { 73 + /* If different queues size, do not stop, since it is not 74 + * easy to find which TX queue is mapped here 75 + */ 76 + if (rx_dev->real_num_tx_queues != tx_dev->num_rx_queues) 77 + return; 78 + 79 + /* rq is the queue on the receive side */ 80 + netif_subqueue_try_stop(tx_dev, idx, 81 + NSIM_RING_SIZE - skb_queue_len(&rq->skb_queue), 82 + NSIM_RING_SIZE / 2); 83 + } 84 + 85 + static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev, 86 + struct nsim_rq *rq, struct sk_buff *skb) 41 87 { 42 88 if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { 43 89 dev_kfree_skb_any(skb); ··· 91 45 } 92 46 93 47 skb_queue_tail(&rq->skb_queue, skb); 48 + 49 + /* Stop the peer TX queue avoiding dropping packets later */ 50 + if (skb_queue_len(&rq->skb_queue) >= NSIM_RING_SIZE) 51 + nsim_stop_tx_queue(tx_dev, rx_dev, rq, 52 + skb_get_queue_mapping(skb)); 53 + 94 54 return NET_RX_SUCCESS; 95 55 } 96 56 97 - static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, 57 + static int nsim_forward_skb(struct net_device *tx_dev, 58 + struct net_device *rx_dev, 59 + struct sk_buff *skb, 98 60 struct nsim_rq *rq) 99 61 { 100 - return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb); 62 + return __dev_forward_skb(rx_dev, skb) ?: 63 + nsim_napi_rx(tx_dev, rx_dev, rq, skb); 101 64 } 102 65 103 66 static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) ··· 141 86 skb_linearize(skb); 142 87 143 88 skb_tx_timestamp(skb); 144 - if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP)) 89 + if (unlikely(nsim_forward_skb(dev, peer_dev, skb, rq) == NET_RX_DROP)) 145 90 goto out_drop_cnt; 146 91 147 92 if (!hrtimer_active(&rq->napi_timer)) ··· 406 351 dev_dstats_rx_dropped(dev); 407 352 } 408 353 354 + nsim_start_peer_tx_queue(dev, rq); 409 355 return i; 410 356 } 411 357 ··· 920 864 ether_setup(dev); 921 865 eth_hw_addr_random(dev); 922 866 923 - dev->tx_queue_len = 0; 924 867 dev->flags &= ~IFF_MULTICAST; 925 - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | 926 - IFF_NO_QUEUE; 868 + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 927 869 dev->features |= NETIF_F_HIGHDMA | 928 870 NETIF_F_SG | 929 871 NETIF_F_FRAGLIST |