Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-optimize-tx-throughput-and-efficiency'

Eric Dumazet says:

====================
net: optimize TX throughput and efficiency

In this series, I replace the busylock spinlock we have in
__dev_queue_xmit() and use lockless list (llist) to reduce
spinlock contention to the minimum.

Idea is that only one cpu might spin on the qdisc spinlock,
while others simply add their skb in the llist.

After this series, we get a 300 % (4x) improvement on heavy TX workloads,
sending twice the number of packets per second, for half the cpu cycles.
====================

Link: https://patch.msgid.link/20251014171907.3554413-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+111 -104
+8 -1
include/linux/netdevice_xmit.h
··· 2 2 #ifndef _LINUX_NETDEVICE_XMIT_H 3 3 #define _LINUX_NETDEVICE_XMIT_H 4 4 5 + #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) 6 + #define MIRRED_NEST_LIMIT 4 7 + #endif 8 + 9 + struct net_device; 10 + 5 11 struct netdev_xmit { 6 12 u16 recursion; 7 13 u8 more; ··· 15 9 u8 skip_txqueue; 16 10 #endif 17 11 #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) 18 - u8 sched_mirred_nest; 12 + u8 sched_mirred_nest; 13 + struct net_device *sched_mirred_dev[MIRRED_NEST_LIMIT]; 19 14 #endif 20 15 #if IS_ENABLED(CONFIG_NF_DUP_NETDEV) 21 16 u8 nf_dup_skb_recursion;
+10 -13
include/net/sch_generic.h
··· 41 41 __QDISC_STATE_DRAINING, 42 42 }; 43 43 44 - enum qdisc_state2_t { 45 - /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. 46 - * Use qdisc_run_begin/end() or qdisc_is_running() instead. 47 - */ 48 - __QDISC_STATE2_RUNNING, 49 - }; 50 - 51 44 #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) 52 45 #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) 53 46 ··· 110 117 struct qdisc_skb_head q; 111 118 struct gnet_stats_basic_sync bstats; 112 119 struct gnet_stats_queue qstats; 113 - int owner; 120 + bool running; /* must be written under qdisc spinlock */ 114 121 unsigned long state; 115 - unsigned long state2; /* must be written under qdisc spinlock */ 116 122 struct Qdisc *next_sched; 117 123 struct sk_buff_head skb_bad_txq; 118 124 119 - spinlock_t busylock ____cacheline_aligned_in_smp; 125 + atomic_long_t defer_count ____cacheline_aligned_in_smp; 126 + struct llist_head defer_list; 127 + 120 128 spinlock_t seqlock; 121 129 122 130 struct rcu_head rcu; ··· 162 168 { 163 169 if (qdisc->flags & TCQ_F_NOLOCK) 164 170 return spin_is_locked(&qdisc->seqlock); 165 - return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); 171 + return READ_ONCE(qdisc->running); 166 172 } 167 173 168 174 static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) ··· 205 211 */ 206 212 return spin_trylock(&qdisc->seqlock); 207 213 } 208 - return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); 214 + if (READ_ONCE(qdisc->running)) 215 + return false; 216 + WRITE_ONCE(qdisc->running, true); 217 + return true; 209 218 } 210 219 211 220 static inline void qdisc_run_end(struct Qdisc *qdisc) ··· 226 229 &qdisc->state))) 227 230 __netif_schedule(qdisc); 228 231 } else { 229 - __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); 232 + WRITE_ONCE(qdisc->running, false); 230 233 } 231 234 } 232 235
+56 -41
net/core/dev.c
··· 4125 4125 struct net_device *dev, 4126 4126 struct netdev_queue *txq) 4127 4127 { 4128 + struct sk_buff *next, *to_free = NULL; 4128 4129 spinlock_t *root_lock = qdisc_lock(q); 4129 - struct sk_buff *to_free = NULL; 4130 - bool contended; 4130 + struct llist_node *ll_list, *first_n; 4131 + unsigned long defer_count = 0; 4131 4132 int rc; 4132 4133 4133 4134 qdisc_calculate_pkt_len(skb, q); ··· 4168 4167 return rc; 4169 4168 } 4170 4169 4171 - if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { 4172 - kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); 4173 - return NET_XMIT_DROP; 4174 - } 4175 - /* 4176 - * Heuristic to force contended enqueues to serialize on a 4177 - * separate lock before trying to get qdisc main lock. 4178 - * This permits qdisc->running owner to get the lock more 4179 - * often and dequeue packets faster. 4180 - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit 4181 - * and then other tasks will only enqueue packets. The packets will be 4182 - * sent after the qdisc owner is scheduled again. To prevent this 4183 - * scenario the task always serialize on the lock. 4170 + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. 4171 + * In the try_cmpxchg() loop, we want to increment q->defer_count 4172 + * at most once to limit the number of skbs in defer_list. 4173 + * We perform the defer_count increment only if the list is not empty, 4174 + * because some arches have slow atomic_long_inc_return(). 4184 4175 */ 4185 - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); 4186 - if (unlikely(contended)) 4187 - spin_lock(&q->busylock); 4176 + first_n = READ_ONCE(q->defer_list.first); 4177 + do { 4178 + if (first_n && !defer_count) { 4179 + defer_count = atomic_long_inc_return(&q->defer_count); 4180 + if (unlikely(defer_count > q->limit)) { 4181 + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); 4182 + return NET_XMIT_DROP; 4183 + } 4184 + } 4185 + skb->ll_node.next = first_n; 4186 + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); 4187 + 4188 + /* If defer_list was not empty, we know the cpu which queued 4189 + * the first skb will process the whole list for us. 4190 + */ 4191 + if (first_n) 4192 + return NET_XMIT_SUCCESS; 4188 4193 4189 4194 spin_lock(root_lock); 4195 + 4196 + ll_list = llist_del_all(&q->defer_list); 4197 + /* There is a small race because we clear defer_count not atomically 4198 + * with the prior llist_del_all(). This means defer_list could grow 4199 + * over q->limit. 4200 + */ 4201 + atomic_long_set(&q->defer_count, 0); 4202 + 4203 + ll_list = llist_reverse_order(ll_list); 4204 + 4190 4205 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 4191 - __qdisc_drop(skb, &to_free); 4206 + llist_for_each_entry_safe(skb, next, ll_list, ll_node) 4207 + __qdisc_drop(skb, &to_free); 4192 4208 rc = NET_XMIT_DROP; 4193 - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 4194 - qdisc_run_begin(q)) { 4209 + goto unlock; 4210 + } 4211 + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 4212 + !llist_next(ll_list) && qdisc_run_begin(q)) { 4195 4213 /* 4196 4214 * This is a work-conserving queue; there are no old skbs 4197 4215 * waiting to be sent out; and the qdisc is not running - 4198 4216 * xmit the skb directly. 4199 4217 */ 4200 4218 4219 + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, 4220 + struct sk_buff, 4221 + ll_node)); 4201 4222 qdisc_bstats_update(q, skb); 4202 - 4203 - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 4204 - if (unlikely(contended)) { 4205 - spin_unlock(&q->busylock); 4206 - contended = false; 4207 - } 4223 + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) 4208 4224 __qdisc_run(q); 4209 - } 4210 - 4211 4225 qdisc_run_end(q); 4212 4226 rc = NET_XMIT_SUCCESS; 4213 4227 } else { 4214 - WRITE_ONCE(q->owner, smp_processor_id()); 4215 - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); 4216 - WRITE_ONCE(q->owner, -1); 4217 - if (qdisc_run_begin(q)) { 4218 - if (unlikely(contended)) { 4219 - spin_unlock(&q->busylock); 4220 - contended = false; 4221 - } 4222 - __qdisc_run(q); 4223 - qdisc_run_end(q); 4228 + int count = 0; 4229 + 4230 + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { 4231 + prefetch(next); 4232 + skb_mark_not_on_list(skb); 4233 + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); 4234 + count++; 4224 4235 } 4236 + qdisc_run(q); 4237 + if (count != 1) 4238 + rc = NET_XMIT_SUCCESS; 4225 4239 } 4240 + unlock: 4226 4241 spin_unlock(root_lock); 4227 4242 if (unlikely(to_free)) 4228 4243 kfree_skb_list_reason(to_free, 4229 4244 tcf_get_drop_reason(to_free)); 4230 - if (unlikely(contended)) 4231 - spin_unlock(&q->busylock); 4232 4245 return rc; 4233 4246 } 4234 4247
+10 -1
net/core/skbuff.c
··· 1136 1136 skb_dst_drop(skb); 1137 1137 if (skb->destructor) { 1138 1138 DEBUG_NET_WARN_ON_ONCE(in_hardirq()); 1139 - skb->destructor(skb); 1139 + #ifdef CONFIG_INET 1140 + INDIRECT_CALL_3(skb->destructor, 1141 + tcp_wfree, __sock_wfree, sock_wfree, 1142 + skb); 1143 + #else 1144 + INDIRECT_CALL_1(skb->destructor, 1145 + sock_wfree, 1146 + skb); 1147 + 1148 + #endif 1140 1149 } 1141 1150 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1142 1151 nf_conntrack_put(skb_nfct(skb));
+23 -39
net/sched/act_mirred.c
··· 29 29 static LIST_HEAD(mirred_list); 30 30 static DEFINE_SPINLOCK(mirred_list_lock); 31 31 32 - #define MIRRED_NEST_LIMIT 4 33 - 34 - #ifndef CONFIG_PREEMPT_RT 35 - static u8 tcf_mirred_nest_level_inc_return(void) 36 - { 37 - return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); 38 - } 39 - 40 - static void tcf_mirred_nest_level_dec(void) 41 - { 42 - __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); 43 - } 44 - 45 - #else 46 - static u8 tcf_mirred_nest_level_inc_return(void) 47 - { 48 - return current->net_xmit.sched_mirred_nest++; 49 - } 50 - 51 - static void tcf_mirred_nest_level_dec(void) 52 - { 53 - current->net_xmit.sched_mirred_nest--; 54 - } 55 - #endif 56 - 57 32 static bool tcf_mirred_is_act_redirect(int action) 58 33 { 59 34 return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; ··· 414 439 { 415 440 struct tcf_mirred *m = to_mirred(a); 416 441 int retval = READ_ONCE(m->tcf_action); 417 - unsigned int nest_level; 442 + struct netdev_xmit *xmit; 418 443 bool m_mac_header_xmit; 419 444 struct net_device *dev; 420 - int m_eaction; 445 + int i, m_eaction; 421 446 u32 blockid; 422 447 423 - nest_level = tcf_mirred_nest_level_inc_return(); 424 - if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { 448 + #ifdef CONFIG_PREEMPT_RT 449 + xmit = &current->net_xmit; 450 + #else 451 + xmit = this_cpu_ptr(&softnet_data.xmit); 452 + #endif 453 + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { 425 454 net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", 426 455 netdev_name(skb->dev)); 427 - retval = TC_ACT_SHOT; 428 - goto dec_nest_level; 456 + return TC_ACT_SHOT; 429 457 } 430 458 431 459 tcf_lastuse_update(&m->tcf_tm); 432 460 tcf_action_update_bstats(&m->common, skb); 433 461 434 462 blockid = READ_ONCE(m->tcfm_blockid); 435 - if (blockid) { 436 - retval = tcf_blockcast(skb, m, blockid, res, retval); 437 - goto dec_nest_level; 438 - } 463 + if (blockid) 464 + return tcf_blockcast(skb, m, blockid, res, retval); 439 465 440 466 dev = rcu_dereference_bh(m->tcfm_dev); 441 467 if (unlikely(!dev)) { 442 468 pr_notice_once("tc mirred: target device is gone\n"); 443 469 tcf_action_inc_overlimit_qstats(&m->common); 444 - goto dec_nest_level; 470 + return retval; 445 471 } 472 + for (i = 0; i < xmit->sched_mirred_nest; i++) { 473 + if (xmit->sched_mirred_dev[i] != dev) 474 + continue; 475 + pr_notice_once("tc mirred: loop on device %s\n", 476 + netdev_name(dev)); 477 + tcf_action_inc_overlimit_qstats(&m->common); 478 + return retval; 479 + } 480 + 481 + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; 446 482 447 483 m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); 448 484 m_eaction = READ_ONCE(m->tcfm_eaction); 449 485 450 486 retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, 451 487 retval); 452 - 453 - dec_nest_level: 454 - tcf_mirred_nest_level_dec(); 488 + xmit->sched_mirred_nest--; 455 489 456 490 return retval; 457 491 }
-7
net/sched/sch_generic.c
··· 666 666 .ops = &noop_qdisc_ops, 667 667 .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), 668 668 .dev_queue = &noop_netdev_queue, 669 - .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), 670 669 .gso_skb = { 671 670 .next = (struct sk_buff *)&noop_qdisc.gso_skb, 672 671 .prev = (struct sk_buff *)&noop_qdisc.gso_skb, ··· 678 679 .qlen = 0, 679 680 .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), 680 681 }, 681 - .owner = -1, 682 682 }; 683 683 EXPORT_SYMBOL(noop_qdisc); 684 684 ··· 969 971 } 970 972 } 971 973 972 - spin_lock_init(&sch->busylock); 973 - lockdep_set_class(&sch->busylock, 974 - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); 975 - 976 974 /* seqlock has the same scope of busylock, for NOLOCK qdisc */ 977 975 spin_lock_init(&sch->seqlock); 978 976 lockdep_set_class(&sch->seqlock, ··· 979 985 sch->enqueue = ops->enqueue; 980 986 sch->dequeue = ops->dequeue; 981 987 sch->dev_queue = dev_queue; 982 - sch->owner = -1; 983 988 netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); 984 989 refcount_set(&sch->refcnt, 1); 985 990
+4 -2
tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
··· 23 23 24 24 // install a qdisc dropping all packets 25 25 +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0` 26 + 26 27 +0 write(4, ..., 24) = 24 27 28 // When qdisc is congested we retry every 500ms 28 29 // (TCP_RESOURCE_PROBE_INTERVAL) and therefore 29 30 // we retry 6 times before hitting 3s timeout. 30 31 // First verify that the connection is alive: 31 - +3.250 write(4, ..., 24) = 24 32 + +3 write(4, ..., 24) = 24 33 + 32 34 // Now verify that shortly after that the socket is dead: 33 - +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) 35 + +1 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) 34 36 35 37 +0 %{ assert tcpi_probes == 6, tcpi_probes; \ 36 38 assert tcpi_backoff == 0, tcpi_backoff }%