Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net_sched: add qdisc_dequeue_drop() helper

Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.

This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.

Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.

TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Eric Dumazet and committed by
Paolo Abeni
191ff13e 0170d7f4

+43 -14
+3 -2
include/net/pkt_sched.h
··· 114 114 115 115 void __qdisc_run(struct Qdisc *q); 116 116 117 - static inline void qdisc_run(struct Qdisc *q) 117 + static inline struct sk_buff *qdisc_run(struct Qdisc *q) 118 118 { 119 119 if (qdisc_run_begin(q)) { 120 120 __qdisc_run(q); 121 - qdisc_run_end(q); 121 + return qdisc_run_end(q); 122 122 } 123 + return NULL; 123 124 } 124 125 125 126 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+27 -3
include/net/sch_generic.h
··· 88 88 #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ 89 89 #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ 90 90 #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ 91 + #define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ 92 + 91 93 u32 limit; 92 94 const struct Qdisc_ops *ops; 93 95 struct qdisc_size_table __rcu *stab; ··· 121 119 122 120 /* Note : we only change qstats.backlog in fast path. */ 123 121 struct gnet_stats_queue qstats; 122 + 123 + struct sk_buff *to_free; 124 124 __cacheline_group_end(Qdisc_write); 125 125 126 126 ··· 222 218 return true; 223 219 } 224 220 225 - static inline void qdisc_run_end(struct Qdisc *qdisc) 221 + static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) 226 222 { 223 + struct sk_buff *to_free = NULL; 224 + 227 225 if (qdisc->flags & TCQ_F_NOLOCK) { 228 226 spin_unlock(&qdisc->seqlock); 229 227 ··· 238 232 if (unlikely(test_bit(__QDISC_STATE_MISSED, 239 233 &qdisc->state))) 240 234 __netif_schedule(qdisc); 241 - } else { 242 - WRITE_ONCE(qdisc->running, false); 235 + return NULL; 243 236 } 237 + 238 + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { 239 + to_free = qdisc->to_free; 240 + if (to_free) 241 + qdisc->to_free = NULL; 242 + } 243 + WRITE_ONCE(qdisc->running, false); 244 + return to_free; 244 245 } 245 246 246 247 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) ··· 1127 1114 kfree_skb_reason(skb, tcf_get_drop_reason(skb)); 1128 1115 skb = next; 1129 1116 } 1117 + } 1118 + 1119 + static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, 1120 + enum skb_drop_reason reason) 1121 + { 1122 + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); 1123 + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); 1124 + 1125 + tcf_set_drop_reason(skb, reason); 1126 + skb->next = q->to_free; 1127 + q->to_free = skb; 1130 1128 } 1131 1129 1132 1130 /* Instead of calling kfree_skb() while root qdisc lock is held,
+13 -9
net/core/dev.c
··· 4141 4141 struct net_device *dev, 4142 4142 struct netdev_queue *txq) 4143 4143 { 4144 - struct sk_buff *next, *to_free = NULL; 4144 + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; 4145 4145 spinlock_t *root_lock = qdisc_lock(q); 4146 4146 struct llist_node *ll_list, *first_n; 4147 4147 unsigned long defer_count = 0; ··· 4160 4160 if (unlikely(!nolock_qdisc_is_empty(q))) { 4161 4161 rc = dev_qdisc_enqueue(skb, q, &to_free, txq); 4162 4162 __qdisc_run(q); 4163 - qdisc_run_end(q); 4163 + to_free2 = qdisc_run_end(q); 4164 4164 4165 4165 goto free_skbs; 4166 4166 } ··· 4170 4170 !nolock_qdisc_is_empty(q)) 4171 4171 __qdisc_run(q); 4172 4172 4173 - qdisc_run_end(q); 4174 - return NET_XMIT_SUCCESS; 4173 + to_free2 = qdisc_run_end(q); 4174 + rc = NET_XMIT_SUCCESS; 4175 + goto free_skbs; 4175 4176 } 4176 4177 4177 4178 rc = dev_qdisc_enqueue(skb, q, &to_free, txq); 4178 - qdisc_run(q); 4179 + to_free2 = qdisc_run(q); 4179 4180 goto free_skbs; 4180 4181 } 4181 4182 ··· 4235 4234 qdisc_bstats_update(q, skb); 4236 4235 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) 4237 4236 __qdisc_run(q); 4238 - qdisc_run_end(q); 4237 + to_free2 = qdisc_run_end(q); 4239 4238 rc = NET_XMIT_SUCCESS; 4240 4239 } else { 4241 4240 int count = 0; ··· 4247 4246 rc = dev_qdisc_enqueue(skb, q, &to_free, txq); 4248 4247 count++; 4249 4248 } 4250 - qdisc_run(q); 4249 + to_free2 = qdisc_run(q); 4251 4250 if (count != 1) 4252 4251 rc = NET_XMIT_SUCCESS; 4253 4252 } ··· 4256 4255 4257 4256 free_skbs: 4258 4257 tcf_kfree_skb_list(to_free); 4258 + tcf_kfree_skb_list(to_free2); 4259 4259 return rc; 4260 4260 } 4261 4261 ··· 5749 5747 rcu_read_lock(); 5750 5748 5751 5749 while (head) { 5752 - struct Qdisc *q = head; 5753 5750 spinlock_t *root_lock = NULL; 5751 + struct sk_buff *to_free; 5752 + struct Qdisc *q = head; 5754 5753 5755 5754 head = head->next_sched; 5756 5755 ··· 5778 5775 } 5779 5776 5780 5777 clear_bit(__QDISC_STATE_SCHED, &q->state); 5781 - qdisc_run(q); 5778 + to_free = qdisc_run(q); 5782 5779 if (root_lock) 5783 5780 spin_unlock(root_lock); 5781 + tcf_kfree_skb_list(to_free); 5784 5782 } 5785 5783 5786 5784 rcu_read_unlock();