Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/sched: Fix backlog accounting in qdisc_dequeue_internal

This issue applies for the following qdiscs: hhf, fq, fq_codel, and
fq_pie, and occurs in their change handlers when adjusting to the new
limit. The problem is the following in the values passed to the
subsequent qdisc_tree_reduce_backlog call given a tbf parent:

When the tbf parent runs out of tokens, skbs of these qdiscs will
be placed in gso_skb. Their peek handlers are qdisc_peek_dequeued,
which accounts for both qlen and backlog. However, in the case of
qdisc_dequeue_internal, ONLY qlen is accounted for when pulling
from gso_skb. This means that these qdiscs are missing a
qdisc_qstats_backlog_dec when dropping packets to satisfy the
new limit in their change handlers.

One can observe this issue with the following (with tc patched to
support a limit of 0):

export TARGET=fq
tc qdisc del dev lo root
tc qdisc add dev lo root handle 1: tbf rate 8bit burst 100b latency 1ms
tc qdisc replace dev lo handle 3: parent 1:1 $TARGET limit 1000
echo ''; echo 'add child'; tc -s -d qdisc show dev lo
ping -I lo -f -c2 -s32 -W0.001 127.0.0.1 2>&1 >/dev/null
echo ''; echo 'after ping'; tc -s -d qdisc show dev lo
tc qdisc change dev lo handle 3: parent 1:1 $TARGET limit 0
echo ''; echo 'after limit drop'; tc -s -d qdisc show dev lo
tc qdisc replace dev lo handle 2: parent 1:1 sfq
echo ''; echo 'post graft'; tc -s -d qdisc show dev lo

The second to last show command shows 0 packets but a positive
number (74) of backlog bytes. The problem becomes clearer in the
last show command, where qdisc_purge_queue triggers
qdisc_tree_reduce_backlog with the positive backlog and causes an
underflow in the tbf parent's backlog (4096 Mb instead of 0).

To fix this issue, the codepath for all clients of qdisc_dequeue_internal
has been simplified: codel, pie, hhf, fq, fq_pie, and fq_codel.
qdisc_dequeue_internal handles the backlog adjustments for all cases that
do not directly use the dequeue handler.

The old fq_codel_change limit adjustment loop accumulated the arguments to
the subsequent qdisc_tree_reduce_backlog call through the cstats field.
However, this is confusing and error prone as fq_codel_dequeue could also
potentially mutate this field (which qdisc_dequeue_internal calls in the
non gso_skb case), so we have unified the code here with other qdiscs.

Fixes: 2d3cbfd6d54a ("net_sched: Flush gso_skb list too during ->change()")
Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM")
Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc")
Signed-off-by: William Liu <will@willsroot.io>
Reviewed-by: Savino Dicanosa <savy@syst3mfailure.io>
Link: https://patch.msgid.link/20250812235725.45243-1-will@willsroot.io
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

William Liu and committed by
Jakub Kicinski
52bf2726 d1547bf4

+50 -33
+8 -3
include/net/sch_generic.h
··· 1038 1038 skb = __skb_dequeue(&sch->gso_skb); 1039 1039 if (skb) { 1040 1040 sch->q.qlen--; 1041 + qdisc_qstats_backlog_dec(sch, skb); 1041 1042 return skb; 1042 1043 } 1043 - if (direct) 1044 - return __qdisc_dequeue_head(&sch->q); 1045 - else 1044 + if (direct) { 1045 + skb = __qdisc_dequeue_head(&sch->q); 1046 + if (skb) 1047 + qdisc_qstats_backlog_dec(sch, skb); 1048 + return skb; 1049 + } else { 1046 1050 return sch->dequeue(sch); 1051 + } 1047 1052 } 1048 1053 1049 1054 static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
+7 -5
net/sched/sch_codel.c
··· 101 101 static int codel_change(struct Qdisc *sch, struct nlattr *opt, 102 102 struct netlink_ext_ack *extack) 103 103 { 104 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 104 105 struct codel_sched_data *q = qdisc_priv(sch); 105 106 struct nlattr *tb[TCA_CODEL_MAX + 1]; 106 - unsigned int qlen, dropped = 0; 107 107 int err; 108 108 109 109 err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, ··· 142 142 WRITE_ONCE(q->params.ecn, 143 143 !!nla_get_u32(tb[TCA_CODEL_ECN])); 144 144 145 - qlen = sch->q.qlen; 146 145 while (sch->q.qlen > sch->limit) { 147 146 struct sk_buff *skb = qdisc_dequeue_internal(sch, true); 148 147 149 - dropped += qdisc_pkt_len(skb); 150 - qdisc_qstats_backlog_dec(sch, skb); 148 + if (!skb) 149 + break; 150 + 151 + dropped_pkts++; 152 + dropped_bytes += qdisc_pkt_len(skb); 151 153 rtnl_qdisc_drop(skb, sch); 152 154 } 153 - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); 155 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 154 156 155 157 sch_tree_unlock(sch); 156 158 return 0;
+7 -5
net/sched/sch_fq.c
··· 1013 1013 static int fq_change(struct Qdisc *sch, struct nlattr *opt, 1014 1014 struct netlink_ext_ack *extack) 1015 1015 { 1016 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 1016 1017 struct fq_sched_data *q = qdisc_priv(sch); 1017 1018 struct nlattr *tb[TCA_FQ_MAX + 1]; 1018 - int err, drop_count = 0; 1019 - unsigned drop_len = 0; 1020 1019 u32 fq_log; 1020 + int err; 1021 1021 1022 1022 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, 1023 1023 NULL); ··· 1135 1135 err = fq_resize(sch, fq_log); 1136 1136 sch_tree_lock(sch); 1137 1137 } 1138 + 1138 1139 while (sch->q.qlen > sch->limit) { 1139 1140 struct sk_buff *skb = qdisc_dequeue_internal(sch, false); 1140 1141 1141 1142 if (!skb) 1142 1143 break; 1143 - drop_len += qdisc_pkt_len(skb); 1144 + 1145 + dropped_pkts++; 1146 + dropped_bytes += qdisc_pkt_len(skb); 1144 1147 rtnl_kfree_skbs(skb, skb); 1145 - drop_count++; 1146 1148 } 1147 - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 1149 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 1148 1150 1149 1151 sch_tree_unlock(sch); 1150 1152 return err;
+7 -5
net/sched/sch_fq_codel.c
··· 366 366 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, 367 367 struct netlink_ext_ack *extack) 368 368 { 369 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 369 370 struct fq_codel_sched_data *q = qdisc_priv(sch); 370 371 struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; 371 372 u32 quantum = 0; ··· 444 443 q->memory_usage > q->memory_limit) { 445 444 struct sk_buff *skb = qdisc_dequeue_internal(sch, false); 446 445 447 - q->cstats.drop_len += qdisc_pkt_len(skb); 446 + if (!skb) 447 + break; 448 + 449 + dropped_pkts++; 450 + dropped_bytes += qdisc_pkt_len(skb); 448 451 rtnl_kfree_skbs(skb, skb); 449 - q->cstats.drop_count++; 450 452 } 451 - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); 452 - q->cstats.drop_count = 0; 453 - q->cstats.drop_len = 0; 453 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 454 454 455 455 sch_tree_unlock(sch); 456 456 return 0;
+7 -5
net/sched/sch_fq_pie.c
··· 287 287 static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, 288 288 struct netlink_ext_ack *extack) 289 289 { 290 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 290 291 struct fq_pie_sched_data *q = qdisc_priv(sch); 291 292 struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; 292 - unsigned int len_dropped = 0; 293 - unsigned int num_dropped = 0; 294 293 int err; 295 294 296 295 err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); ··· 367 368 while (sch->q.qlen > sch->limit) { 368 369 struct sk_buff *skb = qdisc_dequeue_internal(sch, false); 369 370 370 - len_dropped += qdisc_pkt_len(skb); 371 - num_dropped += 1; 371 + if (!skb) 372 + break; 373 + 374 + dropped_pkts++; 375 + dropped_bytes += qdisc_pkt_len(skb); 372 376 rtnl_kfree_skbs(skb, skb); 373 377 } 374 - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); 378 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 375 379 376 380 sch_tree_unlock(sch); 377 381 return 0;
+7 -5
net/sched/sch_hhf.c
··· 508 508 static int hhf_change(struct Qdisc *sch, struct nlattr *opt, 509 509 struct netlink_ext_ack *extack) 510 510 { 511 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 511 512 struct hhf_sched_data *q = qdisc_priv(sch); 512 513 struct nlattr *tb[TCA_HHF_MAX + 1]; 513 - unsigned int qlen, prev_backlog; 514 514 int err; 515 515 u64 non_hh_quantum; 516 516 u32 new_quantum = q->quantum; ··· 561 561 usecs_to_jiffies(us)); 562 562 } 563 563 564 - qlen = sch->q.qlen; 565 - prev_backlog = sch->qstats.backlog; 566 564 while (sch->q.qlen > sch->limit) { 567 565 struct sk_buff *skb = qdisc_dequeue_internal(sch, false); 568 566 567 + if (!skb) 568 + break; 569 + 570 + dropped_pkts++; 571 + dropped_bytes += qdisc_pkt_len(skb); 569 572 rtnl_kfree_skbs(skb, skb); 570 573 } 571 - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, 572 - prev_backlog - sch->qstats.backlog); 574 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 573 575 574 576 sch_tree_unlock(sch); 575 577 return 0;
+7 -5
net/sched/sch_pie.c
··· 141 141 static int pie_change(struct Qdisc *sch, struct nlattr *opt, 142 142 struct netlink_ext_ack *extack) 143 143 { 144 + unsigned int dropped_pkts = 0, dropped_bytes = 0; 144 145 struct pie_sched_data *q = qdisc_priv(sch); 145 146 struct nlattr *tb[TCA_PIE_MAX + 1]; 146 - unsigned int qlen, dropped = 0; 147 147 int err; 148 148 149 149 err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, ··· 193 193 nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); 194 194 195 195 /* Drop excess packets if new limit is lower */ 196 - qlen = sch->q.qlen; 197 196 while (sch->q.qlen > sch->limit) { 198 197 struct sk_buff *skb = qdisc_dequeue_internal(sch, true); 199 198 200 - dropped += qdisc_pkt_len(skb); 201 - qdisc_qstats_backlog_dec(sch, skb); 199 + if (!skb) 200 + break; 201 + 202 + dropped_pkts++; 203 + dropped_bytes += qdisc_pkt_len(skb); 202 204 rtnl_qdisc_drop(skb, sch); 203 205 } 204 - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); 206 + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); 205 207 206 208 sch_tree_unlock(sch); 207 209 return 0;