Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7 * original idea by Martin Devera
8 */
9
10#include <linux/module.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/errno.h>
15#include <linux/skbuff.h>
16#include <net/gso.h>
17#include <net/netlink.h>
18#include <net/sch_generic.h>
19#include <net/pkt_cls.h>
20#include <net/pkt_sched.h>
21
22
23/* Simple Token Bucket Filter.
24 =======================================
25
26 SOURCE.
27 -------
28
29 None.
30
31 Description.
32 ------------
33
34 A data flow obeys TBF with rate R and depth B, if for any
35 time interval t_i...t_f the number of transmitted bits
36 does not exceed B + R*(t_f-t_i).
37
38 Packetized version of this definition:
39 The sequence of packets of sizes s_i served at moments t_i
40 obeys TBF, if for any i<=k:
41
42 s_i+....+s_k <= B + R*(t_k - t_i)
43
44 Algorithm.
45 ----------
46
47 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
48
49 N(t+delta) = min{B/R, N(t) + delta}
50
51 If the first packet in queue has length S, it may be
52 transmitted only at the time t_* when S/R <= N(t_*),
53 and in this case N(t) jumps:
54
55 N(t_* + 0) = N(t_* - 0) - S/R.
56
57
58
59 Actually, QoS requires two TBF to be applied to a data stream.
60 One of them controls steady state burst size, another
61 one with rate P (peak rate) and depth M (equal to link MTU)
62 limits bursts at a smaller time scale.
63
64 It is easy to see that P>R, and B>M. If P is infinity, this double
65 TBF is equivalent to a single one.
66
67 When TBF works in reshaping mode, latency is estimated as:
68
69 lat = max ((L-B)/R, (L-M)/P)
70
71
72 NOTES.
73 ------
74
75 If TBF throttles, it starts a watchdog timer, which will wake it up
76 when it is ready to transmit.
77 Note that the minimal timer resolution is 1/HZ.
78 If no new packets arrive during this period,
79 or if the device is not awaken by EOI for some previous packet,
80 TBF can stop its activity for 1/HZ.
81
82
83 This means, that with depth B, the maximal rate is
84
85 R_crit = B*HZ
86
87 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
88
89 Note that the peak rate TBF is much more tough: with MTU 1500
90 P_crit = 150Kbytes/sec. So, if you need greater peak
91 rates, use alpha with HZ=1000 :-)
92
93 With classful TBF, limit is just kept for backwards compatibility.
94 It is passed to the default bfifo qdisc - if the inner qdisc is
95 changed the limit is not effective anymore.
96*/
97
98struct tbf_sched_data {
99/* Parameters */
100 u32 limit; /* Maximal length of backlog: bytes */
101 u32 max_size;
102 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
103 s64 mtu;
104 struct psched_ratecfg rate;
105 struct psched_ratecfg peak;
106
107/* Variables */
108 s64 tokens; /* Current number of B tokens */
109 s64 ptokens; /* Current number of P tokens */
110 s64 t_c; /* Time check-point */
111 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
112 struct qdisc_watchdog watchdog; /* Watchdog timer */
113};
114
115
116/* Time to Length, convert time in ns to length in bytes
117 * to determinate how many bytes can be sent in given time.
118 */
119static u64 psched_ns_t2l(const struct psched_ratecfg *r,
120 u64 time_in_ns)
121{
122 /* The formula is :
123 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
124 */
125 u64 len = time_in_ns * r->rate_bytes_ps;
126
127 do_div(len, NSEC_PER_SEC);
128
129 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
130 do_div(len, 53);
131 len = len * 48;
132 }
133
134 if (len > r->overhead)
135 len -= r->overhead;
136 else
137 len = 0;
138
139 return len;
140}
141
142static void tbf_offload_change(struct Qdisc *sch)
143{
144 struct tbf_sched_data *q = qdisc_priv(sch);
145 struct net_device *dev = qdisc_dev(sch);
146 struct tc_tbf_qopt_offload qopt;
147
148 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
149 return;
150
151 qopt.command = TC_TBF_REPLACE;
152 qopt.handle = sch->handle;
153 qopt.parent = sch->parent;
154 qopt.replace_params.rate = q->rate;
155 qopt.replace_params.max_size = q->max_size;
156 qopt.replace_params.qstats = &sch->qstats;
157
158 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
159}
160
161static void tbf_offload_destroy(struct Qdisc *sch)
162{
163 struct net_device *dev = qdisc_dev(sch);
164 struct tc_tbf_qopt_offload qopt;
165
166 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
167 return;
168
169 qopt.command = TC_TBF_DESTROY;
170 qopt.handle = sch->handle;
171 qopt.parent = sch->parent;
172 dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
173}
174
175static int tbf_offload_dump(struct Qdisc *sch)
176{
177 struct tc_tbf_qopt_offload qopt;
178
179 qopt.command = TC_TBF_STATS;
180 qopt.handle = sch->handle;
181 qopt.parent = sch->parent;
182 qopt.stats.bstats = &sch->bstats;
183 qopt.stats.qstats = &sch->qstats;
184
185 return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
186}
187
188static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
189 struct Qdisc *old, struct netlink_ext_ack *extack)
190{
191 struct tc_tbf_qopt_offload graft_offload = {
192 .handle = sch->handle,
193 .parent = sch->parent,
194 .child_handle = new->handle,
195 .command = TC_TBF_GRAFT,
196 };
197
198 qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
199 TC_SETUP_QDISC_TBF, &graft_offload, extack);
200}
201
202/* GSO packet is too big, segment it so that tbf can transmit
203 * each segment in time
204 */
205static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
206 struct sk_buff **to_free)
207{
208 struct tbf_sched_data *q = qdisc_priv(sch);
209 struct sk_buff *segs, *nskb;
210 netdev_features_t features = netif_skb_features(skb);
211 unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len;
212 int ret, nb;
213
214 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
215
216 if (IS_ERR_OR_NULL(segs))
217 return qdisc_drop(skb, sch, to_free);
218
219 nb = 0;
220 skb_list_walk_safe(segs, segs, nskb) {
221 skb_mark_not_on_list(segs);
222 seg_len = segs->len;
223 qdisc_skb_cb(segs)->pkt_len = seg_len;
224 qdisc_skb_cb(segs)->pkt_segs = 1;
225 ret = qdisc_enqueue(segs, q->qdisc, to_free);
226 if (ret != NET_XMIT_SUCCESS) {
227 if (net_xmit_drop_count(ret))
228 qdisc_qstats_drop(sch);
229 } else {
230 nb++;
231 len += seg_len;
232 }
233 }
234 sch->q.qlen += nb;
235 sch->qstats.backlog += len;
236 if (nb > 0) {
237 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
238 consume_skb(skb);
239 return NET_XMIT_SUCCESS;
240 }
241
242 kfree_skb(skb);
243 return NET_XMIT_DROP;
244}
245
246static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
247 struct sk_buff **to_free)
248{
249 struct tbf_sched_data *q = qdisc_priv(sch);
250 unsigned int len = qdisc_pkt_len(skb);
251 int ret;
252
253 if (qdisc_pkt_len(skb) > q->max_size) {
254 if (skb_is_gso(skb) &&
255 skb_gso_validate_mac_len(skb, q->max_size))
256 return tbf_segment(skb, sch, to_free);
257 return qdisc_drop(skb, sch, to_free);
258 }
259 ret = qdisc_enqueue(skb, q->qdisc, to_free);
260 if (ret != NET_XMIT_SUCCESS) {
261 if (net_xmit_drop_count(ret))
262 qdisc_qstats_drop(sch);
263 return ret;
264 }
265
266 sch->qstats.backlog += len;
267 sch->q.qlen++;
268 return NET_XMIT_SUCCESS;
269}
270
271static bool tbf_peak_present(const struct tbf_sched_data *q)
272{
273 return q->peak.rate_bytes_ps;
274}
275
276static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
277{
278 struct tbf_sched_data *q = qdisc_priv(sch);
279 struct sk_buff *skb;
280
281 skb = q->qdisc->ops->peek(q->qdisc);
282
283 if (skb) {
284 s64 now;
285 s64 toks;
286 s64 ptoks = 0;
287 unsigned int len = qdisc_pkt_len(skb);
288
289 now = ktime_get_ns();
290 toks = min_t(s64, now - q->t_c, q->buffer);
291
292 if (tbf_peak_present(q)) {
293 ptoks = toks + q->ptokens;
294 if (ptoks > q->mtu)
295 ptoks = q->mtu;
296 ptoks -= (s64) psched_l2t_ns(&q->peak, len);
297 }
298 toks += q->tokens;
299 if (toks > q->buffer)
300 toks = q->buffer;
301 toks -= (s64) psched_l2t_ns(&q->rate, len);
302
303 if ((toks|ptoks) >= 0) {
304 skb = qdisc_dequeue_peeked(q->qdisc);
305 if (unlikely(!skb))
306 return NULL;
307
308 q->t_c = now;
309 q->tokens = toks;
310 q->ptokens = ptoks;
311 qdisc_qstats_backlog_dec(sch, skb);
312 sch->q.qlen--;
313 qdisc_bstats_update(sch, skb);
314 return skb;
315 }
316
317 qdisc_watchdog_schedule_ns(&q->watchdog,
318 now + max_t(long, -toks, -ptoks));
319
320 /* Maybe we have a shorter packet in the queue,
321 which can be sent now. It sounds cool,
322 but, however, this is wrong in principle.
323 We MUST NOT reorder packets under these circumstances.
324
325 Really, if we split the flow into independent
326 subflows, it would be a very good solution.
327 This is the main idea of all FQ algorithms
328 (cf. CSZ, HPFQ, HFSC)
329 */
330
331 qdisc_qstats_overlimit(sch);
332 }
333 return NULL;
334}
335
336static void tbf_reset(struct Qdisc *sch)
337{
338 struct tbf_sched_data *q = qdisc_priv(sch);
339
340 qdisc_reset(q->qdisc);
341 q->t_c = ktime_get_ns();
342 q->tokens = q->buffer;
343 q->ptokens = q->mtu;
344 qdisc_watchdog_cancel(&q->watchdog);
345}
346
347static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
348 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
349 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
350 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
351 [TCA_TBF_RATE64] = { .type = NLA_U64 },
352 [TCA_TBF_PRATE64] = { .type = NLA_U64 },
353 [TCA_TBF_BURST] = { .type = NLA_U32 },
354 [TCA_TBF_PBURST] = { .type = NLA_U32 },
355};
356
357static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
358 struct netlink_ext_ack *extack)
359{
360 int err;
361 struct tbf_sched_data *q = qdisc_priv(sch);
362 struct nlattr *tb[TCA_TBF_MAX + 1];
363 struct tc_tbf_qopt *qopt;
364 struct Qdisc *child = NULL;
365 struct Qdisc *old = NULL;
366 struct psched_ratecfg rate;
367 struct psched_ratecfg peak;
368 u64 max_size;
369 s64 buffer, mtu;
370 u64 rate64 = 0, prate64 = 0;
371
372 err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
373 NULL);
374 if (err < 0)
375 return err;
376
377 err = -EINVAL;
378 if (tb[TCA_TBF_PARMS] == NULL)
379 goto done;
380
381 qopt = nla_data(tb[TCA_TBF_PARMS]);
382 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
383 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
384 tb[TCA_TBF_RTAB],
385 NULL));
386
387 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
388 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
389 tb[TCA_TBF_PTAB],
390 NULL));
391
392 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
393 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
394
395 if (tb[TCA_TBF_RATE64])
396 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
397 psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
398
399 if (tb[TCA_TBF_BURST]) {
400 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
401 buffer = psched_l2t_ns(&rate, max_size);
402 } else {
403 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
404 }
405
406 if (qopt->peakrate.rate) {
407 if (tb[TCA_TBF_PRATE64])
408 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
409 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
410 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
411 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
412 peak.rate_bytes_ps, rate.rate_bytes_ps);
413 err = -EINVAL;
414 goto done;
415 }
416
417 if (tb[TCA_TBF_PBURST]) {
418 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
419 max_size = min_t(u32, max_size, pburst);
420 mtu = psched_l2t_ns(&peak, pburst);
421 } else {
422 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
423 }
424 } else {
425 memset(&peak, 0, sizeof(peak));
426 }
427
428 if (max_size < psched_mtu(qdisc_dev(sch)))
429 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
430 max_size, qdisc_dev(sch)->name,
431 psched_mtu(qdisc_dev(sch)));
432
433 if (!max_size) {
434 err = -EINVAL;
435 goto done;
436 }
437
438 if (q->qdisc != &noop_qdisc) {
439 err = fifo_set_limit(q->qdisc, qopt->limit);
440 if (err)
441 goto done;
442 } else if (qopt->limit > 0) {
443 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
444 extack);
445 if (IS_ERR(child)) {
446 err = PTR_ERR(child);
447 goto done;
448 }
449
450 /* child is fifo, no need to check for noop_qdisc */
451 qdisc_hash_add(child, true);
452 }
453
454 sch_tree_lock(sch);
455 if (child) {
456 qdisc_purge_queue(q->qdisc);
457 old = q->qdisc;
458 q->qdisc = child;
459 }
460 q->limit = qopt->limit;
461 if (tb[TCA_TBF_PBURST])
462 q->mtu = mtu;
463 else
464 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
465 q->max_size = max_size;
466 if (tb[TCA_TBF_BURST])
467 q->buffer = buffer;
468 else
469 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
470 q->tokens = q->buffer;
471 q->ptokens = q->mtu;
472
473 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
474 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
475
476 sch_tree_unlock(sch);
477 qdisc_put(old);
478 err = 0;
479
480 tbf_offload_change(sch);
481done:
482 return err;
483}
484
485static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
486 struct netlink_ext_ack *extack)
487{
488 struct tbf_sched_data *q = qdisc_priv(sch);
489
490 qdisc_watchdog_init(&q->watchdog, sch);
491 q->qdisc = &noop_qdisc;
492
493 if (!opt)
494 return -EINVAL;
495
496 q->t_c = ktime_get_ns();
497
498 return tbf_change(sch, opt, extack);
499}
500
501static void tbf_destroy(struct Qdisc *sch)
502{
503 struct tbf_sched_data *q = qdisc_priv(sch);
504
505 qdisc_watchdog_cancel(&q->watchdog);
506 tbf_offload_destroy(sch);
507 qdisc_put(q->qdisc);
508}
509
510static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
511{
512 struct tbf_sched_data *q = qdisc_priv(sch);
513 struct nlattr *nest;
514 struct tc_tbf_qopt opt;
515 int err;
516
517 err = tbf_offload_dump(sch);
518 if (err)
519 return err;
520
521 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
522 if (nest == NULL)
523 goto nla_put_failure;
524
525 opt.limit = q->limit;
526 psched_ratecfg_getrate(&opt.rate, &q->rate);
527 if (tbf_peak_present(q))
528 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
529 else
530 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
531 opt.mtu = PSCHED_NS2TICKS(q->mtu);
532 opt.buffer = PSCHED_NS2TICKS(q->buffer);
533 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
534 goto nla_put_failure;
535 if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
536 nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
537 TCA_TBF_PAD))
538 goto nla_put_failure;
539 if (tbf_peak_present(q) &&
540 q->peak.rate_bytes_ps >= (1ULL << 32) &&
541 nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
542 TCA_TBF_PAD))
543 goto nla_put_failure;
544
545 return nla_nest_end(skb, nest);
546
547nla_put_failure:
548 nla_nest_cancel(skb, nest);
549 return -1;
550}
551
552static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
553 struct sk_buff *skb, struct tcmsg *tcm)
554{
555 struct tbf_sched_data *q = qdisc_priv(sch);
556
557 tcm->tcm_handle |= TC_H_MIN(1);
558 tcm->tcm_info = q->qdisc->handle;
559
560 return 0;
561}
562
563static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
564 struct Qdisc **old, struct netlink_ext_ack *extack)
565{
566 struct tbf_sched_data *q = qdisc_priv(sch);
567
568 if (new == NULL)
569 new = &noop_qdisc;
570
571 *old = qdisc_replace(sch, new, &q->qdisc);
572
573 tbf_offload_graft(sch, new, *old, extack);
574 return 0;
575}
576
577static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
578{
579 struct tbf_sched_data *q = qdisc_priv(sch);
580 return q->qdisc;
581}
582
583static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
584{
585 return 1;
586}
587
588static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
589{
590 if (!walker->stop) {
591 tc_qdisc_stats_dump(sch, 1, walker);
592 }
593}
594
595static const struct Qdisc_class_ops tbf_class_ops = {
596 .graft = tbf_graft,
597 .leaf = tbf_leaf,
598 .find = tbf_find,
599 .walk = tbf_walk,
600 .dump = tbf_dump_class,
601};
602
603static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
604 .next = NULL,
605 .cl_ops = &tbf_class_ops,
606 .id = "tbf",
607 .priv_size = sizeof(struct tbf_sched_data),
608 .enqueue = tbf_enqueue,
609 .dequeue = tbf_dequeue,
610 .peek = qdisc_peek_dequeued,
611 .init = tbf_init,
612 .reset = tbf_reset,
613 .destroy = tbf_destroy,
614 .change = tbf_change,
615 .dump = tbf_dump,
616 .owner = THIS_MODULE,
617};
618MODULE_ALIAS_NET_SCH("tbf");
619
620static int __init tbf_module_init(void)
621{
622 return register_qdisc(&tbf_qdisc_ops);
623}
624
625static void __exit tbf_module_exit(void)
626{
627 unregister_qdisc(&tbf_qdisc_ops);
628}
629module_init(tbf_module_init)
630module_exit(tbf_module_exit)
631MODULE_LICENSE("GPL");
632MODULE_DESCRIPTION("Token Bucket Filter qdisc");