Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.28-rc5 2090 lines 49 kB view raw
1/* 2 * net/sched/sch_cbq.c Class-Based Queueing discipline. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 */ 12 13#include <linux/module.h> 14#include <linux/types.h> 15#include <linux/kernel.h> 16#include <linux/string.h> 17#include <linux/errno.h> 18#include <linux/skbuff.h> 19#include <net/netlink.h> 20#include <net/pkt_sched.h> 21 22 23/* Class-Based Queueing (CBQ) algorithm. 24 ======================================= 25 26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource 27 Management Models for Packet Networks", 28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 29 30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 31 32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting 33 Parameters", 1996 34 35 [4] Sally Floyd and Michael Speer, "Experimental Results 36 for Class-Based Queueing", 1998, not published. 37 38 ----------------------------------------------------------------------- 39 40 Algorithm skeleton was taken from NS simulator cbq.cc. 41 If someone wants to check this code against the LBL version, 42 he should take into account that ONLY the skeleton was borrowed, 43 the implementation is different. Particularly: 44 45 --- The WRR algorithm is different. Our version looks more 46 reasonable (I hope) and works when quanta are allowed to be 47 less than MTU, which is always the case when real time classes 48 have small rates. Note, that the statement of [3] is 49 incomplete, delay may actually be estimated even if class 50 per-round allotment is less than MTU. Namely, if per-round 51 allotment is W*r_i, and r_1+...+r_k = r < 1 52 53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B 54 55 In the worst case we have IntServ estimate with D = W*r+k*MTU 56 and C = MTU*r. The proof (if correct at all) is trivial. 57 58 59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot 60 interpret some places, which look like wrong translations 61 from NS. Anyone is advised to find these differences 62 and explain to me, why I am wrong 8). 63 64 --- Linux has no EOI event, so that we cannot estimate true class 65 idle time. Workaround is to consider the next dequeue event 66 as sign that previous packet is finished. This is wrong because of 67 internal device queueing, but on a permanently loaded link it is true. 68 Moreover, combined with clock integrator, this scheme looks 69 very close to an ideal solution. */ 70 71struct cbq_sched_data; 72 73 74struct cbq_class 75{ 76 struct Qdisc_class_common common; 77 struct cbq_class *next_alive; /* next class with backlog in this priority band */ 78 79/* Parameters */ 80 unsigned char priority; /* class priority */ 81 unsigned char priority2; /* priority to be used after overlimit */ 82 unsigned char ewma_log; /* time constant for idle time calculation */ 83 unsigned char ovl_strategy; 84#ifdef CONFIG_NET_CLS_ACT 85 unsigned char police; 86#endif 87 88 u32 defmap; 89 90 /* Link-sharing scheduler parameters */ 91 long maxidle; /* Class parameters: see below. */ 92 long offtime; 93 long minidle; 94 u32 avpkt; 95 struct qdisc_rate_table *R_tab; 96 97 /* Overlimit strategy parameters */ 98 void (*overlimit)(struct cbq_class *cl); 99 psched_tdiff_t penalty; 100 101 /* General scheduler (WRR) parameters */ 102 long allot; 103 long quantum; /* Allotment per WRR round */ 104 long weight; /* Relative allotment: see below */ 105 106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */ 107 struct cbq_class *split; /* Ptr to split node */ 108 struct cbq_class *share; /* Ptr to LS parent in the class tree */ 109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ 110 struct cbq_class *borrow; /* NULL if class is bandwidth limited; 111 parent otherwise */ 112 struct cbq_class *sibling; /* Sibling chain */ 113 struct cbq_class *children; /* Pointer to children chain */ 114 115 struct Qdisc *q; /* Elementary queueing discipline */ 116 117 118/* Variables */ 119 unsigned char cpriority; /* Effective priority */ 120 unsigned char delayed; 121 unsigned char level; /* level of the class in hierarchy: 122 0 for leaf classes, and maximal 123 level of children + 1 for nodes. 124 */ 125 126 psched_time_t last; /* Last end of service */ 127 psched_time_t undertime; 128 long avgidle; 129 long deficit; /* Saved deficit for WRR */ 130 psched_time_t penalized; 131 struct gnet_stats_basic bstats; 132 struct gnet_stats_queue qstats; 133 struct gnet_stats_rate_est rate_est; 134 struct tc_cbq_xstats xstats; 135 136 struct tcf_proto *filter_list; 137 138 int refcnt; 139 int filters; 140 141 struct cbq_class *defaults[TC_PRIO_MAX+1]; 142}; 143 144struct cbq_sched_data 145{ 146 struct Qdisc_class_hash clhash; /* Hash table of all classes */ 147 int nclasses[TC_CBQ_MAXPRIO+1]; 148 unsigned quanta[TC_CBQ_MAXPRIO+1]; 149 150 struct cbq_class link; 151 152 unsigned activemask; 153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes 154 with backlog */ 155 156#ifdef CONFIG_NET_CLS_ACT 157 struct cbq_class *rx_class; 158#endif 159 struct cbq_class *tx_class; 160 struct cbq_class *tx_borrowed; 161 int tx_len; 162 psched_time_t now; /* Cached timestamp */ 163 psched_time_t now_rt; /* Cached real time */ 164 unsigned pmask; 165 166 struct hrtimer delay_timer; 167 struct qdisc_watchdog watchdog; /* Watchdog timer, 168 started when CBQ has 169 backlog, but cannot 170 transmit just now */ 171 psched_tdiff_t wd_expires; 172 int toplevel; 173 u32 hgenerator; 174}; 175 176 177#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) 178 179static __inline__ struct cbq_class * 180cbq_class_lookup(struct cbq_sched_data *q, u32 classid) 181{ 182 struct Qdisc_class_common *clc; 183 184 clc = qdisc_class_find(&q->clhash, classid); 185 if (clc == NULL) 186 return NULL; 187 return container_of(clc, struct cbq_class, common); 188} 189 190#ifdef CONFIG_NET_CLS_ACT 191 192static struct cbq_class * 193cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) 194{ 195 struct cbq_class *cl, *new; 196 197 for (cl = this->tparent; cl; cl = cl->tparent) 198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) 199 return new; 200 201 return NULL; 202} 203 204#endif 205 206/* Classify packet. The procedure is pretty complicated, but 207 it allows us to combine link sharing and priority scheduling 208 transparently. 209 210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ, 211 so that it resolves to split nodes. Then packets are classified 212 by logical priority, or a more specific classifier may be attached 213 to the split node. 214 */ 215 216static struct cbq_class * 217cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) 218{ 219 struct cbq_sched_data *q = qdisc_priv(sch); 220 struct cbq_class *head = &q->link; 221 struct cbq_class **defmap; 222 struct cbq_class *cl = NULL; 223 u32 prio = skb->priority; 224 struct tcf_result res; 225 226 /* 227 * Step 1. If skb->priority points to one of our classes, use it. 228 */ 229 if (TC_H_MAJ(prio^sch->handle) == 0 && 230 (cl = cbq_class_lookup(q, prio)) != NULL) 231 return cl; 232 233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 234 for (;;) { 235 int result = 0; 236 defmap = head->defaults; 237 238 /* 239 * Step 2+n. Apply classifier. 240 */ 241 if (!head->filter_list || 242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) 243 goto fallback; 244 245 if ((cl = (void*)res.class) == NULL) { 246 if (TC_H_MAJ(res.classid)) 247 cl = cbq_class_lookup(q, res.classid); 248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) 249 cl = defmap[TC_PRIO_BESTEFFORT]; 250 251 if (cl == NULL || cl->level >= head->level) 252 goto fallback; 253 } 254 255#ifdef CONFIG_NET_CLS_ACT 256 switch (result) { 257 case TC_ACT_QUEUED: 258 case TC_ACT_STOLEN: 259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 260 case TC_ACT_SHOT: 261 return NULL; 262 case TC_ACT_RECLASSIFY: 263 return cbq_reclassify(skb, cl); 264 } 265#endif 266 if (cl->level == 0) 267 return cl; 268 269 /* 270 * Step 3+n. If classifier selected a link sharing class, 271 * apply agency specific classifier. 272 * Repeat this procdure until we hit a leaf node. 273 */ 274 head = cl; 275 } 276 277fallback: 278 cl = head; 279 280 /* 281 * Step 4. No success... 282 */ 283 if (TC_H_MAJ(prio) == 0 && 284 !(cl = head->defaults[prio&TC_PRIO_MAX]) && 285 !(cl = head->defaults[TC_PRIO_BESTEFFORT])) 286 return head; 287 288 return cl; 289} 290 291/* 292 A packet has just been enqueued on the empty class. 293 cbq_activate_class adds it to the tail of active class list 294 of its priority band. 295 */ 296 297static __inline__ void cbq_activate_class(struct cbq_class *cl) 298{ 299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 300 int prio = cl->cpriority; 301 struct cbq_class *cl_tail; 302 303 cl_tail = q->active[prio]; 304 q->active[prio] = cl; 305 306 if (cl_tail != NULL) { 307 cl->next_alive = cl_tail->next_alive; 308 cl_tail->next_alive = cl; 309 } else { 310 cl->next_alive = cl; 311 q->activemask |= (1<<prio); 312 } 313} 314 315/* 316 Unlink class from active chain. 317 Note that this same procedure is done directly in cbq_dequeue* 318 during round-robin procedure. 319 */ 320 321static void cbq_deactivate_class(struct cbq_class *this) 322{ 323 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 324 int prio = this->cpriority; 325 struct cbq_class *cl; 326 struct cbq_class *cl_prev = q->active[prio]; 327 328 do { 329 cl = cl_prev->next_alive; 330 if (cl == this) { 331 cl_prev->next_alive = cl->next_alive; 332 cl->next_alive = NULL; 333 334 if (cl == q->active[prio]) { 335 q->active[prio] = cl_prev; 336 if (cl == q->active[prio]) { 337 q->active[prio] = NULL; 338 q->activemask &= ~(1<<prio); 339 return; 340 } 341 } 342 return; 343 } 344 } while ((cl_prev = cl) != q->active[prio]); 345} 346 347static void 348cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) 349{ 350 int toplevel = q->toplevel; 351 352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { 353 psched_time_t now; 354 psched_tdiff_t incr; 355 356 now = psched_get_time(); 357 incr = now - q->now_rt; 358 now = q->now + incr; 359 360 do { 361 if (cl->undertime < now) { 362 q->toplevel = cl->level; 363 return; 364 } 365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level); 366 } 367} 368 369static int 370cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 371{ 372 struct cbq_sched_data *q = qdisc_priv(sch); 373 int uninitialized_var(ret); 374 struct cbq_class *cl = cbq_classify(skb, sch, &ret); 375 376#ifdef CONFIG_NET_CLS_ACT 377 q->rx_class = cl; 378#endif 379 if (cl == NULL) { 380 if (ret & __NET_XMIT_BYPASS) 381 sch->qstats.drops++; 382 kfree_skb(skb); 383 return ret; 384 } 385 386#ifdef CONFIG_NET_CLS_ACT 387 cl->q->__parent = sch; 388#endif 389 ret = qdisc_enqueue(skb, cl->q); 390 if (ret == NET_XMIT_SUCCESS) { 391 sch->q.qlen++; 392 sch->bstats.packets++; 393 sch->bstats.bytes += qdisc_pkt_len(skb); 394 cbq_mark_toplevel(q, cl); 395 if (!cl->next_alive) 396 cbq_activate_class(cl); 397 return ret; 398 } 399 400 if (net_xmit_drop_count(ret)) { 401 sch->qstats.drops++; 402 cbq_mark_toplevel(q, cl); 403 cl->qstats.drops++; 404 } 405 return ret; 406} 407 408static int 409cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) 410{ 411 struct cbq_sched_data *q = qdisc_priv(sch); 412 struct cbq_class *cl; 413 int ret; 414 415 if ((cl = q->tx_class) == NULL) { 416 kfree_skb(skb); 417 sch->qstats.drops++; 418 return NET_XMIT_CN; 419 } 420 q->tx_class = NULL; 421 422 cbq_mark_toplevel(q, cl); 423 424#ifdef CONFIG_NET_CLS_ACT 425 q->rx_class = cl; 426 cl->q->__parent = sch; 427#endif 428 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { 429 sch->q.qlen++; 430 sch->qstats.requeues++; 431 if (!cl->next_alive) 432 cbq_activate_class(cl); 433 return 0; 434 } 435 if (net_xmit_drop_count(ret)) { 436 sch->qstats.drops++; 437 cl->qstats.drops++; 438 } 439 return ret; 440} 441 442/* Overlimit actions */ 443 444/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ 445 446static void cbq_ovl_classic(struct cbq_class *cl) 447{ 448 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 449 psched_tdiff_t delay = cl->undertime - q->now; 450 451 if (!cl->delayed) { 452 delay += cl->offtime; 453 454 /* 455 Class goes to sleep, so that it will have no 456 chance to work avgidle. Let's forgive it 8) 457 458 BTW cbq-2.0 has a crap in this 459 place, apparently they forgot to shift it by cl->ewma_log. 460 */ 461 if (cl->avgidle < 0) 462 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 463 if (cl->avgidle < cl->minidle) 464 cl->avgidle = cl->minidle; 465 if (delay <= 0) 466 delay = 1; 467 cl->undertime = q->now + delay; 468 469 cl->xstats.overactions++; 470 cl->delayed = 1; 471 } 472 if (q->wd_expires == 0 || q->wd_expires > delay) 473 q->wd_expires = delay; 474 475 /* Dirty work! We must schedule wakeups based on 476 real available rate, rather than leaf rate, 477 which may be tiny (even zero). 478 */ 479 if (q->toplevel == TC_CBQ_MAXLEVEL) { 480 struct cbq_class *b; 481 psched_tdiff_t base_delay = q->wd_expires; 482 483 for (b = cl->borrow; b; b = b->borrow) { 484 delay = b->undertime - q->now; 485 if (delay < base_delay) { 486 if (delay <= 0) 487 delay = 1; 488 base_delay = delay; 489 } 490 } 491 492 q->wd_expires = base_delay; 493 } 494} 495 496/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when 497 they go overlimit 498 */ 499 500static void cbq_ovl_rclassic(struct cbq_class *cl) 501{ 502 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 503 struct cbq_class *this = cl; 504 505 do { 506 if (cl->level > q->toplevel) { 507 cl = NULL; 508 break; 509 } 510 } while ((cl = cl->borrow) != NULL); 511 512 if (cl == NULL) 513 cl = this; 514 cbq_ovl_classic(cl); 515} 516 517/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ 518 519static void cbq_ovl_delay(struct cbq_class *cl) 520{ 521 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 522 psched_tdiff_t delay = cl->undertime - q->now; 523 524 if (test_bit(__QDISC_STATE_DEACTIVATED, 525 &qdisc_root_sleeping(cl->qdisc)->state)) 526 return; 527 528 if (!cl->delayed) { 529 psched_time_t sched = q->now; 530 ktime_t expires; 531 532 delay += cl->offtime; 533 if (cl->avgidle < 0) 534 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 535 if (cl->avgidle < cl->minidle) 536 cl->avgidle = cl->minidle; 537 cl->undertime = q->now + delay; 538 539 if (delay > 0) { 540 sched += delay + cl->penalty; 541 cl->penalized = sched; 542 cl->cpriority = TC_CBQ_MAXPRIO; 543 q->pmask |= (1<<TC_CBQ_MAXPRIO); 544 545 expires = ktime_set(0, 0); 546 expires = ktime_add_ns(expires, PSCHED_US2NS(sched)); 547 if (hrtimer_try_to_cancel(&q->delay_timer) && 548 ktime_to_ns(ktime_sub( 549 hrtimer_get_expires(&q->delay_timer), 550 expires)) > 0) 551 hrtimer_set_expires(&q->delay_timer, expires); 552 hrtimer_restart(&q->delay_timer); 553 cl->delayed = 1; 554 cl->xstats.overactions++; 555 return; 556 } 557 delay = 1; 558 } 559 if (q->wd_expires == 0 || q->wd_expires > delay) 560 q->wd_expires = delay; 561} 562 563/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ 564 565static void cbq_ovl_lowprio(struct cbq_class *cl) 566{ 567 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 568 569 cl->penalized = q->now + cl->penalty; 570 571 if (cl->cpriority != cl->priority2) { 572 cl->cpriority = cl->priority2; 573 q->pmask |= (1<<cl->cpriority); 574 cl->xstats.overactions++; 575 } 576 cbq_ovl_classic(cl); 577} 578 579/* TC_CBQ_OVL_DROP: penalize class by dropping */ 580 581static void cbq_ovl_drop(struct cbq_class *cl) 582{ 583 if (cl->q->ops->drop) 584 if (cl->q->ops->drop(cl->q)) 585 cl->qdisc->q.qlen--; 586 cl->xstats.overactions++; 587 cbq_ovl_classic(cl); 588} 589 590static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, 591 psched_time_t now) 592{ 593 struct cbq_class *cl; 594 struct cbq_class *cl_prev = q->active[prio]; 595 psched_time_t sched = now; 596 597 if (cl_prev == NULL) 598 return 0; 599 600 do { 601 cl = cl_prev->next_alive; 602 if (now - cl->penalized > 0) { 603 cl_prev->next_alive = cl->next_alive; 604 cl->next_alive = NULL; 605 cl->cpriority = cl->priority; 606 cl->delayed = 0; 607 cbq_activate_class(cl); 608 609 if (cl == q->active[prio]) { 610 q->active[prio] = cl_prev; 611 if (cl == q->active[prio]) { 612 q->active[prio] = NULL; 613 return 0; 614 } 615 } 616 617 cl = cl_prev->next_alive; 618 } else if (sched - cl->penalized > 0) 619 sched = cl->penalized; 620 } while ((cl_prev = cl) != q->active[prio]); 621 622 return sched - now; 623} 624 625static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) 626{ 627 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, 628 delay_timer); 629 struct Qdisc *sch = q->watchdog.qdisc; 630 psched_time_t now; 631 psched_tdiff_t delay = 0; 632 unsigned pmask; 633 634 now = psched_get_time(); 635 636 pmask = q->pmask; 637 q->pmask = 0; 638 639 while (pmask) { 640 int prio = ffz(~pmask); 641 psched_tdiff_t tmp; 642 643 pmask &= ~(1<<prio); 644 645 tmp = cbq_undelay_prio(q, prio, now); 646 if (tmp > 0) { 647 q->pmask |= 1<<prio; 648 if (tmp < delay || delay == 0) 649 delay = tmp; 650 } 651 } 652 653 if (delay) { 654 ktime_t time; 655 656 time = ktime_set(0, 0); 657 time = ktime_add_ns(time, PSCHED_US2NS(now + delay)); 658 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); 659 } 660 661 sch->flags &= ~TCQ_F_THROTTLED; 662 __netif_schedule(qdisc_root(sch)); 663 return HRTIMER_NORESTART; 664} 665 666#ifdef CONFIG_NET_CLS_ACT 667static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) 668{ 669 struct Qdisc *sch = child->__parent; 670 struct cbq_sched_data *q = qdisc_priv(sch); 671 struct cbq_class *cl = q->rx_class; 672 673 q->rx_class = NULL; 674 675 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { 676 int ret; 677 678 cbq_mark_toplevel(q, cl); 679 680 q->rx_class = cl; 681 cl->q->__parent = sch; 682 683 ret = qdisc_enqueue(skb, cl->q); 684 if (ret == NET_XMIT_SUCCESS) { 685 sch->q.qlen++; 686 sch->bstats.packets++; 687 sch->bstats.bytes += qdisc_pkt_len(skb); 688 if (!cl->next_alive) 689 cbq_activate_class(cl); 690 return 0; 691 } 692 if (net_xmit_drop_count(ret)) 693 sch->qstats.drops++; 694 return 0; 695 } 696 697 sch->qstats.drops++; 698 return -1; 699} 700#endif 701 702/* 703 It is mission critical procedure. 704 705 We "regenerate" toplevel cutoff, if transmitting class 706 has backlog and it is not regulated. It is not part of 707 original CBQ description, but looks more reasonable. 708 Probably, it is wrong. This question needs further investigation. 709*/ 710 711static __inline__ void 712cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, 713 struct cbq_class *borrowed) 714{ 715 if (cl && q->toplevel >= borrowed->level) { 716 if (cl->q->q.qlen > 1) { 717 do { 718 if (borrowed->undertime == PSCHED_PASTPERFECT) { 719 q->toplevel = borrowed->level; 720 return; 721 } 722 } while ((borrowed=borrowed->borrow) != NULL); 723 } 724#if 0 725 /* It is not necessary now. Uncommenting it 726 will save CPU cycles, but decrease fairness. 727 */ 728 q->toplevel = TC_CBQ_MAXLEVEL; 729#endif 730 } 731} 732 733static void 734cbq_update(struct cbq_sched_data *q) 735{ 736 struct cbq_class *this = q->tx_class; 737 struct cbq_class *cl = this; 738 int len = q->tx_len; 739 740 q->tx_class = NULL; 741 742 for ( ; cl; cl = cl->share) { 743 long avgidle = cl->avgidle; 744 long idle; 745 746 cl->bstats.packets++; 747 cl->bstats.bytes += len; 748 749 /* 750 (now - last) is total time between packet right edges. 751 (last_pktlen/rate) is "virtual" busy time, so that 752 753 idle = (now - last) - last_pktlen/rate 754 */ 755 756 idle = q->now - cl->last; 757 if ((unsigned long)idle > 128*1024*1024) { 758 avgidle = cl->maxidle; 759 } else { 760 idle -= L2T(cl, len); 761 762 /* true_avgidle := (1-W)*true_avgidle + W*idle, 763 where W=2^{-ewma_log}. But cl->avgidle is scaled: 764 cl->avgidle == true_avgidle/W, 765 hence: 766 */ 767 avgidle += idle - (avgidle>>cl->ewma_log); 768 } 769 770 if (avgidle <= 0) { 771 /* Overlimit or at-limit */ 772 773 if (avgidle < cl->minidle) 774 avgidle = cl->minidle; 775 776 cl->avgidle = avgidle; 777 778 /* Calculate expected time, when this class 779 will be allowed to send. 780 It will occur, when: 781 (1-W)*true_avgidle + W*delay = 0, i.e. 782 idle = (1/W - 1)*(-true_avgidle) 783 or 784 idle = (1 - W)*(-cl->avgidle); 785 */ 786 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); 787 788 /* 789 That is not all. 790 To maintain the rate allocated to the class, 791 we add to undertime virtual clock, 792 necessary to complete transmitted packet. 793 (len/phys_bandwidth has been already passed 794 to the moment of cbq_update) 795 */ 796 797 idle -= L2T(&q->link, len); 798 idle += L2T(cl, len); 799 800 cl->undertime = q->now + idle; 801 } else { 802 /* Underlimit */ 803 804 cl->undertime = PSCHED_PASTPERFECT; 805 if (avgidle > cl->maxidle) 806 cl->avgidle = cl->maxidle; 807 else 808 cl->avgidle = avgidle; 809 } 810 cl->last = q->now; 811 } 812 813 cbq_update_toplevel(q, this, q->tx_borrowed); 814} 815 816static __inline__ struct cbq_class * 817cbq_under_limit(struct cbq_class *cl) 818{ 819 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 820 struct cbq_class *this_cl = cl; 821 822 if (cl->tparent == NULL) 823 return cl; 824 825 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { 826 cl->delayed = 0; 827 return cl; 828 } 829 830 do { 831 /* It is very suspicious place. Now overlimit 832 action is generated for not bounded classes 833 only if link is completely congested. 834 Though it is in agree with ancestor-only paradigm, 835 it looks very stupid. Particularly, 836 it means that this chunk of code will either 837 never be called or result in strong amplification 838 of burstiness. Dangerous, silly, and, however, 839 no another solution exists. 840 */ 841 if ((cl = cl->borrow) == NULL) { 842 this_cl->qstats.overlimits++; 843 this_cl->overlimit(this_cl); 844 return NULL; 845 } 846 if (cl->level > q->toplevel) 847 return NULL; 848 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); 849 850 cl->delayed = 0; 851 return cl; 852} 853 854static __inline__ struct sk_buff * 855cbq_dequeue_prio(struct Qdisc *sch, int prio) 856{ 857 struct cbq_sched_data *q = qdisc_priv(sch); 858 struct cbq_class *cl_tail, *cl_prev, *cl; 859 struct sk_buff *skb; 860 int deficit; 861 862 cl_tail = cl_prev = q->active[prio]; 863 cl = cl_prev->next_alive; 864 865 do { 866 deficit = 0; 867 868 /* Start round */ 869 do { 870 struct cbq_class *borrow = cl; 871 872 if (cl->q->q.qlen && 873 (borrow = cbq_under_limit(cl)) == NULL) 874 goto skip_class; 875 876 if (cl->deficit <= 0) { 877 /* Class exhausted its allotment per 878 this round. Switch to the next one. 879 */ 880 deficit = 1; 881 cl->deficit += cl->quantum; 882 goto next_class; 883 } 884 885 skb = cl->q->dequeue(cl->q); 886 887 /* Class did not give us any skb :-( 888 It could occur even if cl->q->q.qlen != 0 889 f.e. if cl->q == "tbf" 890 */ 891 if (skb == NULL) 892 goto skip_class; 893 894 cl->deficit -= qdisc_pkt_len(skb); 895 q->tx_class = cl; 896 q->tx_borrowed = borrow; 897 if (borrow != cl) { 898#ifndef CBQ_XSTATS_BORROWS_BYTES 899 borrow->xstats.borrows++; 900 cl->xstats.borrows++; 901#else 902 borrow->xstats.borrows += qdisc_pkt_len(skb); 903 cl->xstats.borrows += qdisc_pkt_len(skb); 904#endif 905 } 906 q->tx_len = qdisc_pkt_len(skb); 907 908 if (cl->deficit <= 0) { 909 q->active[prio] = cl; 910 cl = cl->next_alive; 911 cl->deficit += cl->quantum; 912 } 913 return skb; 914 915skip_class: 916 if (cl->q->q.qlen == 0 || prio != cl->cpriority) { 917 /* Class is empty or penalized. 918 Unlink it from active chain. 919 */ 920 cl_prev->next_alive = cl->next_alive; 921 cl->next_alive = NULL; 922 923 /* Did cl_tail point to it? */ 924 if (cl == cl_tail) { 925 /* Repair it! */ 926 cl_tail = cl_prev; 927 928 /* Was it the last class in this band? */ 929 if (cl == cl_tail) { 930 /* Kill the band! */ 931 q->active[prio] = NULL; 932 q->activemask &= ~(1<<prio); 933 if (cl->q->q.qlen) 934 cbq_activate_class(cl); 935 return NULL; 936 } 937 938 q->active[prio] = cl_tail; 939 } 940 if (cl->q->q.qlen) 941 cbq_activate_class(cl); 942 943 cl = cl_prev; 944 } 945 946next_class: 947 cl_prev = cl; 948 cl = cl->next_alive; 949 } while (cl_prev != cl_tail); 950 } while (deficit); 951 952 q->active[prio] = cl_prev; 953 954 return NULL; 955} 956 957static __inline__ struct sk_buff * 958cbq_dequeue_1(struct Qdisc *sch) 959{ 960 struct cbq_sched_data *q = qdisc_priv(sch); 961 struct sk_buff *skb; 962 unsigned activemask; 963 964 activemask = q->activemask&0xFF; 965 while (activemask) { 966 int prio = ffz(~activemask); 967 activemask &= ~(1<<prio); 968 skb = cbq_dequeue_prio(sch, prio); 969 if (skb) 970 return skb; 971 } 972 return NULL; 973} 974 975static struct sk_buff * 976cbq_dequeue(struct Qdisc *sch) 977{ 978 struct sk_buff *skb; 979 struct cbq_sched_data *q = qdisc_priv(sch); 980 psched_time_t now; 981 psched_tdiff_t incr; 982 983 now = psched_get_time(); 984 incr = now - q->now_rt; 985 986 if (q->tx_class) { 987 psched_tdiff_t incr2; 988 /* Time integrator. We calculate EOS time 989 by adding expected packet transmission time. 990 If real time is greater, we warp artificial clock, 991 so that: 992 993 cbq_time = max(real_time, work); 994 */ 995 incr2 = L2T(&q->link, q->tx_len); 996 q->now += incr2; 997 cbq_update(q); 998 if ((incr -= incr2) < 0) 999 incr = 0; 1000 } 1001 q->now += incr; 1002 q->now_rt = now; 1003 1004 for (;;) { 1005 q->wd_expires = 0; 1006 1007 skb = cbq_dequeue_1(sch); 1008 if (skb) { 1009 sch->q.qlen--; 1010 sch->flags &= ~TCQ_F_THROTTLED; 1011 return skb; 1012 } 1013 1014 /* All the classes are overlimit. 1015 1016 It is possible, if: 1017 1018 1. Scheduler is empty. 1019 2. Toplevel cutoff inhibited borrowing. 1020 3. Root class is overlimit. 1021 1022 Reset 2d and 3d conditions and retry. 1023 1024 Note, that NS and cbq-2.0 are buggy, peeking 1025 an arbitrary class is appropriate for ancestor-only 1026 sharing, but not for toplevel algorithm. 1027 1028 Our version is better, but slower, because it requires 1029 two passes, but it is unavoidable with top-level sharing. 1030 */ 1031 1032 if (q->toplevel == TC_CBQ_MAXLEVEL && 1033 q->link.undertime == PSCHED_PASTPERFECT) 1034 break; 1035 1036 q->toplevel = TC_CBQ_MAXLEVEL; 1037 q->link.undertime = PSCHED_PASTPERFECT; 1038 } 1039 1040 /* No packets in scheduler or nobody wants to give them to us :-( 1041 Sigh... start watchdog timer in the last case. */ 1042 1043 if (sch->q.qlen) { 1044 sch->qstats.overlimits++; 1045 if (q->wd_expires) 1046 qdisc_watchdog_schedule(&q->watchdog, 1047 now + q->wd_expires); 1048 } 1049 return NULL; 1050} 1051 1052/* CBQ class maintanance routines */ 1053 1054static void cbq_adjust_levels(struct cbq_class *this) 1055{ 1056 if (this == NULL) 1057 return; 1058 1059 do { 1060 int level = 0; 1061 struct cbq_class *cl; 1062 1063 if ((cl = this->children) != NULL) { 1064 do { 1065 if (cl->level > level) 1066 level = cl->level; 1067 } while ((cl = cl->sibling) != this->children); 1068 } 1069 this->level = level+1; 1070 } while ((this = this->tparent) != NULL); 1071} 1072 1073static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) 1074{ 1075 struct cbq_class *cl; 1076 struct hlist_node *n; 1077 unsigned int h; 1078 1079 if (q->quanta[prio] == 0) 1080 return; 1081 1082 for (h = 0; h < q->clhash.hashsize; h++) { 1083 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1084 /* BUGGGG... Beware! This expression suffer of 1085 arithmetic overflows! 1086 */ 1087 if (cl->priority == prio) { 1088 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ 1089 q->quanta[prio]; 1090 } 1091 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { 1092 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); 1093 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; 1094 } 1095 } 1096 } 1097} 1098 1099static void cbq_sync_defmap(struct cbq_class *cl) 1100{ 1101 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1102 struct cbq_class *split = cl->split; 1103 unsigned h; 1104 int i; 1105 1106 if (split == NULL) 1107 return; 1108 1109 for (i=0; i<=TC_PRIO_MAX; i++) { 1110 if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) 1111 split->defaults[i] = NULL; 1112 } 1113 1114 for (i=0; i<=TC_PRIO_MAX; i++) { 1115 int level = split->level; 1116 1117 if (split->defaults[i]) 1118 continue; 1119 1120 for (h = 0; h < q->clhash.hashsize; h++) { 1121 struct hlist_node *n; 1122 struct cbq_class *c; 1123 1124 hlist_for_each_entry(c, n, &q->clhash.hash[h], 1125 common.hnode) { 1126 if (c->split == split && c->level < level && 1127 c->defmap&(1<<i)) { 1128 split->defaults[i] = c; 1129 level = c->level; 1130 } 1131 } 1132 } 1133 } 1134} 1135 1136static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) 1137{ 1138 struct cbq_class *split = NULL; 1139 1140 if (splitid == 0) { 1141 if ((split = cl->split) == NULL) 1142 return; 1143 splitid = split->common.classid; 1144 } 1145 1146 if (split == NULL || split->common.classid != splitid) { 1147 for (split = cl->tparent; split; split = split->tparent) 1148 if (split->common.classid == splitid) 1149 break; 1150 } 1151 1152 if (split == NULL) 1153 return; 1154 1155 if (cl->split != split) { 1156 cl->defmap = 0; 1157 cbq_sync_defmap(cl); 1158 cl->split = split; 1159 cl->defmap = def&mask; 1160 } else 1161 cl->defmap = (cl->defmap&~mask)|(def&mask); 1162 1163 cbq_sync_defmap(cl); 1164} 1165 1166static void cbq_unlink_class(struct cbq_class *this) 1167{ 1168 struct cbq_class *cl, **clp; 1169 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1170 1171 qdisc_class_hash_remove(&q->clhash, &this->common); 1172 1173 if (this->tparent) { 1174 clp=&this->sibling; 1175 cl = *clp; 1176 do { 1177 if (cl == this) { 1178 *clp = cl->sibling; 1179 break; 1180 } 1181 clp = &cl->sibling; 1182 } while ((cl = *clp) != this->sibling); 1183 1184 if (this->tparent->children == this) { 1185 this->tparent->children = this->sibling; 1186 if (this->sibling == this) 1187 this->tparent->children = NULL; 1188 } 1189 } else { 1190 WARN_ON(this->sibling != this); 1191 } 1192} 1193 1194static void cbq_link_class(struct cbq_class *this) 1195{ 1196 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1197 struct cbq_class *parent = this->tparent; 1198 1199 this->sibling = this; 1200 qdisc_class_hash_insert(&q->clhash, &this->common); 1201 1202 if (parent == NULL) 1203 return; 1204 1205 if (parent->children == NULL) { 1206 parent->children = this; 1207 } else { 1208 this->sibling = parent->children->sibling; 1209 parent->children->sibling = this; 1210 } 1211} 1212 1213static unsigned int cbq_drop(struct Qdisc* sch) 1214{ 1215 struct cbq_sched_data *q = qdisc_priv(sch); 1216 struct cbq_class *cl, *cl_head; 1217 int prio; 1218 unsigned int len; 1219 1220 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { 1221 if ((cl_head = q->active[prio]) == NULL) 1222 continue; 1223 1224 cl = cl_head; 1225 do { 1226 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { 1227 sch->q.qlen--; 1228 if (!cl->q->q.qlen) 1229 cbq_deactivate_class(cl); 1230 return len; 1231 } 1232 } while ((cl = cl->next_alive) != cl_head); 1233 } 1234 return 0; 1235} 1236 1237static void 1238cbq_reset(struct Qdisc* sch) 1239{ 1240 struct cbq_sched_data *q = qdisc_priv(sch); 1241 struct cbq_class *cl; 1242 struct hlist_node *n; 1243 int prio; 1244 unsigned h; 1245 1246 q->activemask = 0; 1247 q->pmask = 0; 1248 q->tx_class = NULL; 1249 q->tx_borrowed = NULL; 1250 qdisc_watchdog_cancel(&q->watchdog); 1251 hrtimer_cancel(&q->delay_timer); 1252 q->toplevel = TC_CBQ_MAXLEVEL; 1253 q->now = psched_get_time(); 1254 q->now_rt = q->now; 1255 1256 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) 1257 q->active[prio] = NULL; 1258 1259 for (h = 0; h < q->clhash.hashsize; h++) { 1260 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1261 qdisc_reset(cl->q); 1262 1263 cl->next_alive = NULL; 1264 cl->undertime = PSCHED_PASTPERFECT; 1265 cl->avgidle = cl->maxidle; 1266 cl->deficit = cl->quantum; 1267 cl->cpriority = cl->priority; 1268 } 1269 } 1270 sch->q.qlen = 0; 1271} 1272 1273 1274static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) 1275{ 1276 if (lss->change&TCF_CBQ_LSS_FLAGS) { 1277 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; 1278 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; 1279 } 1280 if (lss->change&TCF_CBQ_LSS_EWMA) 1281 cl->ewma_log = lss->ewma_log; 1282 if (lss->change&TCF_CBQ_LSS_AVPKT) 1283 cl->avpkt = lss->avpkt; 1284 if (lss->change&TCF_CBQ_LSS_MINIDLE) 1285 cl->minidle = -(long)lss->minidle; 1286 if (lss->change&TCF_CBQ_LSS_MAXIDLE) { 1287 cl->maxidle = lss->maxidle; 1288 cl->avgidle = lss->maxidle; 1289 } 1290 if (lss->change&TCF_CBQ_LSS_OFFTIME) 1291 cl->offtime = lss->offtime; 1292 return 0; 1293} 1294 1295static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) 1296{ 1297 q->nclasses[cl->priority]--; 1298 q->quanta[cl->priority] -= cl->weight; 1299 cbq_normalize_quanta(q, cl->priority); 1300} 1301 1302static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) 1303{ 1304 q->nclasses[cl->priority]++; 1305 q->quanta[cl->priority] += cl->weight; 1306 cbq_normalize_quanta(q, cl->priority); 1307} 1308 1309static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) 1310{ 1311 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1312 1313 if (wrr->allot) 1314 cl->allot = wrr->allot; 1315 if (wrr->weight) 1316 cl->weight = wrr->weight; 1317 if (wrr->priority) { 1318 cl->priority = wrr->priority-1; 1319 cl->cpriority = cl->priority; 1320 if (cl->priority >= cl->priority2) 1321 cl->priority2 = TC_CBQ_MAXPRIO-1; 1322 } 1323 1324 cbq_addprio(q, cl); 1325 return 0; 1326} 1327 1328static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) 1329{ 1330 switch (ovl->strategy) { 1331 case TC_CBQ_OVL_CLASSIC: 1332 cl->overlimit = cbq_ovl_classic; 1333 break; 1334 case TC_CBQ_OVL_DELAY: 1335 cl->overlimit = cbq_ovl_delay; 1336 break; 1337 case TC_CBQ_OVL_LOWPRIO: 1338 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || 1339 ovl->priority2-1 <= cl->priority) 1340 return -EINVAL; 1341 cl->priority2 = ovl->priority2-1; 1342 cl->overlimit = cbq_ovl_lowprio; 1343 break; 1344 case TC_CBQ_OVL_DROP: 1345 cl->overlimit = cbq_ovl_drop; 1346 break; 1347 case TC_CBQ_OVL_RCLASSIC: 1348 cl->overlimit = cbq_ovl_rclassic; 1349 break; 1350 default: 1351 return -EINVAL; 1352 } 1353 cl->penalty = ovl->penalty; 1354 return 0; 1355} 1356 1357#ifdef CONFIG_NET_CLS_ACT 1358static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) 1359{ 1360 cl->police = p->police; 1361 1362 if (cl->q->handle) { 1363 if (p->police == TC_POLICE_RECLASSIFY) 1364 cl->q->reshape_fail = cbq_reshape_fail; 1365 else 1366 cl->q->reshape_fail = NULL; 1367 } 1368 return 0; 1369} 1370#endif 1371 1372static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) 1373{ 1374 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); 1375 return 0; 1376} 1377 1378static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { 1379 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, 1380 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, 1381 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, 1382 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, 1383 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, 1384 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 1385 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, 1386}; 1387 1388static int cbq_init(struct Qdisc *sch, struct nlattr *opt) 1389{ 1390 struct cbq_sched_data *q = qdisc_priv(sch); 1391 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1392 struct tc_ratespec *r; 1393 int err; 1394 1395 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1396 if (err < 0) 1397 return err; 1398 1399 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) 1400 return -EINVAL; 1401 1402 r = nla_data(tb[TCA_CBQ_RATE]); 1403 1404 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) 1405 return -EINVAL; 1406 1407 err = qdisc_class_hash_init(&q->clhash); 1408 if (err < 0) 1409 goto put_rtab; 1410 1411 q->link.refcnt = 1; 1412 q->link.sibling = &q->link; 1413 q->link.common.classid = sch->handle; 1414 q->link.qdisc = sch; 1415 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1416 &pfifo_qdisc_ops, 1417 sch->handle))) 1418 q->link.q = &noop_qdisc; 1419 1420 q->link.priority = TC_CBQ_MAXPRIO-1; 1421 q->link.priority2 = TC_CBQ_MAXPRIO-1; 1422 q->link.cpriority = TC_CBQ_MAXPRIO-1; 1423 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; 1424 q->link.overlimit = cbq_ovl_classic; 1425 q->link.allot = psched_mtu(qdisc_dev(sch)); 1426 q->link.quantum = q->link.allot; 1427 q->link.weight = q->link.R_tab->rate.rate; 1428 1429 q->link.ewma_log = TC_CBQ_DEF_EWMA; 1430 q->link.avpkt = q->link.allot/2; 1431 q->link.minidle = -0x7FFFFFFF; 1432 1433 qdisc_watchdog_init(&q->watchdog, sch); 1434 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1435 q->delay_timer.function = cbq_undelay; 1436 q->toplevel = TC_CBQ_MAXLEVEL; 1437 q->now = psched_get_time(); 1438 q->now_rt = q->now; 1439 1440 cbq_link_class(&q->link); 1441 1442 if (tb[TCA_CBQ_LSSOPT]) 1443 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); 1444 1445 cbq_addprio(q, &q->link); 1446 return 0; 1447 1448put_rtab: 1449 qdisc_put_rtab(q->link.R_tab); 1450 return err; 1451} 1452 1453static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) 1454{ 1455 unsigned char *b = skb_tail_pointer(skb); 1456 1457 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); 1458 return skb->len; 1459 1460nla_put_failure: 1461 nlmsg_trim(skb, b); 1462 return -1; 1463} 1464 1465static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) 1466{ 1467 unsigned char *b = skb_tail_pointer(skb); 1468 struct tc_cbq_lssopt opt; 1469 1470 opt.flags = 0; 1471 if (cl->borrow == NULL) 1472 opt.flags |= TCF_CBQ_LSS_BOUNDED; 1473 if (cl->share == NULL) 1474 opt.flags |= TCF_CBQ_LSS_ISOLATED; 1475 opt.ewma_log = cl->ewma_log; 1476 opt.level = cl->level; 1477 opt.avpkt = cl->avpkt; 1478 opt.maxidle = cl->maxidle; 1479 opt.minidle = (u32)(-cl->minidle); 1480 opt.offtime = cl->offtime; 1481 opt.change = ~0; 1482 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); 1483 return skb->len; 1484 1485nla_put_failure: 1486 nlmsg_trim(skb, b); 1487 return -1; 1488} 1489 1490static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) 1491{ 1492 unsigned char *b = skb_tail_pointer(skb); 1493 struct tc_cbq_wrropt opt; 1494 1495 opt.flags = 0; 1496 opt.allot = cl->allot; 1497 opt.priority = cl->priority+1; 1498 opt.cpriority = cl->cpriority+1; 1499 opt.weight = cl->weight; 1500 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); 1501 return skb->len; 1502 1503nla_put_failure: 1504 nlmsg_trim(skb, b); 1505 return -1; 1506} 1507 1508static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) 1509{ 1510 unsigned char *b = skb_tail_pointer(skb); 1511 struct tc_cbq_ovl opt; 1512 1513 opt.strategy = cl->ovl_strategy; 1514 opt.priority2 = cl->priority2+1; 1515 opt.pad = 0; 1516 opt.penalty = cl->penalty; 1517 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1518 return skb->len; 1519 1520nla_put_failure: 1521 nlmsg_trim(skb, b); 1522 return -1; 1523} 1524 1525static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) 1526{ 1527 unsigned char *b = skb_tail_pointer(skb); 1528 struct tc_cbq_fopt opt; 1529 1530 if (cl->split || cl->defmap) { 1531 opt.split = cl->split ? cl->split->common.classid : 0; 1532 opt.defmap = cl->defmap; 1533 opt.defchange = ~0; 1534 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); 1535 } 1536 return skb->len; 1537 1538nla_put_failure: 1539 nlmsg_trim(skb, b); 1540 return -1; 1541} 1542 1543#ifdef CONFIG_NET_CLS_ACT 1544static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) 1545{ 1546 unsigned char *b = skb_tail_pointer(skb); 1547 struct tc_cbq_police opt; 1548 1549 if (cl->police) { 1550 opt.police = cl->police; 1551 opt.__res1 = 0; 1552 opt.__res2 = 0; 1553 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1554 } 1555 return skb->len; 1556 1557nla_put_failure: 1558 nlmsg_trim(skb, b); 1559 return -1; 1560} 1561#endif 1562 1563static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) 1564{ 1565 if (cbq_dump_lss(skb, cl) < 0 || 1566 cbq_dump_rate(skb, cl) < 0 || 1567 cbq_dump_wrr(skb, cl) < 0 || 1568 cbq_dump_ovl(skb, cl) < 0 || 1569#ifdef CONFIG_NET_CLS_ACT 1570 cbq_dump_police(skb, cl) < 0 || 1571#endif 1572 cbq_dump_fopt(skb, cl) < 0) 1573 return -1; 1574 return 0; 1575} 1576 1577static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) 1578{ 1579 struct cbq_sched_data *q = qdisc_priv(sch); 1580 struct nlattr *nest; 1581 1582 nest = nla_nest_start(skb, TCA_OPTIONS); 1583 if (nest == NULL) 1584 goto nla_put_failure; 1585 if (cbq_dump_attr(skb, &q->link) < 0) 1586 goto nla_put_failure; 1587 nla_nest_end(skb, nest); 1588 return skb->len; 1589 1590nla_put_failure: 1591 nla_nest_cancel(skb, nest); 1592 return -1; 1593} 1594 1595static int 1596cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1597{ 1598 struct cbq_sched_data *q = qdisc_priv(sch); 1599 1600 q->link.xstats.avgidle = q->link.avgidle; 1601 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); 1602} 1603 1604static int 1605cbq_dump_class(struct Qdisc *sch, unsigned long arg, 1606 struct sk_buff *skb, struct tcmsg *tcm) 1607{ 1608 struct cbq_class *cl = (struct cbq_class*)arg; 1609 struct nlattr *nest; 1610 1611 if (cl->tparent) 1612 tcm->tcm_parent = cl->tparent->common.classid; 1613 else 1614 tcm->tcm_parent = TC_H_ROOT; 1615 tcm->tcm_handle = cl->common.classid; 1616 tcm->tcm_info = cl->q->handle; 1617 1618 nest = nla_nest_start(skb, TCA_OPTIONS); 1619 if (nest == NULL) 1620 goto nla_put_failure; 1621 if (cbq_dump_attr(skb, cl) < 0) 1622 goto nla_put_failure; 1623 nla_nest_end(skb, nest); 1624 return skb->len; 1625 1626nla_put_failure: 1627 nla_nest_cancel(skb, nest); 1628 return -1; 1629} 1630 1631static int 1632cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, 1633 struct gnet_dump *d) 1634{ 1635 struct cbq_sched_data *q = qdisc_priv(sch); 1636 struct cbq_class *cl = (struct cbq_class*)arg; 1637 1638 cl->qstats.qlen = cl->q->q.qlen; 1639 cl->xstats.avgidle = cl->avgidle; 1640 cl->xstats.undertime = 0; 1641 1642 if (cl->undertime != PSCHED_PASTPERFECT) 1643 cl->xstats.undertime = cl->undertime - q->now; 1644 1645 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 1646 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || 1647 gnet_stats_copy_queue(d, &cl->qstats) < 0) 1648 return -1; 1649 1650 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); 1651} 1652 1653static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1654 struct Qdisc **old) 1655{ 1656 struct cbq_class *cl = (struct cbq_class*)arg; 1657 1658 if (cl) { 1659 if (new == NULL) { 1660 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1661 &pfifo_qdisc_ops, 1662 cl->common.classid); 1663 if (new == NULL) 1664 return -ENOBUFS; 1665 } else { 1666#ifdef CONFIG_NET_CLS_ACT 1667 if (cl->police == TC_POLICE_RECLASSIFY) 1668 new->reshape_fail = cbq_reshape_fail; 1669#endif 1670 } 1671 sch_tree_lock(sch); 1672 *old = xchg(&cl->q, new); 1673 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); 1674 qdisc_reset(*old); 1675 sch_tree_unlock(sch); 1676 1677 return 0; 1678 } 1679 return -ENOENT; 1680} 1681 1682static struct Qdisc * 1683cbq_leaf(struct Qdisc *sch, unsigned long arg) 1684{ 1685 struct cbq_class *cl = (struct cbq_class*)arg; 1686 1687 return cl ? cl->q : NULL; 1688} 1689 1690static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1691{ 1692 struct cbq_class *cl = (struct cbq_class *)arg; 1693 1694 if (cl->q->q.qlen == 0) 1695 cbq_deactivate_class(cl); 1696} 1697 1698static unsigned long cbq_get(struct Qdisc *sch, u32 classid) 1699{ 1700 struct cbq_sched_data *q = qdisc_priv(sch); 1701 struct cbq_class *cl = cbq_class_lookup(q, classid); 1702 1703 if (cl) { 1704 cl->refcnt++; 1705 return (unsigned long)cl; 1706 } 1707 return 0; 1708} 1709 1710static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) 1711{ 1712 struct cbq_sched_data *q = qdisc_priv(sch); 1713 1714 WARN_ON(cl->filters); 1715 1716 tcf_destroy_chain(&cl->filter_list); 1717 qdisc_destroy(cl->q); 1718 qdisc_put_rtab(cl->R_tab); 1719 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1720 if (cl != &q->link) 1721 kfree(cl); 1722} 1723 1724static void 1725cbq_destroy(struct Qdisc* sch) 1726{ 1727 struct cbq_sched_data *q = qdisc_priv(sch); 1728 struct hlist_node *n, *next; 1729 struct cbq_class *cl; 1730 unsigned h; 1731 1732#ifdef CONFIG_NET_CLS_ACT 1733 q->rx_class = NULL; 1734#endif 1735 /* 1736 * Filters must be destroyed first because we don't destroy the 1737 * classes from root to leafs which means that filters can still 1738 * be bound to classes which have been destroyed already. --TGR '04 1739 */ 1740 for (h = 0; h < q->clhash.hashsize; h++) { 1741 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) 1742 tcf_destroy_chain(&cl->filter_list); 1743 } 1744 for (h = 0; h < q->clhash.hashsize; h++) { 1745 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], 1746 common.hnode) 1747 cbq_destroy_class(sch, cl); 1748 } 1749 qdisc_class_hash_destroy(&q->clhash); 1750} 1751 1752static void cbq_put(struct Qdisc *sch, unsigned long arg) 1753{ 1754 struct cbq_class *cl = (struct cbq_class*)arg; 1755 1756 if (--cl->refcnt == 0) { 1757#ifdef CONFIG_NET_CLS_ACT 1758 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); 1759 struct cbq_sched_data *q = qdisc_priv(sch); 1760 1761 spin_lock_bh(root_lock); 1762 if (q->rx_class == cl) 1763 q->rx_class = NULL; 1764 spin_unlock_bh(root_lock); 1765#endif 1766 1767 cbq_destroy_class(sch, cl); 1768 } 1769} 1770 1771static int 1772cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, 1773 unsigned long *arg) 1774{ 1775 int err; 1776 struct cbq_sched_data *q = qdisc_priv(sch); 1777 struct cbq_class *cl = (struct cbq_class*)*arg; 1778 struct nlattr *opt = tca[TCA_OPTIONS]; 1779 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1780 struct cbq_class *parent; 1781 struct qdisc_rate_table *rtab = NULL; 1782 1783 if (opt == NULL) 1784 return -EINVAL; 1785 1786 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1787 if (err < 0) 1788 return err; 1789 1790 if (cl) { 1791 /* Check parent */ 1792 if (parentid) { 1793 if (cl->tparent && 1794 cl->tparent->common.classid != parentid) 1795 return -EINVAL; 1796 if (!cl->tparent && parentid != TC_H_ROOT) 1797 return -EINVAL; 1798 } 1799 1800 if (tb[TCA_CBQ_RATE]) { 1801 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); 1802 if (rtab == NULL) 1803 return -EINVAL; 1804 } 1805 1806 /* Change class parameters */ 1807 sch_tree_lock(sch); 1808 1809 if (cl->next_alive != NULL) 1810 cbq_deactivate_class(cl); 1811 1812 if (rtab) { 1813 rtab = xchg(&cl->R_tab, rtab); 1814 qdisc_put_rtab(rtab); 1815 } 1816 1817 if (tb[TCA_CBQ_LSSOPT]) 1818 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1819 1820 if (tb[TCA_CBQ_WRROPT]) { 1821 cbq_rmprio(q, cl); 1822 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1823 } 1824 1825 if (tb[TCA_CBQ_OVL_STRATEGY]) 1826 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1827 1828#ifdef CONFIG_NET_CLS_ACT 1829 if (tb[TCA_CBQ_POLICE]) 1830 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1831#endif 1832 1833 if (tb[TCA_CBQ_FOPT]) 1834 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1835 1836 if (cl->q->q.qlen) 1837 cbq_activate_class(cl); 1838 1839 sch_tree_unlock(sch); 1840 1841 if (tca[TCA_RATE]) 1842 gen_replace_estimator(&cl->bstats, &cl->rate_est, 1843 qdisc_root_sleeping_lock(sch), 1844 tca[TCA_RATE]); 1845 return 0; 1846 } 1847 1848 if (parentid == TC_H_ROOT) 1849 return -EINVAL; 1850 1851 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || 1852 tb[TCA_CBQ_LSSOPT] == NULL) 1853 return -EINVAL; 1854 1855 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); 1856 if (rtab == NULL) 1857 return -EINVAL; 1858 1859 if (classid) { 1860 err = -EINVAL; 1861 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) 1862 goto failure; 1863 } else { 1864 int i; 1865 classid = TC_H_MAKE(sch->handle,0x8000); 1866 1867 for (i=0; i<0x8000; i++) { 1868 if (++q->hgenerator >= 0x8000) 1869 q->hgenerator = 1; 1870 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) 1871 break; 1872 } 1873 err = -ENOSR; 1874 if (i >= 0x8000) 1875 goto failure; 1876 classid = classid|q->hgenerator; 1877 } 1878 1879 parent = &q->link; 1880 if (parentid) { 1881 parent = cbq_class_lookup(q, parentid); 1882 err = -EINVAL; 1883 if (parent == NULL) 1884 goto failure; 1885 } 1886 1887 err = -ENOBUFS; 1888 cl = kzalloc(sizeof(*cl), GFP_KERNEL); 1889 if (cl == NULL) 1890 goto failure; 1891 cl->R_tab = rtab; 1892 rtab = NULL; 1893 cl->refcnt = 1; 1894 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1895 &pfifo_qdisc_ops, classid))) 1896 cl->q = &noop_qdisc; 1897 cl->common.classid = classid; 1898 cl->tparent = parent; 1899 cl->qdisc = sch; 1900 cl->allot = parent->allot; 1901 cl->quantum = cl->allot; 1902 cl->weight = cl->R_tab->rate.rate; 1903 1904 sch_tree_lock(sch); 1905 cbq_link_class(cl); 1906 cl->borrow = cl->tparent; 1907 if (cl->tparent != &q->link) 1908 cl->share = cl->tparent; 1909 cbq_adjust_levels(parent); 1910 cl->minidle = -0x7FFFFFFF; 1911 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1912 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1913 if (cl->ewma_log==0) 1914 cl->ewma_log = q->link.ewma_log; 1915 if (cl->maxidle==0) 1916 cl->maxidle = q->link.maxidle; 1917 if (cl->avpkt==0) 1918 cl->avpkt = q->link.avpkt; 1919 cl->overlimit = cbq_ovl_classic; 1920 if (tb[TCA_CBQ_OVL_STRATEGY]) 1921 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1922#ifdef CONFIG_NET_CLS_ACT 1923 if (tb[TCA_CBQ_POLICE]) 1924 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1925#endif 1926 if (tb[TCA_CBQ_FOPT]) 1927 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1928 sch_tree_unlock(sch); 1929 1930 qdisc_class_hash_grow(sch, &q->clhash); 1931 1932 if (tca[TCA_RATE]) 1933 gen_new_estimator(&cl->bstats, &cl->rate_est, 1934 qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); 1935 1936 *arg = (unsigned long)cl; 1937 return 0; 1938 1939failure: 1940 qdisc_put_rtab(rtab); 1941 return err; 1942} 1943 1944static int cbq_delete(struct Qdisc *sch, unsigned long arg) 1945{ 1946 struct cbq_sched_data *q = qdisc_priv(sch); 1947 struct cbq_class *cl = (struct cbq_class*)arg; 1948 unsigned int qlen; 1949 1950 if (cl->filters || cl->children || cl == &q->link) 1951 return -EBUSY; 1952 1953 sch_tree_lock(sch); 1954 1955 qlen = cl->q->q.qlen; 1956 qdisc_reset(cl->q); 1957 qdisc_tree_decrease_qlen(cl->q, qlen); 1958 1959 if (cl->next_alive) 1960 cbq_deactivate_class(cl); 1961 1962 if (q->tx_borrowed == cl) 1963 q->tx_borrowed = q->tx_class; 1964 if (q->tx_class == cl) { 1965 q->tx_class = NULL; 1966 q->tx_borrowed = NULL; 1967 } 1968#ifdef CONFIG_NET_CLS_ACT 1969 if (q->rx_class == cl) 1970 q->rx_class = NULL; 1971#endif 1972 1973 cbq_unlink_class(cl); 1974 cbq_adjust_levels(cl->tparent); 1975 cl->defmap = 0; 1976 cbq_sync_defmap(cl); 1977 1978 cbq_rmprio(q, cl); 1979 sch_tree_unlock(sch); 1980 1981 if (--cl->refcnt == 0) 1982 cbq_destroy_class(sch, cl); 1983 1984 return 0; 1985} 1986 1987static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) 1988{ 1989 struct cbq_sched_data *q = qdisc_priv(sch); 1990 struct cbq_class *cl = (struct cbq_class *)arg; 1991 1992 if (cl == NULL) 1993 cl = &q->link; 1994 1995 return &cl->filter_list; 1996} 1997 1998static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, 1999 u32 classid) 2000{ 2001 struct cbq_sched_data *q = qdisc_priv(sch); 2002 struct cbq_class *p = (struct cbq_class*)parent; 2003 struct cbq_class *cl = cbq_class_lookup(q, classid); 2004 2005 if (cl) { 2006 if (p && p->level <= cl->level) 2007 return 0; 2008 cl->filters++; 2009 return (unsigned long)cl; 2010 } 2011 return 0; 2012} 2013 2014static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) 2015{ 2016 struct cbq_class *cl = (struct cbq_class*)arg; 2017 2018 cl->filters--; 2019} 2020 2021static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2022{ 2023 struct cbq_sched_data *q = qdisc_priv(sch); 2024 struct cbq_class *cl; 2025 struct hlist_node *n; 2026 unsigned h; 2027 2028 if (arg->stop) 2029 return; 2030 2031 for (h = 0; h < q->clhash.hashsize; h++) { 2032 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 2033 if (arg->count < arg->skip) { 2034 arg->count++; 2035 continue; 2036 } 2037 if (arg->fn(sch, (unsigned long)cl, arg) < 0) { 2038 arg->stop = 1; 2039 return; 2040 } 2041 arg->count++; 2042 } 2043 } 2044} 2045 2046static const struct Qdisc_class_ops cbq_class_ops = { 2047 .graft = cbq_graft, 2048 .leaf = cbq_leaf, 2049 .qlen_notify = cbq_qlen_notify, 2050 .get = cbq_get, 2051 .put = cbq_put, 2052 .change = cbq_change_class, 2053 .delete = cbq_delete, 2054 .walk = cbq_walk, 2055 .tcf_chain = cbq_find_tcf, 2056 .bind_tcf = cbq_bind_filter, 2057 .unbind_tcf = cbq_unbind_filter, 2058 .dump = cbq_dump_class, 2059 .dump_stats = cbq_dump_class_stats, 2060}; 2061 2062static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { 2063 .next = NULL, 2064 .cl_ops = &cbq_class_ops, 2065 .id = "cbq", 2066 .priv_size = sizeof(struct cbq_sched_data), 2067 .enqueue = cbq_enqueue, 2068 .dequeue = cbq_dequeue, 2069 .requeue = cbq_requeue, 2070 .drop = cbq_drop, 2071 .init = cbq_init, 2072 .reset = cbq_reset, 2073 .destroy = cbq_destroy, 2074 .change = NULL, 2075 .dump = cbq_dump, 2076 .dump_stats = cbq_dump_stats, 2077 .owner = THIS_MODULE, 2078}; 2079 2080static int __init cbq_module_init(void) 2081{ 2082 return register_qdisc(&cbq_qdisc_ops); 2083} 2084static void __exit cbq_module_exit(void) 2085{ 2086 unregister_qdisc(&cbq_qdisc_ops); 2087} 2088module_init(cbq_module_init) 2089module_exit(cbq_module_exit) 2090MODULE_LICENSE("GPL");